Camera pose estimation giving wrong results

I am trying to estimate relative camera movement based on matching points in two different images. Much like described here: Camera pose estimation: How do I interpret rotation and translation matrices?

But the estimated translation and rotation do not make sense.

I use synthetic input to make sure all points are valid and perfectly positioned.

10 x 10 x 10 points evenly spread within a cube. (Cube plotted with blue front face, red back face, lighter top and darker bottom)

zeroProjection Camera is in front of the cube pointing into to the front face.

rotate90projection Camera to the left of the cube pointing into the left side face.

I plot the two projections. You can easily verify visually that the camera has panned 90 degrees and moved diagonally in the x-z plane between the two projections.

In the code, the rotation (in degrees) is given as (0, -90, 0)

The translation is (0.7071, 0, 0.7071), camera movement distance is exactly 1.

I then do findEssentialMat() and recoverPose() on the 2d point sets to get translation and rotation estimates.

I expect to see the same translation and rotation I used to generate the images, but the estimates are completely wrong:

rotation estimate: (-74.86565284711004, -48.52201867665918, 121.26023708879158)
translation estimate: [[0.96576997]
 [0.17203598]
 [0.19414426]]

How can recover the actual (0, -90, 0), (0.7071, 0, 0,7071) transformation?

Complete code that displays the two cube images and prints out estimates:

import cv2
import numpy as np
import math


def cameraMatrix(f, w, h):
    return np.array([
                     [f, 0, w/2],
                     [0, f, h/2],
                     [0, 0, 1]])


n = 10
f = 300
w = 640
h = 480
K = cameraMatrix(f, w, h)


def cube(x=0, y=0, z=0, radius=1):
    c = np.zeros((n * n * n, 3), dtype=np.float32)
    for i in range(0, n):
        for j in range(0, n):
            for k in range(0, n):
                index = i + j * n + k * n * n
                c[index] = [i, j, k]
    c = 2 * c / (n - 1) - 1
    c *= radius
    c += [x, y, z]
    return c


def project3dTo2dArray(points3d, K, rotation, translation):
    imagePoints, _ = cv2.projectPoints(points3d,
                                       rotation,
                                       translation,
                                       K,
                                       np.array([]))
    p2d = imagePoints.reshape((imagePoints.shape[0],2))
    return p2d


def estimate_pose(projectionA, projectionB):
    E, _ = cv2.findEssentialMat(projectionA, projectionB, focal = f)
    _, r, t, _ = cv2.recoverPose(E,  projectionA, projectionB)
    angles, _, _, _, _, _ = cv2.RQDecomp3x3(r)
    print('rotation estimate:', angles)
    print('translation estimate:', t)


def main():
    c = cube(0, 0, math.sqrt(.5), 0.1)
    rotation = np.array([[0], [0], [0]], dtype=np.float32)
    translation = np.array([[0], [0], [0]], dtype=np.float32)
    zeroProjection = project3dTo2dArray(c, K, rotation, translation)
    displayCube(w, h, zeroProjection)

    rotation = np.array([[0], [-90], [0]], dtype=np.float32)
    translation = np.array([[math.sqrt(.5)], [0], [math.sqrt(.5)]], dtype=np.float32)
    print('applying rotation: ', rotation)
    print('applying translation: ', translation)
    rotate90projection = project3dTo2dArray(c, K, rotation * math.pi / 180, translation)
    displayCube(w, h, rotate90projection)

    estimate_pose(zeroProjection, rotate90projection)


def displayCube(w, h, points):
    img = np.zeros((h, w, 3), dtype=np.uint8)

    plotCube(img, points)

    cv2.imshow('img', img)
    k = cv2.waitKey(0) & 0xff
    if k == ord('q'):
        exit(0)


def plotCube(img, points):
    # Red back face
    cv2.line(img, tuple(points[n*n*(n-1)]),         tuple(points[n*n*(n-1)+n-1]),         (0, 0, 255), 2)
    cv2.line(img, tuple(points[n*n*(n-1)+n*(n-1)]), tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (0, 0, 128), 2)
    cv2.line(img, tuple(points[n*n*(n-1)]),         tuple(points[n*n*(n-1)+n*(n-1)]),     (0, 0, 200), 2)
    cv2.line(img, tuple(points[n*n*(n-1)+n-1]),     tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (0, 0, 200), 2)

    # gray connectors
    cv2.line(img, tuple(points[0]), tuple(points[n*n*(n-1)]), (150, 150, 150), 2)
    cv2.line(img, tuple(points[n-1]), tuple(points[n*n*(n-1)+n-1]), (150, 150, 150), 2)
    cv2.line(img, tuple(points[n*(n-1)]), tuple(points[n*n*(n-1)+n*(n-1)]), (100, 100, 100), 2)
    cv2.line(img, tuple(points[n*(n-1)+n-1]), tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (100, 100, 100), 2)

    # Blue front face
    cv2.line(img, tuple(points[0]),       tuple(points[n-1]),         (255, 0, 0), 2)
    cv2.line(img, tuple(points[n*(n-1)]), tuple(points[n*(n-1)+n-1]), (128, 0, 0), 2)
    cv2.line(img, tuple(points[0]),       tuple(points[n*(n-1)]),     (200, 0, 0), 2)
    cv2.line(img, tuple(points[n-1]),     tuple(points[n*(n-1)+n-1]), (200, 0, 0), 2)


main()

Solution

Turned out to be a few minor bugs in my code (like wrong principal point). Working code below shows 3 images.

First is a cube displayed in front of the camera. Second is the same cube, but different projection. Camera has been moved 1 unit and rotated around all 3 axes. Camera translation and rotation is estimated from the two projections. Third shows the cube projected using the rotation and translation estimates.

The code works since the second and third images are similar.

import cv2
import numpy as np
import math


def cameraMatrix(f, w, h):
    return np.array([
                     [f, 0, w/2],
                     [0, f, h/2],
                     [0, 0, 1]])


n = 10
f = 300
w = 640
h = 480
K = cameraMatrix(f, w, h)


def cube(x=0, y=0, z=0, radius=1):
    c = np.zeros((n * n * n, 3), dtype=np.float32)
    for i in range(0, n):
        for j in range(0, n):
            for k in range(0, n):
                index = i + j * n + k * n * n
                c[index] = [i, j, k]
    c = 2 * c / (n - 1) - 1
    c *= radius
    c += [x, y, z]
    return c


def project3dTo2dArray(points3d, K, rotation, translation):
    imagePoints, _ = cv2.projectPoints(points3d,
                                       rotation,
                                       translation,
                                       K,
                                       np.array([]))
    p2d = imagePoints.reshape((imagePoints.shape[0],2))
    return p2d


def estimate_pose(projectionA, projectionB):
    principal_point = (w/2,h/2)
    E, m = cv2.findEssentialMat(projectionA, projectionB, focal = f, pp = principal_point, method=cv2.RANSAC, threshold=1, prob=0.999)
    _, r, t, _ = cv2.recoverPose(E,  projectionA, projectionB, focal = f, pp = principal_point, mask = m)
    angles, _, _, _, _, _ = cv2.RQDecomp3x3(r)
    return angles, t


def main():
    c = cube(0, 0, math.sqrt(.5), 0.1)
    rotation = np.array([[0], [0], [0]], dtype=np.float32)
    translation = np.array([[0], [0], [0]], dtype=np.float32)
    zeroProjection = project3dTo2dArray(c, K, rotation, translation)
    displayCube(w, h, zeroProjection)

    rotation = np.array([[10], [-30], [5]], dtype=np.float32)
    translation = np.array([[math.sqrt(.7)], [0], [math.sqrt(.3)]], dtype=np.float32)

    print('applying rotation: ', rotation)
    print('applying translation: ', translation)
    movedprojection = project3dTo2dArray(c, K, rotation * math.pi / 180, translation)
    displayCube(w, h, movedprojection)

    estRot, estTra= estimate_pose(zeroProjection, movedprojection)
    print('rotation estimate:', estRot)
    print('translation estimate:', estTra)

    rotation = np.array([[estRot[0]], [estRot[1]], [estRot[2]]], dtype=np.float32)
    translation = np.array([[estTra[0]], [estTra[1]], [estTra[2]]], dtype=np.float32)
    estimateProjection = project3dTo2dArray(c, K, rotation * math.pi / 180, translation)
    displayCube(w, h, estimateProjection)


def displayCube(w, h, points):
    img = np.zeros((h, w, 3), dtype=np.uint8)

    plotCube(img, points)

    cv2.imshow('img', img)
    k = cv2.waitKey(0) & 0xff
    if k == ord('q'):
        exit(0)


def plotCube(img, points):
    # Red back face
    cv2.line(img, tuple(points[n*n*(n-1)]),         tuple(points[n*n*(n-1)+n-1]),         (0, 0, 255), 2)
    cv2.line(img, tuple(points[n*n*(n-1)+n*(n-1)]), tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (0, 0, 128), 2)
    cv2.line(img, tuple(points[n*n*(n-1)]),         tuple(points[n*n*(n-1)+n*(n-1)]),     (0, 0, 200), 2)
    cv2.line(img, tuple(points[n*n*(n-1)+n-1]),     tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (0, 0, 200), 2)

    # gray connectors
    cv2.line(img, tuple(points[0]), tuple(points[n*n*(n-1)]), (150, 150, 150), 2)
    cv2.line(img, tuple(points[n-1]), tuple(points[n*n*(n-1)+n-1]), (150, 150, 150), 2)
    cv2.line(img, tuple(points[n*(n-1)]), tuple(points[n*n*(n-1)+n*(n-1)]), (100, 100, 100), 2)
    cv2.line(img, tuple(points[n*(n-1)+n-1]), tuple(points[n*n*(n-1)+n*(n-1)+n-1]), (100, 100, 100), 2)

    # Blue front face
    cv2.line(img, tuple(points[0]),       tuple(points[n-1]),         (255, 0, 0), 2)
    cv2.line(img, tuple(points[n*(n-1)]), tuple(points[n*(n-1)+n-1]), (128, 0, 0), 2)
    cv2.line(img, tuple(points[0]),       tuple(points[n*(n-1)]),     (200, 0, 0), 2)
    cv2.line(img, tuple(points[n-1]),     tuple(points[n*(n-1)+n-1]), (200, 0, 0), 2)


main()