/**
 * Implementation of Newton-Schulz iteration
 * @author Doug James
 */
static class FSchulz3f
{
    Matrix3f S       = new Matrix3f();
    Matrix3f invUEst = new Matrix3f();
    Matrix3f U0      = new Matrix3f();// U(t)   (symmetric estimate)

    FSchulz3f() { reset();  }
    
    private void reset() {  U0.setIdentity();   }

    /** ESTIMATE! */
    public void getRotation(Matrix3f R) { R.set(S);  }

    /** Tracks rotation of "A" using a temporally coherent Newton-Schulz
     * estimate.
     * @return error estimate if enabled.
     */
    public float updateFast(Matrix3f A)
    {
        /// INVERT LAST STRETCH USING SYMMETRY:
        invUEst.set(U0);
        fastSymmetricInverse(invUEst);

        /// REMOVE STRETCH ESTIMATE:
        S.mul(A, invUEst); /// NOTE: A*symMtx could be optimized slightly <skip>

        /// NEWTON-SCHULZ ITERATION:  S <--- S + S*(I - S'*S)/2 = S*(3I - S'S)/2
        fastSchulzUpdate(S);
        fastSchulzUpdate(S);
        fastSchulzUpdate(S);/// hardcode 3 iterates

        /// COMPUTE NEW STRETCH ESTIMATE (EXPLOIT SYMMETRY):
        fastSymmetric_mulTransposeLeft(U0, S, A); /// symm(S'*A)

        /// RETURN SQUARED ERROR ESTIMATE:  ||I-S^T*S||_F^2
        return fastSchulzErrorSq(S);
    }

    /** ||I-S^T*S||_F^2 */
    static float fastSchulzErrorSq(Matrix3f S)
    {
        float t1 = S.m00*S.m00;
        float t2 = S.m10*S.m10;
        float t3 = S.m20*S.m20;
        float t5 = (t1+t2+t3- 1.f);
        float t6 = t5*t5;
        float t11 = (S.m00*S.m01+S.m10*S.m11+S.m20*S.m21);
        float t12 = t11*t11;
        float t18 = (S.m00*S.m02+S.m10*S.m12+S.m20*S.m22);
        float t19 = t18*t18;
        float t21 = S.m01*S.m01;
        float t22 = S.m11*S.m11;
        float t23 = S.m21*S.m21;
        float t25 = (t21+t22+t23-1.f);
        float t26 = t25*t25;
        float t31 = (S.m01*S.m02+S.m11*S.m12+S.m21*S.m22 );
        float t32 = t31*t31;
        float t34 = S.m02*S.m02;
        float t35 = S.m12*S.m12;
        float t36 = S.m22*S.m22;
        float t38 = (t34+t35+t36-1.f);
        float t39 = t38*t38;
        float t40 = t6+2.f*t12+2.f*t19+t26+2.f*t32+t39;
        return t40;  ///wasn't that easy ;)
    }

    /** Fast implementation of: U = symmetric( S^T A ). */
    static void fastSymmetric_mulTransposeLeft(Matrix3f U, Matrix3f S, Matrix3f A)
    {
        U.m00 = S.m00*A.m00+S.m10*A.m10+S.m20*A.m20;
        U.m01 = S.m00*A.m01+S.m10*A.m11+S.m20*A.m21;
        U.m02 = S.m00*A.m02+S.m10*A.m12+S.m20*A.m22;
        U.m10 = U.m01; 
        U.m11 = S.m01*A.m01+S.m11*A.m11+S.m21*A.m21;
        U.m12 = S.m01*A.m02+S.m11*A.m12+S.m21*A.m22;
        U.m20 = U.m02; 
        U.m21 = U.m12; 
        U.m22 = S.m02*A.m02+S.m12*A.m12+S.m22*A.m22;
    }

    /**
     * Fast (24-multiply) inverse of a symmetric 3x3 matrix using
     * Maple codegen , C(inverse(A),optimized). No checks for
     * symmetry are done--only upper triangular block is used.
     * @param A Input matrix that is overwritten by inv(A).
     * @return det(A) for singularity detection
     */
    static float fastSymmetricInverse(Matrix3f A)
    {
        /// COPY A.mij fields since must overwrite A incrementally:
        float m00 = A.m00;
        float m01 = A.m01;
        float m02 = A.m02;
        float m11 = A.m11;
        float m12 = A.m12;
        float m22 = A.m22;

        float t2  = m12*m12;
        float t4  = m00*m11;
        float t7  = m01*m01;
        float t9  = m01*m02;
        float t12 = m02*m02;
        float det = t4*m22-m00*t2-t7*m22+2.f*t9*m12-t12*m11;
        float t15 = 1.f/det;
        float t20 = (m01*m22-m02*m12)*t15;
        float t24 = (m01*m12-m02*m11)*t15;
        float t30 = (m00*m12-t9)*t15;
        A.m00 = (m11*m22-t2)*t15;
        A.m01 = -t20;
        A.m02 = t24;
        A.m10 = -t20;
        A.m11 = (m00*m22-t12)*t15;
        A.m12 = -t30;
        A.m20 = t24;
        A.m21 = -t30;
        A.m22 = (t4-t7)*t15;
        return det;
    }

    static void fastSchulzUpdate(Matrix3f S)
    {
        /// COPY S.mij FIELDS SINCE OVERWRITING S INCREMENTALLY:
        float s00 = S.m00;  float s01 = S.m01;  float s02 = S.m02;
        float s10 = S.m10;  float s11 = S.m11;  float s12 = S.m12;
        float s20 = S.m20;  float s21 = S.m21;  float s22 = S.m22;

        float t1  = s00*s00;
        float t3  = s10*s10;
        float t5  = s20*s20;
        float t7  = 1.5f-0.5f*t1-0.5f*t3-0.5f*t5;
        float t15 = -0.5f*s00*s01-0.5f*s10*s11-0.5f*s20*s21;
        float t23 = -0.5f*s00*s02-0.5f*s10*s12-0.5f*s20*s22;
        float t27 = s01*s01;
        float t29 = s11*s11;
        float t31 = s21*s21;
        float t33 = 1.5f-0.5f*t27-0.5f*t29-0.5f*t31;
        float t41 = -0.5f*s01*s02-0.5f*s11*s12-0.5f*s21*s22;
        float t46 = s02*s02;
        float t48 = s12*s12;
        float t50 = s22*s22;
        float t52 = 1.5f-0.5f*t46-0.5f*t48-0.5f*t50;
        S.m00 = s00*t7+s01*t15+s02*t23;
        S.m01 = s00*t15+s01*t33+s02*t41;
        S.m02 = s00*t23+s01*t41+s02*t52;
        S.m10 = s10*t7+s11*t15+s12*t23;
        S.m11 = s10*t15+s11*t33+s12*t41;
        S.m12 = s10*t23+s11*t41+s12*t52;
        S.m20 = s20*t7+s21*t15+s22*t23;
        S.m21 = s20*t15+s21*t33+s22*t41;
        S.m22 = s20*t23+s21*t41+s22*t52;
}
