/*
==========================================================================================
 Cg Acceleration Research

 Optimized:  Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu
 Original:   Eugene Lee (el77 [at] cornell [dot] edu)
------------------------------------------------------------------------------------------
 Reachability.cg

 Computes 5x5 reachability.
==========================================================================================
*/


struct vertexInfo {
    float4 pos              : POSITION;
    half4  texCoords[8];    // To hold all the interpolated texture coordinates
};



/**
 * Vertex shader for the neighbor reach, interpolates coordinates
 *
 * # 8 instructions, 1 R-regs
 */
void NeighborReachVert( 
                uniform float4x4 ModelViewProj      : state.matrix.mvp,
                in  half2 uv                        : TEXCOORD0,
                in  float4 pos                      : POSITION,
                out vertexInfo OUT) 
{
    // Transformed position of the vertex into clip coordinates
    OUT.pos = mul(ModelViewProj, pos);

    // Using TEXTURE_RECTANGE, coords are not normalized

    // Interpolate
    OUT.texCoords[0].xy = uv + half2(-1, 0);        // -1, 0
    OUT.texCoords[0].zw = uv + half2( 1, 0);        //  1, 0
    OUT.texCoords[1].xy = uv + half2( 0, 1);        //  0, 1

}



/**
 * Vertex shader for the reachability that performs the multiple texture
 * coordinates interpolation in advance.
 *
 * # 19 instructions, 2 R-regs
 */
void ReachabilityVert(  
                uniform float4x4 ModelViewProj      : state.matrix.mvp,
                in  half2 uv                        : TEXCOORD0,
                in  float4 pos                      : POSITION,
                out vertexInfo OUT) 
{

    // Transformed position of the vertex into clip coordinates
    OUT.pos = mul(ModelViewProj, pos);
    
    // Using TEXTURE_RECTANGE, coords are not normalized

    // Interpolate!
    OUT.texCoords[0].xy = uv + half2(-2, 0);        // -2, 0
    OUT.texCoords[0].zw = uv + half2(-1, 0);        // -1, 0
    OUT.texCoords[1].xy = uv;                       //  0, 0
    OUT.texCoords[1].zw = uv + half2( 1, 0);        //  1, 0
    OUT.texCoords[2].xy = uv + half2( 2, 0);        //  2, 0

    OUT.texCoords[2].zw = uv + half2(-2, 1);        // -2, 1
    OUT.texCoords[3].xy = uv + half2(-1, 1);        // -1, 1
    OUT.texCoords[3].zw = uv + half2( 0, 1);        //  0, 1
    OUT.texCoords[4].xy = uv + half2( 1, 1);        //  1, 1
    OUT.texCoords[4].zw = uv + half2( 2, 1);        //  2, 1

    OUT.texCoords[5].xy = uv + half2(-2, 2);        // -2, 2
    OUT.texCoords[5].zw = uv + half2(-1, 2);        // -1, 2
    OUT.texCoords[6].xy = uv + half2( 0, 2);        //  0, 2
    OUT.texCoords[6].zw = uv + half2( 1, 2);        //  1, 2
    OUT.texCoords[7].xy = uv + half2( 2, 2);        //  2, 2

}




/**
 * Vertex shader for the reachability copy that performs the multiple texture
 * coordinates interpolation in advance.
 *
 * # 17 instructions, 2 R-regs
 */
void CopyReachabilityVert(  
                uniform float4x4 ModelViewProj      : state.matrix.mvp,
                in  half2 uv                        : TEXCOORD0,
                in  float4 pos                      : POSITION,
                out vertexInfo OUT) 
{

    // Transformed position of the vertex into clip coordinates
    OUT.pos = mul(ModelViewProj, pos);
    
    // Using TEXTURE_RECTANGE, coords are not normalized

    // Interpolate!
    OUT.texCoords[0].xy = uv + half2(-2,-2);        // -2,-2
    OUT.texCoords[0].zw = uv + half2(-1,-2);        // -1,-2
    OUT.texCoords[1].xy = uv + half2( 0,-2);        //  0,-2
    OUT.texCoords[1].zw = uv + half2( 1,-2);        //  1,-2
    OUT.texCoords[2].xy = uv + half2( 2,-2);        //  2,-2

    OUT.texCoords[2].zw = uv + half2(-2,-1);        // -2,-1
    OUT.texCoords[3].xy = uv + half2(-1,-1);        // -1,-1
    OUT.texCoords[3].zw = uv + half2( 0,-1);        //  0,-1
    OUT.texCoords[4].xy = uv + half2( 1,-1);        //  1,-1
    OUT.texCoords[4].zw = uv + half2( 2,-1);        //  2,-1

}





// New:      # 19 instructions, 2 R-regs, 2 H-regs
// Original: # 28 instructions, 1 R-regs, 2 H-regs
half4 NeighborReach(in half2 pos : WPOS,
    in  vertexInfo IN,
    const uniform samplerRECT pixelClass,
    const uniform samplerRECT neighborTableLR,
    const uniform samplerRECT neighborTableRL,
    const uniform samplerRECT neighborTableVER) : COLOR {
    
    half4 outColor;
    half4 olrb;         // origin, left, right, bottom

    olrb.x = texRECT(pixelClass, pos).g;
    olrb.y = texRECT(pixelClass, IN.texCoords[0].xy).g;     // -1, 0
    olrb.z = texRECT(pixelClass, IN.texCoords[0].zw).g;     //  1, 0
    olrb.w = texRECT(pixelClass, IN.texCoords[1].xy).g;     //  0, 1

    olrb = round(olrb * 255);

    outColor.x = texRECT(neighborTableRL,  olrb.yx).x;  // half2(left, origin)
    outColor.y = texRECT(neighborTableLR,  olrb.xz).x;  // half2(origin, right)
    outColor.z = texRECT(neighborTableVER, olrb.wx).x;  // half2(bottom, origin)
    outColor.w = step(14.5, olrb.x);    // origin
    return outColor / half4(255.0h.xxx, 1);
}


// New:      # 140 instructions, 13 R-regs, 2 H-regs
// Original: # 165 instructions, 10 R-regs, 3 H-regs
half4 Reachability(in half2 pos : WPOS,
    in vertexInfo IN,
    const uniform samplerRECT neighborTable,
    const uniform samplerRECT pixelClass,
    const uniform samplerRECT orTable,
    const uniform samplerRECT chainTable) : COLOR 
{


    half4 color = half4(0, 0, 0, 0);
    
    //half reachability[15];
    half3 reachability013;
    half3 reachability456;
    half4 reachability789A;
    half4 reachabilityBCDE;

    half2 argument;

    const half3 nr00 = round(texRECT(neighborTable, IN.texCoords[0].xy).rgb * 255);     // -2, 0
    const half3 nr01 = round(texRECT(neighborTable, IN.texCoords[0].zw).rgb * 255);     // -1, 0
    const half4 nr02 = round(texRECT(neighborTable, IN.texCoords[1].xy) * 255);         //  0, 0
    const half3 nr03 = round(texRECT(neighborTable, IN.texCoords[1].zw).rgb * 255);     //  1, 0
    const half3 nr04 = round(texRECT(neighborTable, IN.texCoords[2].xy).rgb * 255);     //  2, 0

    const half3 nr05 = round(texRECT(neighborTable, IN.texCoords[2].zw).rgb * 255);     // -2, 1
    const half3 nr06 = round(texRECT(neighborTable, IN.texCoords[3].xy).rgb * 255);     // -1, 1
    const half3 nr07 = round(texRECT(neighborTable, IN.texCoords[3].zw).rgb * 255);     //  0, 1
    const half3 nr08 = round(texRECT(neighborTable, IN.texCoords[4].xy).rgb * 255);     //  1, 1
    const half3 nr09 = round(texRECT(neighborTable, IN.texCoords[4].zw).rgb * 255);     //  2, 1

    const half3 nr10 = round(texRECT(neighborTable, IN.texCoords[5].xy).rgb * 255);     // -2, 2
    const half3 nr11 = round(texRECT(neighborTable, IN.texCoords[5].zw).rgb * 255);     // -1, 2
    const half3 nr12 = round(texRECT(neighborTable, IN.texCoords[6].xy).rgb * 255);     //  0, 2
    const half3 nr13 = round(texRECT(neighborTable, IN.texCoords[6].zw).rgb * 255);     //  1, 2
    const half3 nr14 = round(texRECT(neighborTable, IN.texCoords[7].xy).rgb * 255);     //  2, 2

    // ROW 0
    reachability013.y = nr02.r;
    reachability013.x = texRECT(chainTable, half2(nr02.r, nr01.r)).x;

    reachability013.z = nr02.g;
    reachability456.x = texRECT(chainTable, half2(nr02.g, nr03.g)).x;



    // To mask latency
    color.g += dot(step(8.0h.xxx, reachability013),  half3(4,8,16));


    
    // ROW 1
    reachability789A.x = nr02.b;

    argument.x = texRECT(chainTable, half2(nr02.r, nr01.b)).x;
    argument.y = texRECT(chainTable, half2(nr02.b, nr07.r)).x;
    reachability456.z = texRECT(orTable, argument).x;

    argument.x = texRECT(chainTable, half2(reachability013.x, nr00.b)).x;
    argument.y = texRECT(chainTable, half2(reachability456.z, nr06.r)).x;
    reachability456.y = texRECT(orTable, argument).x;


    // To mask latency
    color.g += dot(step(8.0h.xxx, reachability456),  half3(32,64,128));


    argument.x = texRECT(chainTable, half2(nr02.g, nr03.b)).x;
    argument.y = texRECT(chainTable, half2(nr02.b, nr07.g)).x;
    reachability789A.y = texRECT(orTable, argument).x;

    argument.x = texRECT(chainTable, half2(reachability456.x, nr04.b)).x;
    argument.y = texRECT(chainTable, half2(reachability789A.y, nr08.g)).x;
    reachability789A.z = texRECT(orTable, argument).x;



    // ROW 2
    reachabilityBCDE.y = texRECT(chainTable, float2(nr02.b, nr07.b)).x;

    argument.x = texRECT(chainTable, half2(reachability456.z, nr06.b)).x;
    argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.r)).x;
    reachabilityBCDE.x = texRECT(orTable, argument).x;

    argument.x = texRECT(chainTable, half2(reachability456.y, nr05.b)).x;
    argument.y = texRECT(chainTable, half2(reachabilityBCDE.x, nr11.r)).x;
    reachability789A.w = texRECT(orTable, argument).x;


    // To mask latency
    color.b += dot(step(8.0h.xxxx, reachability789A), half4(1,2,4,8));



    argument.x = texRECT(chainTable, half2(reachability789A.y, nr08.b)).x;
    argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.g)).x;
    reachabilityBCDE.z = texRECT(orTable, argument).x;

    argument.x = texRECT(chainTable, half2(reachability789A.z, nr09.b)).x;
    argument.y = texRECT(chainTable, half2(reachabilityBCDE.z, nr13.g)).x;
    reachabilityBCDE.w = texRECT(orTable, argument).x;


    // To mask latency
    color.b += dot(step(8.0h.xxxx, reachabilityBCDE), half4(16,32,64,128));

    color.a = nr02.a;
    
    return color / 255.0;
}



// New:         # 66 instructions, 2 R-regs, 4 H-regs     
//              # 70 instructions, 2 R-regs, 5 H-regs - with if branch
// Original:    # 114 instructions, 3 R-regs, 4 H-regs
half3 CopyReachability(in half2 pos : WPOS, 
                       in vertexInfo IN,
                       const uniform samplerRECT pixelClass, 
                       const uniform samplerRECT reachability) : COLOR 
{
    
    //half4 neighbor;
    half4 outColor = round(texRECT(reachability, pos) * 255);



    // Paralelize

    // First block
    half4 neighborA;
    neighborA.x = texRECT(reachability, IN.texCoords[0].xy).b;
    neighborA.y = texRECT(reachability, IN.texCoords[0].zw).b;
    neighborA.z = texRECT(reachability, IN.texCoords[1].xy).b;
    neighborA.w = texRECT(reachability, IN.texCoords[1].zw).b;

    // Multiply and round
    neighborA = round(neighborA * 255);

    // First fmod operations
    neighborA.yzw = fmod(neighborA.yzw, half3(128, 128, 32));

    // One extra fmod
    neighborA.z = fmod(neighborA.z, 64);

    // Values at once
    neighborA = step(half4(128, 64, 32, 16), neighborA);
    outColor.r += dot(neighborA, half4(1,2,4,8));

    // Second block
    half4 neighborB;
    neighborB.x = texRECT(reachability, IN.texCoords[2].xy).b;
    neighborB.y = texRECT(reachability, IN.texCoords[2].zw).b;
    neighborB.z = texRECT(reachability, IN.texCoords[3].xy).b;
    neighborB.w = texRECT(reachability, IN.texCoords[3].zw).b;

    // Multiply and round
    neighborB = round(neighborB * 255);

    // Fmod operations
    neighborB = fmod(neighborB, half4(16, 8, 4, 2));

    // Values at once
    neighborB = step(half4(8, 4, 2, 0.9), neighborB);
    outColor.r += dot(neighborB, half4(16,32,64,128));


    // (round(tex*255)) < 128 : 0 ? 1
    half4 neighbor;
    neighbor = round(texRECT(reachability, IN.texCoords[4].xy) * 255);      //  1,-1
    outColor.g += step(128, neighbor.g);

    // (round(tex*255)) mod 128 < 64 ? 0 : 2
    neighbor = round(texRECT(reachability, IN.texCoords[0].zw) * 255);      //  2,-1
    neighbor.g -= 128 * step(128, neighbor.g);
    outColor.g += step(64, neighbor.g) * 2;



    if(outColor.a > 0) {
        outColor.rgb = half3(255, 255, 255);
    }


    return outColor.rgb / 255.0;
}