/* ========================================================================================== Cg Acceleration Research Optimized: Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu Original: Eugene Lee (el77 [at] cornell [dot] edu) ------------------------------------------------------------------------------------------ Reachability.cg Computes 5x5 reachability. ========================================================================================== */ struct vertexInfo { float4 pos : POSITION; half4 texCoords[8]; // To hold all the interpolated texture coordinates }; /** * Vertex shader for the neighbor reach, interpolates coordinates * * # 8 instructions, 1 R-regs */ void NeighborReachVert( uniform float4x4 ModelViewProj : state.matrix.mvp, in half2 uv : TEXCOORD0, in float4 pos : POSITION, out vertexInfo OUT) { // Transformed position of the vertex into clip coordinates OUT.pos = mul(ModelViewProj, pos); // Using TEXTURE_RECTANGE, coords are not normalized // Interpolate OUT.texCoords[0].xy = uv + half2(-1, 0); // -1, 0 OUT.texCoords[0].zw = uv + half2( 1, 0); // 1, 0 OUT.texCoords[1].xy = uv + half2( 0, 1); // 0, 1 } /** * Vertex shader for the reachability that performs the multiple texture * coordinates interpolation in advance. * * # 19 instructions, 2 R-regs */ void ReachabilityVert( uniform float4x4 ModelViewProj : state.matrix.mvp, in half2 uv : TEXCOORD0, in float4 pos : POSITION, out vertexInfo OUT) { // Transformed position of the vertex into clip coordinates OUT.pos = mul(ModelViewProj, pos); // Using TEXTURE_RECTANGE, coords are not normalized // Interpolate! OUT.texCoords[0].xy = uv + half2(-2, 0); // -2, 0 OUT.texCoords[0].zw = uv + half2(-1, 0); // -1, 0 OUT.texCoords[1].xy = uv; // 0, 0 OUT.texCoords[1].zw = uv + half2( 1, 0); // 1, 0 OUT.texCoords[2].xy = uv + half2( 2, 0); // 2, 0 OUT.texCoords[2].zw = uv + half2(-2, 1); // -2, 1 OUT.texCoords[3].xy = uv + half2(-1, 1); // -1, 1 OUT.texCoords[3].zw = uv + half2( 0, 1); // 0, 1 OUT.texCoords[4].xy = uv + half2( 1, 1); // 1, 1 OUT.texCoords[4].zw = uv + half2( 2, 1); // 2, 1 OUT.texCoords[5].xy = uv + half2(-2, 2); // -2, 2 OUT.texCoords[5].zw = uv + half2(-1, 2); // -1, 2 OUT.texCoords[6].xy = uv + half2( 0, 2); // 0, 2 OUT.texCoords[6].zw = uv + half2( 1, 2); // 1, 2 OUT.texCoords[7].xy = uv + half2( 2, 2); // 2, 2 } /** * Vertex shader for the reachability copy that performs the multiple texture * coordinates interpolation in advance. * * # 17 instructions, 2 R-regs */ void CopyReachabilityVert( uniform float4x4 ModelViewProj : state.matrix.mvp, in half2 uv : TEXCOORD0, in float4 pos : POSITION, out vertexInfo OUT) { // Transformed position of the vertex into clip coordinates OUT.pos = mul(ModelViewProj, pos); // Using TEXTURE_RECTANGE, coords are not normalized // Interpolate! OUT.texCoords[0].xy = uv + half2(-2,-2); // -2,-2 OUT.texCoords[0].zw = uv + half2(-1,-2); // -1,-2 OUT.texCoords[1].xy = uv + half2( 0,-2); // 0,-2 OUT.texCoords[1].zw = uv + half2( 1,-2); // 1,-2 OUT.texCoords[2].xy = uv + half2( 2,-2); // 2,-2 OUT.texCoords[2].zw = uv + half2(-2,-1); // -2,-1 OUT.texCoords[3].xy = uv + half2(-1,-1); // -1,-1 OUT.texCoords[3].zw = uv + half2( 0,-1); // 0,-1 OUT.texCoords[4].xy = uv + half2( 1,-1); // 1,-1 OUT.texCoords[4].zw = uv + half2( 2,-1); // 2,-1 } // New: # 19 instructions, 2 R-regs, 2 H-regs // Original: # 28 instructions, 1 R-regs, 2 H-regs half4 NeighborReach(in half2 pos : WPOS, in vertexInfo IN, const uniform samplerRECT pixelClass, const uniform samplerRECT neighborTableLR, const uniform samplerRECT neighborTableRL, const uniform samplerRECT neighborTableVER) : COLOR { half4 outColor; half4 olrb; // origin, left, right, bottom olrb.x = texRECT(pixelClass, pos).g; olrb.y = texRECT(pixelClass, IN.texCoords[0].xy).g; // -1, 0 olrb.z = texRECT(pixelClass, IN.texCoords[0].zw).g; // 1, 0 olrb.w = texRECT(pixelClass, IN.texCoords[1].xy).g; // 0, 1 olrb = round(olrb * 255); outColor.x = texRECT(neighborTableRL, olrb.yx).x; // half2(left, origin) outColor.y = texRECT(neighborTableLR, olrb.xz).x; // half2(origin, right) outColor.z = texRECT(neighborTableVER, olrb.wx).x; // half2(bottom, origin) outColor.w = step(14.5, olrb.x); // origin return outColor / half4(255.0h.xxx, 1); } // New: # 140 instructions, 13 R-regs, 2 H-regs // Original: # 165 instructions, 10 R-regs, 3 H-regs half4 Reachability(in half2 pos : WPOS, in vertexInfo IN, const uniform samplerRECT neighborTable, const uniform samplerRECT pixelClass, const uniform samplerRECT orTable, const uniform samplerRECT chainTable) : COLOR { half4 color = half4(0, 0, 0, 0); //half reachability[15]; half3 reachability013; half3 reachability456; half4 reachability789A; half4 reachabilityBCDE; half2 argument; const half3 nr00 = round(texRECT(neighborTable, IN.texCoords[0].xy).rgb * 255); // -2, 0 const half3 nr01 = round(texRECT(neighborTable, IN.texCoords[0].zw).rgb * 255); // -1, 0 const half4 nr02 = round(texRECT(neighborTable, IN.texCoords[1].xy) * 255); // 0, 0 const half3 nr03 = round(texRECT(neighborTable, IN.texCoords[1].zw).rgb * 255); // 1, 0 const half3 nr04 = round(texRECT(neighborTable, IN.texCoords[2].xy).rgb * 255); // 2, 0 const half3 nr05 = round(texRECT(neighborTable, IN.texCoords[2].zw).rgb * 255); // -2, 1 const half3 nr06 = round(texRECT(neighborTable, IN.texCoords[3].xy).rgb * 255); // -1, 1 const half3 nr07 = round(texRECT(neighborTable, IN.texCoords[3].zw).rgb * 255); // 0, 1 const half3 nr08 = round(texRECT(neighborTable, IN.texCoords[4].xy).rgb * 255); // 1, 1 const half3 nr09 = round(texRECT(neighborTable, IN.texCoords[4].zw).rgb * 255); // 2, 1 const half3 nr10 = round(texRECT(neighborTable, IN.texCoords[5].xy).rgb * 255); // -2, 2 const half3 nr11 = round(texRECT(neighborTable, IN.texCoords[5].zw).rgb * 255); // -1, 2 const half3 nr12 = round(texRECT(neighborTable, IN.texCoords[6].xy).rgb * 255); // 0, 2 const half3 nr13 = round(texRECT(neighborTable, IN.texCoords[6].zw).rgb * 255); // 1, 2 const half3 nr14 = round(texRECT(neighborTable, IN.texCoords[7].xy).rgb * 255); // 2, 2 // ROW 0 reachability013.y = nr02.r; reachability013.x = texRECT(chainTable, half2(nr02.r, nr01.r)).x; reachability013.z = nr02.g; reachability456.x = texRECT(chainTable, half2(nr02.g, nr03.g)).x; // To mask latency color.g += dot(step(8.0h.xxx, reachability013), half3(4,8,16)); // ROW 1 reachability789A.x = nr02.b; argument.x = texRECT(chainTable, half2(nr02.r, nr01.b)).x; argument.y = texRECT(chainTable, half2(nr02.b, nr07.r)).x; reachability456.z = texRECT(orTable, argument).x; argument.x = texRECT(chainTable, half2(reachability013.x, nr00.b)).x; argument.y = texRECT(chainTable, half2(reachability456.z, nr06.r)).x; reachability456.y = texRECT(orTable, argument).x; // To mask latency color.g += dot(step(8.0h.xxx, reachability456), half3(32,64,128)); argument.x = texRECT(chainTable, half2(nr02.g, nr03.b)).x; argument.y = texRECT(chainTable, half2(nr02.b, nr07.g)).x; reachability789A.y = texRECT(orTable, argument).x; argument.x = texRECT(chainTable, half2(reachability456.x, nr04.b)).x; argument.y = texRECT(chainTable, half2(reachability789A.y, nr08.g)).x; reachability789A.z = texRECT(orTable, argument).x; // ROW 2 reachabilityBCDE.y = texRECT(chainTable, float2(nr02.b, nr07.b)).x; argument.x = texRECT(chainTable, half2(reachability456.z, nr06.b)).x; argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.r)).x; reachabilityBCDE.x = texRECT(orTable, argument).x; argument.x = texRECT(chainTable, half2(reachability456.y, nr05.b)).x; argument.y = texRECT(chainTable, half2(reachabilityBCDE.x, nr11.r)).x; reachability789A.w = texRECT(orTable, argument).x; // To mask latency color.b += dot(step(8.0h.xxxx, reachability789A), half4(1,2,4,8)); argument.x = texRECT(chainTable, half2(reachability789A.y, nr08.b)).x; argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.g)).x; reachabilityBCDE.z = texRECT(orTable, argument).x; argument.x = texRECT(chainTable, half2(reachability789A.z, nr09.b)).x; argument.y = texRECT(chainTable, half2(reachabilityBCDE.z, nr13.g)).x; reachabilityBCDE.w = texRECT(orTable, argument).x; // To mask latency color.b += dot(step(8.0h.xxxx, reachabilityBCDE), half4(16,32,64,128)); color.a = nr02.a; return color / 255.0; } // New: # 66 instructions, 2 R-regs, 4 H-regs // # 70 instructions, 2 R-regs, 5 H-regs - with if branch // Original: # 114 instructions, 3 R-regs, 4 H-regs half3 CopyReachability(in half2 pos : WPOS, in vertexInfo IN, const uniform samplerRECT pixelClass, const uniform samplerRECT reachability) : COLOR { //half4 neighbor; half4 outColor = round(texRECT(reachability, pos) * 255); // Paralelize // First block half4 neighborA; neighborA.x = texRECT(reachability, IN.texCoords[0].xy).b; neighborA.y = texRECT(reachability, IN.texCoords[0].zw).b; neighborA.z = texRECT(reachability, IN.texCoords[1].xy).b; neighborA.w = texRECT(reachability, IN.texCoords[1].zw).b; // Multiply and round neighborA = round(neighborA * 255); // First fmod operations neighborA.yzw = fmod(neighborA.yzw, half3(128, 128, 32)); // One extra fmod neighborA.z = fmod(neighborA.z, 64); // Values at once neighborA = step(half4(128, 64, 32, 16), neighborA); outColor.r += dot(neighborA, half4(1,2,4,8)); // Second block half4 neighborB; neighborB.x = texRECT(reachability, IN.texCoords[2].xy).b; neighborB.y = texRECT(reachability, IN.texCoords[2].zw).b; neighborB.z = texRECT(reachability, IN.texCoords[3].xy).b; neighborB.w = texRECT(reachability, IN.texCoords[3].zw).b; // Multiply and round neighborB = round(neighborB * 255); // Fmod operations neighborB = fmod(neighborB, half4(16, 8, 4, 2)); // Values at once neighborB = step(half4(8, 4, 2, 0.9), neighborB); outColor.r += dot(neighborB, half4(16,32,64,128)); // (round(tex*255)) < 128 : 0 ? 1 half4 neighbor; neighbor = round(texRECT(reachability, IN.texCoords[4].xy) * 255); // 1,-1 outColor.g += step(128, neighbor.g); // (round(tex*255)) mod 128 < 64 ? 0 : 2 neighbor = round(texRECT(reachability, IN.texCoords[0].zw) * 255); // 2,-1 neighbor.g -= 128 * step(128, neighbor.g); outColor.g += step(64, neighbor.g) * 2; if(outColor.a > 0) { outColor.rgb = half3(255, 255, 255); } return outColor.rgb / 255.0; }