/* ========================================================================================== Cg Acceleration Research Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu ------------------------------------------------------------------------------------------ depthCull.cg Depth Cull shaders using MRT. Requires fp40 profile. ========================================================================================== */ // The output of the depth vertex shader struct depthVertexInfo { float4 pos : POSITION; half4 texCoords[5]; // To hold all the interpolated texture coordinates }; /** * Vertex shader for the depth filter that performs the multiple texture * coordinates interpolation in advance. * * vp40: # 15 instructions, 2 R-regs */ void depthVertMain( uniform float4x4 ModelViewProj : state.matrix.mvp, in half2 uv : TEXCOORD0, in float4 pos : POSITION, out depthVertexInfo OUT) { // Transformed position of the vertex into clip coordinates OUT.pos = mul(ModelViewProj, pos); static const half offset = 1.0h; // Using TEXTURE_RECTANGE, coords are not normalized static const half3 offsetV = half3(offset, 0, -offset); // Interpolate! OUT.texCoords[0].xy = uv + offsetV.zx; OUT.texCoords[0].zw = uv + offsetV.yx; OUT.texCoords[1].xy = uv + offsetV.xx; OUT.texCoords[1].zw = uv + offsetV.zy; OUT.texCoords[2].xy = uv; OUT.texCoords[2].zw = uv + offsetV.xy; OUT.texCoords[3].xy = uv + offsetV.zz; OUT.texCoords[3].zw = uv + offsetV.yz; OUT.texCoords[4].xy = uv + offsetV.xz; } /** * Fragment shader to perform the depth culling. It uses a custom vertex shader to precalculate * all the texture coordinates, instead of making uv + half2(offset, -offset), just fetch them. * * fp40: # 38 instructions, 2 R-regs, 2 H-regs */ void depthMain( uniform samplerRECT depthTex, uniform float4 zTransform, // The four factors for scaling the projected z-values in depthVertexInfo IN, out float depth : DEPTH, out half4 outputs[3] : COLOR0 ) { // Now compute the 3x3 depth filter, first getting the average; float z; float4 alfa; float4 beta; // Instead of making a whole if/else block, I use this small instruction which // compiles into a shader without real branches. It is faster and reduces // instruction count by 4 compared with the whole branch version. // // --> WITHOUT THIS, the depth is also blurred on the very same pass! //if (z == 1) { discard; } // Lookup the values alfa.x = texRECT( depthTex, IN.texCoords[0].xy ).r; // uv + (-s, s) alfa.y = texRECT( depthTex, IN.texCoords[0].zw ).r; // uv + ( 0, s) alfa.z = texRECT( depthTex, IN.texCoords[1].xy ).r; // uv + ( s, s) alfa.w = texRECT( depthTex, IN.texCoords[1].zw ).r; // uv + (-s, 0) z = texRECT( depthTex, IN.texCoords[2].xy ).r; // uv beta.x = texRECT( depthTex, IN.texCoords[2].zw ).r; // uv + ( s, 0) beta.y = texRECT( depthTex, IN.texCoords[3].xy ).r; // uv + (-s, -s) beta.z = texRECT( depthTex, IN.texCoords[3].zw ).r; // uv + ( 0, -s) beta.w = texRECT( depthTex, IN.texCoords[4].xy ).r; // uv + ( s, -s) // Invalid values have a depth of 1, so perform a trick to get rid of them half4 alfaF = alfa < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx; half4 betaF = beta < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx; //half4 alfaF = !step(1.0h.xxxx, alfa); //half4 betaF = !step(1.0h.xxxx, beta); // Use those factors to get the propper values //alfa = (alfaF != 0.0h.xxxx) ? alfa : 0.0f.xxxx; //beta = (betaF != 0.0h.xxxx) ? beta : 0.0f.xxxx; alfa *= alfaF; beta *= betaF; // To make a fast add of all values, construct a matrix with 4 rows, 4 columns // The first two rows will have the 8 surrounding depth values, and the other // two have the element count. This way, all the sums are performed in the // same operation, and it is faster than make the explicit sums for all 18 values. float4x4 values = float4x4(alfa, beta, alfaF, betaF); float4 sumPart = mul(values, 1.0f.xxxx); // The sum of values will be in x, the number of elements in y float2 sumCount = float2(z + sumPart.x, 1 + sumPart.z) + sumPart.yw; // To perform the depth cull, the boundaries of the test must be // calculated, because the z values read from the depth buffer // do not map lineary with the model's depth. float average = sumCount.x / sumCount.y; // Offset float2 vecTmp = (average.xx * zTransform.xy) + zTransform.zw; float boundary = vecTmp.x / vecTmp.y; // The inferior limit is in boundary.x, the upper limit in boundary.y if (z > boundary) { // Clear each buffer to its corresponding clear color. It is // faster to clear all them to the same color, with a single // instruction, but this is the logic of the application outputs[0] = half4(0,0,0,0); outputs[1] = half4(16/255.0,0,0,0); // Clear with invalid point flag outputs[2] = half4(0,0,0,0); //z = 1; } else { //z = boundary; outputs[0] = 1.0h.xxxx; outputs[1] = 1.0h.xxxx; outputs[2] = 1.0h.xxxx; } z = boundary; // Depth test must be enabled for the depth texture to be written. // In order to erase the previos pixels, and at the same time allow to write // in the color buffer, the DepthTest function must be GL_ALWAYS. // This way, I will have written both the color and the depth // If the depth is not the last thing written, everything gets messed up depth = z; }