/*
==========================================================================================
Cg Acceleration Research
Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu
------------------------------------------------------------------------------------------
depthCull.cg
Depth Cull shaders using MRT. Requires fp40 profile.
==========================================================================================
*/
// The output of the depth vertex shader
struct depthVertexInfo {
float4 pos : POSITION;
half4 texCoords[5]; // To hold all the interpolated texture coordinates
};
/**
* Vertex shader for the depth filter that performs the multiple texture
* coordinates interpolation in advance.
*
* vp40: # 15 instructions, 2 R-regs
*/
void depthVertMain( uniform float4x4 ModelViewProj : state.matrix.mvp,
in half2 uv : TEXCOORD0,
in float4 pos : POSITION,
out depthVertexInfo OUT) {
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, pos);
static const half offset = 1.0h; // Using TEXTURE_RECTANGE, coords are not normalized
static const half3 offsetV = half3(offset, 0, -offset);
// Interpolate!
OUT.texCoords[0].xy = uv + offsetV.zx;
OUT.texCoords[0].zw = uv + offsetV.yx;
OUT.texCoords[1].xy = uv + offsetV.xx;
OUT.texCoords[1].zw = uv + offsetV.zy;
OUT.texCoords[2].xy = uv;
OUT.texCoords[2].zw = uv + offsetV.xy;
OUT.texCoords[3].xy = uv + offsetV.zz;
OUT.texCoords[3].zw = uv + offsetV.yz;
OUT.texCoords[4].xy = uv + offsetV.xz;
}
/**
* Fragment shader to perform the depth culling. It uses a custom vertex shader to precalculate
* all the texture coordinates, instead of making uv + half2(offset, -offset), just fetch them.
*
* fp40: # 38 instructions, 2 R-regs, 2 H-regs
*/
void depthMain( uniform samplerRECT depthTex,
uniform float4 zTransform, // The four factors for scaling the projected z-values
in depthVertexInfo IN,
out float depth : DEPTH,
out half4 outputs[3] : COLOR0 ) {
// Now compute the 3x3 depth filter, first getting the average;
float z;
float4 alfa;
float4 beta;
// Instead of making a whole if/else block, I use this small instruction which
// compiles into a shader without real branches. It is faster and reduces
// instruction count by 4 compared with the whole branch version.
//
// --> WITHOUT THIS, the depth is also blurred on the very same pass!
//if (z == 1) { discard; }
// Lookup the values
alfa.x = texRECT( depthTex, IN.texCoords[0].xy ).r; // uv + (-s, s)
alfa.y = texRECT( depthTex, IN.texCoords[0].zw ).r; // uv + ( 0, s)
alfa.z = texRECT( depthTex, IN.texCoords[1].xy ).r; // uv + ( s, s)
alfa.w = texRECT( depthTex, IN.texCoords[1].zw ).r; // uv + (-s, 0)
z = texRECT( depthTex, IN.texCoords[2].xy ).r; // uv
beta.x = texRECT( depthTex, IN.texCoords[2].zw ).r; // uv + ( s, 0)
beta.y = texRECT( depthTex, IN.texCoords[3].xy ).r; // uv + (-s, -s)
beta.z = texRECT( depthTex, IN.texCoords[3].zw ).r; // uv + ( 0, -s)
beta.w = texRECT( depthTex, IN.texCoords[4].xy ).r; // uv + ( s, -s)
// Invalid values have a depth of 1, so perform a trick to get rid of them
half4 alfaF = alfa < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
half4 betaF = beta < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
//half4 alfaF = !step(1.0h.xxxx, alfa);
//half4 betaF = !step(1.0h.xxxx, beta);
// Use those factors to get the propper values
//alfa = (alfaF != 0.0h.xxxx) ? alfa : 0.0f.xxxx;
//beta = (betaF != 0.0h.xxxx) ? beta : 0.0f.xxxx;
alfa *= alfaF;
beta *= betaF;
// To make a fast add of all values, construct a matrix with 4 rows, 4 columns
// The first two rows will have the 8 surrounding depth values, and the other
// two have the element count. This way, all the sums are performed in the
// same operation, and it is faster than make the explicit sums for all 18 values.
float4x4 values = float4x4(alfa, beta, alfaF, betaF);
float4 sumPart = mul(values, 1.0f.xxxx);
// The sum of values will be in x, the number of elements in y
float2 sumCount = float2(z + sumPart.x, 1 + sumPart.z) + sumPart.yw;
// To perform the depth cull, the boundaries of the test must be
// calculated, because the z values read from the depth buffer
// do not map lineary with the model's depth.
float average = sumCount.x / sumCount.y;
// Offset
float2 vecTmp = (average.xx * zTransform.xy) + zTransform.zw;
float boundary = vecTmp.x / vecTmp.y;
// The inferior limit is in boundary.x, the upper limit in boundary.y
if (z > boundary) {
// Clear each buffer to its corresponding clear color. It is
// faster to clear all them to the same color, with a single
// instruction, but this is the logic of the application
outputs[0] = half4(0,0,0,0);
outputs[1] = half4(16/255.0,0,0,0); // Clear with invalid point flag
outputs[2] = half4(0,0,0,0);
//z = 1;
}
else {
//z = boundary;
outputs[0] = 1.0h.xxxx;
outputs[1] = 1.0h.xxxx;
outputs[2] = 1.0h.xxxx;
}
z = boundary;
// Depth test must be enabled for the depth texture to be written.
// In order to erase the previos pixels, and at the same time allow to write
// in the color buffer, the DepthTest function must be GL_ALWAYS.
// This way, I will have written both the color and the depth
// If the depth is not the last thing written, everything gets messed up
depth = z;
}