/*
==========================================================================================
 Cg Acceleration Research

 Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu
------------------------------------------------------------------------------------------
 depthCull.cg

 Depth Cull shaders using MRT. Requires fp40 profile.
==========================================================================================
*/


// The output of the depth vertex shader
struct depthVertexInfo {
    float4 pos                  : POSITION;
    half4  texCoords[5];        // To hold all the interpolated texture coordinates
};



/**
 * Vertex shader for the depth filter that performs the multiple texture
 * coordinates interpolation in advance.
 *
 * vp40: # 15 instructions, 2 R-regs
 */
void depthVertMain( uniform float4x4 ModelViewProj  : state.matrix.mvp,
                in  half2 uv                        : TEXCOORD0,
                in  float4 pos                      : POSITION,
                out depthVertexInfo OUT) {
                
    // Transformed position of the vertex into clip coordinates
    OUT.pos = mul(ModelViewProj, pos);
    
    static const half offset    = 1.0h;     // Using TEXTURE_RECTANGE, coords are not normalized
    static const half3 offsetV  = half3(offset, 0, -offset);
    
    // Interpolate!
    OUT.texCoords[0].xy = uv + offsetV.zx;
    OUT.texCoords[0].zw = uv + offsetV.yx;
    OUT.texCoords[1].xy = uv + offsetV.xx;
    OUT.texCoords[1].zw = uv + offsetV.zy;
    OUT.texCoords[2].xy = uv;
    OUT.texCoords[2].zw = uv + offsetV.xy;
    OUT.texCoords[3].xy = uv + offsetV.zz;
    OUT.texCoords[3].zw = uv + offsetV.yz;
    OUT.texCoords[4].xy = uv + offsetV.xz;
                
}



/**
 * Fragment shader to perform the depth culling. It uses a custom vertex shader to precalculate
 * all the texture coordinates, instead of making uv + half2(offset, -offset), just fetch them.
 *
 * fp40: # 38 instructions, 2 R-regs, 2 H-regs
 */
void depthMain( uniform samplerRECT depthTex,
                uniform float4 zTransform,      // The four factors for scaling the projected z-values
                in  depthVertexInfo IN,
                out float depth                 : DEPTH,
                out half4 outputs[3]            : COLOR0 ) {


    // Now compute the 3x3 depth filter, first getting the average;
    float z;
    float4 alfa;
    float4 beta;

    
    // Instead of making a whole if/else block, I use this small instruction which
    // compiles into a shader without real branches. It is faster and reduces
    // instruction count by 4 compared with the whole branch version.
    //
    // --> WITHOUT THIS, the depth is also blurred on the very same pass!
    //if (z == 1) { discard; }
    
        
    // Lookup the values
    alfa.x = texRECT( depthTex, IN.texCoords[0].xy ).r;     // uv + (-s,  s)
    alfa.y = texRECT( depthTex, IN.texCoords[0].zw ).r;     // uv + ( 0,  s)
    alfa.z = texRECT( depthTex, IN.texCoords[1].xy ).r;     // uv + ( s,  s)
    
    alfa.w = texRECT( depthTex, IN.texCoords[1].zw ).r;     // uv + (-s,  0)
    z      = texRECT( depthTex, IN.texCoords[2].xy ).r;     // uv
    beta.x = texRECT( depthTex, IN.texCoords[2].zw ).r;     // uv + ( s,  0)
    
    beta.y = texRECT( depthTex, IN.texCoords[3].xy ).r;     // uv + (-s, -s)
    beta.z = texRECT( depthTex, IN.texCoords[3].zw ).r;     // uv + ( 0, -s)
    beta.w = texRECT( depthTex, IN.texCoords[4].xy ).r;     // uv + ( s, -s)
    
    
    // Invalid values have a depth of 1, so perform a trick to get rid of them
    half4 alfaF = alfa < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
    half4 betaF = beta < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
    //half4 alfaF = !step(1.0h.xxxx, alfa);
    //half4 betaF = !step(1.0h.xxxx, beta);
    
    // Use those factors to get the propper values
    //alfa = (alfaF != 0.0h.xxxx) ? alfa : 0.0f.xxxx;
    //beta = (betaF != 0.0h.xxxx) ? beta : 0.0f.xxxx;
    alfa *= alfaF;
    beta *= betaF;
    
    // To make a fast add of all values, construct a matrix with 4 rows, 4 columns
    // The first two rows will have the 8 surrounding depth values, and the other
    // two have the element count.  This way, all the sums are performed in the
    // same operation, and it is faster than make the explicit sums for all 18 values.
    float4x4 values  = float4x4(alfa, beta, alfaF, betaF);
    float4   sumPart = mul(values, 1.0f.xxxx);
    
    // The sum of values will be in x, the number of elements in y
    float2 sumCount = float2(z + sumPart.x, 1 + sumPart.z) + sumPart.yw;
    
    // To perform the depth cull, the boundaries of the test must be
    // calculated, because the z values read from the depth buffer
    // do not map lineary with the model's depth.
    float average       = sumCount.x / sumCount.y;

    // Offset
    float2 vecTmp  = (average.xx * zTransform.xy) + zTransform.zw;
    float boundary = vecTmp.x / vecTmp.y;
    

    // The inferior limit is in boundary.x, the upper limit in boundary.y
    if (z > boundary) {
        
        // Clear each buffer to its corresponding clear color. It is
        // faster to clear all them to the same color, with a single
        // instruction, but this is the logic of the application

        outputs[0] = half4(0,0,0,0);
        outputs[1] = half4(16/255.0,0,0,0);     // Clear with invalid point flag
        outputs[2] = half4(0,0,0,0);


        //z = 1;
    }
    else {
        //z = boundary;
        outputs[0] = 1.0h.xxxx;
        outputs[1] = 1.0h.xxxx;
        outputs[2] = 1.0h.xxxx;
    }
    z = boundary;
    
    
    // Depth test must be enabled for the depth texture to be written.
    // In order to erase the previos pixels, and at the same time allow to write
    // in the color buffer, the DepthTest function must be GL_ALWAYS.
    // This way, I will have written both the color and the depth
    
    // If the depth is not the last thing written, everything gets messed up
    depth = z;

}