|
Features

Appendix
This appendix
contains a C++ implementation with all
four methods: non-compact culling, compact scalar culling, compact SIMD
culling, and compact SIMD culling with prefetch instructions.
The traditional,
non-compact structure for storing triangle data is 32 bytes and is the
following:
struct STriangleData
{
float px,py,pz;
// position on the triangle
float nx,ny,nz;
// the normal of the triangle
WORD v1,v2,v3;
// indices to the vertex pool
WORD stub;
// padding to fill a cache line and
// avoid alignment stalls
};
The compact
structure is is only 16 bytes. We store the normal in fixed-point representation
instead of float. The tn field is a precalculated dot product of the position*normal
of the triangle. Here it is:
struct SCompactTriangleData
{
float tn; //
precalculated dot product
// (position*normal)
signed short nx,ny,nz;
// the normal of the triangle
WORD v1,v2,v3;
// indices to the vertex pool
};
The compact
SIMD structure has the same fields as the compact structure, except that
each is four elements wide. The F32vec4 and Is16vec4 classes are defined
in the Intel Vector Class Libraries. These libraries provide a C++ interface
for using MMX Technology and Streaming SIMD Extensions. Here is the compact
SIMD structure:
struct SCompactSIMDTriangleData
{
F32vec4 tn;
// 4 items of precalculated dot
// product (position*normal)
Is16vec4 nx,ny,nz;
// 4 items of the triangle's normal
WORD v[4][3];
// 4 items of indices to the vertex pool
};
The traditional
backface culling routine is as follows:
int C3DObject::backfaceCulling()
{
// With 1 bit per
vertex calculate number of
// dwords in cull flag array
DWORD cullFlagsLen
= (m_nVerts + 31) / 32;
int i;
int ff =
0; // front facing triangles
// Mark all vertices
as "invisible"
for (i=0
; i<cullFlagsLen ; i++) {
m_vertsCulled[i]
= 0;
}
}
for
(i=0; i<m_nTris; i++) // for each triangle
{
float Tx,Ty,Tz;
float TN;
// Calculate viewing
vector
Tx = m_tris[i].px
- VOx;
Ty = m_tris[i].py
- VOy;
Tz = m_tris[i].pz
- VOz;
// Calculate dot
product (between viewing vector
// and triangle normal)
TN = Tx*m_tris[i].nx
+ Ty*m_tris[i].ny + Tz*m_tris[i].nz;
if (TN <= 0.0f)
// is front facing
{
ff++;
// mark the
triangle 3 vertices as
// "visible"
SELECT_VTX(m_vertsCulled,m_tris[i].v1);
SELECT_VTX(m_vertsCulled,m_tris[i].v2);
SELECT_VTX(m_vertsCulled,m_tris[i].v3);
}
}
return ff;
}
The
compact backface culling routine
The compact
backface culling routine is much the same as the previous one, except
for the fixed-point calculations and the use of the precalculated dot
product.
int CCompactCull3DObject::backfaceCulling()
{
DWORD nVerts =
getVertsCount();
DWORD nTris
= getTrisCount();
DWORD *vertsCulled
= getVertsCulledBitArray();
DWORD cullFlagsLen
= (nVerts + 31) / 32;
int i;
int ff =
0; // front facing triangles
// Mark all vertices
as "invisible"
for (i=0;i<cullFlagsLen;i++)
{
vertsCulled[i]
= 0;
}
for (i=0 ; i<nTris
; i++) // for each triangle
{
float Nx,Ny,Nz;
float TN;
// Convert the normal from fixed point to float
// normal components
become floats between -SHRT_MAX to +SHRT_MAX
Nx =
(float)m_triData[i].nx;
Ny =
(float)m_triData[i].ny;
Nz =
(float)m_triData[i].nz;
// Calculate
dot product
// view*normal
is multiplied by
// 1/SHRT_MAX to adjust to original
// normal values
TN =
m_triData[i].tn - (VOx*Nx + VOy*Ny + VOz*Nz)*OOSHRT_MAX;
if (TN <=
0.0f) // is front facing
{
ff++;
// mark
the triangle 3 vertices
// as "visible"
SELECT_VTX(vertsCulled,m_triData[i].v1);
SELECT_VTX(vertsCulled,m_triData[i].v2);
SELECT_VTX(vertsCulled,m_triData[i].v3);
}
return ff;
}
}
The
compact SIMD backface culling routine
The compact
SIMD backface culling routine is much the same as the compact culling
routine, except for the use of the vector classes for parallelism. Note
that we can't use an if (TN <= 0) expression here because the results
may not be the same for all four elements. Instead we collect the sign
bits of the dot product and check for each of the four elements separately.
int CCompactSIMDCull3DObject::backfaceCulling()
{
static F32vec4
_OOSHRT_MAX_ = F32vec4(OOSHRT_MAX);
DWORD nVerts
= getVertsCount();
DWORD nTris
= (getTrisCount() + 3)/4;
DWORD *vertsCulled
= getVertsCulledBitArray();
DWORD cullFlagsLen
= (nVerts + 31) / 32;
int i;
int ff =
0; // front facing triangles
// Mark all vertices
as "invisible"
for (i=0;i<cullFlagsLen;i++)
{
vertsCulled[i]
= 0;
}
// Duplicate each
of the components of the
// viewing vector 4 times
F32vec4 _VOx_
= VOx, _VOy_ = VOy, _VOz_ = VOz;
for (i=0; i<nTris;
i++) // each triangle
{
F32vec4 Nx,Ny,Nz;
F32vec4
TN;
int mask;
// Convert
triangle normal from fixed
// point (16-bit integer) to float
Nx =
Is16vec4ToF32vec4( m_triData[i].nx);
Ny =
Is16vec4ToF32vec4( m_triData[i].ny);
Nz =
Is16vec4ToF32vec4( m_triData[i].nz);
// Calculate
dot product
TN =
m_triData[i].tn - (_VOx_*Nx + _VOy_*Ny + _VOz_*Nz)*_OOSHRT_MAX_;
// Collect
sign bits of dot products
// If all dot
products are positive mask // gets the value of 0
otherwise it gets
// a value between 0x1-0xF
mask
= move_mask(TN);
if (mask >
0)
// any of the 4 dot products is negative
{
for (int
j=0; j<4; j++)
// do 4 times
{
if
(mask & 1)
// dot product is negative
{
ff++;
//
mark the
// triangle 3
// vertices as
// "visible"
SELECT_VTX(vertsCulled,
m_triData[i].v[j][0]);
SELECT_VTX(vertsCulled,
m_triData[i].v[j][1]);
SELECT_VTX(vertsCulled,
m_triData[i].v[j][2]);
}
//
shift right
mask
>>= 1;
}
}
}
_m_empty();
return
ff;
}
|