TITLE Modified form of Perlin's Noise Basis function using MMX(TM) technology
;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list
.586
.model FLAT
;***********************************************************************
; Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA
;KEY for comments
;P0, P1, P# = Pixel number 0, Pixel number 1, Pixel number # respectively.
;Pix = Pixel
;DU = Derivative of the variable U.
;DDU = Derivative of the variable DU.
;Texel = A point in the texture to be mapped onto the screen. Given by U, V.
;Note: Even though the assembly writes four pixel values through each pass of the
;inner loop, only two of the pixels are directly calculated. The other two pixels
;are averaged from neighboring pixels. According to the current scheme,
; |--- 16 bit ---|
; +-----------------------------------------------------------+
; | Pixel #0 | Pixel #1 | Pixel #2 | Pixel #3 |
; +-----------------------------------------------------------+
;Pixels #1 and #3 are directly calculated. Pixel #2 is averaged from Pixel #1 and
;pixel #3. Pixel #0 is averaged from Pixel #1 and the previous pixel before #0.
;
;Also, the programmer realizes that the pixels are labeled from 0, 1, 2, 3 instead
;of 3, 2, 1, 0 as follows the conventional format of Intel Architecture. This was
;an oversite and not realized until it was too late.
;Variables, u, v, du, dv, ddu, ddv each contain parameters for two
;texels. Since u, v, ..., ddv are 64 bit, then each texel parameter is
;32 bit. (32 bit per texel * two texels = 64 bits). This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
u QWORD ?
du QWORD ?
ddu QWORD ?
v QWORD ?
dv QWORD ?
ddv QWORD ?
firstU QWORD ?
firstV QWORD ?
;Since the program only calculates odd pixel values, the even pixel values
;must be averaged. Therefore, for each pass through the inner loop, four
;pixels will be drawn. In order to draw the first pixel, the pixel before
;it must be known for the averaging. This pixel color is contained here.
octShift DWORD 0, 0
turbShift DWORD 0, 0
prev_color DWORD 255
;Various masks. Set up to filter out unwanted bits in MMX registers.
ALIGN 8
mask_32_to_15 QWORD 00007FFF00007FFFh
mask_quad_1 QWORD 0001000100010001h
mask_quad_255 QWORD 00FF00FF00FF00FFh
mask_quad_256 QWORD 0100010001000100h
mask_quad_510 QWORD 01FE01FE01FE01FEh
mask_quad_511 QWORD 01FF01FF01FF01FFh
mask_quad_1536 QWORD 0600060006000600h
mask_double_255 QWORD 000000FF000000FFh
mask_double_FFFF QWORD 0000FFFF0000FFFFh
mask_double_65536 QWORD 0001000000010000h
mask_four_255 QWORD 00FF00FF00FF00FFh
DSEG ENDS
;***********************************************************************
; Constant Segment Declarations
;***********************************************************************
.const
;***********************************************************************
; Code Segment Declarations
;***********************************************************************
.code
COMMENT^
void SIMD_Octave(unsigned long u_init, unsigned long v_init,
long du_init, long dv_init, unsigned long Num_Pix,
unsigned _int16* turb_buffer, unsigned long num_octaves);
^
SIMD_Octave PROC NEAR C USES ebx ecx edi esi,
u_init:DWORD, v_init:DWORD, du_init:DWORD, dv_init:DWORD,
num_pixels:DWORD, turb_buffer:DWORD, num_octaves:DWORD
;Initialization
MOVD MM0, u_init
MOVD MM1, v_init
PUNPCKLDQ MM0, MM0 ;U p1 = u, p3 = u
MOVD MM2, du_init
PUNPCKLDQ MM1, MM1 ;V p1 = v, p3 = v
MOVD MM3, dv_init
PADDD MM0, MM2 ;U p1 = u, p3 = u + du
PADDD MM1, MM3 ;V p1 = v, p3 = v + dv
PADDD MM0, MM2 ;U p1 = u, p3 = u + 2du
PADDD MM1, MM3 ;V p1 = v, p3 = v + 2dv
PUNPCKLDQ MM2, MM2
PUNPCKLDQ MM3, MM3
PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du
MOV [turbShift],0 ;turbShift is the octave number 0,1,2,....
XOR ESI,ESI
MOVQ DWORD PTR firstU , MM0
PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv
MOV [octShift],14 ;octshift is (14 - esi (octave number))
PSLLD MM2, 2 ;DU p1 = 4du, p3 = 4du
MOVQ DWORD PTR firstV, MM1
PSLLD MM3, 2 ;DU p1 = 4dv, p3 = 4dv
MOVQ DWORD PTR du, MM2
MOVQ DWORD PTR dv, MM3
start_octave :
MOV EBX, prev_color
MOV EDI, turb_buffer ;EDI will always be pointer to screen buffer
MOV ECX, num_pixels
SUB EDI, 8
;Get the UV parameters in MMX(TM) technology form.
;Note: UV texel values are stored in 10.22 fixed integer format.
;This sets up the U parameters for pixels 1 and 3 in MM0 register and
;V parameter in MM1 register. After setup, the registers will contain:
; |--------- 32 bit ------------|
; +--------------------------------------------------------------+
;MM0 = | U texel for pix #1 = u + du | U texel for pix #3 = u + 3du |
; +--------------------------------------------------------------+
; +--------------------------------------------------------------+
;MM1 = | V texel for pix #1 = v + dv | V texel for pix #3 = v + 3dv |
; +--------------------------------------------------------------+
;This is because the first four pixels drawn on the screen will have the
;U and V texel values of:
;Pixel #0 = u + 0du
;Pixel #1 = u + 1du
;Pixel #2 = u + 2du
;Pixel #3 = u + 3du
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.
MOVQ MM0, DWORD PTR firstU
MOVQ MM1, DWORD PTR firstV
MOVQ DWORD PTR u, MM0
MOVQ DWORD PTR v, MM1
start_scan_line:
;First, the program converts the u and v texel coordinates
;from 10.22 format to 8.8 format. 10.22 format is used for
;decimal accuracy but only 16 of the 32 bits are actually used.
;Because the final format will fit in a 16 bit result, u and v
;values are converted from 4, 32 bit packed values
;to 4, 16 bit packed values that will fit in one MMX register. Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM0 = | U texel - p1 | U texel - p3 | V texel - p1 | V texel - p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;u_16bit = u_init >> 14;
;v_16bit = v_init >> 14;
MOVQ MM1, DWORD PTR u
MOVQ MM3, DWORD PTR octShift
MOVQ MM0, DWORD PTR v
PSRLD MM1, MM3 ;Convert from 10.22 to 10.8
MOVQ MM2, DWORD PTR mask_32_to_15 ;Uses 15 instead of 16 because of signed saturation.
PSRLD MM0, MM3 ;Convert from 10.22 to 10.8
PAND MM1, MM2 ;Convert from 10.8 to 7.8 integer format
PAND MM0, MM2 ;Convert from 10.8 to 7.8 integer format
MOVQ MM3, DWORD PTR mask_quad_1
PACKSSDW MM0, MM1 ;Pack the result into one register
;Calculation of the bx0, by0, bx1, by1 values for both pixels. Output:
; |-8 bit-|
; +-----------------------------------------------------------+
;MM2 = | |BX0 p1 | |BX0 p3 | |BY0 p1 | |BY0 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM3 = | |BX1 p1 | |BX1 p3 | |BY1 p1 | |BY1 p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;bx0 = u_16bit >> 8;
;by0 = v_16bit >> 8;
;bx1 = bx0 + 1;
;by1 = by0 + 1;
MOVQ MM1, DWORD PTR u ;Used for incrementing u for next 4 pix.
MOVQ MM2, MM0
PSRLW MM2, 8
PADDD MM1, DWORD PTR du ;Used for incrementing u for next 4 pix.
PADDUSB MM3, MM2 ;mm3 = 0:BX1(1):0:BX1(3):0:BY1(1):0:BY1(3)
;Calculation of the rx0, ry0 values for both pixels. Final output:
; |-8 bit-|
; +-----------------------------------------------------------+
;MM0 = | |RX0 p1 | |RX0 p3 | |RY0 p1 | |RY0 p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;rx0 = u_16bit & 255;
;ry0 = v_16bit & 255;
PSLLW MM0, 8
MOVQ MM4, MM3
MOVQ MM6, DWORD PTR mask_quad_1
PUNPCKHWD MM4, MM2 ;MM4 = 0:BX0(1):0:BX1(1):0:BX0(3):0:BX1(3)
PUNPCKLWD MM3, MM2 ;MM3 = 0:BY0(1):0:BY1(1):0:BY0(3):0:BY1(3)
PMULLW MM4, MM4 ;MM4 = BX0^2(1):BX1^2(1):BX0^2(3):BX1^2(3)
PSRLW MM0, 8 ;MM0 = rx0 and ry0 param for pix 1, 3
;This section includes calculation of b00, b01, b10, b11. Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM4 = | b01 for p1 | b11 for p1 | b01 for p3 | b11 for p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM5 = | b00 for p1 | b10 for p1 | b00 for p3 | b10 for p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;b00 = random1((random1(bx0) + by0));
;b01 = random1((random1(bx0) + by1));
;b10 = random1((random1(bx1) + by0));
;b11 = random1((random1(bx1) + by1));
MOVQ MM2, MM3
PUNPCKLDQ MM3, MM3 ;MM3 = 0:BY0(3):0:BY1(3):0:BY0(3):0:BY1(3)
PUNPCKHDQ MM2, MM2 ;MM2 = 0:BY0(1):0:BY1(1):0:BY0(1):0:BY1(1)
MOVQ MM5, MM4
MOVQ DWORD PTR u, MM1 ;Used for incrementing u for next 4 pix.
PUNPCKLWD MM4, MM4 ;MM4 = BX0^2(3):BX0^2(3):BX1^2(3):BX1^2(3)
PUNPCKHWD MM5, MM5 ;MM5 = BX0^2(1):BX0^2(1):BX1^2(1):BX1^2(1)
PADDW MM4, MM3
PADDW MM5, MM2
;This section calculates g_b00_0, b_b01_0, g_b10_0, g_b11_0 for pix 1 and 3.
;Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM2 = | g_b00_1 p3 | g_b01_1 p3 | g_b10_1 p3 | g_b11_1 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM3 = | g_b00_1 p1 | g_b01_1 p1 | g_b10_1 p1 | g_b11_1 p1 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM4 = | g_b00_0 p3 | g_b01_0 p3 | g_b10_0 p3 | g_b11_0 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM5 = | g_b00_0 p1 | g_b01_0 p1 | g_b10_0 p1 | g_b11_0 p1 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;g_b00_0 = (random2(b00) & 511) - 256;
;g_b01_0 = (random2(b01) & 511) - 256;
;g_b10_0 = (random2(b10) & 511) - 256;
;g_b11_0 = (random2(b11) & 511) - 256;
;g_b00_1 = (random2(b00 + 1) & 511) - 256;
;g_b01_1 = (random2(b01 + 1) & 511) - 256;
;g_b10_1 = (random2(b10 + 1) & 511) - 256;
;g_b11_1 = (random2(b11 + 1) & 511) - 256;
PMULLW MM4, MM4 ;random1
PMULLW MM5, MM5 ;random1
MOVQ MM2, MM6
MOVQ MM3, MM6
PADDUSW MM2, MM4
PMULLW MM2, MM2 ;random2
PADDUSW MM3, MM5
MOVQ MM1, DWORD PTR mask_quad_256
PMULLW MM3, MM3 ;random2
MOVQ MM7, DWORD PTR mask_quad_511
PMULLW MM4, MM4 ;random2
PMULLW MM5, MM5 ;random2
PSRLW MM2, 2
PSRLW MM3, 2
PAND MM2, MM7
PSRLW MM4, 2
PAND MM3, MM7
PSRLW MM5, 2
PAND MM4, MM7
PAND MM5, MM7
PSUBW MM2, MM1 ;MM2 = g_b##_1 for pixel #3
PSUBW MM3, MM1 ;MM3 = g_b##_1 for pixel #1
PSUBW MM4, MM1 ;MM4 = g_b##_0 for pixel #3
PSUBW MM5, MM1 ;MM5 = g_b##_0 for pixel #1
;Take above data for g_b00_0, b_b01_0, g_b10_0, g_b11_0 for pix 1 and 3
;and rearrange the packed values in the MMX registers.
;Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM2 = | g_b00_0 p3 | g_b00_1 p3 | g_b01_0 p3 | g_b01_1 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM3 = | g_b00_0 p1 | g_b00_1 p1 | g_b01_0 p1 | g_b01_1 p1 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM6 = | g_b10_0 p3 | g_b10_1 p3 | g_b11_0 p3 | g_b11_1 p3 |
; +-----------------------------------------------------------+
; +-----------------------------------------------------------+
;MM7 = | g_b10_0 p1 | g_b10_1 p1 | g_b11_0 p1 | g_b11_1 p1 |
; +-----------------------------------------------------------+
MOVQ MM6, MM2
MOVQ MM7, MM3
PUNPCKHWD MM2, MM4 ;MM2 = g_b00_# and g_b01_# for pix #3
PUNPCKLWD MM6, MM4 ;MM6 = g_b10_# and g_b11_# for pix #3
PUNPCKHWD MM3, MM5 ;MM3 = g_b00_# and g_b01_# for pix #1
MOVQ MM4, MM0 ;Preparing for rx1 and ry1 calculation
PUNPCKLWD MM7, MM5 ;MM7 = g_b10_# and g_b11_# for pix #1
;Calculation of the rx1, ry1 values for both pixels. Final output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM4 = | RX1 p1 | RX1 p3 | RY1 p1 | RY1 p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;rx1 = rx0 - 256;
;ry1 = ry0 - 256;
PSUBW MM4, MM1 ;MM4 = rx1 and ry1 parameters
;Setup for the calculation of u1 and u2 for pix #1. Final output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM1 = | RX0 p1 | RY0 p1 | RX0 p1 | RY1 p1 |
; +-----------------------------------------------------------+
MOVQ MM5, MM0
MOVQ MM1, MM4
PSRLD MM5, 16
PSRAD MM1, 16
PSLLQ MM1, 32
PUNPCKHDQ MM1, MM5
PACKSSDW MM1, MM1
PACKSSDW MM5, MM5
PUNPCKLDQ MM1, MM5
;Calculation for U1 and U2 for pixel #1 -> After multiplication... Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM3 = | U1 for pixel #1 | U2 for pixel #1 |
; +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;u1 = rx0 * g_b00_0 + ry0 * g_b00_1;
;u2 = rx0 * g_b01_0 + ry1 * g_b01_1;
PMADDWD MM3, MM1 ;43u, MM3 = u1 and u2 for pixel #1
;Setup for the calculation of v1 and v2 for pix #1. Final output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM5 = | RX1 p1 | RY0 p1 | RX1 p1 | RY1 p1 |
; +-----------------------------------------------------------+
MOVQ MM5, MM4
PSRAD MM5, 16
MOVQ MM1, MM0
PSRLD MM1, 16
PSLLQ MM1, 32
PUNPCKHDQ MM1, MM5
PACKSSDW MM1, MM1
PACKSSDW MM5, MM5
PUNPCKLDQ MM5, MM1
;Calculation for V1 and V2 for pixel #1 -> After multiplication... Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM7 = | V1 for pixel #1 | V2 for pixel #1 |
; +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;v1 = rx1 * g_b00_0 + ry0 * g_b00_1;
;v2 = rx1 * g_b01_0 + ry1 * g_b01_1;
PMADDWD MM7, MM5 ;MM7 = v1 and v2 for pixel #1
;Setup for the calculation of u1 and u2 for pix #3. Final output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM1 = | RX0 p3 | RY0 p3 | RX0 p3 | RY1 p3 |
; +-----------------------------------------------------------+
MOVQ MM5, MM0
PSLLD MM5, 16
PSRLD MM5, 16
MOVQ MM1, MM4
PSLLD MM1, 16
PSRAD MM1, 16
PUNPCKLDQ MM1, MM1
PUNPCKHDQ MM1, MM5
PACKSSDW MM1, MM1
PACKSSDW MM5, MM5
PUNPCKLDQ MM1, MM5
;Calculation for U1 and U2 for pixel #3 -> After multiplication... Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM2 = | U1 for pixel #3 | U2 for pixel #3 |
; +-----------------------------------------------------+
PMADDWD MM2, MM1 ;MM2 = u1 and u2 for pixel #3
;Setup for the calculation of v1 and v2 for pix #3. Final output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM4 = | RX1 p3 | RY0 p3 | RX1 p3 | RY1 p3 |
; +-----------------------------------------------------------+
PSLLD MM4, 16
PSRAD MM4, 16
MOVQ MM5, MM0
PSLLD MM5, 16
PSRAD MM5, 16
PUNPCKLDQ MM5, MM5
PUNPCKHDQ MM5, MM4
PACKSSDW MM5, MM5
PACKSSDW MM4, MM4
PUNPCKLDQ MM4, MM5
;Calculation for V1 and V2 for pixel #3 -> After multiplication... Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM6 = | V1 for pixel #3 | V2 for pixel #3 |
; +-----------------------------------------------------+
PMADDWD MM6, MM4 ;MM6 = v1 and v2 for pixel #2
;Calculation for SX and SY for pixels #1 and #3, Output:
; |--- 16 bit ---|
; +-----------------------------------------------------------+
;MM1 = | SX p1 | SX p3 | SY p1 | SY p3 |
; +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;sx = (((rx0 * rx0) >> 1) * ((1536 - (rx0 << 2))))>> 16;
;sy = (((ry0 * ry0) >> 1) * ((1536 - (ry0 << 2))))>> 16;
MOVQ MM5, MM0
PMULLW MM5, MM5
MOVQ MM4, MM0
MOVQ MM1, DWORD PTR mask_quad_1536
PSLLW MM4, 2
PSUBD MM6, MM2 ;V1 - U1 and V2 - U2 for P3
PSUBD MM7, MM3 ;V1 - U1 and V2 - U2 for P1
PSUBW MM1, MM4
PSRLW MM5, 1
PMULHW MM1, MM5 ;MM1 = sx and sy param for pix 1, 3
;Calculation of A and B for pixel #1 and #3. Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM7 = | A for pixel #1 | B for pixel #1 |
; +-----------------------------------------------------+
; +-----------------------------------------------------+
;MM6 = | A for pixel #3 | B for pixel #3 |
; +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;a = u1 + sx * ((v1 - u1) >> 8);
;b = u2 + sx * ((v2 - u2) >> 8);
PSRAD MM7, 8
PSRAD MM6, 8
MOVQ MM4, MM1
MOVQ MM5, MM1
PSRLQ MM4, 16
PUNPCKLWD MM1, MM1
PUNPCKHDQ MM4, MM4
PMADDWD MM7, MM4
PSLLD MM5, 16
MOVQ MM4, DWORD PTR v ;Used for incrementing v for next 4 pix
PSRLD MM5, 16
PUNPCKHDQ MM5, MM5
PADDD MM4, DWORD PTR dv ;Used for incrementing v for next 4 pix
PADDD MM7, MM3 ;MM7 = a and b parameter for pix #1
PMADDWD MM6, MM5
MOVQ MM3, DWORD PTR mask_double_65536
PSRLD MM1, 16
MOVQ DWORD PTR v, MM4 ;Used for incrementing v for next 4 pix
;Calculation of color indexes for pixel #1 and #3. Output:
; |--------- 32 bit ---------|
; +-----------------------------------------------------+
;MM7 = | Color index for pixel #1 | Color index for pixel #3 |
; +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;color = (a + 65536 + sy * ((b - a) >> 8)) >> 9;
PADDD MM6, MM2 ;MM6 = a and b parameter for pix #3
MOVQ MM4, DWORD PTR mask_quad_510
MOVQ MM2, MM6
PUNPCKLDQ MM6, MM7
MOVD MM0, ebx ;Move the last color written into MM2
PUNPCKHDQ MM2, MM7
PADDD MM3, MM2
PSUBD MM6, MM2
PSRAD MM6, 8
PMADDWD MM6, MM1
PADDD MM6, MM3
PSRLD MM6, 9 ;MM6 = color for pix #1 and #3
;Since the color values have been calculated for pixels 1 and 3,
;pixels 0 and 2 still need to be determined. Pixel 0 is calculated by
;(prev_pixel + pixel #1) / 2 and pixel 2 is calculated by (pixel #1 +
;pixel #3) / 2. Output:
; |--- 16 bit ----|
; +-----------------------------------------------------------------+
;MM3 = |Color p0 index | Color p1 index | Color p2 index | Color p3 index|
; +-----------------------------------------------------------------+
MOVD MM4, DWORD PTR mask_double_255
PACKSSDW MM6, MM6
MOVQ MM7, MM6
MOVQ MM3, MM6
PSRLD MM7, 16
PUNPCKLWD MM7, MM0
PADDW MM6, MM7
PSRLW MM6, 1
PUNPCKLWD MM3, MM6
ADD EDI, 8
;Now that MM3 contains the 4 memory indexes in packed format, we need
;to unpack them in order to get the precomputed color values from the 256
;element color array. Output:
; |--- 16 bit ---|
; +--------------------------------------------------------------+
;MM1 = | Color p3 | Color p2 | Color p1 | Color p0 |
; +--------------------------------------------------------------+
;Write the 4 pixel colors to the backbuffer.
;Decrease the counter and loop back to draw four more pixels if necessary.
;The looping construct may look strange but it is done to allow for the
;calculation of the pixel colors at the end of the scan line.
;Or : divide(right shift) by the octave index and add to the prev ones
MOVD EBX ,MM3
PSRLW MM3,[turbShift]
PADDW MM3,[EDI]
MOVQ [EDI], MM3 ;Write out the 4 pix to video memory.
DEC ECX
JNZ start_scan_line
INC ESI
INC [turbShift]
DEC [octShift]
CMP ESI, num_octaves
JNZ start_octave
MOV prev_color, EBX ;EBX is the color index of pixel #3. Store it.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; here we rearrange the turb buffer
;; buffer[i] = p0:p1:p2:p3 --> buffer[i] = p3:p2:p1:p0
MOV EDI, turb_buffer
MOV ECX, num_pixels
flipLoop:
MOVQ MM5, [EDI]
MOVQ MM4, MM5
PUNPCKHDQ MM5, MM5 ;MM5 = p0:p1:p0:p1
MOVQ MM7, MM5 ;MM7 = p0:p1:p0:p1
PSRLD MM5, 16
MOVQ MM6, MM4
PUNPCKLWD MM5, MM7 ;MM5 = *:*:p1:p0
PSRLQ MM6, 16 ;MM6 = 0:p0:p1:p2
PUNPCKLWD MM6, MM4 ;MM6 = *:*:p3:p2
PUNPCKLDQ MM5, MM6 ;MM5 = p3:p2:p1:p0
MOVQ [EDI], MM5
ADD EDI, 8
DEC ECX
JNZ flipLoop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
EMMS ;Clear out the MMX registers and set appropriate flags.
RET ;End of function
SIMD_Octave ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
END
Appendix B - Wood (Sqrt) Code Listing
TITLE wood textures using MMX(TM) technology
;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list
.586
.model FLAT
KLAM equ 0
;***********************************************************************
; Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA
extrn _marbleTable : ptr sword
extrn _woodTable : ptr sword
extrn _sqrtTable : ptr sword
extrn _turbulenceBuf : ptr sword
;Variables, u, v, du, dv each contain parameters for two
;texels. Since u, v, ... are 64 bit, then each texel parameter is
;32 bit. (32 bit per texel * two texels = 64 bits). This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
_4du QWORD ?
_4dv QWORD ?
result dd 0
;Various masks. Set up to filter out unwanted bits in MMX registers.
ALIGN 8
const_quad_10 QWORD 000a000a000a000ah
const_quad_15 QWORD 000f000f000f000fh
const_FFFF_Minus_High_sqrt QWORD 0f800f800f800f800h
const_FFFF_Minus_High_Wood QWORD 0e890e890e890e890h
mask_odd_indexes QWORD 0fffefffefffefffeh
mask_high_words QWORD 00000ffff0000ffffh
mask_low_words QWORD 0ffff0000ffff0000h
mask_all_1 QWORD 0ffffffffffffffffh
mask_clear_word_1 QWORD 0000000000000ffffh
const_quad_735 QWORD 002df02df02df02dfh
mask_quad_green QWORD 0800080008000800h
const_quad_1500 QWORD 05dc05dc05dc05dch
DSEG ENDS
;***********************************************************************
; Constant Segment Declarations
;***********************************************************************
.const
;***********************************************************************
; Code Segment Declarations
;***********************************************************************
.code
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; SIMD_Wood_Sqrt(u_init : DWORD, v_init : DWORD, du : DWORD, dv : DWORD,
;;;; num_pixels : DWORD)
wood_u_init = 20
wood_v_init = 24
wood_du = 28
wood_dv = 32
wood_num_pixels = 36
_SIMD_Wood_Sqrt PROC NEAR
sub esp , 16
mov [esp ], edi
mov [esp + 4], edx
mov [esp + 8], ecx
mov [esp + 12], eax
MOV ECX, wood_num_pixels[esp]
LEA EDI, _turbulenceBuf
MOVD MM4, wood_du[esp] ; 0:du
SHR ECX, 2 ; ECX= # of times to draw 4 pixels at once
MOVD MM0, wood_u_init[esp] ; 0:u
PSLLQ MM4, 32 ; du:0
PUNPCKLDQ MM0, MM0 ; u:u
MOVD MM5, wood_dv[esp] ; 0:dv
PADDD MM0, MM4 ; u + du:u
MOVD MM1, wood_v_init[esp] ; 0:v
PUNPCKHDQ MM4, MM4 ; du:du
PUNPCKLDQ MM1, MM1 ; v:v
PSLLQ MM5, 32 ; dv:0
PADDD MM1, MM5 ; v + dv:v
PUNPCKHDQ MM5, MM5 ; dv:dv
MOVQ MM2, MM0 ; u + du:u
MOVQ MM3, MM1 ; v + dv:v
PADDD MM4, MM4 ; 2du:2du
PADDD MM5, MM5 ; 2dv:2dv
PADDD MM2, MM4 ; u + 3du:u+2du
PADDD MM3, MM5 ; v + 3dv:v+2dv
PADDD MM4, MM4 ; 4du:4du
PADDD MM5, MM5 ; 4dv:4dv
MOVQ dword ptr _4du, mm4
MOVQ dword ptr _4dv, mm5
;; during the loop the following hold
;; mm0 = u1 : u0
;; mm2 = u3 : u2
;; mm1 = v1 : v0
;; mm3 = v3 : v2
;; _4du = 4du : 4du
;; _4dv = 4dv : 4dv
wood_loop:
MOVQ MM5, MM1 ; v1 : v0
MOVQ MM4, MM0 ; u1 : u0
MOVQ MM6, MM3 ; v3 : v2
PSLLD MM5, 2 ; shift left by 2 (16 -14)
MOVQ MM7, MM2 ; u3 : u2
PSRLD MM4, 14 ; shift right by 14
PAND MM5, dword ptr mask_low_words ; mm5 = v1: 0 : v0 : 0
PSLLD MM6, 2 ; shift left by 2 (16 -14)
PADDD MM0, dword ptr _4du ; u1 + 4du : u0 + 4du
POR MM4, MM5 ; mm4 = v1:u1:v0:u0
PAND MM6, dword ptr mask_low_words
PSRLD MM7, 14 ; shift left by 14
PMADDWD MM4, MM4 ; res1 = (u1*u1 + v1*v1) : res0 = (u0*u0 + v0*v0)
POR MM7, MM6 ; mm7 = v3:u3:v2:u2
PADDD MM1, dword ptr _4dv ; v1 + 4dv : v0 + 4dv
PMADDWD MM7, MM7 ; res1 = (u3*u3 + v3*v3) : res0 = (u2*u2 + v2*v2)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;; pack the four r^2 values to words (take the results
;;;;;;;; after 10 bits shift right .
;;;;;;;; shift left by 16 , and then shift right Arithmetic by 16.
;;;;;;;;; the 16 bits shift left is done by 6 bits shift left
;;;;;;;;; instead of 10 bits shift right.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
PADDD MM2, dword ptr _4du ; u3 + 4du : u2 + 4du
PSLLD MM4, 6 ; shift left by 6 (16 -10)
PADDD MM3, dword ptr _4dv ; v3 + 4dv : v2 + 4dv
PSRAD MM4, 16 ; extend sign bit for PACKSSDW
MOVQ MM5, [EDI] ; turbulence
PSLLD MM7, 6 ; shift left by 6 (16 -10)
PMULLW MM5, dword ptr const_quad_15 ; turb = 15 * turb
PSRAD MM7, 16 ; extend sign bit for PACKSSDW
MOVQ MM6, dword ptr const_FFFF_Minus_High_sqrt
;;;;;; finally pack them correctly
;;;;;;; mm4 = ( res3:res2:res1:res0) 10 and packed
PACKSSDW MM4, MM7
;;; clip the values against the range [0 : 7FFh ]
;;; which is the size of the sqrt table (2048 entries)
PADDUSW MM4, MM6 ; mm6 = const_FFFF_Minus_High_sqrt
PSUBUSW MM4, MM6 ; mm6 = const_FFFF_Minus_High_sqrt
PAND MM4, dword ptr mask_odd_indexes
MOVD EAX, MM4
MOV EDX, EAX
AND EAX, 0ffffh ; eax = res0
SHR EDX, 16 ; edx = res1
PSRLQ MM4, 32
MOVD MM7, [ _sqrtTable + eax*2] ; read from the sqrt table
PUNPCKLWD MM7, [ _sqrtTable + edx*2] ; 0:0:sqrt(res1):sqrt(res0)
MOVD EAX, MM4
MOV EDX, EAX
AND EAX, 0ffffh ; eax = res2
SHR EDX, 16 ; edx = res3
MOVD MM6, [ _sqrtTable + eax*2] ; read from the sqrt table
PUNPCKLWD MM6, [ _sqrtTable + edx*2] ; 0:0:sqrt(res3):sqrt(res2)
PUNPCKLDQ MM7, MM6 ; sqrt(res3):sqrt(res2):sqrt(res1):sqrt(res0)
;;;;; mm7 = 10 * (sqrt(res3):sqrt(res2):sqrt(res1):sqrt(res0))
PMULLW MM7, dword ptr const_quad_10
ADD EDI, 8
;;;;; wood_indx = 10 * sqrt(res) + 15 * turbulence
MOVQ MM6, dword ptr const_FFFF_Minus_High_Wood
PADDW MM7, MM5
PSRLW MM7, 2 ; wood_indx = 2
;;; clip the values against the range [0 : 176Fh ]
;;; which is the size of the wood table (6000 entries).
PADDUSW MM7, MM6 ; mm6 = const_FFFF_Minus_High_Wood
PSUBUSW MM7, MM6 ; mm6 = const_FFFF_Minus_High_Wood
PAND MM7, dword ptr mask_odd_indexes
MOVD EAX, MM7 ; indx1:indx0
MOV EDX, EAX
AND EAX, 0ffffh ; eax = indx0
SHR EDX, 16 ; edx = indx1
MOVD MM6, [ _woodTable + eax*2] ; read wood colors from table
PSRLQ MM7, 32
PUNPCKLWD MM6, [ _woodTable + edx*2] ; 0:0:wood1:wood0
MOVD EAX, MM7 ; indx1:indx0
MOV EDX, EAX
AND EAX, 0ffffh ; eax = indx2
SHR EDX, 16 ; edx = indx3
MOVD MM7, [ _woodTable + eax*2] ; read wood colors from table
PUNPCKLWD MM7, [ _woodTable + edx*2] ; 0:0:wood3:wood2
PUNPCKLDQ MM6, MM7 ; wood3:wood2:wood1:wood0
MOVQ [EDI-8], MM6 ; store the colors into turb_buffer
DEC ECX
JNZ wood_loop
EMMS ; Clear out the MMX registers and set appropriate flags.
MOV EAX, [ESP + 12]
MOV ECX, [ESP + 8]
MOV EDX, [ESP + 4]
MOV EDI, [ESP ]
ADD ESP, 16
RET ; end of function
_SIMD_Wood_Sqrt ENDP
Appendix C - Marble Code Listing
TITLE Marble textures using MMX(TM) technology ;prevent listing of iammx.inc file .nolist INCLUDE iammx.inc .list .586 .model FLAT KLAM equ 0 ;*********************************************************************** ; Data Segment Declarations ;*********************************************************************** ;.DATA DSEG SEGMENT PARA extrn _marbleTable : ptr sword extrn _woodTable : ptr sword extrn _sqrtTable : ptr sword extrn _turbulenceBuf : ptr sword ;Variables, u, v, du, dv each contain parameters for two ;texels. Since u, v, ... are 64 bit, then each texel parameter is ;32 bit. (32 bit per texel * two texels = 64 bits). This enables us ;to work with two pixels at one time using MMX technology. ALIGN 8 _4du QWORD ? _4dv QWORD ? result dd 0 ;Various masks. Set up to filter out unwanted bits in MMX registers. ALIGN 8 const_quad_10 QWORD 000a000a000a000ah const_quad_15 QWORD 000f000f000f000fh const_FFFF_Minus_High_sqrt QWORD 0f800f800f800f800h const_FFFF_Minus_High_Wood QWORD 0e890e890e890e890h mask_odd_indexes QWORD 0fffefffefffefffeh mask_high_words QWORD 00000ffff0000ffffh mask_low_words QWORD 0ffff0000ffff0000h mask_all_1 QWORD 0ffffffffffffffffh mask_clear_word_1 QWORD 0000000000000ffffh const_quad_735 QWORD 002df02df02df02dfh mask_quad_green QWORD 0800080008000800h const_quad_1500 QWORD 05dc05dc05dc05dch DSEG ENDS ;*********************************************************************** ; Constant Segment Declarations ;*********************************************************************** .const ;*********************************************************************** ; Code Segment Declarations ;*********************************************************************** .code ;*********************************************************************** ;;;; SIMD_Marble uses the contents of _turbulenceBuf which was filled ;;;; before by SIMD_Octave with num_octaves of perlin noise. ;;;; The marble approximation is ;;;; marb(u,v) = sin(u + turb(u,v)), we use a pre-computed ;;;; sine table to accelerate it this also enables the usage of MMX technolgy ;;;; The table '_marbleTable' actually hold the marble value itself ;;;; which is a manipulation of the sine output. ;;;; In each iteration 4 pixels are calculated, 'num_pixels' is a multiply of 4. ;*********************************************************************** ;;; SIMD_Marble(u_init:DWORD, du:DWORD, num_pixels:DWORD ) marb_u_init = 20 marb_du = 24 marb_num_pixels = 28 _SIMD_Marble PROC NEAR SUB ESP, 16 MOV [ESP + 12], EAX MOV [ESP + 8], ECX MOV [ESP + 4], EBX MOV [ESP ], EDI MOV ECX, marb_num_pixels[esp] ; number of pixels in scanline LEA EDI, _turbulenceBuf ; already calculated turbulence MOVD MM2, marb_du[esp] ; mm2 = 0:du MOVD MM0, marb_u_init[esp] ; mm0 = 0: PSLLQ MM2, 32 ; mm2 = du:0 SHR ECX, 2 ; ecx = # of times to draw 4 pixels at once PUNPCKLDQ MM0, MM0 ; u : u PADDD MM0, MM2 ; u + du : u PUNPCKHDQ MM2, MM2 ; du : du MOVQ MM1, MM0 ; u + du : u PADDD MM2, MM2 ; 2du : 2du PADDD MM1, MM2 ; u + 3du : u + 2du PADDD MM2, MM2 ; 4du : 4du if (KLAM) MOVQ MM6, dword ptr const_quad_10 endif ;; during the loop the following hold ;; mm0 = u1 : u0 ;; mm1 = u3 : u2 ;; mm2 = 4du: 4du ;; if KLAM is defined then ;; mm6 = 10:10:10:10 ;; on P55C it is paired on the u pipe ;; so we can PAND with memory marb_loop: MOVQ MM5, [EDI] ; mm5 = turb3:turb2:turb1:turb0 MOVQ MM3, MM0 ; mm3 = u1:u0 MOVQ MM4, MM1 ; mm4 = u3:u2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; the following lines pack u3,u2,u1,u0 from two registers ;;;; to one register including shift right by 14 . ;;;; in order to make packssdw not to change the numbers ;;;; but only pack them we do shift left by 16 and then ;;;; shift right arithmetic by 16 to extend the sign bit . ;;;; The 16 bits shift left is done by 2 bits shift left ;;;;; instead of 14 bits shift right. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; PSLLD MM3, 2 ; shift left by 2 (16 -14) if ( KLAM ) PMULLW MM5, MM6 ; turb = 10 * turb else PMULLW MM5, dword ptr const_quad_10 ; turb = 10 * turb endif PSLLD MM4, 2 ; shift left by 2 (16 -14) PADDD MM0, MM2 ; increment each of u1:u0 by 4du for next iteration PSRAD MM3,16 ; extend sign bit for PACKSSDW PSRAD MM4,16 ; extend sign bit for PACKSSDW ADD EDI,8 ; increment edi for next iteration PACKSSDW MM3, MM4 ; mm3 = (u3:u2:u1:u0) 14 and packed PADDD MM1, MM2 ; increment each of u3:u2 by 4du for next iteration PADDW MM3, MM5 ; marble indexes are: (u_init 14) + (10 * turb) ;;;;; now read the colors from the marble table ;;;;; the input to this part is mm3 = indx3:indx2:indx1:indx0 ;;;;; the output is mem[edi-8] = pixel3 :pixel2 :pixel1 :pixel0 PAND MM3, dword ptr mask_odd_indexes MOVD EAX, MM3 ; eax = indx1:indx0 MOV EBX, EAX ; ebx = indx1:index0 AND EAX, 0ffffh ; eax = indx0 SHR EBX, 16 ; edx = indx1 PSRLQ MM3, 32 ; mm3 = 0:0:indx3:indx2 MOVD MM4, [ _marbleTable + eax*2] ; read from the marble table PUNPCKLWD MM4, [ _marbleTable + ebx*2] ; 0:0:marb1:marb0 MOVD EAX, MM3 ; eax = indx3:indx2 MOV EBX, EAX ; ebx = indx3:index2 AND EAX, 0ffffh ; eax = indx2 SHR EBX, 16 ; edx = indx3 MOVD MM5, [ _marbleTable + eax*2] ; read from the marble table PUNPCKLWD MM5, [ _marbleTable + ebx*2] ; 0:0:marb3:marb2 PUNPCKLDQ MM4, MM5 ; marb3:marb2:marb1:marb0 MOVQ [EDI-8], MM4 ; store the 4 pixels to turb_buffer DEC ECX JNZ marb_loop EMMS ; Clear out the MMX registers and set appropriate flags. MOV EAX, [ESP + 12] MOV ECX, [ESP + 8] MOV EBX, [ESP + 4] MOV EDI, [ESP ] ADD ESP, 16 RET ; end of function _SIMD_Marble ENDP
Appendix D - DDU and DDV Code Listing
;Get the UV parameters in MMX(TM) technology form. ;Note: UV texel values are stored in 10.22 fixed integer format. ;This sets up the U parameters for pixels 1 and 3 in MM0 register and ;V parameter in MM1 register. After setup, the registers will contain: ; |--------- 32 bit ------------| ; +-------------------------------------------------------------------+ ;MM0 = | U texel for pix #1 = u + du | U texel for pix #3 = u + 3du + 3ddu | ; +-------------------------------------------------------------------+ ; +-------------------------------------------------------------------+ ;MM1 = | V texel for pix #1 = v + dv | V texel for pix #3 = v + 3dv + 3ddv | ; +-------------------------------------------------------------------+ ;This is because the first four pixels drawn on the screen will have the ;U and V texel values of: ;Pixel #0 = u ;Pixel #1 = u + du ;Pixel #2 = u + 2du + ddu ;Pixel #3 = u + 3du + 3ddu ;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged. MOVD MM0, u_init SHR ECX, 2 ;ECX= # of times to draw 4 pixels at once MOVD MM1, v_init PUNPCKLDQ MM0, MM0 ;U p1 = u, p3 = u MOVD MM2, du_init PUNPCKLDQ MM1, MM1 ;V p1 = v, p3 = v MOVD MM3, dv_init PADDD MM0, MM2 ;U p1 = u, p3 = u + du PADDD MM1, MM3 ;V p1 = v, p3 = v + dv PADDD MM0, MM2 ;U p1 = u, p3 = u + 2du PADDD MM1, MM3 ;V p1 = v, p3 = v + 2dv PUNPCKLDQ MM2, MM2 PUNPCKLDQ MM3, MM3 PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du MOVD MM2, ddu_init PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv MOVD MM3, ddv_init PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du + ddu PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv + ddv PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du + 2ddu PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv + 2ddv PADDD MM0, MM2 ;U p1 = u + du, p3 = u + 3du + 3ddu MOVQ DWORD PTR u, MM0 PADDD MM1, MM3 ;V p1 = v + dv, p3 = v + 3dv + 3ddv MOVQ DWORD PTR v, MM1 ;Get the du dv parameters in MMX(TM) technology form ;Note: du dv texel values are stored in 10.22 fixed integer format. ;This sets up the du parameters for pixels 1 and 3 in MM0 register and ;dv parameter in MM1 register. After setup, the registers will contain: ; |--------- 32 bit --------------| ; +---------------------------------------------------------------+ ;MM0 = | DU texel for p1 = 4du + 10ddu | DU texel for p3 = 4du + 18ddu | ; +---------------------------------------------------------------+ ; +---------------------------------------------------------------+ ;MM1 = | DV texel for p1 = 4dv + 10ddv | DV texel for p3 = 4dv + 18ddv | ; +---------------------------------------------------------------+ ;This is because after the first four pixels are drawn on the screen, the ;loop repeats to draw the next four pixels. In order to get the next u, v ;texel coordinates, appropriate du, dv values need to be summed to u and v. ;The correct starting values of du and dv are: ;Pixel #0 = 4du + 6ddu ;Note: these have been mathematically proven. ;Pixel #1 = 4du + 10ddu ;Pixel #2 = 4du + 14ddu ;Pixel #3 = 4du + 18ddu ;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged. MOVD MM0, du_init ;DU p1 = 0, p3 = du MOVD MM1, dv_init ;DV p1 = 0, p3 = dv PUNPCKLDQ MM0, MM0 ;DU p1 = du, p3 = du PUNPCKLDQ MM1, MM1 ;DV p1 = dv, p3 = dv MOVD MM2, ddu_init PSLLD MM0, 2 ;DU p1 = 4du, p3 = 4du MOVD MM3, ddv_init PSLLD MM1, 2 ;DV p1 = 4dv, p3 = 4dv PUNPCKLDQ MM2, MM2 PUNPCKLDQ MM3, MM3 PSLLD MM2, 1 PSLLD MM3, 1 PADDD MM0, MM2 ;DU p1 = 4du + 2ddu, p3 = 4du + 2ddu PADDD MM1, MM3 ;DV p1 = 4dv + 2ddv, p3 = 4dv + 2ddv PSLLD MM2, 2 PSLLD MM3, 2 PADDD MM0, MM2 ;DU p1 = 4du + 10ddu, p3 = 4du + 10ddu MOVD MM2, ddu_init ;DDU p1 = 0, p3 = ddu PADDD MM1, MM3 ;DV p1 = 4dv + 10ddv, p3 = 4dv + 10ddv MOVD MM3, ddv_init ;DDV p1 = 0, p3 = ddv PSLLD MM2, 3 ;DDU p1 = 0, p3 = 8ddu PSLLD MM3, 3 ;DDV p1 = 0, p3 = 8ddv PADDD MM0, MM2 ;DU p1 = 4du + 10ddu, p3 = 4du + 18ddu PADDD MM1, MM3 ;DV p1 = 4dv + 10ddv, p3 = 4du + 18ddv PSLLD MM2, 1 ;DDU p1 = 0, p3 = 16ddu MOVQ DWORD PTR du, MM0 PUNPCKLDQ MM2, MM2 ;DDU p1 = 16ddu, p3 = 16ddu MOVQ DWORD PTR dv, MM1 ;Get the ddu ddv parameters in MMX(TM) technology form ;Note: ddu ddv texel values are stored in 10.22 fixed integer format. ;This sets up the ddu parameters for pixels 1 and 3 in MM0 register and ;ddv parameter in MM1 register. After setup, the registers will contain: ; |--------- 32 bit ---------| ; +-----------------------------------------------------+ ;MM0 = | DDU texel for p1 = 16ddu | DDU texel for p3 = 16ddu | ; +-----------------------------------------------------+ ; +-----------------------------------------------------+ ;MM1 = | DDV texel for p1 = 16ddv | DDV texel for p3 = 16ddv | ; +-----------------------------------------------------+ ;This is because after the first four pixels are drawn on the screen, the ;loop repeats to draw the next four pixels. In order to get the next du, dv ;texel coordinates, appropriate ddu, ddv values need to be summed to du and dv. ;The correct values of ddu and ddv are: ;Pixel #0 = 16ddu ;Note: these have been mathematically proven. ;Pixel #1 = 16ddu ;Pixel #2 = 16ddu ;Pixel #3 = 16ddu ;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged. PSLLD MM3, 1 ;DDV p1 = 0, p3 = 16ddv MOVQ DWORD PTR ddu, MM2 PUNPCKLDQ MM3, MM3 ;DDV p1 = 16ddv, p3 = 16ddv MOVQ DWORD PTR ddv, MM3
Appendix E - Z-Buffer Scanline Algorithm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; z is calculated along the scan line z = z_init + i * dz_init
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
MMX_INCZbuffer PROC NEAR C USES edi esi ecx eax ebx,
z_init: DWORD, dz_init: DWORD,
num_pixels: DWORD, z_line: PTR SWORD, color_line: PTR SWORD,
z_buffer: PTR SWORD, color_buffer: PTR SWORD
MOVD MM1, dz_init
MOVD MM4, z_init
MOVD MM5, dz_init
PSLLD MM1, 16 ;0:0:dz:0
PAND MM4, DWORD PTR mask_clear_byte_1
MOVQ MM6, DWORD PTR mask_all_1
PUNPCKLWD MM4, MM4 ;0:0:z:z
PADDSW MM4, MM1 ;0:0:z + dz:z
PUNPCKLWD MM5, MM5 ;0:0:dz:dz
MOVQ MM3, MM4 ;0:0:z + dz:z
PSLLW MM5, 1 ;0:0:2dz:2dz
PSLLQ MM3, 32 ;z + dz:z:0:0
MOV EAX, z_buffer
PSLLQ MM5, 32 ;2dz:2dz:0:0
MOV EDI, color_line
PADDSW MM3, MM5 ;z+3z:z+2dz:0:0
PUNPCKHDQ MM5, MM5 ;2dz:2dz:2dz:2dz
POR MM4, MM3 ;z+3z:z+2dz:z + dz:z
PSLLW MM5, 1 ;4dz:4dz:4dz:4dz
MOV ECX, color_buffer
MOV ESI, num_pixels
SHR ESI, 2
zLoop:
MOVQ MM0, [eax] ;mm0 = Za,Za,Za,Za (load)
MOVQ MM1, MM4 ;[ebx] mm1 = Zb,Zb,Zb,Zb (load)
MOVQ MM2, MM0 ;mm2 = Za,Za,Za,Za (will be the mask)
PADDSW MM4, MM5
PCMPGTW MM2, MM1 ;mm2 = mask of 0000 or fffff (4 times)
ADD EAX, 8
MOVQ MM3, MM2 ;(after pxor) mm3 = ~mm2 (mm2 xor fffffffffffffff)
PAND MM1, MM2 ;mm1 = only the Zb's which are less then the Za's
PXOR MM3, MM6 ;DWORD PTR mask_all_1
ADD ECX, 8
PAND MM0, MM3 ;mm0 = the Za's which are less or EQUAL the Zb's
ADD EDI, 8
POR MM0, MM1 ;mm0 = the wanted Z's
MOVQ [eax-8], MM0 ;(store Z's)
MOVQ MM0, [ecx-8] ;mm0 = Ca,Ca,Ca,Ca
MOVQ MM1, [edi-8] ;mm1 = Cb,Cb,Cb,Cb
PAND MM1, MM2 ;mm1 = the Ca's of the 'Good' Za's
PAND MM0, MM3 ;mm0 = the Cb's of the 'Good' Zb's
POR MM0, MM1 ;the wanted C's
MOVQ [edi-8], MM0 ;(store)
DEC ESI
JNZ zLoop
EMMS
RET
MMX_INCZbuffer ENDP
Appendix F - Optimized Z-Buffer Code Listing
; Note, registers ESI, EDI, MM1, MM2, MM3, MM4, MM6, MM7 are modified by this routine. MOVQ MM4, low_z ;Move two rightmost Z-Buffer values into MM4 (LSD) MOVQ MM2, high_z ;Move the leftmost Z-Buffer values into MM2 (MSD) MOVQ MM6, MM4 ;Make a copy of LSD of the Z-Buffer values MOVQ MM7, z_inc ;Move the Z-incremental into a register for future use. PSRAD MM4, 16 ;Discard the fractional part of the two Z values PUSH ESI ;Save ESI PSRAD MM2, 16 ;Discard the fractional part of the two Z values MOV ESI, z_buffer ;ESI = pointer to four Z values being looked at in Z-Buffer. PACKSSDW MM4, MM2 ;Mesh all four Z-Buffer values into one register MOVQ MM2, [ESI] ;MM2 = the old Z values currently in the Z-Buffer. PADDD MM6, MM7 ;Add DZ to Z MOVQ MM7, high_z ;Save a copy of high_z PCMPGTW MM2, MM4 ;Perform a compare between the old and the new Z values. PADDD MM7, z_inc ;Add DZ to Z MOVQ MM3, MM2 ;Save a copy of the compare results PANDN MM3, [EDI] ;MM3 = Colors of previous pixels to be drawn. PAND MM1, MM2 ;MM1 = Colors of current pixel 4 pixels to be drawn. MOVQ high_z, MM7 ;Update the high_z variable POR MM1, MM3 ;"OR" old and new contents together for the 4 pixel colors. MOVQ low_z, MM6 ;Update the low_z variable MOVQ MM3, MM2 ;Save a copy of the compare results PANDN MM3, [ESI] ;[ESI] = Pointer to existing 4 Z-Buffer values. PAND MM2, MM4 MOVQ [EDI], MM1 ;Write out the 4 pixels to video memory. POR MM2, MM3 ;"OR" old and new contents together for the 4 Z values. MOVQ [ESI], MM2 ;Update the Z-Buffer with the 4 new values. ADD z_buffer, 8 ;z_buffer pointer is incremented eight bytes (4 pixels). POP ESI ;Restore ESI
Appendix G - Wood (Linear) Code Listing
;***********************************************************************
;;; This is the wood implementation by linear curves in the u_v plane.
;;;; In each iteration 4 pixels are calculated, 'num_pixels' is a multiply of 4.
;***********************************************************************
;;;; SIMD_Wood_Linear(u_init: DWORD, v_init: DWORD, du: DWORD, dv : DWORD, num_pixels:DWORD)
_SIMD_Wood_Linear PROC NEAR
SUB ESP, 16
MOV [ESP + 12], EAX
MOV [ESP + 8], ECX
MOV [ESP + 4], EDX
MOV [ESP ], EDI
MOV ECX, wood_num_pixels[esp]
MOVD MM4, wood_du[esp] ; 0:du
SHR ECX, 2 ; ECX= # of times to draw 4 pixels at once
LEA EDI, _turbulenceBuf
MOVD MM0, wood_u_init[esp] ; 0:u
PSLLQ MM4, 32 ; du:0
PUNPCKLDQ MM0, MM0 ; u:u
MOVD MM5, wood_dv[esp] ; 0:dv
PADDD MM0, MM4 ; u + du:u
MOVD MM1, wood_v_init[esp] ; 0:v
PUNPCKHDQ MM4, MM4 ; du:du
PUNPCKLDQ MM1, MM1 ; v:v
PSLLQ MM5, 32 ; dv:0
PADDD MM1, MM5 ; v + dv:v
PUNPCKHDQ MM5, MM5 ; dv:dv
MOVQ MM2, MM0 ; u + du:u
MOVQ MM3, MM1 ; v + dv:v
PADDD MM4, MM4 ; 2du:2du
PADDD MM5, MM5 ; 2dv:2dv
PADDD MM2, MM4 ; u + 3du:u+2du
PADDD MM3, MM5 ; v + 3dv:v+2dv
PADDD MM4, MM4 ; 4du:4du
PADDD MM5, MM5 ; 4dv:4dv
MOVQ dword ptr _4dv, MM5
;; during the loop the following hold
;; mm0 = u1 : u0
;; mm2 = u3 : u2
;; mm1 = v1 : v0
;; mm3 = v3 : v2
;; mm4 = 4du : 4du
;; _4dv = 4dv : 4dv
wood_loop:
MOVQ MM5, MM0 ; u1 : u0
MOVQ MM6, MM2 ; u3 : u2
;; like in the marble code, in order to shift right by 14
;; and then pack 4 dwords to 4 words in one MMX(TM) register
;; a shift left followed by shift right arithmetic are done
;;; as in the marble the shift left is by 2 .
PADDD MM0, MM4 ; u1 + 4du : u0 + 4du
PSLLD MM5, 2 ; shift left by 2 (16 -14)
PADDD MM2, MM4 ; u3 + 4du : u2 + 4du
PSLLD MM6, 2 ; shift left by 2 (16 -14)
MOVQ MM7, MM1 ; v1 : v0
PSRAD MM5, 16 ; extend sign bit for PACKSSDW
PADDD MM1, dword ptr _4dv ; v1 + 4dv : v0 + 4dv
PSRAD MM6, 16 ; extend sign bit for PACKSSDW
;;;;;;;; finally pack them correctly
;;;;;;;; mm5 = (u3:u2:u1:u0) 14 and packed
PACKSSDW MM5, MM6
MOVQ MM6, MM3 ; v3 : v2
PSLLD MM7, 2 ; shift left by 2 (16 -14)
PADDD MM3, dword ptr _4dv ; v3 + 4dv : v2 + 4dv
PSLLD MM6, 2 ; shift left by 2 (16 -14)
PSRAD MM6, 16 ; extend sign bit for PACKSSDW
PSRAD MM7, 16 ; extend sign bit for PACKSSDW
;;;;;;;; mm7 = (v3:v2:v1:v0) 14 and packed
PACKSSDW MM7, MM6
MOVQ MM6, MM5 ; the following instructions implement
; Unsigned absolute value for words
PSUBUSW MM5, MM7
PSUBUSW MM7, MM6
MOVQ MM6, [EDI] ; turbulence
POR MM5, MM7 ; mm5 = abs(v3 - u3 : v2 - u2 : v1 - u1 : v0 - u0)
;;;;;; wood_indx = (10 * abs(u-v) + 15 * turbulence(u,v) ) 2
PMULLW MM6, dword ptr const_quad_15 ; turb = 15 * turb
PMULLW MM5, dword ptr const_quad_10 ; | u - v | * 10
MOVQ MM7, dword ptr const_FFFF_Minus_High_Wood
PADDW MM5, MM6 ; 10 * abs | u - v | + 15 * turb(u,v)
ADD EDI, 8
PSRLW MM5, 2 ; wood_indx = 2
;;; Now clip the values against the range [0 : 176Fh ]
;;; which is the size of the wood table (6000 entries).
;;; paddusw saturates each value above 176FH to FFFF
;;; psubusw undo the offset
PADDUSW MM5, MM7 ; mm7 = const_FFFF_Minus_High_Wood
PSUBUSW MM5, MM7 ; mm7 = const_FFFF_Minus_High_Wood
;;;; on P55C each unaligned load of 4 bytes (movd) cause penalty
;;;; so we don't read odd indexes (the table's element size is word )
PAND MM5, dword ptr mask_odd_indexes
MOVD EAX, MM5 ; eax= indx1:indx0
MOV EDX, EAX ; edx = indx1:indx0
AND EAX, 0ffffh ; eax = indx0
SHR EDX, 16 ; edx = indx1
PSRLQ MM5, 32 ; mm5 = indx3:indx2
MOVD MM6, [ _woodTable + eax*2] ; read wood colors from table
PUNPCKLWD MM6, [ _woodTable + edx*2] ; 0:0:wood1:wood0
MOVD EAX, MM5 ; eax = indx3:indx2
MOV EDX, EAX ; edx = indx3:indx2
AND EAX, 0ffffh ; eax = indx2
SHR EDX, 16 ; edx = indx3
MOVD MM5, [ _woodTable + eax*2] ; read wood colors from table
PUNPCKLWD MM5, [ _woodTable + edx*2] ; 0:0:wood3:wood2
PUNPCKLDQ MM6, MM5 ; mm6 = wood3:wood2:wood1:wood0
MOVQ [EDI-8], MM6 ; store the colors into turb_buffer
DEC ECX
JNZ wood_loop
EMMS ; Clear out the MMX registers and set appropriate flags.
MOV EAX, [ESP + 12]
MOV ECX, [ESP + 8]
MOV EDX, [ESP + 4]
MOV EDI, [ESP ]
ADD ESP, 16
RET ; end of function
_SIMD_Wood_Linear ENDP