Appendix A - fBm Code Listing

TITLE Modified form of Perlin's Noise Basis function using MMX(TM) technology

;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list

.586
.model FLAT


;***********************************************************************
;     Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA

;KEY for comments
;P0, P1, P# = Pixel number 0, Pixel number 1, Pixel number # respectively.
;Pix        = Pixel
;DU         = Derivative of the variable U.
;DDU        = Derivative of the variable DU.
;Texel      = A point in the texture to be mapped onto the screen.  Given by U, V.

;Note: Even though the assembly writes four pixel values through each pass of the
;inner loop, only two of the pixels are directly calculated.  The other two pixels
;are averaged from neighboring pixels.  According to the current scheme,
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;      | Pixel #0     | Pixel #1     | Pixel #2    | Pixel #3      |
;      +-----------------------------------------------------------+
;Pixels #1 and #3 are directly calculated.  Pixel #2 is averaged from Pixel #1 and
;pixel #3.  Pixel #0 is averaged from Pixel #1 and the previous pixel before #0.
;
;Also, the programmer realizes that the pixels are labeled from 0, 1, 2, 3 instead
;of 3, 2, 1, 0 as follows the conventional format of Intel Architecture.  This was
;an oversite and not realized until it was too late.

;Variables, u, v, du, dv, ddu, ddv each contain parameters for two
;texels.  Since u, v, ..., ddv are 64 bit, then each texel parameter is
;32 bit.  (32 bit per texel * two texels = 64 bits).  This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
u                 QWORD ?
du                QWORD ?
ddu               QWORD ?

v                 QWORD ?
dv                QWORD ?
ddv               QWORD ?

firstU            QWORD ?
firstV            QWORD ?


;Since the program only calculates odd pixel values, the even pixel values
;must be averaged.  Therefore, for each pass through the inner loop, four
;pixels will be drawn.  In order to draw the first pixel, the pixel before
;it must be known for the averaging.  This pixel color is contained here.
octShift          DWORD 0, 0
turbShift         DWORD 0, 0
prev_color        DWORD 255

;Various masks.  Set up to filter out unwanted bits in MMX registers.
ALIGN 8
mask_32_to_15     QWORD 00007FFF00007FFFh
mask_quad_1       QWORD 0001000100010001h
mask_quad_255     QWORD 00FF00FF00FF00FFh
mask_quad_256     QWORD 0100010001000100h
mask_quad_510     QWORD 01FE01FE01FE01FEh
mask_quad_511     QWORD 01FF01FF01FF01FFh
mask_quad_1536    QWORD 0600060006000600h
mask_double_255   QWORD 000000FF000000FFh
mask_double_FFFF  QWORD 0000FFFF0000FFFFh
mask_double_65536 QWORD 0001000000010000h
mask_four_255     QWORD 00FF00FF00FF00FFh

DSEG ENDS


;***********************************************************************
;     Constant Segment Declarations
;***********************************************************************
.const

;***********************************************************************
;     Code Segment Declarations
;***********************************************************************
.code

COMMENT^
void SIMD_Octave(unsigned long u_init, unsigned long v_init,
                long du_init, long dv_init, unsigned long Num_Pix,
                unsigned _int16* turb_buffer, unsigned long num_octaves);
^

SIMD_Octave PROC NEAR C USES ebx ecx edi esi,
           u_init:DWORD, v_init:DWORD, du_init:DWORD, dv_init:DWORD,
           num_pixels:DWORD, turb_buffer:DWORD,   num_octaves:DWORD

;Initialization
MOVD        MM0, u_init

MOVD        MM1, v_init
PUNPCKLDQ   MM0, MM0      ;U p1 = u, p3 = u

MOVD        MM2, du_init
PUNPCKLDQ   MM1, MM1      ;V p1 = v, p3 = v

MOVD        MM3, dv_init
PADDD       MM0, MM2      ;U p1 = u, p3 = u + du

PADDD       MM1, MM3      ;V p1 = v, p3 = v + dv
PADDD       MM0, MM2      ;U p1 = u, p3 = u + 2du

PADDD       MM1, MM3      ;V p1 = v, p3 = v + 2dv
PUNPCKLDQ   MM2, MM2

PUNPCKLDQ   MM3, MM3
PADDD       MM0, MM2      ;U p1 = u + du, p3 = u + 3du

MOV         [turbShift],0 ;turbShift is the octave number 0,1,2,....
XOR         ESI,ESI

MOVQ        DWORD PTR firstU , MM0
PADDD       MM1, MM3      ;V p1 = v + dv, p3 = v + 3dv

MOV         [octShift],14 ;octshift is (14 - esi (octave number))
PSLLD       MM2, 2        ;DU p1 = 4du, p3 = 4du

MOVQ        DWORD PTR firstV, MM1
PSLLD       MM3, 2        ;DU p1 = 4dv, p3 = 4dv

MOVQ        DWORD PTR du, MM2

MOVQ        DWORD PTR dv, MM3


start_octave :
MOV         EBX, prev_color
MOV         EDI, turb_buffer ;EDI will always be pointer to screen buffer

MOV         ECX, num_pixels
SUB         EDI, 8

;Get the UV parameters in MMX(TM) technology form.
;Note: UV texel values are stored in 10.22 fixed integer format.
;This sets up the U parameters for pixels 1 and 3 in MM0 register and
;V parameter in MM1 register.  After setup, the registers will contain:
;      |--------- 32 bit ------------|
;      +--------------------------------------------------------------+
;MM0 = | U texel for pix #1 = u + du | U texel for pix #3 = u + 3du   |
;      +--------------------------------------------------------------+
;      +--------------------------------------------------------------+
;MM1 = | V texel for pix #1 = v + dv | V texel for pix #3 = v + 3dv   |
;      +--------------------------------------------------------------+
;This is because the first four pixels drawn on the screen will have the
;U and V texel values of:
;Pixel #0 = u + 0du
;Pixel #1 = u + 1du
;Pixel #2 = u + 2du
;Pixel #3 = u + 3du
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.

MOVQ        MM0, DWORD PTR firstU
MOVQ        MM1, DWORD PTR firstV
MOVQ        DWORD PTR u, MM0
MOVQ        DWORD PTR v, MM1


start_scan_line:
;First, the program converts the u and v texel coordinates
;from 10.22 format to 8.8 format.  10.22 format is used for
;decimal accuracy but only 16 of the 32 bits are actually used.
;Because the final format will fit in a 16 bit result, u and v
;values are converted from 4, 32 bit packed values
;to 4, 16 bit packed values that will fit in one MMX register.  Output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM0 = | U texel - p1 | U texel - p3 | V texel - p1 | V texel - p3 |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;u_16bit = u_init >> 14;
;v_16bit = v_init >> 14;

MOVQ        MM1, DWORD PTR u

MOVQ        MM3, DWORD PTR octShift

MOVQ        MM0, DWORD PTR v
PSRLD       MM1, MM3                     ;Convert from 10.22 to 10.8

MOVQ        MM2, DWORD PTR mask_32_to_15 ;Uses 15 instead of 16 because of signed saturation.
PSRLD       MM0, MM3                     ;Convert from 10.22 to 10.8

PAND        MM1, MM2                     ;Convert from 10.8 to 7.8 integer format
PAND        MM0, MM2                     ;Convert from 10.8 to 7.8 integer format

MOVQ        MM3, DWORD PTR mask_quad_1
PACKSSDW    MM0, MM1                     ;Pack the result into one register

;Calculation of the bx0, by0, bx1, by1 values for both pixels.  Output:
;             |-8 bit-|
;      +-----------------------------------------------------------+
;MM2 = |      |BX0 p1 |      |BX0 p3 |      |BY0 p1 |      |BY0 p3 |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM3 = |      |BX1 p1 |      |BX1 p3 |      |BY1 p1 |      |BY1 p3 |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;bx0 = u_16bit >> 8;
;by0 = v_16bit >> 8;
;bx1 = bx0 + 1;
;by1 = by0 + 1;

MOVQ        MM1, DWORD PTR u             ;Used for incrementing u for next 4 pix.
MOVQ        MM2, MM0

PSRLW       MM2, 8

PADDD       MM1, DWORD PTR du            ;Used for incrementing u for next 4 pix.
PADDUSB     MM3, MM2                     ;mm3 = 0:BX1(1):0:BX1(3):0:BY1(1):0:BY1(3)


;Calculation of the rx0, ry0 values for both pixels.  Final output:
;             |-8 bit-|
;      +-----------------------------------------------------------+
;MM0 = |      |RX0 p1 |      |RX0 p3 |        |RY0 p1 |         |RY0 p3 |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;rx0 = u_16bit & 255;
;ry0 = v_16bit & 255;

PSLLW       MM0, 8
MOVQ        MM4, MM3

MOVQ        MM6, DWORD PTR mask_quad_1
PUNPCKHWD   MM4, MM2                     ;MM4 = 0:BX0(1):0:BX1(1):0:BX0(3):0:BX1(3)

PUNPCKLWD   MM3, MM2                     ;MM3 = 0:BY0(1):0:BY1(1):0:BY0(3):0:BY1(3)
PMULLW      MM4, MM4                     ;MM4 = BX0^2(1):BX1^2(1):BX0^2(3):BX1^2(3)

PSRLW       MM0, 8                       ;MM0 = rx0 and ry0 param for pix 1, 3

;This section includes calculation of b00, b01, b10, b11.  Output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM4 = | b01 for p1   | b11 for p1   | b01 for p3   | b11 for p3   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM5 = | b00 for p1   |   b10 for p1   | b00 for p3   | b10 for p3   |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;b00 = random1((random1(bx0) + by0));
;b01 = random1((random1(bx0) + by1));
;b10 = random1((random1(bx1) + by0));
;b11 = random1((random1(bx1) + by1));
MOVQ        MM2, MM3

PUNPCKLDQ   MM3, MM3                    ;MM3  = 0:BY0(3):0:BY1(3):0:BY0(3):0:BY1(3)

PUNPCKHDQ   MM2, MM2                    ;MM2  = 0:BY0(1):0:BY1(1):0:BY0(1):0:BY1(1)
MOVQ        MM5, MM4

MOVQ        DWORD PTR u, MM1            ;Used for incrementing u for next 4 pix.
PUNPCKLWD   MM4, MM4                    ;MM4  = BX0^2(3):BX0^2(3):BX1^2(3):BX1^2(3)

PUNPCKHWD   MM5, MM5                    ;MM5  = BX0^2(1):BX0^2(1):BX1^2(1):BX1^2(1)
PADDW       MM4, MM3

PADDW       MM5, MM2

;This section calculates g_b00_0, b_b01_0, g_b10_0, g_b11_0 for pix 1 and 3.
;Output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM2 = | g_b00_1 p3   | g_b01_1 p3   | g_b10_1 p3   | g_b11_1 p3   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM3 = | g_b00_1 p1   |   g_b01_1 p1   | g_b10_1 p1   | g_b11_1 p1   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM4 = | g_b00_0 p3   | g_b01_0 p3   | g_b10_0 p3   | g_b11_0 p3   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM5 = | g_b00_0 p1   |   g_b01_0 p1   | g_b10_0 p1   | g_b11_0 p1   |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;g_b00_0 = (random2(b00) & 511) - 256;
;g_b01_0 = (random2(b01) & 511) - 256;
;g_b10_0 = (random2(b10) & 511) - 256;
;g_b11_0 = (random2(b11) & 511) - 256;
;g_b00_1 = (random2(b00 + 1) & 511) - 256;
;g_b01_1 = (random2(b01 + 1) & 511) - 256;
;g_b10_1 = (random2(b10 + 1) & 511) - 256;
;g_b11_1 = (random2(b11 + 1) & 511) - 256;
PMULLW      MM4, MM4       ;random1

PMULLW      MM5, MM5       ;random1
MOVQ        MM2, MM6

MOVQ        MM3, MM6
PADDUSW     MM2, MM4

PMULLW      MM2, MM2       ;random2
PADDUSW     MM3, MM5

MOVQ        MM1, DWORD PTR mask_quad_256
PMULLW      MM3, MM3       ;random2


MOVQ        MM7, DWORD PTR mask_quad_511
PMULLW      MM4, MM4       ;random2

PMULLW      MM5, MM5       ;random2
PSRLW       MM2, 2

PSRLW       MM3, 2
PAND        MM2, MM7

PSRLW       MM4, 2
PAND        MM3, MM7

PSRLW       MM5, 2
PAND        MM4, MM7

PAND        MM5, MM7
PSUBW       MM2, MM1       ;MM2 = g_b##_1 for pixel #3

PSUBW       MM3, MM1     ;MM3 = g_b##_1 for pixel #1
PSUBW       MM4, MM1     ;MM4 = g_b##_0 for pixel #3

PSUBW       MM5, MM1     ;MM5 = g_b##_0 for pixel #1

;Take above data for g_b00_0, b_b01_0, g_b10_0, g_b11_0 for pix 1 and 3
;and rearrange the packed values in the MMX registers.
;Output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM2 = | g_b00_0 p3   | g_b00_1 p3   | g_b01_0 p3   | g_b01_1 p3   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM3 = | g_b00_0 p1   |   g_b00_1 p1   | g_b01_0 p1   | g_b01_1 p1   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM6 = | g_b10_0 p3   | g_b10_1 p3   | g_b11_0 p3   | g_b11_1 p3   |
;      +-----------------------------------------------------------+
;      +-----------------------------------------------------------+
;MM7 = | g_b10_0 p1   |   g_b10_1 p1   | g_b11_0 p1   | g_b11_1 p1   |
;      +-----------------------------------------------------------+

MOVQ        MM6, MM2

MOVQ        MM7, MM3
PUNPCKHWD   MM2, MM4       ;MM2 = g_b00_# and g_b01_# for pix #3

PUNPCKLWD   MM6, MM4         ;MM6 = g_b10_# and g_b11_# for pix #3

PUNPCKHWD   MM3, MM5         ;MM3 = g_b00_# and g_b01_# for pix #1
MOVQ        MM4, MM0      ;Preparing for rx1 and ry1 calculation

PUNPCKLWD   MM7, MM5         ;MM7 = g_b10_# and g_b11_# for pix #1

;Calculation of the rx1, ry1 values for both pixels.  Final output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM4 = |       RX1 p1 |       RX1 p3 |           RY1 p1 |           RY1 p3 |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;rx1 = rx0 - 256;
;ry1 = ry0 - 256;

PSUBW       MM4, MM1     ;MM4 = rx1 and ry1 parameters


;Setup for the calculation of u1 and u2 for pix #1.  Final output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM1 = |       RX0 p1 |            RY0 p1 |            RX0 p1 |           RY1 p1 |
;      +-----------------------------------------------------------+

MOVQ        MM5, MM0
MOVQ        MM1, MM4

PSRLD       MM5, 16

PSRAD       MM1, 16

PSLLQ       MM1, 32

PUNPCKHDQ   MM1, MM5

PACKSSDW    MM1, MM1

PACKSSDW    MM5, MM5

PUNPCKLDQ   MM1, MM5

;Calculation for U1 and U2 for pixel #1 -> After multiplication... Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM3 = | U1 for pixel #1          | U2 for pixel #1           |
;      +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;u1 = rx0 * g_b00_0 + ry0 * g_b00_1;
;u2 = rx0 * g_b01_0 + ry1 * g_b01_1;
PMADDWD     MM3, MM1       ;43u, MM3 = u1 and u2 for pixel #1

;Setup for the calculation of v1 and v2 for pix #1.  Final output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM5 = |       RX1 p1 |            RY0 p1 |            RX1 p1 |           RY1 p1 |
;      +-----------------------------------------------------------+

MOVQ        MM5, MM4

PSRAD       MM5, 16
MOVQ        MM1, MM0


PSRLD       MM1, 16

PSLLQ       MM1, 32

PUNPCKHDQ   MM1, MM5

PACKSSDW    MM1, MM1

PACKSSDW    MM5, MM5

PUNPCKLDQ   MM5, MM1

;Calculation for V1 and V2 for pixel #1 -> After multiplication... Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM7 = | V1 for pixel #1          | V2 for pixel #1           |
;      +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;v1 = rx1 * g_b00_0 + ry0 * g_b00_1;
;v2 = rx1 * g_b01_0 + ry1 * g_b01_1;

PMADDWD     MM7, MM5           ;MM7 = v1 and v2 for pixel #1

;Setup for the calculation of u1 and u2 for pix #3.  Final output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM1 = |       RX0 p3 |            RY0 p3 |            RX0 p3 |           RY1 p3 |
;      +-----------------------------------------------------------+

MOVQ        MM5, MM0

PSLLD       MM5, 16

PSRLD       MM5, 16
MOVQ        MM1, MM4

PSLLD       MM1, 16

PSRAD       MM1, 16

PUNPCKLDQ   MM1, MM1

PUNPCKHDQ   MM1, MM5

PACKSSDW    MM1, MM1

PACKSSDW    MM5, MM5

PUNPCKLDQ   MM1, MM5

;Calculation for U1 and U2 for pixel #3 -> After multiplication... Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM2 = | U1 for pixel #3          | U2 for pixel #3           |
;      +-----------------------------------------------------+
PMADDWD     MM2, MM1           ;MM2 = u1 and u2 for pixel #3


;Setup for the calculation of v1 and v2 for pix #3.  Final output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM4 = |       RX1 p3 |            RY0 p3 |            RX1 p3 |           RY1 p3 |
;      +-----------------------------------------------------------+

PSLLD       MM4, 16

PSRAD       MM4, 16
MOVQ        MM5, MM0

PSLLD       MM5, 16

PSRAD       MM5, 16

PUNPCKLDQ   MM5, MM5

PUNPCKHDQ   MM5, MM4

PACKSSDW    MM5, MM5

PACKSSDW    MM4, MM4

PUNPCKLDQ   MM4, MM5

;Calculation for V1 and V2 for pixel #3 -> After multiplication... Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM6 = | V1 for pixel #3          | V2 for pixel #3           |
;      +-----------------------------------------------------+
PMADDWD     MM6, MM4           ;MM6 = v1 and v2 for pixel #2

;Calculation for SX and SY for pixels #1 and #3, Output:
;      |--- 16 bit ---|
;      +-----------------------------------------------------------+
;MM1 = |       SX  p1 |             SX  p3 |             SY  p1 |            SY     p3 |
;      +-----------------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;sx = (((rx0 * rx0) >> 1) * ((1536 - (rx0 << 2))))>> 16;
;sy = (((ry0 * ry0) >> 1) * ((1536 - (ry0 << 2))))>> 16;
MOVQ        MM5, MM0

PMULLW      MM5, MM5
MOVQ        MM4, MM0

MOVQ        MM1, DWORD PTR mask_quad_1536
PSLLW       MM4, 2

PSUBD       MM6, MM2     ;V1 - U1 and V2 - U2 for P3
PSUBD       MM7, MM3     ;V1 - U1 and V2 - U2 for P1

PSUBW       MM1, MM4
PSRLW       MM5, 1

PMULHW      MM1, MM5    ;MM1 = sx and sy param for pix 1, 3

;Calculation of A and B for pixel #1 and #3. Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM7 = | A for pixel #1           | B for pixel #1            |
;      +-----------------------------------------------------+
;      +-----------------------------------------------------+
;MM6 = | A for pixel #3           | B for pixel #3            |
;      +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;a = u1 + sx * ((v1 - u1) >> 8);
;b = u2 + sx * ((v2 - u2) >> 8);
PSRAD       MM7, 8

PSRAD       MM6, 8

MOVQ        MM4, MM1
MOVQ        MM5, MM1

PSRLQ       MM4, 16

PUNPCKLWD   MM1, MM1

PUNPCKHDQ   MM4, MM4

PMADDWD     MM7, MM4
PSLLD       MM5, 16

MOVQ        MM4, DWORD PTR v  ;Used for incrementing v for next 4 pix
PSRLD       MM5, 16

PUNPCKHDQ   MM5, MM5
PADDD       MM4, DWORD PTR dv ;Used for incrementing v for next 4 pix

PADDD       MM7, MM3          ;MM7 = a and b parameter for pix #1
PMADDWD     MM6, MM5

MOVQ        MM3, DWORD PTR mask_double_65536
PSRLD       MM1, 16

MOVQ        DWORD PTR v, MM4  ;Used for incrementing v for next 4 pix

;Calculation of color indexes for pixel #1 and #3. Output:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM7 = | Color index for pixel #1 | Color index for pixel #3 |
;      +-----------------------------------------------------+
;This code correlates to the following "C" code in the "C_Noise()" function.
;color = (a + 65536 + sy * ((b - a) >> 8)) >> 9;
PADDD       MM6, MM2                ;MM6 = a and b parameter for pix #3

MOVQ        MM4, DWORD PTR mask_quad_510
MOVQ        MM2, MM6

PUNPCKLDQ   MM6, MM7

MOVD        MM0, ebx                ;Move the last color written into MM2
PUNPCKHDQ   MM2, MM7

PADDD       MM3, MM2
PSUBD       MM6, MM2

PSRAD       MM6, 8

PMADDWD     MM6, MM1

PADDD       MM6, MM3

PSRLD       MM6, 9                  ;MM6 = color for pix #1 and #3

;Since the color values have been calculated for pixels 1 and 3,
;pixels 0 and 2 still need to be determined.  Pixel 0 is calculated by
;(prev_pixel + pixel #1) / 2 and pixel 2 is calculated by (pixel #1 +
;pixel #3) / 2.  Output:
;      |--- 16 bit ----|
;      +-----------------------------------------------------------------+
;MM3 = |Color p0 index | Color p1 index | Color p2 index | Color p3 index|
;      +-----------------------------------------------------------------+

MOVD        MM4, DWORD PTR mask_double_255
PACKSSDW    MM6, MM6

MOVQ        MM7, MM6
MOVQ        MM3, MM6

PSRLD       MM7, 16

PUNPCKLWD   MM7, MM0

PADDW       MM6, MM7

PSRLW       MM6, 1

PUNPCKLWD   MM3, MM6
ADD         EDI, 8

;Now that MM3 contains the 4 memory indexes in packed format, we need
;to unpack them in order to get the precomputed color values from the 256
;element color array.  Output:
;      |--- 16 bit ---|
;      +--------------------------------------------------------------+
;MM1 = | Color p3     | Color p2      | Color p1      | Color p0      |
;      +--------------------------------------------------------------+

;Write the 4 pixel colors to the backbuffer.
;Decrease the counter and loop back to draw four more pixels if necessary.
;The looping construct may look strange but it is done to allow for the
;calculation of the pixel colors at the end of the scan line.

;Or : divide(right shift) by the octave index and add to the prev ones

MOVD        EBX ,MM3

PSRLW       MM3,[turbShift]

PADDW       MM3,[EDI]
MOVQ      [EDI], MM3        ;Write out the 4 pix to video memory.

DEC         ECX
JNZ         start_scan_line

INC         ESI
INC         [turbShift]

DEC         [octShift]

CMP         ESI, num_octaves
JNZ         start_octave

MOV         prev_color, EBX ;EBX is the color index of pixel #3. Store it.

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; here we rearrange the turb buffer
;; buffer[i] = p0:p1:p2:p3 --> buffer[i] = p3:p2:p1:p0

MOV         EDI, turb_buffer
MOV         ECX, num_pixels

flipLoop:
MOVQ        MM5, [EDI]

MOVQ        MM4, MM5
PUNPCKHDQ   MM5, MM5      ;MM5 = p0:p1:p0:p1

MOVQ        MM7, MM5      ;MM7 = p0:p1:p0:p1
PSRLD       MM5, 16

MOVQ        MM6, MM4
PUNPCKLWD   MM5, MM7      ;MM5 = *:*:p1:p0

PSRLQ       MM6, 16       ;MM6 = 0:p0:p1:p2

PUNPCKLWD   MM6, MM4      ;MM6 = *:*:p3:p2

PUNPCKLDQ   MM5, MM6      ;MM5 = p3:p2:p1:p0

MOVQ      [EDI], MM5
ADD         EDI, 8
DEC         ECX
JNZ         flipLoop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

EMMS                      ;Clear out the MMX registers and set appropriate flags.

RET                       ;End of function

SIMD_Octave ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

END






Appendix B - Wood (Sqrt) Code Listing

TITLE wood textures using MMX(TM) technology

;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list
.586
.model FLAT

KLAM equ 0
;***********************************************************************
;     Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA


extrn _marbleTable   : ptr sword
extrn _woodTable     : ptr sword
extrn _sqrtTable     : ptr sword
extrn _turbulenceBuf : ptr sword


;Variables, u, v, du, dv  each contain parameters for two
;texels.  Since u, v, ...  are 64 bit, then each texel parameter is
;32 bit.  (32 bit per texel * two texels = 64 bits).  This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
_4du   QWORD ?
_4dv   QWORD ?
result dd 0

;Various masks.  Set up to filter out unwanted bits in MMX registers.
ALIGN 8

const_quad_10               QWORD 000a000a000a000ah
const_quad_15               QWORD 000f000f000f000fh
const_FFFF_Minus_High_sqrt  QWORD 0f800f800f800f800h
const_FFFF_Minus_High_Wood  QWORD 0e890e890e890e890h

mask_odd_indexes            QWORD 0fffefffefffefffeh
mask_high_words             QWORD 00000ffff0000ffffh
mask_low_words              QWORD 0ffff0000ffff0000h

mask_all_1                  QWORD 0ffffffffffffffffh
mask_clear_word_1           QWORD 0000000000000ffffh
const_quad_735              QWORD 002df02df02df02dfh
mask_quad_green             QWORD 0800080008000800h
const_quad_1500             QWORD 05dc05dc05dc05dch

DSEG ENDS

;***********************************************************************
;     Constant Segment Declarations
;***********************************************************************
.const

;***********************************************************************
;     Code Segment Declarations
;***********************************************************************
.code

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; SIMD_Wood_Sqrt(u_init : DWORD, v_init : DWORD, du : DWORD, dv : DWORD,
;;;;               num_pixels : DWORD)

wood_u_init     =  20
wood_v_init     =  24
wood_du         =  28
wood_dv         =  32
wood_num_pixels =  36

_SIMD_Wood_Sqrt PROC NEAR

sub   esp , 16

mov  [esp     ], edi
mov  [esp +  4], edx
mov  [esp +  8], ecx
mov  [esp + 12], eax

MOV        ECX, wood_num_pixels[esp]
LEA        EDI, _turbulenceBuf

MOVD       MM4, wood_du[esp]     ; 0:du
SHR        ECX, 2	               ; ECX= # of times to draw 4 pixels at once

MOVD       MM0, wood_u_init[esp] ; 0:u
PSLLQ      MM4, 32               ; du:0

PUNPCKLDQ  MM0, MM0              ; u:u

MOVD       MM5, wood_dv[esp]     ; 0:dv
PADDD      MM0, MM4              ; u + du:u

MOVD       MM1, wood_v_init[esp] ; 0:v
PUNPCKHDQ  MM4, MM4              ; du:du

PUNPCKLDQ  MM1, MM1              ; v:v

PSLLQ      MM5, 32               ; dv:0

PADDD	     MM1, MM5              ; v + dv:v
PUNPCKHDQ  MM5, MM5              ; dv:dv

MOVQ       MM2, MM0              ; u + du:u
MOVQ       MM3, MM1              ; v + dv:v

PADDD      MM4, MM4              ; 2du:2du
PADDD      MM5, MM5              ; 2dv:2dv

PADDD      MM2, MM4              ; u + 3du:u+2du
PADDD      MM3, MM5              ; v + 3dv:v+2dv

PADDD      MM4, MM4              ; 4du:4du
PADDD      MM5, MM5              ; 4dv:4dv

MOVQ       dword ptr _4du, mm4

MOVQ       dword ptr _4dv, mm5


;; during the loop the following hold
    ;; mm0  = u1   : u0
    ;; mm2  = u3   : u2
    ;; mm1  = v1   : v0
    ;; mm3  = v3   : v2
    ;; _4du  = 4du : 4du
    ;; _4dv  = 4dv : 4dv

wood_loop:

MOVQ       MM5, MM1              ; v1 : v0
MOVQ       MM4, MM0              ; u1 : u0

MOVQ       MM6, MM3              ; v3 : v2
PSLLD      MM5, 2                ; shift left by 2 (16 -14)

MOVQ       MM7, MM2              ; u3 : u2
PSRLD      MM4, 14               ; shift right by 14

PAND       MM5, dword ptr mask_low_words ; mm5 = v1: 0 : v0 : 0
PSLLD      MM6, 2                        ; shift left by 2 (16 -14)

PADDD      MM0, dword ptr _4du   ; u1 + 4du : u0 + 4du
POR        MM4, MM5              ; mm4 = v1:u1:v0:u0

PAND       MM6, dword ptr mask_low_words
PSRLD      MM7, 14               ; shift left by 14

PMADDWD    MM4, MM4              ; res1 = (u1*u1  + v1*v1) : res0 = (u0*u0 + v0*v0)
POR        MM7, MM6              ; mm7 = v3:u3:v2:u2

PADDD      MM1, dword ptr _4dv   ; v1 + 4dv : v0 + 4dv
PMADDWD    MM7, MM7              ; res1 = (u3*u3  + v3*v3) : res0 = (u2*u2 + v2*v2)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;  pack the four r^2 values to words (take the results
;;;;;;;;  after 10 bits shift right .
;;;;;;;;  shift left by 16 , and then shift right Arithmetic by 16.
;;;;;;;;; the 16 bits shift left is done by 6 bits shift left
;;;;;;;;; instead of 10 bits shift right.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
PADDD      MM2, dword ptr _4du   ; u3 + 4du : u2 + 4du
PSLLD      MM4, 6	               ; shift left by 6 (16 -10)

PADDD      MM3, dword ptr _4dv   ; v3 + 4dv : v2 + 4dv
PSRAD      MM4, 16               ; extend sign bit for PACKSSDW

MOVQ       MM5, [EDI]            ; turbulence
PSLLD      MM7, 6                ; shift left by 6 (16 -10)

PMULLW     MM5, dword ptr const_quad_15 ; turb  = 15 * turb
PSRAD      MM7, 16                      ; extend sign bit for PACKSSDW

MOVQ       MM6, dword ptr const_FFFF_Minus_High_sqrt
;;;;;;  finally pack them correctly
;;;;;;; mm4 = ( res3:res2:res1:res0)  10 and packed
PACKSSDW   MM4, MM7			
													
;;; clip the values against the range [0 : 7FFh ]
;;; which is the size of the sqrt table (2048 entries)
PADDUSW    MM4, MM6              ; mm6 =  const_FFFF_Minus_High_sqrt

PSUBUSW    MM4, MM6              ; mm6 =  const_FFFF_Minus_High_sqrt


PAND       MM4, dword ptr mask_odd_indexes


MOVD       EAX, MM4

MOV        EDX, EAX
AND        EAX, 0ffffh           ; eax =  res0

SHR        EDX, 16               ; edx =  res1
PSRLQ      MM4, 32

MOVD       MM7, [ _sqrtTable +  eax*2]  ; read from the sqrt table
PUNPCKLWD  MM7, [ _sqrtTable +  edx*2]  ; 0:0:sqrt(res1):sqrt(res0)
MOVD       EAX, MM4

MOV        EDX, EAX
AND        EAX, 0ffffh           ; eax =  res2

SHR        EDX, 16               ; edx =  res3
MOVD       MM6, [ _sqrtTable +  eax*2]  ; read from the sqrt table
PUNPCKLWD  MM6, [ _sqrtTable +  edx*2]  ; 0:0:sqrt(res3):sqrt(res2)
PUNPCKLDQ  MM7, MM6              ; sqrt(res3):sqrt(res2):sqrt(res1):sqrt(res0)

;;;;; mm7 = 10 * (sqrt(res3):sqrt(res2):sqrt(res1):sqrt(res0))
PMULLW     MM7, dword ptr const_quad_10
ADD        EDI, 8

;;;;; wood_indx = 10 * sqrt(res) + 15 * turbulence
MOVQ       MM6, dword ptr const_FFFF_Minus_High_Wood
PADDW      MM7, MM5												

PSRLW      MM7, 2                ; wood_indx  =  2

;;; clip the values against the range [0 : 176Fh ]
;;; which is the size of the wood table (6000 entries).
PADDUSW    MM7, MM6  ; mm6  = const_FFFF_Minus_High_Wood
PSUBUSW    MM7, MM6  ; mm6 =  const_FFFF_Minus_High_Wood

PAND       MM7, dword ptr mask_odd_indexes

MOVD       EAX, MM7              ; indx1:indx0

MOV        EDX, EAX
AND        EAX, 0ffffh           ; eax = indx0

SHR        EDX, 16               ; edx = indx1

MOVD       MM6, [ _woodTable + eax*2] ; read wood colors from table
PSRLQ      MM7, 32

PUNPCKLWD  MM6, [ _woodTable + edx*2] ; 0:0:wood1:wood0

MOVD       EAX, MM7              ; indx1:indx0

MOV        EDX, EAX
AND        EAX, 0ffffh           ; eax = indx2

SHR        EDX, 16               ; edx = indx3

MOVD       MM7,  [ _woodTable + eax*2]   ; read wood colors from table
PUNPCKLWD  MM7,  [ _woodTable + edx*2]   ; 0:0:wood3:wood2
PUNPCKLDQ  MM6, MM7                      ; wood3:wood2:wood1:wood0

MOVQ   [EDI-8], MM6              ; store the colors into turb_buffer

DEC        ECX
JNZ        wood_loop

EMMS                 ; Clear out the MMX registers and set appropriate flags.

MOV  EAX, [ESP + 12]
MOV  ECX, [ESP +  8]
MOV  EDX, [ESP +  4]
MOV  EDI, [ESP     ]

ADD  ESP, 16

RET                        ; end of function
_SIMD_Wood_Sqrt ENDP






Appendix C - Marble Code Listing

TITLE Marble textures using MMX(TM) technology

;prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc
.list
.586
.model FLAT

KLAM		       equ 0
;***********************************************************************
;     Data Segment Declarations
;***********************************************************************
;.DATA
DSEG SEGMENT PARA


extrn _marbleTable   : ptr sword
extrn _woodTable     : ptr sword
extrn _sqrtTable     : ptr sword
extrn _turbulenceBuf : ptr sword


;Variables, u, v, du, dv  each contain parameters for two
;texels.  Since u, v, ...  are 64 bit, then each texel parameter is
;32 bit.  (32 bit per texel * two texels = 64 bits).  This enables us
;to work with two pixels at one time using MMX technology.
ALIGN 8
_4du	 QWORD ?
_4dv	 QWORD ?
result dd 0


;Various masks.  Set up to filter out unwanted bits in MMX registers.
ALIGN 8
const_quad_10               QWORD 000a000a000a000ah
const_quad_15               QWORD 000f000f000f000fh
const_FFFF_Minus_High_sqrt  QWORD 0f800f800f800f800h
const_FFFF_Minus_High_Wood  QWORD 0e890e890e890e890h

mask_odd_indexes            QWORD 0fffefffefffefffeh
mask_high_words             QWORD 00000ffff0000ffffh
mask_low_words              QWORD 0ffff0000ffff0000h

mask_all_1                  QWORD 0ffffffffffffffffh
mask_clear_word_1           QWORD 0000000000000ffffh
const_quad_735              QWORD 002df02df02df02dfh
mask_quad_green             QWORD 0800080008000800h
const_quad_1500             QWORD 05dc05dc05dc05dch

DSEG ENDS

;***********************************************************************
;     Constant Segment Declarations
;***********************************************************************
.const

;***********************************************************************
;     Code Segment Declarations
;***********************************************************************
.code
;***********************************************************************
;;;; SIMD_Marble uses the contents of _turbulenceBuf which was filled
;;;; before by SIMD_Octave with num_octaves of perlin noise.
;;;; The marble approximation  is
;;;; marb(u,v) = sin(u + turb(u,v)), we use a pre-computed
;;;; sine table to accelerate it this also enables the usage of MMX technolgy
;;;; The table '_marbleTable' actually hold the marble value itself
;;;; which is a manipulation of the sine output.
;;;; In each iteration 4 pixels are calculated, 'num_pixels' is a multiply of 4.
;***********************************************************************
;;; SIMD_Marble(u_init:DWORD, du:DWORD, num_pixels:DWORD )

marb_u_init     =  20
marb_du         =  24
marb_num_pixels =  28

_SIMD_Marble PROC NEAR

SUB         ESP, 16

MOV  [ESP + 12], EAX
MOV  [ESP +  8], ECX

MOV  [ESP +  4], EBX
MOV  [ESP     ], EDI

MOV         ECX, marb_num_pixels[esp]   ; number of pixels in scanline
LEA         EDI, _turbulenceBuf         ; already calculated turbulence

MOVD        MM2, marb_du[esp]           ; mm2 = 0:du

MOVD        MM0, marb_u_init[esp]       ; mm0 = 0:
PSLLQ       MM2, 32                     ; mm2 = du:0

SHR         ECX, 2                      ; ecx = # of times to draw 4 pixels at once
PUNPCKLDQ   MM0, MM0                    ; u : u

PADDD       MM0, MM2                    ; u + du : u
PUNPCKHDQ   MM2, MM2                    ; du : du

MOVQ        MM1, MM0                    ; u + du : u
PADDD       MM2, MM2                    ; 2du : 2du

PADDD       MM1, MM2                    ; u + 3du : u + 2du
PADDD       MM2, MM2                    ; 4du : 4du

if (KLAM)
MOVQ        MM6, dword ptr const_quad_10
endif

;; during the loop the following hold
	;; mm0 = u1  : u0
	;; mm1 = u3  : u2
	;; mm2 = 4du: 4du
	;; if KLAM is defined then
	;; mm6 = 10:10:10:10
	;; on P55C it is paired on the u pipe
	;; so we can PAND with memory

marb_loop:

MOVQ        MM5, [EDI]                  ; mm5 = turb3:turb2:turb1:turb0
MOVQ        MM3, MM0                    ; mm3 = u1:u0

MOVQ        MM4, MM1                    ; mm4 = u3:u2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; the following lines pack u3,u2,u1,u0 from two registers
;;;; to one  register including shift right by 14 .
;;;; in order to make packssdw not to change the numbers
;;;; but only  pack them we do shift left by 16 and then
;;;; shift right arithmetic  by 16 to extend the sign bit .
;;;; The 16 bits shift left is done by 2 bits shift left
;;;;; instead of 14 bits  shift right.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
PSLLD      MM3, 2                       ; shift left by 2 (16 -14)

if ( KLAM )
PMULLW     MM5, MM6                     ; turb = 10 * turb
else
PMULLW     MM5, dword ptr const_quad_10 ; turb = 10 * turb
endif
PSLLD      MM4, 2                       ; shift left by 2 (16 -14)

PADDD      MM0, MM2                     ; increment each of u1:u0 by 4du for next iteration
PSRAD      MM3,16                       ; extend sign bit for PACKSSDW

PSRAD      MM4,16                       ; extend sign bit for PACKSSDW
ADD        EDI,8                        ; increment edi for next iteration

PACKSSDW   MM3, MM4                     ; mm3 = (u3:u2:u1:u0)  14 and packed
PADDD      MM1, MM2                     ; increment each of u3:u2 by 4du for next iteration

PADDW      MM3, MM5                     ; marble indexes are:  (u_init  14) + (10 * turb)

;;;;; now read the colors from the marble table
;;;;; the input to this part is mm3 = indx3:indx2:indx1:indx0
;;;;; the output is  mem[edi-8] = pixel3     :pixel2     :pixel1     :pixel0
PAND       MM3, dword ptr mask_odd_indexes


MOVD       EAX, MM3                     ; eax = indx1:indx0

MOV        EBX, EAX                     ; ebx = indx1:index0
AND        EAX, 0ffffh                  ; eax =  indx0

SHR        EBX, 16                      ; edx =  indx1
PSRLQ      MM3, 32                      ; mm3 = 0:0:indx3:indx2		

MOVD       MM4, [ _marbleTable +  eax*2] ; read from the marble table
PUNPCKLWD  MM4, [ _marbleTable +  ebx*2] ; 0:0:marb1:marb0
MOVD       EAX, MM3                     ; eax = indx3:indx2

MOV        EBX, EAX                     ; ebx = indx3:index2
AND        EAX, 0ffffh                  ; eax =  indx2

SHR        EBX, 16                      ; edx =  indx3
MOVD       MM5, [ _marbleTable +  eax*2] ; read from the marble table
PUNPCKLWD  MM5, [ _marbleTable +  ebx*2] ; 0:0:marb3:marb2
PUNPCKLDQ  MM4, MM5                     ; marb3:marb2:marb1:marb0
MOVQ   [EDI-8], MM4                     ; store the 4 pixels to turb_buffer

DEC        ECX
JNZ        marb_loop

EMMS                                    ; Clear out the MMX registers and set appropriate flags.

MOV        EAX, [ESP + 12]
MOV        ECX, [ESP +  8]
MOV        EBX, [ESP +  4]
MOV        EDI, [ESP     ]

ADD        ESP, 16

RET                                     ; end of function
_SIMD_Marble ENDP






Appendix D - DDU and DDV Code Listing

;Get the UV parameters in MMX(TM) technology form.
;Note: UV texel values are stored in 10.22 fixed integer format.
;This sets up the U parameters for pixels 1 and 3 in MM0 register and
;V parameter in MM1 register.  After setup, the registers will contain:
;      |--------- 32 bit ------------|
;      +-------------------------------------------------------------------+
;MM0 = | U texel for pix #1 = u + du | U texel for pix #3 = u + 3du + 3ddu |
;      +-------------------------------------------------------------------+
;      +-------------------------------------------------------------------+
;MM1 = | V texel for pix #1 = v + dv | V texel for pix #3 = v + 3dv + 3ddv |
;      +-------------------------------------------------------------------+
;This is because the first four pixels drawn on the screen will have the
;U and V texel values of:
;Pixel #0 = u
;Pixel #1 = u + du
;Pixel #2 = u + 2du + ddu
;Pixel #3 = u + 3du + 3ddu
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.

MOVD      MM0, u_init
SHR       ECX, 2          ;ECX= # of times to draw 4 pixels at once

MOVD      MM1, v_init
PUNPCKLDQ MM0, MM0        ;U p1 = u, p3 = u

MOVD      MM2, du_init
PUNPCKLDQ MM1, MM1        ;V p1 = v, p3 = v

MOVD      MM3, dv_init
PADDD     MM0, MM2        ;U p1 = u, p3 = u + du

PADDD     MM1, MM3        ;V p1 = v, p3 = v + dv
PADDD     MM0, MM2        ;U p1 = u, p3 = u + 2du

PADDD     MM1, MM3        ;V p1 = v, p3 = v + 2dv
PUNPCKLDQ MM2, MM2

PUNPCKLDQ MM3, MM3
PADDD     MM0, MM2        ;U p1 = u + du, p3 = u + 3du

MOVD      MM2, ddu_init
PADDD     MM1, MM3        ;V p1 = v + dv, p3 = v + 3dv

MOVD      MM3, ddv_init
PADDD     MM0, MM2        ;U p1 = u + du, p3 = u + 3du + ddu

PADDD     MM1, MM3        ;V p1 = v + dv, p3 = v + 3dv + ddv
PADDD     MM0, MM2        ;U p1 = u + du, p3 = u + 3du + 2ddu

PADDD     MM1, MM3        ;V p1 = v + dv, p3 = v + 3dv + 2ddv
PADDD     MM0, MM2        ;U p1 = u + du, p3 = u + 3du + 3ddu

MOVQ      DWORD PTR u, MM0
PADDD     MM1, MM3        ;V p1 = v + dv, p3 = v + 3dv + 3ddv

MOVQ      DWORD PTR v, MM1
;Get the du dv parameters in MMX(TM) technology form
;Note: du dv texel values are stored in 10.22 fixed integer format.
;This sets up the du parameters for pixels 1 and 3 in MM0 register and
;dv parameter in MM1 register.  After setup, the registers will contain:
;      |--------- 32 bit --------------|
;      +---------------------------------------------------------------+
;MM0 = | DU texel for p1 = 4du + 10ddu | DU texel for p3 = 4du + 18ddu |
;      +---------------------------------------------------------------+
;      +---------------------------------------------------------------+
;MM1 = | DV texel for p1 = 4dv + 10ddv | DV texel for p3 = 4dv + 18ddv |
;      +---------------------------------------------------------------+
;This is because after the first four pixels are drawn on the screen, the
;loop repeats to draw the next four pixels.  In order to get the next u, v
;texel coordinates, appropriate du, dv values need to be summed to u and v.
;The correct starting values of du and dv are:
;Pixel #0 = 4du +  6ddu   ;Note: these have been mathematically proven.
;Pixel #1 = 4du + 10ddu
;Pixel #2 = 4du + 14ddu
;Pixel #3 = 4du + 18ddu
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.

MOVD      MM0, du_init    ;DU p1 = 0, p3 = du

MOVD      MM1, dv_init    ;DV p1 = 0, p3 = dv
PUNPCKLDQ MM0, MM0        ;DU p1 = du, p3 = du

PUNPCKLDQ MM1, MM1        ;DV p1 = dv, p3 = dv

MOVD      MM2, ddu_init
PSLLD     MM0, 2          ;DU p1 = 4du, p3 = 4du

MOVD      MM3, ddv_init
PSLLD     MM1, 2          ;DV p1 = 4dv, p3 = 4dv

PUNPCKLDQ MM2, MM2

PUNPCKLDQ MM3, MM3

PSLLD     MM2, 1

PSLLD     MM3, 1
PADDD     MM0, MM2        ;DU p1 = 4du + 2ddu, p3 = 4du + 2ddu

PADDD     MM1, MM3        ;DV p1 = 4dv + 2ddv, p3 = 4dv + 2ddv
PSLLD     MM2, 2

PSLLD     MM3, 2
PADDD     MM0, MM2        ;DU p1 = 4du + 10ddu, p3 = 4du + 10ddu

MOVD      MM2, ddu_init   ;DDU p1 = 0, p3 = ddu
PADDD     MM1, MM3        ;DV p1 = 4dv + 10ddv, p3 = 4dv + 10ddv

MOVD      MM3, ddv_init   ;DDV p1 = 0, p3 = ddv
PSLLD     MM2, 3          ;DDU p1 = 0, p3 = 8ddu

PSLLD     MM3, 3          ;DDV p1 = 0, p3 = 8ddv
PADDD     MM0, MM2        ;DU p1 = 4du + 10ddu, p3 = 4du + 18ddu

PADDD     MM1, MM3        ;DV p1 = 4dv + 10ddv, p3 = 4du + 18ddv
PSLLD     MM2, 1          ;DDU p1 = 0, p3 = 16ddu

MOVQ      DWORD PTR du, MM0
PUNPCKLDQ MM2, MM2        ;DDU p1 = 16ddu, p3 = 16ddu

MOVQ      DWORD PTR dv, MM1

;Get the ddu ddv parameters in MMX(TM) technology form
;Note: ddu ddv texel values are stored in 10.22 fixed integer format.
;This sets up the ddu parameters for pixels 1 and 3 in MM0 register and
;ddv parameter in MM1 register.  After setup, the registers will contain:
;      |--------- 32 bit ---------|
;      +-----------------------------------------------------+
;MM0 = | DDU texel for p1 = 16ddu | DDU texel for p3 = 16ddu |
;      +-----------------------------------------------------+
;      +-----------------------------------------------------+
;MM1 = | DDV texel for p1 = 16ddv | DDV texel for p3 = 16ddv |
;      +-----------------------------------------------------+
;This is because after the first four pixels are drawn on the screen, the
;loop repeats to draw the next four pixels.  In order to get the next du, dv
;texel coordinates, appropriate ddu, ddv values need to be summed to du and dv.
;The correct values of ddu and ddv are:
;Pixel #0 = 16ddu         ;Note: these have been mathematically proven.
;Pixel #1 = 16ddu
;Pixel #2 = 16ddu
;Pixel #3 = 16ddu
;We are only interested in pixels #1 and #3 because pixels #0 and #2 are averaged.
PSLLD     MM3, 1          ;DDV p1 = 0, p3 = 16ddv

MOVQ      DWORD PTR ddu, MM2
PUNPCKLDQ MM3, MM3        ;DDV p1 = 16ddv, p3 = 16ddv

MOVQ      DWORD PTR ddv, MM3






Appendix E - Z-Buffer Scanline Algorithm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; z is calculated along the scan line z = z_init + i * dz_init
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
MMX_INCZbuffer PROC NEAR C USES edi esi ecx eax ebx,
               z_init: DWORD, dz_init: DWORD,
               num_pixels: DWORD, z_line: PTR SWORD, color_line: PTR SWORD,
               z_buffer: PTR SWORD, color_buffer: PTR SWORD

MOVD       MM1, dz_init
MOVD       MM4,  z_init

MOVD       MM5, dz_init
PSLLD      MM1, 16          ;0:0:dz:0

PAND       MM4, DWORD PTR mask_clear_byte_1

MOVQ       MM6, DWORD PTR  mask_all_1
PUNPCKLWD  MM4, MM4         ;0:0:z:z

PADDSW     MM4, MM1         ;0:0:z + dz:z
PUNPCKLWD  MM5, MM5         ;0:0:dz:dz

MOVQ       MM3, MM4         ;0:0:z + dz:z
PSLLW      MM5, 1           ;0:0:2dz:2dz

PSLLQ      MM3, 32          ;z + dz:z:0:0
MOV        EAX, z_buffer

PSLLQ      MM5, 32          ;2dz:2dz:0:0
MOV        EDI,  color_line

PADDSW     MM3, MM5         ;z+3z:z+2dz:0:0
PUNPCKHDQ  MM5, MM5         ;2dz:2dz:2dz:2dz

POR        MM4, MM3         ;z+3z:z+2dz:z + dz:z
PSLLW      MM5, 1           ;4dz:4dz:4dz:4dz

MOV        ECX, color_buffer
MOV        ESI, num_pixels

SHR        ESI, 2

zLoop:
MOVQ       MM0, [eax]       ;mm0 = Za,Za,Za,Za (load)
MOVQ       MM1, MM4         ;[ebx] mm1 = Zb,Zb,Zb,Zb (load)

MOVQ       MM2, MM0         ;mm2 = Za,Za,Za,Za (will be the mask)
PADDSW     MM4, MM5

PCMPGTW    MM2, MM1         ;mm2 = mask of 0000 or fffff (4 times)
ADD        EAX, 8

MOVQ       MM3, MM2         ;(after pxor) mm3 = ~mm2 (mm2 xor fffffffffffffff)
PAND       MM1, MM2         ;mm1 = only the Zb's which are less then the Za's

PXOR       MM3, MM6         ;DWORD PTR  mask_all_1
ADD        ECX, 8

PAND       MM0, MM3         ;mm0 = the Za's which are less or EQUAL the Zb's
ADD        EDI, 8

POR        MM0, MM1         ;mm0 = the wanted Z's
MOVQ   [eax-8], MM0         ;(store Z's)

MOVQ       MM0, [ecx-8]     ;mm0 = Ca,Ca,Ca,Ca
MOVQ       MM1, [edi-8]     ;mm1 = Cb,Cb,Cb,Cb

PAND       MM1, MM2         ;mm1 = the Ca's of the 'Good' Za's
PAND       MM0, MM3         ;mm0 = the Cb's of the 'Good' Zb's

POR        MM0, MM1         ;the wanted C's
MOVQ   [edi-8], MM0         ;(store)

DEC        ESI
JNZ        zLoop

EMMS
RET

MMX_INCZbuffer ENDP






Appendix F - Optimized Z-Buffer Code Listing

; Note, registers ESI, EDI, MM1, MM2, MM3, MM4, MM6, MM7 are modified by this routine.

MOVQ      MM4, low_z     ;Move two rightmost Z-Buffer values into MM4 (LSD)

MOVQ      MM2, high_z    ;Move the leftmost Z-Buffer values into MM2 (MSD)
MOVQ      MM6, MM4       ;Make a copy of LSD of the Z-Buffer values

MOVQ      MM7, z_inc     ;Move the Z-incremental into a register for future use.
PSRAD     MM4, 16        ;Discard the fractional part of the two Z values

PUSH      ESI            ;Save ESI
PSRAD     MM2, 16        ;Discard the fractional part of the two Z values

MOV       ESI, z_buffer  ;ESI = pointer to four Z values being looked at in Z-Buffer.
PACKSSDW  MM4, MM2       ;Mesh all four Z-Buffer values into one register

MOVQ      MM2, [ESI]     ;MM2 = the old Z values currently in the Z-Buffer.
PADDD     MM6, MM7       ;Add DZ to Z

MOVQ      MM7, high_z    ;Save a copy of high_z
PCMPGTW   MM2, MM4       ;Perform a compare between the old and the new Z values.

PADDD     MM7, z_inc     ;Add DZ to Z
MOVQ      MM3, MM2       ;Save a copy of the compare results

PANDN     MM3, [EDI]     ;MM3 = Colors of previous pixels to be drawn.
PAND      MM1, MM2       ;MM1 = Colors of current pixel 4 pixels to be drawn.

MOVQ   high_z, MM7       ;Update the high_z variable
POR       MM1, MM3       ;"OR" old and new contents together for the 4 pixel colors.

MOVQ    low_z, MM6       ;Update the low_z variable
MOVQ      MM3, MM2       ;Save a copy of the compare results

PANDN     MM3, [ESI]     ;[ESI] = Pointer to existing 4 Z-Buffer values.
PAND      MM2, MM4

MOVQ    [EDI], MM1       ;Write out the 4 pixels to video memory.
POR       MM2, MM3       ;"OR" old and new contents together for the 4 Z values.

MOVQ    [ESI], MM2       ;Update the Z-Buffer with the 4 new values.

ADD  z_buffer, 8         ;z_buffer pointer is incremented eight bytes (4 pixels).
POP       ESI            ;Restore ESI






Appendix G - Wood (Linear) Code Listing

;***********************************************************************
;;;  This is the wood implementation by linear curves in the u_v plane.
;;;; In each iteration 4 pixels are calculated, 'num_pixels' is a multiply of 4.
;***********************************************************************
;;;; SIMD_Wood_Linear(u_init: DWORD, v_init: DWORD, du: DWORD, dv : DWORD, num_pixels:DWORD)

_SIMD_Wood_Linear PROC NEAR

SUB   ESP, 16

MOV  [ESP + 12], EAX
MOV  [ESP +  8], ECX
MOV  [ESP +  4], EDX
MOV  [ESP     ], EDI

MOV         ECX, wood_num_pixels[esp]

MOVD        MM4, wood_du[esp]     ; 0:du

SHR         ECX, 2                ; ECX= # of times to draw 4 pixels at once
LEA         EDI, _turbulenceBuf

MOVD        MM0, wood_u_init[esp] ; 0:u
PSLLQ       MM4, 32               ; du:0

PUNPCKLDQ   MM0, MM0              ; u:u

MOVD        MM5, wood_dv[esp]     ; 0:dv
PADDD	      MM0, MM4              ; u + du:u

MOVD        MM1, wood_v_init[esp] ; 0:v
PUNPCKHDQ   MM4, MM4              ; du:du

PUNPCKLDQ   MM1, MM1              ; v:v

PSLLQ       MM5, 32               ; dv:0

PADDD	      MM1, MM5              ; v + dv:v
PUNPCKHDQ   MM5, MM5              ; dv:dv

MOVQ        MM2, MM0              ; u + du:u
MOVQ        MM3, MM1              ; v + dv:v

PADDD       MM4, MM4              ; 2du:2du
PADDD       MM5, MM5              ; 2dv:2dv

PADDD       MM2, MM4              ; u + 3du:u+2du
PADDD       MM3, MM5              ; v + 3dv:v+2dv

PADDD       MM4, MM4              ; 4du:4du
PADDD       MM5, MM5              ; 4dv:4dv

MOVQ        dword ptr _4dv, MM5

;; during the loop the following hold
	;; mm0  = u1   : u0
	;; mm2  = u3   : u2
	;; mm1  = v1   : v0
	;; mm3  = v3   : v2
      ;; mm4  = 4du : 4du
      ;; _4dv  = 4dv : 4dv

wood_loop:

MOVQ       MM5, MM0               ; u1 : u0
MOVQ       MM6, MM2               ; u3 : u2
;; like in the marble code, in order to shift right by 14
;; and then pack 4 dwords to 4 words in one MMX(TM) register
;; a shift left followed by shift right arithmetic are done
;;; as in the marble the shift left is by 2 .
PADDD      MM0, MM4               ; u1 + 4du : u0 + 4du
PSLLD      MM5, 2                 ; shift left by 2 (16 -14)

PADDD      MM2, MM4               ; u3 + 4du : u2 + 4du
PSLLD      MM6, 2                 ; shift left by 2 (16 -14)

MOVQ       MM7, MM1               ; v1 : v0
PSRAD      MM5, 16                ; extend sign bit for PACKSSDW

PADDD      MM1,  dword ptr _4dv   ; v1 + 4dv : v0 + 4dv
PSRAD      MM6, 16                ; extend sign bit for PACKSSDW

;;;;;;;;  finally pack them correctly
;;;;;;;;  mm5 = (u3:u2:u1:u0)  14 and packed
PACKSSDW   MM5, MM6

MOVQ       MM6, MM3               ; v3 : v2
PSLLD      MM7, 2                 ; shift left by 2 (16 -14)

PADDD      MM3, dword ptr _4dv    ; v3 + 4dv : v2 + 4dv
PSLLD      MM6, 2                 ; shift left by 2 (16 -14)

PSRAD      MM6, 16                ; extend sign bit for PACKSSDW
PSRAD      MM7, 16                ; extend sign bit for PACKSSDW
;;;;;;;;  mm7 = (v3:v2:v1:v0)  14 and packed														
PACKSSDW   MM7, MM6

MOVQ       MM6, MM5               ; the following instructions implement
					    ; Unsigned absolute value for words
PSUBUSW    MM5, MM7				
PSUBUSW    MM7, MM6

MOVQ       MM6, [EDI]		    ; turbulence
POR        MM5, MM7               ; mm5 = abs(v3 - u3 : v2 - u2 : v1 - u1 : v0 - u0)

;;;;;; wood_indx  = (10 * abs(u-v)  + 15 * turbulence(u,v) )  2
PMULLW     MM6, dword ptr const_quad_15 ; turb  = 15 * turb
PMULLW     MM5, dword ptr const_quad_10 ; | u - v | * 10

MOVQ       MM7, dword ptr const_FFFF_Minus_High_Wood
PADDW      MM5, MM6               ;  10 * abs | u - v |  + 15 * turb(u,v)
					
ADD        EDI, 8
PSRLW      MM5, 2	                ; wood_indx  =  2

;;; Now clip the values against the range [0 : 176Fh ]
;;; which is the size of the wood table (6000 entries).
;;; paddusw saturates each value above 176FH to FFFF
;;; psubusw undo the offset
PADDUSW    MM5, MM7               ; mm7 =  const_FFFF_Minus_High_Wood
PSUBUSW    MM5, MM7               ; mm7 =  const_FFFF_Minus_High_Wood

;;;; on P55C each unaligned load of 4 bytes (movd) cause penalty
;;;; so we don't read odd indexes (the table's element size is word )
PAND       MM5, dword ptr mask_odd_indexes


MOVD       EAX, MM5						; eax= indx1:indx0

MOV        EDX, EAX						  ; edx = indx1:indx0
AND        EAX, 0ffffh						; eax = indx0

SHR        EDX, 16							; edx = indx1
PSRLQ      MM5, 32							; mm5 = indx3:indx2

MOVD       MM6, [ _woodTable + eax*2]     ; read wood colors from table
PUNPCKLWD  MM6, [ _woodTable + edx*2]     ; 0:0:wood1:wood0
MOVD       EAX, MM5                       ; eax = indx3:indx2

MOV        EDX, EAX                       ; edx = indx3:indx2
AND        EAX, 0ffffh                    ; eax = indx2

SHR        EDX, 16                        ; edx = indx3
MOVD       MM5, [ _woodTable + eax*2]     ; read wood colors from table
PUNPCKLWD  MM5, [ _woodTable + edx*2]     ; 0:0:wood3:wood2
PUNPCKLDQ  MM6, MM5                       ; mm6 = wood3:wood2:wood1:wood0
MOVQ   [EDI-8], MM6                       ; store the colors into turb_buffer

DEC        ECX
JNZ        wood_loop

EMMS                    ; Clear out the MMX registers and set appropriate flags.

MOV        EAX, [ESP + 12]
MOV        ECX, [ESP +  8]
MOV        EDX, [ESP +  4]
MOV        EDI, [ESP     ]

ADD        ESP, 16

RET                     ; end of function

_SIMD_Wood_Linear ENDP