User's Manual

120 Optimized Matrix Multiplication
AMD Athlon Processor x86 Code Optimization
22007E/0November 1999
/* Function XForm performs a fully generalized 3D transform on an array
of vertices pointed to by "v" and stores the transformed vertices in
the location pointed to by "res". Each vertex consists of four floats.
The 4x4 transform matrix is pointed to by "m". The matrix elements are
also floats. The argument "numverts" indicates how many vertices have
to be transformed. The computation performed for each vertex is:
res->x = v->x*m[0][0] + v->y*m[1][0] + v->z*m[2][0] + v->w*m[3][0]
res->y = v->x*m[0][1] + v->y*m[1][1] + v->z*m[2][1] + v->w*m[3][1]
res->z = v->x*m[0][2] + v->y*m[1][2] + v->z*m[2][2] + v->w*m[3][2]
res->w = v->x*m[0][3] + v->y*m[1][3] + v->z*m[2][3] + v->w*m[3][3]
*/
#define M00 0
#define M01 4
#define M02 8
#define M03 12
#define M10 16
#define M11 20
#define M12 24
#define M13 28
#define M20 32
#define M21 36
#define M22 40
#define M23 44
#define M30 48
#define M31 52
#define M32 56
#define M33 60
void XForm (float *res, const float *v, const float *m, int numverts)
{
_asm {
MOV EDX, [V] ;EDX = source vector ptr
MOV EAX, [M] ;EAX = matrix ptr
MOV EBX, [RES] ;EBX = destination vector ptr
MOV ECX, [NUMVERTS] ;ECX = number of vertices to transform
;3DNow! version of fully general 3D vertex tranformation.
;Optimal for AMD Athlon (completes in 16 cycles)
FEMMS ;clear MMX state
ALIGN 16 ;for optimal branch alignment