User's Manual

Optimized Matrix Multiplication 121
22007E/0November 1999 AMD Athlon Processor x86 Code Optimization
$$xform:
ADD EBX, 16 ;res++
MOVQ MM0, QWORD PTR [EDX] ;v->y | v->x
MOVQ MM1, QWORD PTR [EDX+8] ;v->w | v->z
ADD EDX, 16 ;v++
MOVQ MM2, MM0 ;v->y | v->x
MOVQ MM3, QWORD PTR [EAX+M00] ;m[0][1] | m[0][0]
PUNPCKLDQ MM0, MM0 ;v->x | v->x
MOVQ MM4, QWORD PTR [EAX+M10] ;m[1][1] | m[1][0]
PFMUL MM3, MM0 ;v->x*m[0][1] | v->x*m[0][0]
PUNPCKHDQ MM2, MM2 ;v->y | v->y
PFMUL MM4, MM2 ;v->y*m[1][1] | v->y*m[1][0]
MOVQ MM5, QWORD PTR [EAX+M02] ;m[0][3] | m[0][2]
MOVQ MM7, QWORD PTR [EAX+M12] ;m[1][3] | m[1][2]
MOVQ MM6, MM1 ;v->w | v->z
PFMUL MM5, MM0 ;v->x*m[0][3] | v0>x*m[0][2]
MOVQ MM0, QWORD PTR [EAX+M20] ;m[2][1] | m[2][0]
PUNPCKLDQ MM1, MM1 ;v->z | v->z
PFMUL MM7, MM2 ;v->y*m[1][3] | v->y*m[1][2]
MOVQ MM2, QWORD PTR [EAX+M22] ;m[2][3] | m[2][2]
PFMUL MM0, MM1 ;v->z*m[2][1] | v->z*m[2][0]
PFADD MM3, MM4 ;v->x*m[0][1]+v->y*m[1][1] |
; v->x*m[0][0]+v->y*m[1][0]
MOVQ MM4, QWORD PTR [EAX+M30] ;m[3][1] | m[3][0]
PFMUL MM2, MM1 ;v->z*m[2][3] | v->z*m[2][2]
PFADD MM5, MM7 ;v->x*m[0][3]+v->y*m[1][3] |
; v->x*m[0][2]+v->y*m[1][2]
MOVQ MM1, QWORD PTR [EAX+M32] ;m[3][3] | m[3][2]
PUNPCKHDQ MM6, MM6 ;v->w | v->w
PFADD MM3, MM0 ;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1] |
; v->x*m[0][0]+v->y*m[1][0]+v->z*m[2][0]
PFMUL MM4, MM6 ;v->w*m[3][1] | v->w*m[3][0]
PFMUL MM1, MM6 ;v->w*m[3][3] | v->w*m[3][2]
PFADD MM5, MM2 ;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3] |
; v->x*m[0][2]+v->y*m[1][2]+v->z*m[2][2]
PFADD MM3, MM4 ;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1]+
; v->w*m[3][1] | v->x*m[0][0]+v->y*m[1][0]+
; v->z*m[2][0]+v->w*m[3][0]
MOVQ [EBX-16], MM3 ;store res->y | res->x
PFADD MM5, MM1 ;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3]+
; v->w*m[3][3] | v->x*m[0][2]+v->y*m[1][2]+
; v->z*m[2][2]+v->w*m[3][2]
MOVQ [EBX-8], MM5 ;store res->w | res->z
DEC ECX ;numverts--
JNZ $$XFORM ;until numverts == 0
FEMMS ;clear MMX state
}
}