User's Manual

48 Use the 3DNow! PREFETCH and PREFETCHW
AMD Athlon Processor x86 Code Optimization
22007E/0November 1999
MOV ECX, (-LARGE_NUM) ;used biased index
MOV EAX, OFFSET array_a ;get address of array_a
MOV EDX, OFFSET array_b ;get address of array_b
MOV ECX, OFFSET array_c ;get address of array_c
$loop:
PREFETCHW [EAX+196] ;two cachelines ahead
PREFETCH [EDX+196] ;two cachelines ahead
PREFETCH [ECX+196] ;two cachelines ahead
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE] ;b[i]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE] ;b[i]*c[i]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE] ;a[i] = b[i]*c[i]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+8] ;b[i+1]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+8] ;b[i+1]*c[i+1]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+8] ;a[i+1] =
; b[i+1]*c[i+1]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+16];b[i+2]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+16];b[i+2]*c[i+2]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+16];a[i+2] =
; [i+2]*c[i+2]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+24];b[i+3]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+24];b[i+3]*c[i+3]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+24];a[i+3] =
; b[i+3]*c[i+3]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+32];b[i+4]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+32];b[i+4]*c[i+4]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+32];a[i+4] =
; b[i+4]*c[i+4]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+40];b[i+5]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+40];b[i+5]*c[i+5]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+40];a[i+5] =
; b[i+5]*c[i+5]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+48];b[i+6]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+48];b[i+6]*c[i+6]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+48];a[i+6] =
; b[i+6]*c[i+6]
FLD QWORD PTR [EDX+ECX*8+ARR_SIZE+56];b[i+7]
FMUL QWORD PTR [ECX+ECX*8+ARR_SIZE+56];b[i+7]*c[i+7]
FSTP QWORD PTR [EAX+ECX*8+ARR_SIZE+56];a[i+7] =
; b[i+7]*c[i+7]
ADD ECX, 8 ;next 8 products
JNZ $loop ;until none left
END