User's Manual

116 Use MMX Instructions for Block Copies and Block Fills
AMD Athlon Processor x86 Code Optimization
22007E/0November 1999
$xfer:
movq mm0, [eax]
add edx, 64
movq mm1, [eax+8]
add eax, 64
movq mm2, [eax-48]
movq [edx-64], mm0
movq mm0, [eax-40]
movq [edx-56], mm1
movq mm1, [eax-32]
movq [edx-48], mm2
movq mm2, [eax-24]
movq [edx-40], mm0
movq mm0, [eax-16]
movq [edx-32], mm1
movq mm1, [eax-8]
movq [edx-24], mm2
movq [edx-16], mm0
dec ecx
movq [edx-8], mm1
jnz $xfer
femms
}
/* block fill (destination QWORD aligned) */
__asm {
mov edx, [dst_ptr]
mov ecx, [blk_size]
shr ecx, 6
movq mm0, [fill_data]
align 16
$fill:
movq [edx], mm0
movq [edx+8], mm0
movq [edx+16], mm0
movq [edx+24], mm0
movq [edx+32], mm0
movq [edx+40], mm0
add edx, 64
movq [edx-16], mm0
decq ecx
mov [edx-8], mm0
jnz $fill
femms
}