user manual

Use MMX Instructions for Block Copies and Block Fills 117
22007E/0November 1999 AMD Athlon Processor x86 Code Optimization
AMD Athlon
Processor Specific
Code
The following example code, written for the inline assembler of
Microsoft Visual C, is suitable for moving/filling a quadword
aligned block of data in the following situations:
AMD Athlon processor specific code where the destination
of the block copy is in non-cacheable memory space
AMD Athlon processor specific code where the destination
of the block copy is in cacheable space, but no immediate
data re-use of the data at the destination is expected.
Example 2:
/* block copy (source and destination QWORD aligned) */
__asm {
mov eax, [src_ptr]
mov edx, [dst_ptr]
mov ecx, [blk_size]
shr ecx, 6
align 16
$xfer_nc:
prefetchnta [eax+256]
movq mm0, [eax]
add edx, 64
movq mm1, [eax+8]
add eax, 64
movq mm2, [eax-48]
movntq [edx-64], mm0
movq mm0, [eax-40]
movntq [edx-56], mm1
movq mm1, [eax-32]
movntq [edx-48], mm2
movq mm2, [eax-24]
movntq [edx-40], mm0
movq mm0, [eax-16]
movntq [edx-32], mm1
movq mm1, [eax-8]
movntq [edx-24], mm2
movntq [edx-16], mm0
dec ecx
movntq [edx-8], mm1
jnz $xfer_nc
femms
sfence
}