copymemory
Publié : lun. 27/déc./2004 12:08
j'ai récupéré ça sur le forum anglais. (optimisé pour AMD, mais semble marcher sur pas mal d'intel).
Ca semble marcher bien plus vite que la fonction de PB, mais ne marche pas chez moi (Centrino 1.7GHz)
Et pour vous?
Ca semble marcher bien plus vite que la fonction de PB, mais ne marche pas chez moi (Centrino 1.7GHz)
Et pour vous?
Code : Tout sélectionner
Procedure CopyMemoryAMD(*src, *dst, size)
#CACHEBLOCK = $80
#CACHEBLOCKPREFETCH = #CACHEBLOCK/2
#CACHEBLOCKTOP = #CACHEBLOCK*64
#UNCACHED_COPY = 197*1024
#UNCACHED_COPYPREFETCH = #UNCACHED_COPY/64
#TINY_BLOCK_COPY = 64
#IN_CACHE_COPY = 64*1024
#IN_CACHE_COPYBIG = #IN_CACHE_COPY/64
len = size/8
MOV esi, *src ; source array
MOV edi, *dst ; destination array
MOV ecx, len ; number of QWORDS (8 bytes)
MOV ebx, ecx ; keep a copy of count
CLD
CMP ecx, #TINY_BLOCK_COPY
JB l_memcpy_ic_3 ; tiny? skip mmx copy
CMP ecx, 32*1024 ; don't align between 32k-64k because
JBE l_memcpy_do_align ; it appears to be slower
CMP ecx, 64*1024
JBE l_memcpy_align_done
memcpy_do_align:
MOV ecx, 8 ; a trick that's faster than rep movsb...
SUB ecx, edi ; align destination to qword
And ecx, 7 ; 111b ; get the low bits
SUB ebx, ecx ; update copy count
NEG ecx ; set up to jump into the array
ADD ecx, l_memcpy_align_done
JMP ecx ; jump to array of movsb's
;align 4
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
memcpy_align_done: ; destination is dword aligned
MOV ecx, ebx ; number of bytes left to copy
SHR ecx, 6 ; get 64-byte block count
JZ l_memcpy_ic_2 ; finish the last few bytes
CMP ecx, #IN_CACHE_COPYBIG ; too big 4 cache? use uncached copy
JAE l_memcpy_uc_test
; !align 16
memcpy_ic_1: ; 64-byte block copies, in-cache copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
!movq mm1, [esi+8]
!movq [edi+0], mm0 ; write 64 bits
!movq [edi+8], mm1 ; note: the normal !movq writes the
!movq mm2, [esi+16] ; data to cache; a cache line will be
!movq mm3, [esi+24] ; allocated as needed, to store the data
!movq [edi+16], mm2
!movq [edi+24], mm3
!movq mm0, [esi+32]
!movq mm1, [esi+40]
!movq [edi+32], mm0
!movq [edi+40], mm1
!movq mm2, [esi+48]
!movq mm3, [esi+56]
!movq [edi+48], mm2
!movq [edi+56], mm3
ADD esi, 64 ; update source pointer
ADD edi, 64 ; update destination pointer
DEC ecx ; count down
JNZ l_memcpy_ic_1 ; last 64-byte block?
memcpy_ic_2:
MOV ecx, ebx ; has valid low 6 bits of the byte count
memcpy_ic_3:
SHR ecx, 2 ; dword count
And ecx, 31 ; %1111 ; only look at the "remainder" bits
NEG ecx ; set up to jump into the array
ADD ecx, l_memcpy_last_few
JMP ecx ; jump to array of movsd's
memcpy_uc_test:
CMP ecx, #UNCACHED_COPYPREFETCH ; big enough? use block prefetch copy
JAE l_memcpy_bp_1
memcpy_64_test:
Or ecx, ecx ; tail end of block prefetch will jump here
JZ l_memcpy_ic_2 ; no more 64-byte blocks left
memcpy_uc_1: ; 64-byte blocks, uncached copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
ADD edi, 64 ; update destination pointer
!movq mm1, [esi+8]
ADD esi, 64 ; update source pointer
!movq mm2, [esi-48]
!movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
!movq mm0, [esi-40] ; note: !movntq also prevents the CPU
!movntq [edi-56], mm1 ; from READING the destination address
!movq mm1, [esi-32] ; into the cache, only to be over-written
!movntq [edi-48], mm2 ; so that also helps performance
!movq mm2, [esi-24]
!movntq [edi-40], mm0
!movq mm0, [esi-16]
!movntq [edi-32], mm1
!movq mm1, [esi-8]
!movntq [edi-24], mm2
!movntq [edi-16], mm0
DEC ecx
!movntq [edi-8], mm1
JNZ l_memcpy_uc_1 ; last 64-byte block?
JMP l_memcpy_ic_2 ; almost done
memcpy_bp_1: ; large blocks, block prefetch copy
CMP ecx, #CACHEBLOCK ; big enough to run another prefetch loop?
JL l_memcpy_64_test ; no, back to regular uncached copy
MOV eax, #CACHEBLOCKPREFETCH ; block prefetch loop, unrolled 2X
ADD esi, #CACHEBLOCKTOP ; move to the top of the block
; !align 16
memcpy_bp_2:
MOV edx, [esi-64] ; grab one address per cache line
MOV edx, [esi-128] ; grab one address per cache line
SUB esi, 128 ; go reverse order
DEC eax ; count down the cache lines
JNZ l_memcpy_bp_2 ; keep grabbing more lines into cache
MOV eax, #CACHEBLOCK ; now that it's in cache, do the copy
; !align 16
memcpy_bp_3:
!movq mm0, [esi] ; read 64 bits
!movq mm1, [esi+ 8]
!movq mm2, [esi+16]
!movq mm3, [esi+24]
!movq mm4, [esi+32]
!movq mm5, [esi+40]
!movq mm6, [esi+48]
!movq mm7, [esi+56]
ADD esi, 64 ; update source pointer
!movntq [edi], mm0 ; write 64 bits, bypassing cache
!movntq [edi+ 8], mm1 ; note: !movntq also prevents the CPU
!movntq [edi+16], mm2 ; from READING the destination address
!movntq [edi+24], mm3 ; into the cache, only to be over-written,
!movntq [edi+32], mm4 ; so that also helps performance
!movntq [edi+40], mm5
!movntq [edi+48], mm6
!movntq [edi+56], mm7
ADD edi, 64 ; update dest pointer
DEC eax ; count down
JNZ l_memcpy_bp_3 ; keep copying
SUB ecx, #CACHEBLOCK ; update the 64-byte block count
JMP l_memcpy_bp_1 ; keep processing chunks
;The smallest copy uses the X86 "!movsd" instruction, in an optimized
;form which is an "unrolled loop". Then it handles the last few bytes.
; !align 4
!movsd
!movsd ; perform last 1-15 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd ; perform last 1-7 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
memcpy_last_few: ; dword aligned from before !movsd's
MOV ecx, ebx ; has valid low 2 bits of the byte count
And ecx, 3 ; %11 ; the last few cows must come home
JZ l_memcpy_final ; no more, let's leave
REP movsb ; the last 1, 2, or 3 bytes
memcpy_final:
!emms ; clean up the state
!sfence ; flush the write buffer
EndProcedure
source.l=AllocateMemory(1024000)
destination.l=AllocateMemory(1024000)
For a=0 To 1024000
PokeB(source,Random(15))
Next
time.l=ElapsedMilliseconds()
For a=1 To 10000
CopyMemoryAMD(source,destination,1024000)
Next
onek_AMD.s=Str((ElapsedMilliseconds()-time))
time.l=ElapsedMilliseconds()
For a=1 To 10000
CopyMemory(source,destination,1024000)
Next
onek_PB.s=Str((ElapsedMilliseconds()-time))
source.l=AllocateMemory(102400)
destination.l=AllocateMemory(102400)
For a=0 To 102400
PokeB(source,Random(15))
Next
time.l=ElapsedMilliseconds()
For a=1 To 10000
CopyMemoryAMD(source,destination,102400)
Next
hundredk_AMD.s=Str((ElapsedMilliseconds()-time))
time.l=ElapsedMilliseconds()
For a=1 To 10000
CopyMemory(source,destination,102400)
Next
hundredk_PB.s=Str((ElapsedMilliseconds()-time))
FreeMemory(source)
FreeMemory(destination)
results.s="--- 1 MB tranfer test ---"+#LFCR$
results.s+"AMD Function : "+ onek_AMD +#LFCR$
results.s+"Pure Function : "+ onek_PB +#LFCR$
results.s+#LFCR$
results.s+"--- 100kb tranfer test ---"+#LFCR$
results.s+"AMD Function : "+ hundredk_AMD +#LFCR$
results.s+"Pure Function : "+ hundredk_PB +#LFCR$
MessageRequester("Test Results",results.s)