Author: fireball Date: Tue May 26 20:35:01 2009 New Revision: 41126
URL: http://svn.reactos.org/svn/reactos?rev=41126&view=rev Log: - Make memcpy a duplicate of a memmove (confirmed by testing), there is a dependency on this behaviour. One less "msvcrt_winetest.exe string" failure.
Removed: trunk/reactos/media/doc/memcpy_optimize.txt Modified: trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s trunk/reactos/lib/sdk/crt/mem/memcpy.c trunk/reactos/lib/sdk/crt/mem/memmove.c
Modified: trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s URL: http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/i386/memcpy... ============================================================================== --- trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s [iso-8859-1] (original) +++ trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s [iso-8859-1] Tue May 26 20:35:01 2009 @@ -1,7 +1,7 @@ /* * void *memcpy (void *to, const void *from, size_t count) * - * Some optimization research can be found in media/doc/memcpy_optimize.txt + * NOTE: This code is a duplicate of memmove function from memmove_asm.s */
.globl _memcpy @@ -9,26 +9,39 @@ _memcpy: push %ebp mov %esp,%ebp + push %esi push %edi - mov 0x8(%ebp),%edi - mov 0xc(%ebp),%esi - mov 0x10(%ebp),%ecx + + mov 8(%ebp),%edi + mov 12(%ebp),%esi + mov 16(%ebp),%ecx + + cmp %esi,%edi + jbe .CopyUp + mov %ecx,%eax + add %esi,%eax + cmp %eax,%edi + jb .CopyDown + +.CopyUp: cld + cmp $16,%ecx jb .L1 mov %ecx,%edx test $3,%edi je .L2 /* - * Make the destination dword aligned + * Make the destination dword aligned */ - mov %edi,%ecx - neg %ecx - and $3,%ecx - sub %ecx,%edx - rep movsb - mov %edx,%ecx + mov %edi,%ecx + and $3,%ecx + sub $5,%ecx + not %ecx + sub %ecx,%edx + rep movsb + mov %edx,%ecx .L2: shr $2,%ecx rep movsl @@ -39,9 +52,63 @@ je .L3 rep movsb .L3: + mov 8(%ebp),%eax pop %edi pop %esi - mov 0x8(%ebp),%eax leave ret
+.CopyDown: + std + + add %ecx,%edi + add %ecx,%esi + + cmp $16,%ecx + jb .L4 + mov %ecx,%edx + test $3,%edi + je .L5 + +/* + * Make the destination dword aligned + */ + mov %edi,%ecx + and $3,%ecx + sub %ecx,%edx + dec %esi + dec %edi + rep movsb + mov %edx,%ecx + + sub $3,%esi + sub $3,%edi +.L6: + shr $2,%ecx + rep movsl + mov %edx,%ecx + and $3,%ecx + je .L7 + add $3,%esi + add $3,%edi +.L8: + rep movsb +.L7: + cld + mov 8(%ebp),%eax + pop %edi + pop %esi + leave + ret +.L5: + sub $4,%edi + sub $4,%esi + jmp .L6 + +.L4: + test %ecx,%ecx + je .L7 + dec %esi + dec %edi + jmp .L8 +
Modified: trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s URL: http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/i386/memmov... ============================================================================== --- trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s [iso-8859-1] (original) +++ trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s [iso-8859-1] Tue May 26 20:35:01 2009 @@ -1,9 +1,7 @@ -/* - * $Id$ - */ - /* * void *memmove (void *to, const void *from, size_t count) + * + * NOTE: This code is duplicated in memcpy_asm.s */
.globl _memmove
Modified: trunk/reactos/lib/sdk/crt/mem/memcpy.c URL: http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/memcpy.c?re... ============================================================================== --- trunk/reactos/lib/sdk/crt/mem/memcpy.c [iso-8859-1] (original) +++ trunk/reactos/lib/sdk/crt/mem/memcpy.c [iso-8859-1] Tue May 26 20:35:01 2009 @@ -1,16 +1,36 @@ -/* - * $Id$ - */ - #include <string.h>
+/* NOTE: This code is a duplicate of memmove implementation! */ void* memcpy(void* to, const void* from, size_t count) { - register char *f = (char *)from; - register char *t = (char *)to; - register int i = count; + char *char_dest = (char *)dest; + char *char_src = (char *)src;
- while (i-- > 0) - *t++ = *f++; - return to; + if ((char_dest <= char_src) || (char_dest >= (char_src+count))) + { + /* non-overlapping buffers */ + while(count > 0) + { + *char_dest = *char_src; + char_dest++; + char_src++; + count--; + } + } + else + { + /* overlaping buffers */ + char_dest = (char *)dest + count - 1; + char_src = (char *)src + count - 1; + + while(count > 0) + { + *char_dest = *char_src; + char_dest--; + char_src--; + count--; + } + } + + return dest; }
Modified: trunk/reactos/lib/sdk/crt/mem/memmove.c URL: http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/memmove.c?r... ============================================================================== --- trunk/reactos/lib/sdk/crt/mem/memmove.c [iso-8859-1] (original) +++ trunk/reactos/lib/sdk/crt/mem/memmove.c [iso-8859-1] Tue May 26 20:35:01 2009 @@ -1,10 +1,6 @@ -/* - * $Id$ - */ - #include <string.h>
- +/* NOTE: This code is duplicated in memcpy function */ void * memmove(void *dest,const void *src,size_t count) { char *char_dest = (char *)dest;
Removed: trunk/reactos/media/doc/memcpy_optimize.txt URL: http://svn.reactos.org/svn/reactos/trunk/reactos/media/doc/memcpy_optimize.t... ============================================================================== --- trunk/reactos/media/doc/memcpy_optimize.txt [iso-8859-1] (original) +++ trunk/reactos/media/doc/memcpy_optimize.txt (removed) @@ -1,55 +1,0 @@ -Surfing the Internet, I stumbled upon http://www.sciencemark.org where you -can download a benchmark program that (amongst others) can benchmark different -x86 memcpy implementations. Running that benchmark on my machine revealed that -the fastest implementation was roughly twice as fast as the "rep movsl" -implementation (lib/string/i386/memcpy_asm.s) that ReactOS uses. -To test the alternate implementations in a ReactOS setting, I first -instrumented the existing memcpy implementation to log with which arguments -it was being called. I then booted ReactOS, started a background compile in it -(to generate some I/O) and played a game of Solitaire (to generate graphics -operations). After loosing the game, I shut down ReactOS. I then extracted -the memcpy calls roughly between the start of Explorer (to get rid of one time -startup effects) an shutdown. The resulting call profile is attached below. -I then used that profile to make calls to the existing memcpy and an alternate -implementation (I selected the "MMX registry copy with SSE prefetching"), -taking care to use different source and destination regions to remove caching -effects. The profile consisted of roughly 250000 calls to memcpy, I found -that I had to execute the profile 10000 times to get "reasonable" time values. -To compensate for the overhead of the test program, I also ran a test where -the whole memcpy routine consisted of a single instruction: "ret". The test -results, after applying a correction for the overhead: - -rep movl 70.5 sec -mmx registers 58.3 sec -Speed increase: 17% - -(Test machine: AMD Athlon MP 2800+ running Linux). -Although the relative speed increase is nice (17%), we also have to look at the -absolute speed increase. Remember that the 70.5 sec for the "rep movl" case -was obtained by running the whole profile 10000 times. This means that all the -memcpy's executed during the profiling run of ReactOS together took only -0.00705 seconds. So the conclusion has to be that we're simply not spending -a significant amount of time in memcpy (BTW, our memcpy implementation is -shared between kernel and user mode, of the total of 250000 memcpy calls about -90% were made from kernel mode and 10% from user mode), so optimizing memcpy -(although possible) will not result in a significant better performance of -ReactOS as a whole. -Just for fun, I then used only the part of the profile where the memory area -was larger than 128 bytes. The MMX implementation actually only runs for sizes -over 128 bytes, for smaller sizes it deferred to the "rep movl" implementation. -According to the profile, the vast majority of memcpy calls is made with a -size smaller than 128 bytes (96.8%). - -rep movl 52.9 sec -mmx registers 27.1 sec -Speed increase 48% - -This is more or less in line with the results I got from the membench benchmark -from http://www.sciencemark.org. - -Final conclusion: Although optimizing memcpy is useful (and feasible) for -transfer of large blocks, the usage pattern in ReactOS consists mostly of -small blocks. The resulting absolute spead increase doesn't justify the -increased code complexity. - -2005/12/03 GvG