Author: fireball
Date: Tue May 26 20:35:01 2009
New Revision: 41126
URL:
http://svn.reactos.org/svn/reactos?rev=41126&view=rev
Log:
- Make memcpy a duplicate of a memmove (confirmed by testing), there is a dependency on
this behaviour. One less "msvcrt_winetest.exe string" failure.
Removed:
trunk/reactos/media/doc/memcpy_optimize.txt
Modified:
trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s
trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s
trunk/reactos/lib/sdk/crt/mem/memcpy.c
trunk/reactos/lib/sdk/crt/mem/memmove.c
Modified: trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/i386/memcp…
==============================================================================
--- trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s [iso-8859-1] (original)
+++ trunk/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s [iso-8859-1] Tue May 26 20:35:01 2009
@@ -1,7 +1,7 @@
/*
* void *memcpy (void *to, const void *from, size_t count)
*
- * Some optimization research can be found in media/doc/memcpy_optimize.txt
+ * NOTE: This code is a duplicate of memmove function from memmove_asm.s
*/
.globl _memcpy
@@ -9,26 +9,39 @@
_memcpy:
push %ebp
mov %esp,%ebp
+
push %esi
push %edi
- mov 0x8(%ebp),%edi
- mov 0xc(%ebp),%esi
- mov 0x10(%ebp),%ecx
+
+ mov 8(%ebp),%edi
+ mov 12(%ebp),%esi
+ mov 16(%ebp),%ecx
+
+ cmp %esi,%edi
+ jbe .CopyUp
+ mov %ecx,%eax
+ add %esi,%eax
+ cmp %eax,%edi
+ jb .CopyDown
+
+.CopyUp:
cld
+
cmp $16,%ecx
jb .L1
mov %ecx,%edx
test $3,%edi
je .L2
/*
- * Make the destination dword aligned
+ * Make the destination dword aligned
*/
- mov %edi,%ecx
- neg %ecx
- and $3,%ecx
- sub %ecx,%edx
- rep movsb
- mov %edx,%ecx
+ mov %edi,%ecx
+ and $3,%ecx
+ sub $5,%ecx
+ not %ecx
+ sub %ecx,%edx
+ rep movsb
+ mov %edx,%ecx
.L2:
shr $2,%ecx
rep movsl
@@ -39,9 +52,63 @@
je .L3
rep movsb
.L3:
+ mov 8(%ebp),%eax
pop %edi
pop %esi
- mov 0x8(%ebp),%eax
leave
ret
+.CopyDown:
+ std
+
+ add %ecx,%edi
+ add %ecx,%esi
+
+ cmp $16,%ecx
+ jb .L4
+ mov %ecx,%edx
+ test $3,%edi
+ je .L5
+
+/*
+ * Make the destination dword aligned
+ */
+ mov %edi,%ecx
+ and $3,%ecx
+ sub %ecx,%edx
+ dec %esi
+ dec %edi
+ rep movsb
+ mov %edx,%ecx
+
+ sub $3,%esi
+ sub $3,%edi
+.L6:
+ shr $2,%ecx
+ rep movsl
+ mov %edx,%ecx
+ and $3,%ecx
+ je .L7
+ add $3,%esi
+ add $3,%edi
+.L8:
+ rep movsb
+.L7:
+ cld
+ mov 8(%ebp),%eax
+ pop %edi
+ pop %esi
+ leave
+ ret
+.L5:
+ sub $4,%edi
+ sub $4,%esi
+ jmp .L6
+
+.L4:
+ test %ecx,%ecx
+ je .L7
+ dec %esi
+ dec %edi
+ jmp .L8
+
Modified: trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/i386/memmo…
==============================================================================
--- trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s [iso-8859-1] (original)
+++ trunk/reactos/lib/sdk/crt/mem/i386/memmove_asm.s [iso-8859-1] Tue May 26 20:35:01
2009
@@ -1,9 +1,7 @@
-/*
- * $Id$
- */
-
/*
* void *memmove (void *to, const void *from, size_t count)
+ *
+ * NOTE: This code is duplicated in memcpy_asm.s
*/
.globl _memmove
Modified: trunk/reactos/lib/sdk/crt/mem/memcpy.c
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/memcpy.c?r…
==============================================================================
--- trunk/reactos/lib/sdk/crt/mem/memcpy.c [iso-8859-1] (original)
+++ trunk/reactos/lib/sdk/crt/mem/memcpy.c [iso-8859-1] Tue May 26 20:35:01 2009
@@ -1,16 +1,36 @@
-/*
- * $Id$
- */
-
#include <string.h>
+/* NOTE: This code is a duplicate of memmove implementation! */
void* memcpy(void* to, const void* from, size_t count)
{
- register char *f = (char *)from;
- register char *t = (char *)to;
- register int i = count;
+ char *char_dest = (char *)dest;
+ char *char_src = (char *)src;
- while (i-- > 0)
- *t++ = *f++;
- return to;
+ if ((char_dest <= char_src) || (char_dest >= (char_src+count)))
+ {
+ /* non-overlapping buffers */
+ while(count > 0)
+ {
+ *char_dest = *char_src;
+ char_dest++;
+ char_src++;
+ count--;
+ }
+ }
+ else
+ {
+ /* overlaping buffers */
+ char_dest = (char *)dest + count - 1;
+ char_src = (char *)src + count - 1;
+
+ while(count > 0)
+ {
+ *char_dest = *char_src;
+ char_dest--;
+ char_src--;
+ count--;
+ }
+ }
+
+ return dest;
}
Modified: trunk/reactos/lib/sdk/crt/mem/memmove.c
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/lib/sdk/crt/mem/memmove.c?…
==============================================================================
--- trunk/reactos/lib/sdk/crt/mem/memmove.c [iso-8859-1] (original)
+++ trunk/reactos/lib/sdk/crt/mem/memmove.c [iso-8859-1] Tue May 26 20:35:01 2009
@@ -1,10 +1,6 @@
-/*
- * $Id$
- */
-
#include <string.h>
-
+/* NOTE: This code is duplicated in memcpy function */
void * memmove(void *dest,const void *src,size_t count)
{
char *char_dest = (char *)dest;
Removed: trunk/reactos/media/doc/memcpy_optimize.txt
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/media/doc/memcpy_optimize.…
==============================================================================
--- trunk/reactos/media/doc/memcpy_optimize.txt [iso-8859-1] (original)
+++ trunk/reactos/media/doc/memcpy_optimize.txt (removed)
@@ -1,55 +1,0 @@
-Surfing the Internet, I stumbled upon
http://www.sciencemark.org where you
-can download a benchmark program that (amongst others) can benchmark different
-x86 memcpy implementations. Running that benchmark on my machine revealed that
-the fastest implementation was roughly twice as fast as the "rep movsl"
-implementation (lib/string/i386/memcpy_asm.s) that ReactOS uses.
-To test the alternate implementations in a ReactOS setting, I first
-instrumented the existing memcpy implementation to log with which arguments
-it was being called. I then booted ReactOS, started a background compile in it
-(to generate some I/O) and played a game of Solitaire (to generate graphics
-operations). After loosing the game, I shut down ReactOS. I then extracted
-the memcpy calls roughly between the start of Explorer (to get rid of one time
-startup effects) an shutdown. The resulting call profile is attached below.
-I then used that profile to make calls to the existing memcpy and an alternate
-implementation (I selected the "MMX registry copy with SSE prefetching"),
-taking care to use different source and destination regions to remove caching
-effects. The profile consisted of roughly 250000 calls to memcpy, I found
-that I had to execute the profile 10000 times to get "reasonable" time values.
-To compensate for the overhead of the test program, I also ran a test where
-the whole memcpy routine consisted of a single instruction: "ret". The test
-results, after applying a correction for the overhead:
-
-rep movl 70.5 sec
-mmx registers 58.3 sec
-Speed increase: 17%
-
-(Test machine: AMD Athlon MP 2800+ running Linux).
-Although the relative speed increase is nice (17%), we also have to look at the
-absolute speed increase. Remember that the 70.5 sec for the "rep movl" case
-was obtained by running the whole profile 10000 times. This means that all the
-memcpy's executed during the profiling run of ReactOS together took only
-0.00705 seconds. So the conclusion has to be that we're simply not spending
-a significant amount of time in memcpy (BTW, our memcpy implementation is
-shared between kernel and user mode, of the total of 250000 memcpy calls about
-90% were made from kernel mode and 10% from user mode), so optimizing memcpy
-(although possible) will not result in a significant better performance of
-ReactOS as a whole.
-Just for fun, I then used only the part of the profile where the memory area
-was larger than 128 bytes. The MMX implementation actually only runs for sizes
-over 128 bytes, for smaller sizes it deferred to the "rep movl"
implementation.
-According to the profile, the vast majority of memcpy calls is made with a
-size smaller than 128 bytes (96.8%).
-
-rep movl 52.9 sec
-mmx registers 27.1 sec
-Speed increase 48%
-
-This is more or less in line with the results I got from the membench benchmark
-from
http://www.sciencemark.org.
-
-Final conclusion: Although optimizing memcpy is useful (and feasible) for
-transfer of large blocks, the usage pattern in ReactOS consists mostly of
-small blocks. The resulting absolute spead increase doesn't justify the
-increased code complexity.
-
-2005/12/03 GvG