speed up 24bpp small fill and fill
Modified: trunk/reactos/subsys/win32k/dib/dib24bpp.c

Modified: trunk/reactos/subsys/win32k/dib/dib24bpp.c
--- trunk/reactos/subsys/win32k/dib/dib24bpp.c	2005-06-11 12:13:28 UTC (rev 15858)
+++ trunk/reactos/subsys/win32k/dib/dib24bpp.c	2005-06-11 13:36:00 UTC (rev 15859)
@@ -392,11 +392,97 @@
 {
   ULONG DestY;	
 
+#ifdef _M_IX86
+  PBYTE xaddr = DestSurface->pvScan0 + DestRect->top * DestSurface->lDelta + (DestRect->left << 1) + DestRect->left;
+  PBYTE addr;
+  ULONG Count;
+  ULONG xCount=DestRect->right - DestRect->left;
+
   for (DestY = DestRect->top; DestY< DestRect->bottom; DestY++)
+  {
+    Count = xCount;
+    addr = xaddr;    
+    xaddr = (PBYTE)((ULONG_PTR)addr + DestSurface->lDelta);
+
+    if (Count < 8)
+    {
+      /* For small fills, don't bother doing anything fancy */
+      while (Count--)
+        {
+          *(PUSHORT)(addr) = color;
+          addr += 2;
+          *(addr) = color >> 16;
+          addr += 1;
+        }
+    }
+  else
+    {
+      /* Align to 4-byte address */
+      while (0 != ((ULONG_PTR) addr & 0x3))
+        {
+          *(PUSHORT)(addr) = color;
+          addr += 2;
+          *(addr) = color >> 16;
+          addr += 1;
+          Count--;
+        }
+      /* If the color we need to fill with is 0ABC, then the final mem pattern
+       * (note little-endianness) would be:
+       *
+       * |C.B.A|C.B.A|C.B.A|C.B.A|   <- pixel borders
+       * |C.B.A.C|B.A.C.B|A.C.B.A|   <- ULONG borders
+       *
+       * So, taking endianness into account again, we need to fill with these
+       * ULONGs: CABC BCAB ABCA */
+
+       /* This is about 30% faster than the generic C code below */
+       __asm__ __volatile__ (
+"      movl %1, %%ecx\n"
+"      andl $0xffffff, %%ecx\n"         /* 0ABC */
+"      movl %%ecx, %%ebx\n"             /* Construct BCAB in ebx */
+"      shrl $8, %%ebx\n"
+"      movl %%ecx, %%eax\n"
+"      shll $16, %%eax\n"
+"      orl  %%eax, %%ebx\n"
+"      movl %%ecx, %%edx\n"             /* Construct ABCA in edx */
+"      shll $8, %%edx\n"
+"      movl %%ecx, %%eax\n"
+"      shrl $16, %%eax\n"
+"      orl  %%eax, %%edx\n"
+"      movl %%ecx, %%eax\n"             /* Construct CABC in eax */
+"      shll $24, %%eax\n"
+"      orl  %%ecx, %%eax\n"
+"      movl %2, %%ecx\n"                /* Load count */
+"      shr  $2, %%ecx\n"
+"      movl %3, %%edi\n"                /* Load dest */
+".FL1:\n"
+"      movl %%eax, (%%edi)\n"           /* Store 4 pixels, 12 bytes */
+"      movl %%ebx, 4(%%edi)\n"
+"      movl %%edx, 8(%%edi)\n"
+"      addl $12, %%edi\n"
+"      dec  %%ecx\n"
+"      jnz  .FL1\n"
+"      movl %%edi, %0\n"
+  : "=m"(addr)
+  : "m"(color), "m"(Count), "m"(addr)
+  : "%eax", "%ebx", "%ecx", "%edx", "%edi");
+   Count = Count & 0x03;
+      while (0 != Count--)
+        {
+          *(PUSHORT)(addr) = color;
+          addr += 2;
+          *(addr) = color >> 16;
+          addr += 1;
+        }
+    }
+  }
+#else
+
+  for (DestY = DestRect->top; DestY< DestRect->bottom; DestY++)
     {			 				
       DIB_24BPP_HLine(DestSurface, DestRect->left, DestRect->right, DestY, color);			  				
     }
-
+#endif
   return TRUE;
 }