Speed up 24bpp fills
Modified: trunk/reactos/subsys/win32k/dib/dib24bpp.c

Modified: trunk/reactos/subsys/win32k/dib/dib24bpp.c
--- trunk/reactos/subsys/win32k/dib/dib24bpp.c	2005-06-09 18:33:45 UTC (rev 15847)
+++ trunk/reactos/subsys/win32k/dib/dib24bpp.c	2005-06-09 19:04:32 UTC (rev 15848)
@@ -38,16 +38,100 @@
 DIB_24BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
 {
   PBYTE addr = SurfObj->pvScan0 + y * SurfObj->lDelta + (x1 << 1) + x1;
-  LONG cx = x1;
+  ULONG Count = x2 - x1;
+#ifndef _M_IX86
+  ULONG MultiCount;
+  ULONG Fill[3];
+#endif
 
-  c &= 0xFFFFFF;
-  while(cx < x2) {
-    *(PUSHORT)(addr) = c & 0xFFFF;
-    addr += 2;
-    *(addr) = c >> 16;
-    addr += 1;
-    ++cx;
-  }
+  if (Count < 8)
+    {
+      /* For small fills, don't bother doing anything fancy */
+      while (Count--)
+        {
+          *(PUSHORT)(addr) = c;
+          addr += 2;
+          *(addr) = c >> 16;
+          addr += 1;
+        }
+    }
+  else
+    {
+      /* Align to 4-byte address */
+      while (0 != ((ULONG_PTR) addr & 0x3))
+        {
+          *(PUSHORT)(addr) = c;
+          addr += 2;
+          *(addr) = c >> 16;
+          addr += 1;
+          Count--;
+        }
+      /* If the color we need to fill with is 0ABC, then the final mem pattern
+       * (note little-endianness) would be:
+       *
+       * |C.B.A|C.B.A|C.B.A|C.B.A|   <- pixel borders
+       * |C.B.A.C|B.A.C.B|A.C.B.A|   <- ULONG borders
+       *
+       * So, taking endianness into account again, we need to fill with these
+       * ULONGs: CABC BCAB ABCA */
+#ifdef _M_IX86
+       /* This is about 30% faster than the generic C code below */
+       __asm__ __volatile__ (
+"      movl %1, %%ecx\n"
+"      andl $0xffffff, %%ecx\n"         /* 0ABC */
+"      movl %%ecx, %%ebx\n"             /* Construct BCAB in ebx */
+"      shrl $8, %%ebx\n"
+"      movl %%ecx, %%eax\n"
+"      shll $16, %%eax\n"
+"      orl  %%eax, %%ebx\n"
+"      movl %%ecx, %%edx\n"             /* Construct ABCA in edx */
+"      shll $8, %%edx\n"
+"      movl %%ecx, %%eax\n"
+"      shrl $16, %%eax\n"
+"      orl  %%eax, %%edx\n"
+"      movl %%ecx, %%eax\n"             /* Construct CABC in eax */
+"      shll $24, %%eax\n"
+"      orl  %%ecx, %%eax\n"
+"      movl %2, %%ecx\n"                /* Load count */
+"      shr  $2, %%ecx\n"
+"      movl %3, %%edi\n"                /* Load dest */
+".L1:\n"
+"      movl %%eax, (%%edi)\n"           /* Store 4 pixels, 12 bytes */
+"      movl %%ebx, 4(%%edi)\n"
+"      movl %%edx, 8(%%edi)\n"
+"      addl $12, %%edi\n"
+"      dec  %%ecx\n"
+"      jnz  .L1\n"
+"      movl %%edi, %0\n"
+  : "=m"(addr)
+  : "m"(c), "m"(Count), "m"(addr)
+  : "%eax", "%ebx", "%ecx", "%edx", "%edi");
+#else
+      c = c & 0xffffff;                /* 0ABC */
+      Fill[0] = c | (c << 24);         /* CABC */
+      Fill[1] = (c >> 8) | (c << 16);  /* BCAB */
+      Fill[2] = (c << 8) | (c >> 16);  /* ABCA */
+      MultiCount = Count / 4;
+      do
+        {
+          *(PULONG)addr = Fill[0];
+          addr += 4;
+          *(PULONG)addr = Fill[1];
+          addr += 4;
+          *(PULONG)addr = Fill[2];
+          addr += 4;
+        }
+      while (0 != --MultiCount);
+#endif
+      Count = Count & 0x03;
+      while (0 != Count--)
+        {
+          *(PUSHORT)(addr) = c;
+          addr += 2;
+          *(addr) = c >> 16;
+          addr += 1;
+        }
+    }
 }
 
 VOID
@@ -308,12 +392,12 @@
 {
   ULONG DestY;	
 
-	 for (DestY = DestRect->top; DestY< DestRect->bottom; DestY++)
-	{			 				
-		DIB_24BPP_HLine(DestSurface, DestRect->left, DestRect->right, DestY, color);			  				
-	}
+  for (DestY = DestRect->top; DestY< DestRect->bottom; DestY++)
+    {			 				
+      DIB_24BPP_HLine(DestSurface, DestRect->left, DestRect->right, DestY, color);			  				
+    }
 
-return TRUE;
+  return TRUE;
 }
 
 //NOTE: If you change something here, please do the same in other dibXXbpp.c files!