Speed up 16bpp fills by a factor 2
Modified: trunk/reactos/subsys/win32k/dib/dib16bpp.c

Modified: trunk/reactos/subsys/win32k/dib/dib16bpp.c
--- trunk/reactos/subsys/win32k/dib/dib16bpp.c	2005-01-15 02:44:25 UTC (rev 13052)
+++ trunk/reactos/subsys/win32k/dib/dib16bpp.c	2005-01-15 09:46:55 UTC (rev 13053)
@@ -40,15 +40,52 @@
 VOID
 DIB_16BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
 {
-  PBYTE byteaddr = SurfObj->pvScan0 + y * SurfObj->lDelta;
-  PWORD addr = (PWORD)byteaddr + x1;
+  PDWORD addr = (PDWORD)((PWORD)(SurfObj->pvScan0 + y * SurfObj->lDelta) + x1);
+
+#ifdef _M_IX86
+  /* This is about 10% faster than the generic C code below */
+  LONG Count = x2 - x1;
+
+  __asm__(
+"  cld\n"
+"  andl $0xffff, %0\n"  /* If the pixel value is "abcd", put "abcdabcd" in %eax */
+"  mov  %0, %%eax\n"
+"  shl  $16, %%eax\n"
+"  or   %0, %%eax\n"
+"  test $0x01, %%edi\n" /* Align to fullword boundary */
+"  jz   .L1\n"
+"  stosw\n"
+"  dec  %1\n"
+"  jz   .L2\n"
+".L1:\n"
+"  mov  %1,%%ecx\n"     /* Setup count of fullwords to fill */
+"  shr  $1,%%ecx\n"
+"  rep stosl\n"         /* The actual fill */
+"  test $0x01, %1\n"    /* One left to do at the right side? */
+"  jz   .L2\n"
+"  stosw\n"
+".L2:\n"
+  : /* no output */
+  : "r"(c), "r"(Count), "D"(addr)
+  : "%eax", "%ecx");
+#else /* _M_IX86 */
   LONG cx = x1;
+  DWORD cc;
 
-  while(cx < x2) {
-    *addr = (WORD)c;
-    ++addr;
-    ++cx;
+  if (0 != (cx & 0x01)) {
+    *((PWORD) addr) = c;
+    cx++;
+    addr = (PDWORD)((PWORD)(addr) + 1);
   }
+  cc = ((c & 0xffff) << 16) | (c & 0xffff);
+  while(cx + 1 < x2) {
+    *addr++ = cc;
+    cx += 2;
+  }
+  if (cx < x2) {
+    *((PWORD) addr) = c;
+  }
+#endif /* _M_IX86 */
 }
 
 VOID