Speed up 16bpp fills by a factor 2
Modified: trunk/reactos/subsys/win32k/dib/dib16bpp.c
_____
Modified: trunk/reactos/subsys/win32k/dib/dib16bpp.c
--- trunk/reactos/subsys/win32k/dib/dib16bpp.c 2005-01-15 02:44:25 UTC
(rev 13052)
+++ trunk/reactos/subsys/win32k/dib/dib16bpp.c 2005-01-15 09:46:55 UTC
(rev 13053)
@@ -40,15 +40,52 @@
VOID
DIB_16BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
{
- PBYTE byteaddr = SurfObj->pvScan0 + y * SurfObj->lDelta;
- PWORD addr = (PWORD)byteaddr + x1;
+ PDWORD addr = (PDWORD)((PWORD)(SurfObj->pvScan0 + y *
SurfObj->lDelta) + x1);
+
+#ifdef _M_IX86
+ /* This is about 10% faster than the generic C code below */
+ LONG Count = x2 - x1;
+
+ __asm__(
+" cld\n"
+" andl $0xffff, %0\n" /* If the pixel value is "abcd", put
"abcdabcd"
in %eax */
+" mov %0, %%eax\n"
+" shl $16, %%eax\n"
+" or %0, %%eax\n"
+" test $0x01, %%edi\n" /* Align to fullword boundary */
+" jz .L1\n"
+" stosw\n"
+" dec %1\n"
+" jz .L2\n"
+".L1:\n"
+" mov %1,%%ecx\n" /* Setup count of fullwords to fill */
+" shr $1,%%ecx\n"
+" rep stosl\n" /* The actual fill */
+" test $0x01, %1\n" /* One left to do at the right side? */
+" jz .L2\n"
+" stosw\n"
+".L2:\n"
+ : /* no output */
+ : "r"(c), "r"(Count), "D"(addr)
+ : "%eax", "%ecx");
+#else /* _M_IX86 */
LONG cx = x1;
+ DWORD cc;
- while(cx < x2) {
- *addr = (WORD)c;
- ++addr;
- ++cx;
+ if (0 != (cx & 0x01)) {
+ *((PWORD) addr) = c;
+ cx++;
+ addr = (PDWORD)((PWORD)(addr) + 1);
}
+ cc = ((c & 0xffff) << 16) | (c & 0xffff);
+ while(cx + 1 < x2) {
+ *addr++ = cc;
+ cx += 2;
+ }
+ if (cx < x2) {
+ *((PWORD) addr) = c;
+ }
+#endif /* _M_IX86 */
}
VOID