Below my C code based on the C code previously shown here, and the assembly
generated by vc.
This function, as most ones, does not benefit much from asm coding, although
some cycles can be saved, most notably inside the loop (a cmp and additional
branch in vc generated code).
Some algorithms can benefit a lot from asm, though. For example the Fletcher
checksum or incrementing/decrementing variables larger than the register
size, where the use of the carry flag can save many cycles. Also when a
function exec time is very critical may deserve asm coding, but I think in
this case it does not worth it, as the saving in percentage is tiny (any
compiler I know will use rep stosd for the inner loop, which has the largest
weight in the total time).
BOOLEAN DIB_32BPP_ColorFill(SURFOBJ* pso, RECTL* prcl, ULONG iColor)
{
    LONG lDelta, cx, cy;
    char * pulLine;
    lDelta = pso->lDelta;
    pulLine= (char *)((char *)pso->pvScan0 + prcl->top * lDelta +
(prcl->left << 2));
    cx = prcl->right - prcl->left;
    if (cx <= 0)
        return TRUE;
    cy = prcl->bottom - prcl->top;
    if (cy <= 0)
        return TRUE;
        ULONG *p;
        ULONG c;
        for(; cy--; pulLine += lDelta)
        {
                for(p = (ULONG *)pulLine, c = cx; c--; )
                {
                        *p++ = iColor;
                }
        }
    return TRUE;
}
PUBLIC  ?DIB_32BPP_ColorFill@@YAEPAU_SURFOBJ@@PAU_RECTL@@K@Z ;
DIB_32BPP_ColorFill
; Function compile flags: /Ogtpy
_TEXT   SEGMENT
?DIB_32BPP_ColorFill@@YAEPAU_SURFOBJ@@PAU_RECTL@@K@Z PROC ;
DIB_32BPP_ColorFill
; Line 52
        mov     ecx, DWORD PTR ds:4
; Line 54
        mov     edx, DWORD PTR ds:8
        push    ebp
        mov     ebp, DWORD PTR ds:36
        imul    ecx, ebp
        xor     eax, eax
        mov     eax, DWORD PTR [eax]
        push    esi
        lea     esi, DWORD PTR [ecx+eax*4]
        add     esi, DWORD PTR ds:32
        sub     edx, eax
; Line 55
        test    edx, edx
; Line 56
        jle     SHORT $LN22@DIB_32BPP_
        push    ebx
; Line 58
        mov     ebx, DWORD PTR ds:12
        sub     ebx, DWORD PTR ds:4
; Line 59
        test    ebx, ebx
; Line 60
        jle     SHORT $LN21@DIB_32BPP_
        push    edi
        npad    4
$LL18@DIB_32BPP_:
; Line 64
        dec     ebx
; Line 66
        test    edx, edx
        je      SHORT $LN2@DIB_32BPP_
        mov     ecx, edx
        xor     eax, eax
        mov     edi, esi
        rep stosd
$LN2@DIB_32BPP_:
        add     esi, ebp
        test    ebx, ebx
        jne     SHORT $LL18@DIB_32BPP_
        pop     edi
$LN21@DIB_32BPP_:
        pop     ebx
$LN22@DIB_32BPP_:
        pop     esi
; Line 72
        mov     al, 1
        pop     ebp
; Line 73
        ret     0
?DIB_32BPP_ColorFill@@YAEPAU_SURFOBJ@@PAU_RECTL@@K@Z ENDP ;
DIB_32BPP_ColorFill
In asm I would write the loop as:
        mov eax, iColor
        mov ebx, pulLine
        mov edx, cy
L1:
        mov di, bx
        mov cx, _cx
        rep stosd
        add dx, lDelta
        dec dx
        jnz l1
Jose Catena
DIGIWAVES S.L.