If it should be optimized, inlined asm should be used as following:

Reuel ben Yisrael schrieb:
+.globl _UlongByteSwap
+ 
+.intel_syntax noprefix
+
+/* FUNCTIONS ***************************************************************/
+
+_UlongByteSwap:
+                       push  ebp          // save base 
+                       mov   ebp,esp      // move stack to base
+                       mov   eax,[ebp+8]  // load the ULONG                       
+                       bswap eax          // swap the ULONG
+                       pop   ebp          // restore the base   
+                       ret
 

    
this should work:

_UlongByteSwap:
                       mov   eax,[esp+8]  // load the ULONG                       
                       bswap eax          // swap the ULONG
                       ret
  

static force_inline ULONG UlongByteSwap(ULONG x)
{
    asm volatile(
        "bswap %0;"
        : "=r" (x)
        : "0" (x)
    );
    return x;
}


+.globl _UlonglongByteSwap
+ 
+.intel_syntax noprefix
+
+/* FUNCTIONS ***************************************************************/
+
+_UlonglongByteSwap:
+                       push  ebp          // save base 
+                       mov   ebp,esp      // move stack to base
+                       mov   edx,[ebp+8]  // load the higher part of ULONGLONG
+                       mov   eax,[ebp+12] // load the lower part of ULONGLONG    
+                       bswap edx          // swap the higher part
+                       bswap eax          // swap the lower part 
+                       pop   ebp          // restore the base   
+                       ret
 

    
_UlonglongByteSwap:
                       mov   edx,[esp+8]  // load the higher part of ULONGLONG
                       mov   eax,[esp+12] // load the lower part of ULONGLONG    
                       bswap edx          // swap the higher part
                       bswap eax          // swap the lower part 
                       ret
  

static force_inline ULONGLONG UlonglongByteSwap(ULONGLONG x)
{
    ULONG h,l;

    asm volatile ("": "=d" (l), "=a" (h): "A" (x));

    asm volatile (
        "bswap %%eax;"
        "bswap %%edx;"
        : "=A" (x)
        : "d" (l), "a" (h)
    );
    return x;
}
  
+_UshortByteSwap:
+                       push  ebp          // save base 
+                       mov   ebp,esp      // move stack to base
+                       mov   eax,[ebp+8]  // load the USHORT                       
+                       bswap eax          // swap the USHORT, xchg is slow so we use bswap with rol 
+                       rol   eax,16       // make it USHORT
+                       pop   ebp          // restore the base   
+                       ret
 

    

_UshortByteSwap:
                       mov   eax,[esp+8]  // load the USHORT                       
                       bswap eax          // swap the USHORT, xchg is slow so we use bswap with rol 
                       rol   eax,16       // make it USHORT
                       ret


or to save a byte...

_UshortByteSwap:
                       mov   ebx,[esp+8]  // load the USHORT                       
                       mov   al, bh
                       mov   ah, bl
                       ret
  

static force_inline USHORT UshortByteSwap(USHORT x)
{
    asm volatile(
        "rolw $8, %0;"
        : "=r" (x)
        : "0" (x)
    );
    return x;
}