Alex,
I managed to shave off another comparison in favor of a shift:
int highest_bit ( int i )
{
    int ret = 0;
    if ( i > 0xffff )
        i >>= 16, ret = 16;
    if ( i > 0xff )
        i >>= 8, ret += 8;
    if ( i > 0xf )
        i >>= 4, ret += 4;
    if ( i > 0x3 )
        i >>= 2, ret += 2;
    return ret + (i>>1);
}
FWIW, rdtsc reports that compile for speed is 2x as fast as compile for
size.
I've thought about ways to possibly optimize pipeline throughput for
this, but too many of the calculations depend on results of immediately
previous calculations to get any real advantage.
Here's updated micro-code for BSR, if you really think AMD & Intel will
actually do it...
IF r/m = 0
THEN
   ZF := 1;
   register := UNDEFINED;
ELSE
   ZF := 0;
   register := 0;
   temp := r/m;
   IF OperandSize = 32 THEN
     IF (temp & 0xFFFF0000) != 0 THEN
       temp >>= 16;
       register |= 16;
     FI;
   FI;
   IF (temp & 0xFF00) != 0 THEN
     temp >>= 8;
     register |= 8;
   FI;
   IF (temp & 0xF0) != 0 THEN
     temp >>= 4;
     register |= 4;
   FI;
   IF (temp & 0xC) != 0 THEN
     temp >>= 2;
     register |= 2;
   FI;
   register |= (temp>>1);
FI;