Register a SA Forums Account here!
JOINING THE SA FORUMS WILL REMOVE THIS BIG AD, THE ANNOYING UNDERLINED ADS, AND STUPID INTERSTITIAL ADS!!!

You can: log in, read the tech support FAQ, or request your lost password. This dumb message (and those ads) will appear on every screen until you register! Get rid of this crap by registering your own SA Forums Account and joining roughly 150,000 Goons, for the one-time price of $9.95! We charge money because it costs us money per month for bills, and since we don't believe in showing ads to our users, we try to make the money back through forum registrations.
 
  • Post
  • Reply
Ornedan
Nov 4, 2009


Cybernetic Crumb
I'm trying to figure out the fastest way to reverse way to reverse bits in a byte.

For context I'm trying to make some graphics stuff (dithering) go really fast and I finally hit a point where writing pixels through the whole BufferedImage, ColorModel, Raster, SampleModel and DataBuffer sequence dominates the per-pixel time. So it's time to write specialised code for common combinations that fucks directly with the image bits and chop off another 50% of time required.

Ways to reverse bits I've found and tried so far:

Java code:
import java.util.Random;
import java.util.function.Function;

public class ReverseBitsInByteTest
{
  public static byte reverseStdlib(byte b)
  {
    return (byte)(Integer.reverse(b & 0xFF) >> 24);
  }

  public static byte reverseSwaps(byte b)
  {
    int r = (b & 0b11110000) >> 4 | (b & 0b00001111) << 4;
    r     = (r & 0b11001100) >> 2 | (r & 0b00110011) << 2;
    r     = (r & 0b10101010) >> 1 | (r & 0b01010101) << 1;

    return (byte)r;
  }

  private static final int[] lookupReverseNybble =
  {
    0x0,0x8,0x4,0xc,0x2,0xa,0x6,0xe,0x1,0x9,0x5,0xd,0x3,0xb,0x7,0xf,
  };

  public static byte reverseNybbles(byte b)
  {
    return (byte)((lookupReverseNybble[b & 0b00001111] << 4) | lookupReverseNybble[(b & 0b11110000) >> 4]);
  }

  private static final int[] lookupReverseByte =
  {
    0x0,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,
    0xf0,0x8,0x88,0x48,0xc8,0x28,0xa8,0x68,0xe8,0x18,0x98,0x58,0xd8,0x38,0xb8,
    0x78,0xf8,0x4,0x84,0x44,0xc4,0x24,0xa4,0x64,0xe4,0x14,0x94,0x54,0xd4,0x34,
    0xb4,0x74,0xf4,0xc,0x8c,0x4c,0xcc,0x2c,0xac,0x6c,0xec,0x1c,0x9c,0x5c,0xdc,
    0x3c,0xbc,0x7c,0xfc,0x2,0x82,0x42,0xc2,0x22,0xa2,0x62,0xe2,0x12,0x92,0x52,
    0xd2,0x32,0xb2,0x72,0xf2,0xa,0x8a,0x4a,0xca,0x2a,0xaa,0x6a,0xea,0x1a,0x9a,
    0x5a,0xda,0x3a,0xba,0x7a,0xfa,0x6,0x86,0x46,0xc6,0x26,0xa6,0x66,0xe6,0x16,
    0x96,0x56,0xd6,0x36,0xb6,0x76,0xf6,0xe,0x8e,0x4e,0xce,0x2e,0xae,0x6e,0xee,
    0x1e,0x9e,0x5e,0xde,0x3e,0xbe,0x7e,0xfe,0x1,0x81,0x41,0xc1,0x21,0xa1,0x61,
    0xe1,0x11,0x91,0x51,0xd1,0x31,0xb1,0x71,0xf1,0x9,0x89,0x49,0xc9,0x29,0xa9,
    0x69,0xe9,0x19,0x99,0x59,0xd9,0x39,0xb9,0x79,0xf9,0x5,0x85,0x45,0xc5,0x25,
    0xa5,0x65,0xe5,0x15,0x95,0x55,0xd5,0x35,0xb5,0x75,0xf5,0xd,0x8d,0x4d,0xcd,
    0x2d,0xad,0x6d,0xed,0x1d,0x9d,0x5d,0xdd,0x3d,0xbd,0x7d,0xfd,0x3,0x83,0x43,
    0xc3,0x23,0xa3,0x63,0xe3,0x13,0x93,0x53,0xd3,0x33,0xb3,0x73,0xf3,0xb,0x8b,
    0x4b,0xcb,0x2b,0xab,0x6b,0xeb,0x1b,0x9b,0x5b,0xdb,0x3b,0xbb,0x7b,0xfb,0x7,
    0x87,0x47,0xc7,0x27,0xa7,0x67,0xe7,0x17,0x97,0x57,0xd7,0x37,0xb7,0x77,0xf7,
    0xf,0x8f,0x4f,0xcf,0x2f,0xaf,0x6f,0xef,0x1f,0x9f,0x5f,0xdf,0x3f,0xbf,0x7f,
    0xff,
  };

  public static byte reverseLookup(byte b)
  {
    return (byte)lookupReverseByte[b & 0xFF];
  }

  /** http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
  public static byte reverseBithack3(byte b)
  {
    return (byte)((((b & 0xFF) * 0x0202020202l) & 0x010884422010l) % 1023);
  }

  /** http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits */
  public static byte reverseBithack4(byte b)
  {
    return (byte)((((b & 0xFF) * 0x80200802l) & 0x0884422110l) * 0x0101010101l >> 32);
  }


  public static String toPaddedBinStr(int b, int pad)
  {
    String str = Integer.toBinaryString(b);
    while(str.length() < pad)
      str = '0' + str;
    return str;
  }

  public static String toPaddedBinStr(long b, int pad)
  {
    String str = Long.toBinaryString(b);
    while(str.length() < pad)
      str = '0' + str;
    return str;
  }

  public static void check(String name,Function<Byte,Byte> func)
  {
    for(int b = 0; b <= 0xFF; b++)
    {
      int rev = func.apply((byte)b) & 0xFF;
      String binStr = toPaddedBinStr(b,8);
      String revStr = toPaddedBinStr(rev,8);
      boolean correct =
        new StringBuilder(binStr).reverse().toString().equals(revStr);

      if( !correct)
        System.out.printf("%s failed: %s => %s\n",name,binStr,revStr);
    }
  }

  public static void main(String[] args) throws Throwable
  {
    // Correctness checks
    check("reverseStdlib",ReverseBitsInByteTest::reverseStdlib);
    check("reverseSwaps",ReverseBitsInByteTest::reverseSwaps);
    check("reverseNybbles",ReverseBitsInByteTest::reverseNybbles);
    check("reverseLookup",ReverseBitsInByteTest::reverseLookup);
    check("reverseBithack3",ReverseBitsInByteTest::reverseBithack3);
    check("reverseBithack4",ReverseBitsInByteTest::reverseBithack4);

    // Speed testing
    byte[] data = new byte[1000000];
    new Random().nextBytes(data);

    long beginStdlib = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseStdlib(data[i]);
    long endStdlib = System.currentTimeMillis();

    long beginSwaps = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseSwaps(data[i]);
    long endSwaps = System.currentTimeMillis();

    long beginNybbles = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseNybbles(data[i]);
    long endNybbles = System.currentTimeMillis();

    long beginLookup = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseLookup(data[i]);
    long endLookup = System.currentTimeMillis();

    long beginBithack3 = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseBithack3(data[i]);
    long endBithack3 = System.currentTimeMillis();

    long beginBithack4 = System.currentTimeMillis();
    for(int repeats = 0; repeats < 1000; repeats++)
      for(int i = 0; i < data.length; i++)
        data[i] = reverseBithack4(data[i]);
    long endBithack4 = System.currentTimeMillis();


    System.out.printf("Stdlib:   %d msec\n",endStdlib - beginStdlib);
    System.out.printf("Swaps:    %d msec\n",endSwaps - beginSwaps);
    System.out.printf("Nybbles:  %d msec\n",endNybbles - beginNybbles);
    System.out.printf("Lookup:   %d msec\n",endLookup - beginLookup);
    System.out.printf("Bithack3: %d msec\n",endBithack3 - beginBithack3);
    System.out.printf("Bithack4: %d msec\n",endBithack4 - beginBithack4);
  }
}

The weird thing I'm running into is that I get completely different results depending on whether that test class is compiled with javac on command line or Eclipse:

pre:
javac               | Eclipse
--------------------|--------------------
Stdlib:   1736 msec | Stdlib:   1661 msec
Swaps:    3528 msec | Swaps:    1531 msec
Nybbles:  2829 msec | Nybbles:  780 msec
Lookup:   2366 msec | Lookup:   459 msec
Bithack3: 3513 msec | Bithack3: 1087 msec
Bithack4: 2810 msec | Bithack4: 586 msec
Are these some optimisation flags Eclipse enables by default that could cause the difference?

e: oh wtf, whichever is the first one I measure time for is as fast in the javac-compiled bytecode as in Eclipse-compiled. So the javac results change if I reorder the speed tests.

Ornedan fucked around with this message at 15:03 on Jul 9, 2023

Adbot
ADBOT LOVES YOU

Ornedan
Nov 4, 2009


Cybernetic Crumb
Java code:
import java.util.concurrent.TimeUnit;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.State;


@State(Scope.Thread)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
public class ReverseBitsBenchmark
{
  public static byte reverseStdlib(byte b)
  {
    return (byte)(Integer.reverse(b) >> 24);
  }

  public static byte reverseSwaps(byte b)
  {
    int r = (b & 0b11110000) >>> 4 | (b & 0b00001111) << 4;
    r = (r & 0b11001100) >>> 2 | (r & 0b00110011) << 2;
    r = (r & 0b10101010) >>> 1 | (r & 0b01010101) << 1;

    return (byte)r;
  }

  private static final int[] lookupReverseNybble =
  {
    0x0,0x8,0x4,0xc,0x2,0xa,0x6,0xe,0x1,0x9,0x5,0xd,0x3,0xb,0x7,0xf,
  };

  public static byte reverseNybbles(byte b)
  {
    return (byte)((lookupReverseNybble[b & 0b00001111] << 4) |
      lookupReverseNybble[(b & 0b11110000) >> 4]);
  }

  private static final byte[] lookupReverseByte =
  {
    (byte)0x0,(byte)0x80,(byte)0x40,(byte)0xc0,(byte)0x20,(byte)0xa0,
    (byte)0x60,(byte)0xe0,(byte)0x10,(byte)0x90,(byte)0x50,(byte)0xd0,
    (byte)0x30,(byte)0xb0,(byte)0x70,(byte)0xf0,(byte)0x8,(byte)0x88,
    (byte)0x48,(byte)0xc8,(byte)0x28,(byte)0xa8,(byte)0x68,(byte)0xe8,
    (byte)0x18,(byte)0x98,(byte)0x58,(byte)0xd8,(byte)0x38,(byte)0xb8,
    (byte)0x78,(byte)0xf8,(byte)0x4,(byte)0x84,(byte)0x44,(byte)0xc4,
    (byte)0x24,(byte)0xa4,(byte)0x64,(byte)0xe4,(byte)0x14,(byte)0x94,
    (byte)0x54,(byte)0xd4,(byte)0x34,(byte)0xb4,(byte)0x74,(byte)0xf4,
    (byte)0xc,(byte)0x8c,(byte)0x4c,(byte)0xcc,(byte)0x2c,(byte)0xac,
    (byte)0x6c,(byte)0xec,(byte)0x1c,(byte)0x9c,(byte)0x5c,(byte)0xdc,
    (byte)0x3c,(byte)0xbc,(byte)0x7c,(byte)0xfc,(byte)0x2,(byte)0x82,
    (byte)0x42,(byte)0xc2,(byte)0x22,(byte)0xa2,(byte)0x62,(byte)0xe2,
    (byte)0x12,(byte)0x92,(byte)0x52,(byte)0xd2,(byte)0x32,(byte)0xb2,
    (byte)0x72,(byte)0xf2,(byte)0xa,(byte)0x8a,(byte)0x4a,(byte)0xca,
    (byte)0x2a,(byte)0xaa,(byte)0x6a,(byte)0xea,(byte)0x1a,(byte)0x9a,
    (byte)0x5a,(byte)0xda,(byte)0x3a,(byte)0xba,(byte)0x7a,(byte)0xfa,
    (byte)0x6,(byte)0x86,(byte)0x46,(byte)0xc6,(byte)0x26,(byte)0xa6,
    (byte)0x66,(byte)0xe6,(byte)0x16,(byte)0x96,(byte)0x56,(byte)0xd6,
    (byte)0x36,(byte)0xb6,(byte)0x76,(byte)0xf6,(byte)0xe,(byte)0x8e,
    (byte)0x4e,(byte)0xce,(byte)0x2e,(byte)0xae,(byte)0x6e,(byte)0xee,
    (byte)0x1e,(byte)0x9e,(byte)0x5e,(byte)0xde,(byte)0x3e,(byte)0xbe,
    (byte)0x7e,(byte)0xfe,(byte)0x1,(byte)0x81,(byte)0x41,(byte)0xc1,
    (byte)0x21,(byte)0xa1,(byte)0x61,(byte)0xe1,(byte)0x11,(byte)0x91,
    (byte)0x51,(byte)0xd1,(byte)0x31,(byte)0xb1,(byte)0x71,(byte)0xf1,
    (byte)0x9,(byte)0x89,(byte)0x49,(byte)0xc9,(byte)0x29,(byte)0xa9,
    (byte)0x69,(byte)0xe9,(byte)0x19,(byte)0x99,(byte)0x59,(byte)0xd9,
    (byte)0x39,(byte)0xb9,(byte)0x79,(byte)0xf9,(byte)0x5,(byte)0x85,
    (byte)0x45,(byte)0xc5,(byte)0x25,(byte)0xa5,(byte)0x65,(byte)0xe5,
    (byte)0x15,(byte)0x95,(byte)0x55,(byte)0xd5,(byte)0x35,(byte)0xb5,
    (byte)0x75,(byte)0xf5,(byte)0xd,(byte)0x8d,(byte)0x4d,(byte)0xcd,
    (byte)0x2d,(byte)0xad,(byte)0x6d,(byte)0xed,(byte)0x1d,(byte)0x9d,
    (byte)0x5d,(byte)0xdd,(byte)0x3d,(byte)0xbd,(byte)0x7d,(byte)0xfd,
    (byte)0x3,(byte)0x83,(byte)0x43,(byte)0xc3,(byte)0x23,(byte)0xa3,
    (byte)0x63,(byte)0xe3,(byte)0x13,(byte)0x93,(byte)0x53,(byte)0xd3,
    (byte)0x33,(byte)0xb3,(byte)0x73,(byte)0xf3,(byte)0xb,(byte)0x8b,
    (byte)0x4b,(byte)0xcb,(byte)0x2b,(byte)0xab,(byte)0x6b,(byte)0xeb,
    (byte)0x1b,(byte)0x9b,(byte)0x5b,(byte)0xdb,(byte)0x3b,(byte)0xbb,
    (byte)0x7b,(byte)0xfb,(byte)0x7,(byte)0x87,(byte)0x47,(byte)0xc7,
    (byte)0x27,(byte)0xa7,(byte)0x67,(byte)0xe7,(byte)0x17,(byte)0x97,
    (byte)0x57,(byte)0xd7,(byte)0x37,(byte)0xb7,(byte)0x77,(byte)0xf7,
    (byte)0xf,(byte)0x8f,(byte)0x4f,(byte)0xcf,(byte)0x2f,(byte)0xaf,
    (byte)0x6f,(byte)0xef,(byte)0x1f,(byte)0x9f,(byte)0x5f,(byte)0xdf,
    (byte)0x3f,(byte)0xbf,(byte)0x7f,(byte)0xff,

  };

  public static byte reverseLookup(byte b)
  {
    return (byte)lookupReverseByte[b & 0xFF];
  }

  /**
   * http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv
   */
  public static byte reverseBithack3(byte b)
  {
    return (byte)((((b & 0xFF) * 0x0202020202l) & 0x010884422010l) % 1023);
  }

  /**
   * http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
   */
  public static byte reverseBithack4(byte b)
  {
    return (byte)((((b & 0xFF) * 0x80200802l) & 0x0884422110l) *
      0x0101010101l >> 32);
  }

  // --- Benchmarks ---

  private byte x = (byte)0b10100110;

  @Benchmark
  public byte testStdlib()
  {
    return reverseStdlib(x);
  }

  @Benchmark
  public byte testSwaps()
  {
    return reverseSwaps(x);
  }

  @Benchmark
  public byte testNybbles()
  {
    return reverseNybbles(x);
  }

  @Benchmark
  public byte testLookup()
  {
    return reverseLookup(x);
  }

  @Benchmark
  public byte testBithack3()
  {
    return reverseBithack3(x);
  }

  @Benchmark
  public byte testBithack4()
  {
    return reverseBithack4(x);
  }
}
pre:
Benchmark                          Mode  Cnt  Score   Error  Units
ReverseBitsBenchmark.testBithack3  avgt   25  1.417 ± 0.016  ns/op
ReverseBitsBenchmark.testBithack4  avgt   25  0.889 ± 0.010  ns/op
ReverseBitsBenchmark.testLookup    avgt   25  0.721 ± 0.009  ns/op
ReverseBitsBenchmark.testNybbles   avgt   25  1.073 ± 0.026  ns/op
ReverseBitsBenchmark.testStdlib    avgt   25  1.987 ± 0.107  ns/op
ReverseBitsBenchmark.testSwaps     avgt   25  1.874 ± 0.031  ns/op
Tests with JMH match the Eclipse-compiled results. So a lookup table is best, and the divisionless bithack is a close second.


I did poke the javac vs Eclipse thing a bit further. Moving each of the test loops into a separate method made javac output as fast as Eclipse output. So I'm guessing the javac-generated bytecode does something dumb with the 2nd and later loops.

  • 1
  • 2
  • 3
  • 4
  • 5
  • Post
  • Reply