statistical encoding

This commit is contained in:
Arndt 2015-10-11 19:27:33 +02:00
parent f8dee5b7d1
commit ccf6641bad
41 changed files with 4543 additions and 1965 deletions

View file

@ -1,96 +1,150 @@
package btools.util;
public final class BitCoderContext
public class BitCoderContext
{
private byte[] ab;
private int idx = -1;
private int bm = 0x100; // byte mask
private int b;
public BitCoderContext( byte[] ab )
{
private byte[] ab;
private int idx = -1;
private int bm = 0x100 ; // byte mask
private int b;
public BitCoderContext( byte[] ab )
{
this.ab = ab;
}
// encode a distance with a variable bit length
// (poor mans huffman tree)
// 1 -> 0
// 01 -> 1 + following 1-bit word ( 1..2 )
// 001 -> 3 + following 2-bit word ( 3..6 )
// 0001 -> 7 + following 3-bit word ( 7..14 ) etc.
public void encodeVarBits( int value )
{
int range = 0;
while ( value > range )
{
encodeBit( false );
value -= range+1;
range = 2*range + 1;
}
encodeBit( true );
encode( range, value );
}
// twin to encodeDistance
public int decodeVarBits()
{
int range = 0;
int value = 0;
while ( !decodeBit() )
{
value += range+1;
range = 2*range + 1;
}
return value + decode( range );
}
public void encodeBit( boolean value )
{
if ( bm == 0x100 ) { bm = 1; ab[++idx] = 0; }
if ( value ) ab[idx] |= bm;
bm <<= 1;
}
public boolean decodeBit()
{
if ( bm == 0x100 ) { bm = 1; b = ab[++idx]; }
boolean value = ( (b & bm) != 0 );
bm <<= 1;
return value;
}
// encode a symbol with number of bits according to maxvalue
public void encode( int max, int value )
{
int im = 1; // integer mask
while( max != 0 )
{
if ( bm == 0x100 ) { bm = 1; ab[++idx] = 0; }
if ( (value & im) != 0 ) ab[idx] |= bm;
max >>= 1;
bm <<= 1;
im <<= 1;
}
}
public int getEncodedLength()
{
return idx+1;
}
public int decode( int max )
{
int value = 0;
int im = 1; // integer mask
while( max != 0 )
{
if ( bm == 0x100 ) { bm = 1; b = ab[++idx]; }
if ( (b & bm) != 0 ) value |= im;
max >>= 1;
bm <<= 1;
im <<= 1;
}
return value;
}
this.ab = ab;
}
/**
* encode a distance with a variable bit length
* (poor mans huffman tree)
* 1 -> 0
* 01 -> 1 + following 1-bit word ( 1..2 )
* 001 -> 3 + following 2-bit word ( 3..6 )
* 0001 -> 7 + following 3-bit word ( 7..14 ) etc.
*
* @see #decodeVarBits
*/
public final void encodeVarBits( int value )
{
int range = 0;
while (value > range)
{
encodeBit( false );
value -= range + 1;
range = 2 * range + 1;
}
encodeBit( true );
encodeBounded( range, value );
}
/**
* @see #encodeVarBits
*/
public final int decodeVarBits()
{
int range = 0;
int value = 0;
while (!decodeBit())
{
value += range + 1;
range = 2 * range + 1;
}
return value + decodeBounded( range );
}
public final void encodeBit( boolean value )
{
if ( bm == 0x100 )
{
bm = 1;
ab[++idx] = 0;
}
if ( value )
ab[idx] |= bm;
bm <<= 1;
}
public final boolean decodeBit()
{
if ( bm == 0x100 )
{
bm = 1;
b = ab[++idx];
}
boolean value = ( ( b & bm ) != 0 );
bm <<= 1;
return value;
}
/**
* encode an integer in the range 0..max (inclusive).
* For max = 2^n-1, this just encodes n bits, but in general
* this is variable length encoding, with the shorter codes
* for the central value range
*/
public final void encodeBounded( int max, int value )
{
int im = 1; // integer mask
while (im <= max)
{
if ( bm == 0x100 )
{
bm = 1;
ab[++idx] = 0;
}
if ( ( value & im ) != 0 )
{
ab[idx] |= bm;
max -= im;
}
bm <<= 1;
im <<= 1;
}
}
/**
* decode an integer in the range 0..max (inclusive).
* @see #encodeBounded
*/
public final int decodeBounded( int max )
{
int value = 0;
int im = 1; // integer mask
while (( value | im ) <= max)
{
if ( bm == 0x100 )
{
bm = 1;
b = ab[++idx];
}
if ( ( b & bm ) != 0 )
value |= im;
bm <<= 1;
im <<= 1;
}
return value;
}
/**
* @return the encoded length in bytes
*/
public final int getEncodedLength()
{
return idx + 1;
}
/**
* @return the encoded length in bits
*/
public final long getBitPosition()
{
long bitpos = idx << 3;
int m = bm;
while (m > 1)
{
bitpos++;
m >>= 1;
}
return bitpos;
}
}

View file

@ -9,11 +9,7 @@ public final class ByteArrayUnifier
public ByteArrayUnifier( int size, boolean validateImmutability )
{
this.size = size;
if ( !Boolean.getBoolean( "disableByteArrayUnifification" ) )
{
byteArrayCache = new byte[size][];
}
byteArrayCache = new byte[size][];
if ( validateImmutability ) crcCrosscheck = new int[size];
}
@ -26,33 +22,40 @@ public final class ByteArrayUnifier
*/
public byte[] unify( byte[] ab )
{
if ( byteArrayCache == null ) return ab;
int n = ab.length;
int crc = Crc32.crc( ab, 0, n );
int idx = (crc & 0xfffffff) % size;
byte[] abc = byteArrayCache[idx];
if ( abc != null && abc.length == n )
return unify( ab, 0, ab.length );
}
public byte[] unify( byte[] ab, int offset, int len )
{
int crc = Crc32.crc( ab, offset, len );
int idx = ( crc & 0xfffffff ) % size;
byte[] abc = byteArrayCache[idx];
if ( abc != null && abc.length == len )
{
int i = 0;
while (i < len)
{
int i = 0;
while( i < n )
{
if ( ab[i] != abc[i] ) break;
i++;
}
if ( i == n ) return abc;
if ( ab[offset + i] != abc[i] )
break;
i++;
}
if ( crcCrosscheck != null )
if ( i == len )
return abc;
}
if ( crcCrosscheck != null )
{
if ( byteArrayCache[idx] != null )
{
if ( byteArrayCache[idx] != null )
{
byte[] abold = byteArrayCache[idx];
int crcold = Crc32.crc( abold, 0, abold.length );
if ( crcold != crcCrosscheck[idx] ) throw new IllegalArgumentException( "ByteArrayUnifier: immutablity validation failed!" );
}
crcCrosscheck[idx] = crc;
byte[] abold = byteArrayCache[idx];
int crcold = Crc32.crc( abold, 0, abold.length );
if ( crcold != crcCrosscheck[idx] )
throw new IllegalArgumentException( "ByteArrayUnifier: immutablity validation failed!" );
}
byteArrayCache[idx] = ab;
return ab;
crcCrosscheck[idx] = crc;
}
byte[] nab = new byte[len];
System.arraycopy( ab, offset, nab, 0, len );
byteArrayCache[idx] = nab;
return nab;
}
}

View file

@ -10,10 +10,19 @@ public class ByteDataReader
{
protected byte[] ab;
protected int aboffset;
protected int aboffsetEnd;
public ByteDataReader( byte[] byteArray )
{
ab = byteArray;
aboffsetEnd = ab == null ? 0 : ab.length;
}
public ByteDataReader( byte[] byteArray, int offset )
{
ab = byteArray;
aboffset = offset;
aboffsetEnd = ab == null ? 0 : ab.length;
}
public final int readInt()
@ -57,6 +66,41 @@ public class ByteDataReader
return (short)( (i1 << 8) | i0);
}
/**
* Read a size value and return a pointer to the end of a data section of that size
*
* @return the pointer to the first byte after that section
*/
public int getEndPointer()
{
int size = readVarLengthUnsigned();
return aboffset + size;
}
public byte[] readDataUntil( int endPointer )
{
int size = endPointer - aboffset;
if ( size == 0 )
{
return null;
}
byte[] data = new byte[size];
readFully( data );
return data;
}
public byte[] readVarBytes()
{
int len = readVarLengthUnsigned();
if ( len == 0 )
{
return null;
}
byte[] bytes = new byte[len];
readFully( bytes );
return bytes;
}
public final int readVarLengthSigned()
{
int v = readVarLengthUnsigned();
@ -83,6 +127,11 @@ public class ByteDataReader
aboffset += ta.length;
}
public boolean hasMoreData()
{
return aboffset < aboffsetEnd;
}
@Override
public String toString()
{

View file

@ -6,33 +6,30 @@
package btools.util;
public final class ByteDataWriter
public class ByteDataWriter extends ByteDataReader
{
private byte[] ab;
private int aboffset;
public ByteDataWriter( byte[] byteArray )
{
ab = byteArray;
super ( byteArray );
}
public void writeInt( int v )
{
ab[aboffset++] = (byte)( (v >> 24) & 0xff );
ab[aboffset++] = (byte)( (v >> 24) & 0xff );
ab[aboffset++] = (byte)( (v >> 16) & 0xff );
ab[aboffset++] = (byte)( (v >> 8) & 0xff );
ab[aboffset++] = (byte)( (v >> 8) & 0xff );
ab[aboffset++] = (byte)( (v ) & 0xff );
}
public void writeLong( long v )
{
ab[aboffset++] = (byte)( (v >> 56) & 0xff );
ab[aboffset++] = (byte)( (v >> 56) & 0xff );
ab[aboffset++] = (byte)( (v >> 48) & 0xff );
ab[aboffset++] = (byte)( (v >> 40) & 0xff );
ab[aboffset++] = (byte)( (v >> 40) & 0xff );
ab[aboffset++] = (byte)( (v >> 32) & 0xff );
ab[aboffset++] = (byte)( (v >> 24) & 0xff );
ab[aboffset++] = (byte)( (v >> 24) & 0xff );
ab[aboffset++] = (byte)( (v >> 16) & 0xff );
ab[aboffset++] = (byte)( (v >> 8) & 0xff );
ab[aboffset++] = (byte)( (v >> 8) & 0xff );
ab[aboffset++] = (byte)( (v ) & 0xff );
}
@ -51,31 +48,85 @@ public final class ByteDataWriter
ab[aboffset++] = (byte)( (v >> 8) & 0xff );
ab[aboffset++] = (byte)( (v ) & 0xff );
}
public void write( byte[] sa )
{
System.arraycopy( sa, 0, ab, aboffset, sa.length );
aboffset += sa.length;
}
public void write( byte[] sa, int offset, int len )
{
System.arraycopy( sa, offset, ab, aboffset, len );
aboffset += len;
}
public void ensureCapacity( int len )
public void writeVarBytes( byte[] sa )
{
// TODO
if ( sa == null )
{
writeVarLengthUnsigned( 0 );
}
else
{
int len = sa.length;
writeVarLengthUnsigned( len );
write( sa, 0, len );
}
}
public void writeModeAndDesc( boolean isReverse, byte[] sa )
{
int len = sa == null ? 0 : sa.length;
int sizecode = len << 1 | ( isReverse ? 1 : 0 );
writeVarLengthUnsigned( sizecode );
if ( len > 0 )
{
write( sa, 0, len );
}
}
public byte[] toByteArray()
{
byte[] c = new byte[aboffset];
System.arraycopy( ab, 0, c, 0, aboffset );
return c;
}
/**
* Just reserves a single byte and return it' offset.
* Used in conjunction with injectVarLengthUnsigned
* to efficiently write a size prefix
*
* @return the offset of the placeholder
*/
public int writeSizePlaceHolder()
{
return aboffset++;
}
public void injectSize( int sizeoffset )
{
int size = 0;
int datasize = aboffset-sizeoffset-1;
int v = datasize;
do
{
v >>= 7;
size++;
}
while( v != 0 );
if ( size > 1 ) // doesn't fit -> shift the data after the placeholder
{
System.arraycopy( ab, sizeoffset+1, ab, sizeoffset+size, datasize );
}
aboffset = sizeoffset;
writeVarLengthUnsigned( datasize );
aboffset = sizeoffset + size + datasize;
}
public int writeVarLengthSigned( int v )
{
return writeVarLengthUnsigned( v < 0 ? ( (-v) << 1 ) | 1 : v << 1 );
@ -83,29 +134,21 @@ public final class ByteDataWriter
public int writeVarLengthUnsigned( int v )
{
int start = aboffset;
do
{
int i7 = v & 0x7f;
v >>= 7;
if ( v != 0 ) i7 |= 0x80;
int start = aboffset;
do
{
int i7 = v & 0x7f;
v >>= 7;
if ( v != 0 ) i7 |= 0x80;
ab[aboffset++] = (byte)( i7 & 0xff );
}
while( v != 0 );
return aboffset - start;
}
while( v != 0 );
return aboffset - start;
}
public int size()
{
return aboffset;
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder( "[" );
for( int i=0; i<ab.length; i++ ) sb.append( i == 0 ? " " : ", " ).append( Integer.toString( ab[i] ) );
sb.append( " ]" );
return sb.toString();
}
}
}

View file

@ -1,8 +1,5 @@
package btools.util;
import java.util.Random;
import java.util.HashSet;
import org.junit.Assert;
import org.junit.Test;
@ -11,18 +8,46 @@ public class BitCoderContextTest
@Test
public void varBitsEncodeDecodeTest()
{
byte[] ab = new byte[4000];
BitCoderContext ctx = new BitCoderContext( ab );
for( int i=0; i<1000; i++ )
{
ctx.encodeVarBits( i );
byte[] ab = new byte[4000];
BitCoderContext ctx = new BitCoderContext( ab );
for ( int i = 0; i < 1000; i++ )
{
ctx.encodeVarBits( i );
}
ctx = new BitCoderContext( ab );
for( int i=0; i<1000; i++ )
{
ctx = new BitCoderContext( ab );
for ( int i = 0; i < 1000; i++ )
{
int value = ctx.decodeVarBits();
Assert.assertTrue( "distance value mismatch", value == i );
}
}
@Test
public void boundedEncodeDecodeTest()
{
byte[] ab = new byte[581969];
BitCoderContext ctx = new BitCoderContext( ab );
for ( int max = 1; max < 1000; max++ )
{
for ( int val = 0; val <= max; val++ )
{
ctx.encodeBounded( max, val );
}
}
ctx = new BitCoderContext( ab );
for ( int max = 1; max < 1000; max++ )
{
for ( int val = 0; val <= max; val++ )
{
int valDecoded = ctx.decodeBounded( max );
if ( valDecoded != val )
{
Assert.fail( "mismatch at max=" + max + " " + valDecoded + "<>" + val );
}
}
}
}
}