From 82defe5c5663ca0c28563f8a111d327c87726267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Mon, 8 May 2017 10:40:48 +0300 Subject: [PATCH] common/compress: internalize encoders, add length wrappers --- common/bitutil/compress.go | 72 ++++++++++++++++-------- common/bitutil/compress_fuzz.go | 24 ++++---- common/bitutil/compress_test.go | 99 +++++++++++++++++++++------------ 3 files changed, 125 insertions(+), 70 deletions(-) diff --git a/common/bitutil/compress.go b/common/bitutil/compress.go index a806c0e8b2..c057cee4a6 100644 --- a/common/bitutil/compress.go +++ b/common/bitutil/compress.go @@ -19,21 +19,21 @@ package bitutil import "errors" var ( - // ErrMissingData is returned from decompression if the byte referenced by + // errMissingData is returned from decompression if the byte referenced by // the bitset header overflows the input data. - ErrMissingData = errors.New("missing bytes on input") + errMissingData = errors.New("missing bytes on input") - // ErrUnreferencedData is returned from decompression if not all bytes were used + // errUnreferencedData is returned from decompression if not all bytes were used // up from the input data after decompressing it. - ErrUnreferencedData = errors.New("extra bytes on input") + errUnreferencedData = errors.New("extra bytes on input") - // ErrExceededTarget is returned from decompression if the bitset header has + // errExceededTarget is returned from decompression if the bitset header has // more bits defined than the number of target buffer space available. - ErrExceededTarget = errors.New("target data size exceeded") + errExceededTarget = errors.New("target data size exceeded") - // ErrZeroContent is returned from decompression if a data byte referenced in + // errZeroContent is returned from decompression if a data byte referenced in // the bitset header is actually a zero byte. - ErrZeroContent = errors.New("zero byte in input content") + errZeroContent = errors.New("zero byte in input content") ) // The compression algorithm implemented by CompressBytes and DecompressBytes is @@ -55,8 +55,20 @@ var ( // nonZeroBytes(data) contains the non-zero bytes of data in the same order // CompressBytes compresses the input byte slice according to the sparse bitset -// representation algorithm. +// representation algorithm. If the result is bigger than the original input, no +// compression is done. func CompressBytes(data []byte) []byte { + if out := bitsetEncodeBytes(data); len(out) < len(data) { + return out + } + cpy := make([]byte, len(data)) + copy(cpy, data) + return cpy +} + +// bitsetEncodeBytes compresses the input byte slice according to the sparse +// bitset representation algorithm. +func bitsetEncodeBytes(data []byte) []byte { // Empty slices get compressed to nil if len(data) == 0 { return nil @@ -81,27 +93,41 @@ func CompressBytes(data []byte) []byte { if len(nonZeroBytes) == 0 { return nil } - return append(CompressBytes(nonZeroBitset), nonZeroBytes...) + return append(bitsetEncodeBytes(nonZeroBitset), nonZeroBytes...) } -// DecompressBytes decompresses data with a known target size. In addition to the -// decompressed output, the function returns the length of compressed input data -// corresponding to the output as the input slice may be longer. +// DecompressBytes decompresses data with a known target size. If the input data +// matches the size of the target, it means no compression was done in the first +// place. func DecompressBytes(data []byte, target int) ([]byte, error) { - out, size, err := decompressBytes(data, target) + if len(data) > target { + return nil, errExceededTarget + } + if len(data) == target { + cpy := make([]byte, len(data)) + copy(cpy, data) + return cpy, nil + } + return bitsetDecodeBytes(data, target) +} + +// bitsetDecodeBytes decompresses data with a known target size. +func bitsetDecodeBytes(data []byte, target int) ([]byte, error) { + out, size, err := bitsetDecodePartialBytes(data, target) if err != nil { return nil, err } if size != len(data) { - return nil, ErrUnreferencedData + return nil, errUnreferencedData } return out, nil } -// decompressBytes decompresses data with a known target size. In addition to the -// decompressed output, the function returns the length of compressed input data -// corresponding to the output as the input slice may be longer. -func decompressBytes(data []byte, target int) ([]byte, int, error) { +// bitsetDecodePartialBytes decompresses data with a known target size, but does +// not enforce consuming all the input bytes. In addition to the decompressed +// output, the function returns the length of compressed input data corresponding +// to the output as the input slice may be longer. +func bitsetDecodePartialBytes(data []byte, target int) ([]byte, int, error) { // Sanity check 0 targets to avoid infinite recursion if target == 0 { return nil, 0, nil @@ -119,7 +145,7 @@ func decompressBytes(data []byte, target int) ([]byte, int, error) { return decomp, 0, nil } // Decompress the bitset of set bytes and distribute the non zero bytes - nonZeroBitset, ptr, err := decompressBytes(data, (target+7)/8) + nonZeroBitset, ptr, err := bitsetDecodePartialBytes(data, (target+7)/8) if err != nil { return nil, ptr, err } @@ -127,14 +153,14 @@ func decompressBytes(data []byte, target int) ([]byte, int, error) { if nonZeroBitset[i/8]&(1<= len(data) { - return nil, 0, ErrMissingData + return nil, 0, errMissingData } if i >= len(decomp) { - return nil, 0, ErrExceededTarget + return nil, 0, errExceededTarget } // Make sure the data is valid and push into the slot if data[ptr] == 0 { - return nil, 0, ErrZeroContent + return nil, 0, errZeroContent } decomp[i] = data[ptr] ptr++ diff --git a/common/bitutil/compress_fuzz.go b/common/bitutil/compress_fuzz.go index 2b7fe29775..1b87f50edc 100644 --- a/common/bitutil/compress_fuzz.go +++ b/common/bitutil/compress_fuzz.go @@ -20,36 +20,36 @@ package bitutil import "bytes" -// Fuzz implements a go-fuzz fuzzer method to test various compression method +// Fuzz implements a go-fuzz fuzzer method to test various encoding method // invocations. func Fuzz(data []byte) int { if len(data) == 0 { return -1 } if data[0]%2 == 0 { - return fuzzCompress(data[1:]) + return fuzzEncode(data[1:]) } - return fuzzDecompress(data[1:]) + return fuzzDecode(data[1:]) } -// fuzzCompress implements a go-fuzz fuzzer method to test the bit compression and -// decompression algorithm. -func fuzzCompress(data []byte) int { - proc, _ := DecompressBytes(CompressBytes(data), len(data)) +// fuzzEncode implements a go-fuzz fuzzer method to test the bitset encoding and +// decoding algorithm. +func fuzzEncode(data []byte) int { + proc, _ := bitsetDecodeBytes(bitsetEncodeBytes(data), len(data)) if !bytes.Equal(data, proc) { panic("content mismatch") } return 0 } -// fuzzDecompress implements a go-fuzz fuzzer method to test the bit decompression -// and recompression algorithm. -func fuzzDecompress(data []byte) int { - blob, err := DecompressBytes(data, 1024) +// fuzzDecode implements a go-fuzz fuzzer method to test the bit decoding and +// reencoding algorithm. +func fuzzDecode(data []byte) int { + blob, err := bitsetDecodeBytes(data, 1024) if err != nil { return 0 } - if comp := CompressBytes(blob); !bytes.Equal(comp, data) { + if comp := bitsetEncodeBytes(blob); !bytes.Equal(comp, data) { panic("content mismatch") } return 0 diff --git a/common/bitutil/compress_test.go b/common/bitutil/compress_test.go index ef38bc7b38..805ab0369d 100644 --- a/common/bitutil/compress_test.go +++ b/common/bitutil/compress_test.go @@ -24,8 +24,8 @@ import ( "github.com/ethereum/go-ethereum/common/hexutil" ) -// Tests that data compression and decompression works correctly. -func TestCompressCycle(t *testing.T) { +// Tests that data bitset encoding and decoding works and is bijective. +func TestEncodingCycle(t *testing.T) { tests := []string{ // Tests generated by go-fuzz to maximize code coverage "0x000000000000000000", @@ -50,7 +50,7 @@ func TestCompressCycle(t *testing.T) { for i, tt := range tests { data := hexutil.MustDecode(tt) - proc, err := DecompressBytes(CompressBytes(data), len(data)) + proc, err := bitsetDecodeBytes(bitsetEncodeBytes(data), len(data)) if err != nil { t.Errorf("test %d: failed to decompress compressed data: %v", i, err) continue @@ -61,8 +61,8 @@ func TestCompressCycle(t *testing.T) { } } -// Tests that data decompression works -func TestDecompress(t *testing.T) { +// Tests that data bitset decoding and rencoding works and is bijective. +func TestDecodingCycle(t *testing.T) { tests := []struct { size int input string @@ -71,22 +71,22 @@ func TestDecompress(t *testing.T) { {size: 0, input: "0x"}, // Crashers generated by go-fuzz - {size: 0, input: "0x0020", fail: ErrUnreferencedData}, - {size: 0, input: "0x30", fail: ErrUnreferencedData}, - {size: 1, input: "0x00", fail: ErrUnreferencedData}, - {size: 2, input: "0x07", fail: ErrMissingData}, - {size: 1024, input: "0x8000", fail: ErrZeroContent}, + {size: 0, input: "0x0020", fail: errUnreferencedData}, + {size: 0, input: "0x30", fail: errUnreferencedData}, + {size: 1, input: "0x00", fail: errUnreferencedData}, + {size: 2, input: "0x07", fail: errMissingData}, + {size: 1024, input: "0x8000", fail: errZeroContent}, // Tests generated by go-fuzz to maximize code coverage - {size: 29490, input: "0x343137343733323134333839373334323073333930783e3078333930783e70706336346c65303e", fail: ErrMissingData}, - {size: 59395, input: "0x00", fail: ErrUnreferencedData}, - {size: 52574, input: "0x70706336346c65c0de", fail: ErrExceededTarget}, - {size: 42264, input: "0x07", fail: ErrMissingData}, - {size: 52, input: "0xa5045bad48f4", fail: ErrExceededTarget}, - {size: 52574, input: "0xc0de", fail: ErrMissingData}, + {size: 29490, input: "0x343137343733323134333839373334323073333930783e3078333930783e70706336346c65303e", fail: errMissingData}, + {size: 59395, input: "0x00", fail: errUnreferencedData}, + {size: 52574, input: "0x70706336346c65c0de", fail: errExceededTarget}, + {size: 42264, input: "0x07", fail: errMissingData}, + {size: 52, input: "0xa5045bad48f4", fail: errExceededTarget}, + {size: 52574, input: "0xc0de", fail: errMissingData}, {size: 52574, input: "0x"}, - {size: 29490, input: "0x34313734373332313433383937333432307333393078073034333839373334323073333930783e3078333937333432307333393078073061333930783e70706336346c65303e", fail: ErrMissingData}, - {size: 29491, input: "0x3973333930783e30783e", fail: ErrMissingData}, + {size: 29490, input: "0x34313734373332313433383937333432307333393078073034333839373334323073333930783e3078333937333432307333393078073061333930783e70706336346c65303e", fail: errMissingData}, + {size: 29491, input: "0x3973333930783e30783e", fail: errMissingData}, {size: 1024, input: "0x808080608080"}, {size: 1024, input: "0x808470705e3632383337363033313434303137393130306c6580ef46806380635a80"}, @@ -101,37 +101,66 @@ func TestDecompress(t *testing.T) { for i, tt := range tests { data := hexutil.MustDecode(tt.input) - orig, err := DecompressBytes(data, tt.size) + orig, err := bitsetDecodeBytes(data, tt.size) if err != tt.fail { t.Errorf("test %d: failure mismatch: have %v, want %v", i, err, tt.fail) } if err != nil { continue } - if comp := CompressBytes(orig); !bytes.Equal(comp, data) { + if comp := bitsetEncodeBytes(orig); !bytes.Equal(comp, data) { t.Errorf("test %d: decompress/compress mismatch: have %x, want %x", i, comp, data) } } } +// TestCompression tests that compression works by returning either the bitset +// encoded input, or the actual input if the bitset version is longer. +func TestCompression(t *testing.T) { + // Check the the compression returns the bitset encoding is shorter + in := hexutil.MustDecode("0x4912385c0e7b64000000") + out := hexutil.MustDecode("0x80fe4912385c0e7b64") + + if data := CompressBytes(in); bytes.Compare(data, out) != 0 { + t.Errorf("encoding mismatch for sparse data: have %x, want %x", data, out) + } + if data, err := DecompressBytes(out, len(in)); err != nil || bytes.Compare(data, in) != 0 { + t.Errorf("decoding mismatch for sparse data: have %x, want %x, error %v", data, in, err) + } + // Check the the compression returns the input if the bitset encoding is longer + in = hexutil.MustDecode("0xdf7070533534333636313639343638373532313536346c1bc33339343837313070706336343035336336346c65fefb3930393233383838ac2f65fefb") + out = hexutil.MustDecode("0xdf7070533534333636313639343638373532313536346c1bc33339343837313070706336343035336336346c65fefb3930393233383838ac2f65fefb") + + if data := CompressBytes(in); bytes.Compare(data, out) != 0 { + t.Errorf("encoding mismatch for dense data: have %x, want %x", data, out) + } + if data, err := DecompressBytes(out, len(in)); err != nil || bytes.Compare(data, in) != 0 { + t.Errorf("decoding mismatch for dense data: have %x, want %x, error %v", data, in, err) + } + // Check that decompressing a longer input than the target fails + if _, err := DecompressBytes([]byte{0xc0, 0x01, 0x01}, 2); err != errExceededTarget { + t.Errorf("decoding error mismatch for long data: have %v, want %v", err, errExceededTarget) + } +} + // Crude benchmark for compressing random slices of bytes. -func BenchmarkCompress1KBVerySparse(b *testing.B) { benchmarkCompress(b, 1024, 0.0001) } -func BenchmarkCompress2KBVerySparse(b *testing.B) { benchmarkCompress(b, 2048, 0.0001) } -func BenchmarkCompress4KBVerySparse(b *testing.B) { benchmarkCompress(b, 4096, 0.0001) } +func BenchmarkEncoding1KBVerySparse(b *testing.B) { benchmarkEncoding(b, 1024, 0.0001) } +func BenchmarkEncoding2KBVerySparse(b *testing.B) { benchmarkEncoding(b, 2048, 0.0001) } +func BenchmarkEncoding4KBVerySparse(b *testing.B) { benchmarkEncoding(b, 4096, 0.0001) } -func BenchmarkCompress1KBSparse(b *testing.B) { benchmarkCompress(b, 1024, 0.001) } -func BenchmarkCompress2KBSparse(b *testing.B) { benchmarkCompress(b, 2048, 0.001) } -func BenchmarkCompress4KBSparse(b *testing.B) { benchmarkCompress(b, 4096, 0.001) } +func BenchmarkEncoding1KBSparse(b *testing.B) { benchmarkEncoding(b, 1024, 0.001) } +func BenchmarkEncoding2KBSparse(b *testing.B) { benchmarkEncoding(b, 2048, 0.001) } +func BenchmarkEncoding4KBSparse(b *testing.B) { benchmarkEncoding(b, 4096, 0.001) } -func BenchmarkCompress1KBDense(b *testing.B) { benchmarkCompress(b, 1024, 0.1) } -func BenchmarkCompress2KBDense(b *testing.B) { benchmarkCompress(b, 2048, 0.1) } -func BenchmarkCompress4KBDense(b *testing.B) { benchmarkCompress(b, 4096, 0.1) } +func BenchmarkEncoding1KBDense(b *testing.B) { benchmarkEncoding(b, 1024, 0.1) } +func BenchmarkEncoding2KBDense(b *testing.B) { benchmarkEncoding(b, 2048, 0.1) } +func BenchmarkEncoding4KBDense(b *testing.B) { benchmarkEncoding(b, 4096, 0.1) } -func BenchmarkCompress1KBSaturated(b *testing.B) { benchmarkCompress(b, 1024, 0.5) } -func BenchmarkCompress2KBSaturated(b *testing.B) { benchmarkCompress(b, 2048, 0.5) } -func BenchmarkCompress4KBSaturated(b *testing.B) { benchmarkCompress(b, 4096, 0.5) } +func BenchmarkEncoding1KBSaturated(b *testing.B) { benchmarkEncoding(b, 1024, 0.5) } +func BenchmarkEncoding2KBSaturated(b *testing.B) { benchmarkEncoding(b, 2048, 0.5) } +func BenchmarkEncoding4KBSaturated(b *testing.B) { benchmarkEncoding(b, 4096, 0.5) } -func benchmarkCompress(b *testing.B, bytes int, fill float64) { +func benchmarkEncoding(b *testing.B, bytes int, fill float64) { // Generate a random slice of bytes to compress random := rand.NewSource(0) // reproducible and comparable @@ -143,10 +172,10 @@ func benchmarkCompress(b *testing.B, bytes int, fill float64) { bit := uint(random.Int63() % 8) data[idx] |= 1 << bit } - // Reset the benchmark and measure compression/decompression + // Reset the benchmark and measure encoding/decoding b.ResetTimer() b.ReportAllocs() for i := 0; i < b.N; i++ { - DecompressBytes(CompressBytes(data), len(data)) + bitsetDecodeBytes(bitsetEncodeBytes(data), len(data)) } }