Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perf/compression zero in alphabet #883

Closed
wants to merge 15 commits into from
Closed
70 changes: 12 additions & 58 deletions std/compress/lzss_v1/compress.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
package lzss_v1

import (
"bytes"
"encoding/binary"
"fmt"
"github.com/consensys/gnark/std/compress"
"index/suffixarray"
"math/bits"

Expand All @@ -21,33 +20,30 @@ import (
// In fact, DEFLATE is LZSS + Huffman coding. It is implemented in gzip which is the standard tool for compressing programmatic data.
// For more information, refer to Bill Bird's fantastic undergraduate course on Data Compression
// In particular those on the LZ family: https://youtu.be/z1I1o7zySUI and DEFLATE: https://youtu.be/SJPvNi4HrWQ
func Compress(d []byte, settings Settings) (c []byte, err error) {
func Compress(d []byte, settings Settings) (c compress.Stream, err error) {
// d[i < 0] = Settings.BackRefSettings.Symbol by convention
var out bytes.Buffer
c.NbSymbs = 257

emitBackRef := func(offset, length int) {
out.WriteByte(0)
emit(&out, offset-1, settings.NbBytesAddress)
emit(&out, length-1, settings.NbBytesLength)
c.D = append(c.D, 256)
emit(&c.D, offset-1, settings.NbBytesAddress)
emit(&c.D, length-1, settings.NbBytesLength)
}
compressor := newCompressor(d, settings)
i := 0
i := int(settings.StartAt)
for i < len(d) {
addr, length := compressor.longestMostRecentBackRef(i)
if length == -1 {
// no backref found
if d[i] == 0 {
return nil, fmt.Errorf("could not find an RLE backref at index %d", i)
}
out.WriteByte(d[i])
c.D = append(c.D, int(d[i]))
i++
continue
}
emitBackRef(i-addr, length)
i += length
}

return out.Bytes(), nil
return
}

type compressor struct {
Expand Down Expand Up @@ -91,38 +87,6 @@ func (compressor *compressor) longestMostRecentBackRef(i int) (addr, length int)
windowStart := utils.Max(0, minBackRefAddr)
endWindow := utils.Min(i+brAddressRange, len(d))

if d[i] == 0 { // RLE; prune the options
// we can't encode 0 as is, so we must find a backref.

// runLen := compressor.countZeroes(i, brLengthRange) // utils.Min(getRunLength(d, i), brLengthRange)
runLen := utils.Min(compressor.longestZeroPrefix[i], brLengthRange)

backrefAddr := -1
backrefLen := -1
for j := i - 1; j >= windowStart; j-- {
n := utils.Min(compressor.longestZeroPrefix[j], runLen)
if n == 0 {
continue
}
// check if we can make this backref longer
m := matchLen(d[i+n:endWindow], d[j+n:]) + n

if m > backrefLen {
if m >= brLengthRange {
// we can stop we won't find a longer backref
return j, brLengthRange
}
backrefLen = m
backrefAddr = j
}
}
if (backrefLen == -1 && minBackRefAddr < 0) || (backrefLen != -1 && minBackRefAddr < 0 && backrefLen < -minBackRefAddr) {
backrefAddr = minBackRefAddr
backrefLen = utils.Min(runLen, -minBackRefAddr)
}
return backrefAddr, backrefLen
}

// else -->
// d[i] != 0

Expand Down Expand Up @@ -158,16 +122,6 @@ func (compressor *compressor) longestMostRecentBackRef(i int) (addr, length int)

}

func countZeroes(a []byte, maxCount int) (count int) {
for i := 0; i < len(a) && count < maxCount; i++ {
if a[i] != 0 {
break
}
count++
}
return
}

// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
func matchLen(a, b []byte) (n int) {
Expand All @@ -189,10 +143,10 @@ func matchLen(a, b []byte) (n int) {

}

func emit(bb *bytes.Buffer, n int, nbBytes uint) {
func emit(bb *[]int, n int, nbBytes uint) {
for i := uint(0); i < nbBytes; i++ {
bb.WriteByte(byte(n))
n >>= 8
*bb = append(*bb, n%257)
n /= 257
}
if n != 0 {
panic("n does not fit in nbBytes")
Expand Down
114 changes: 26 additions & 88 deletions std/compress/lzss_v1/compress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@ import (
"bytes"
"encoding/hex"
"fmt"
"io"
"github.com/stretchr/testify/assert"
"os"
"strings"
"testing"

"github.com/consensys/gnark/std/compress"
"github.com/consensys/gnark/std/compress/huffman"
"github.com/klauspost/compress/s2"
"github.com/klauspost/compress/zstd"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand All @@ -26,37 +23,41 @@ func testCompressionRoundTrip(t *testing.T, nbBytesAddress uint, d []byte, testC
d, err = os.ReadFile("../test_cases/" + testCaseName[0] + "/data.bin")
require.NoError(t, err)
}
const contextSize = 256
d = append(make([]byte, contextSize), d...)
settings := Settings{
BackRefSettings: BackRefSettings{
NbBytesAddress: nbBytesAddress,
NbBytesLength: 1,
},
StartAt: 256,
}
c, err := Compress(d, settings)
require.NoError(t, err)

if len(testCaseName) == 1 {
assert.NoError(t, os.WriteFile("../test_cases/"+testCaseName[0]+"/data.lzssv1", c, 0600))
assert.NoError(t, os.WriteFile("../test_cases/"+testCaseName[0]+"/data.lzssv1", c.Write(), 0600))
}
cStream := compress.NewStreamFromBytes(c)
cHuff := huffman.Encode(cStream)
fmt.Println("Size Compression ratio:", float64(len(d))/float64(len(c)))
fmt.Println("Estimated Compression ratio (with Huffman):", float64(8*len(d))/float64(len(cHuff.D)))
if len(c) > 1024 {
fmt.Printf("Compressed size: %dKB\n", int(float64(len(c)*100)/1024)/100)

cHuff := huffman.Encode(c)
fmt.Println("Size Compression ratio:", float64(len(d)-contextSize)/float64(c.Len()))
fmt.Println("Estimated Compression ratio (with Huffman):", float64(8*(len(d)-contextSize))/float64(len(cHuff.D)))
if c.Len() > 1024 {
fmt.Printf("Compressed size: %dKB\n", int(float64(c.Len()*100)/1024)/100)
fmt.Printf("Compressed size (with Huffman): %dKB\n", int(float64(len(cHuff.D)*100)/8192)/100)
}
require.NoError(t, err)

dBack, err := DecompressPureGo(c, settings)
require.NoError(t, err)

if len(c) < 1024 {
printHex(c)
assert.Equal(t, len(d)-contextSize, len(dBack))
for i := range dBack {
require.Equal(t, d[contextSize+i], dBack[i], i)
}

require.Equal(t, d, dBack)
//require.Equal(t, d[contextSize:], dBack)

// store huffman code lengths
lens := huffman.GetCodeLengths(cStream)
lens := huffman.GetCodeLengths(c)
var sbb strings.Builder
sbb.WriteString("symbol,code-length\n")
for i := range lens {
Expand Down Expand Up @@ -119,20 +120,6 @@ func TestLongBackrefBug(t *testing.T) {
testCompressionRoundTrip(t, 2, nil, "bug")
}

func printHex(d []byte) {
for i := range d {
if i%32 == 0 {
fmt.Printf("\n[%d]: ", i)
}
s := fmt.Sprintf("%x", d[i])
if len(s) == 1 {
s = "0" + s
}
fmt.Print(s)
}
fmt.Println()
}

func TestAverageBatch(t *testing.T) {
assert := require.New(t)

Expand Down Expand Up @@ -221,88 +208,39 @@ func BenchmarkAverageBatch(b *testing.B) {
}

type compressResult struct {
compressed []byte
compressed compress.Stream
inputSize int
outputSize int
ratio float64
}

func decompressWithS2(data []byte) ([]byte, error) {
r := s2.NewReader(bytes.NewReader(data))
var dst bytes.Buffer
_, err := io.Copy(&dst, r)
return dst.Bytes(), err
}

func compressWithS2(data []byte) (compressResult, error) {
var buf bytes.Buffer
w := s2.NewWriter(&buf)
w.Write(data)
w.Close()

res := compressResult{
compressed: make([]byte, buf.Len()),
inputSize: len(data),
outputSize: buf.Len(),
ratio: float64(len(data)) / float64(buf.Len()),
}
copy(res.compressed, buf.Bytes())
return res, nil
}

func decompressWithZstd(data []byte) ([]byte, error) {
r, err := zstd.NewReader(bytes.NewReader(data))
if err != nil {
return nil, err
}
var dst bytes.Buffer
_, err = io.Copy(&dst, r)
return dst.Bytes(), err
}

func compressWithZstd(data []byte) (compressResult, error) {
var buf bytes.Buffer

w, err := zstd.NewWriter(&buf)
if err != nil {
return compressResult{}, err
}
w.Write(data)
w.Close()

res := compressResult{
compressed: make([]byte, buf.Len()),
inputSize: len(data),
outputSize: buf.Len(),
ratio: float64(len(data)) / float64(buf.Len()),
}
copy(res.compressed, buf.Bytes())
return res, nil
}

func decompresslzss_v1(data []byte) ([]byte, error) {
func decompresslzss_v1(data compress.Stream) ([]byte, error) {
return DecompressPureGo(data, Settings{
BackRefSettings: BackRefSettings{
NbBytesAddress: 2,
NbBytesLength: 1,
},
StartAt: 256,
})
}

func compresslzss_v1(data []byte) (compressResult, error) {
const contextSize = 256
data = append(make([]byte, contextSize), data...)
c, err := Compress(data, Settings{
BackRefSettings: BackRefSettings{
NbBytesAddress: 2,
NbBytesLength: 1,
},
StartAt: 256,
})
if err != nil {
return compressResult{}, err
}
return compressResult{
compressed: c,
inputSize: len(data),
outputSize: len(c),
ratio: float64(len(data)) / float64(len(c)),
outputSize: c.Len(),
ratio: float64(len(data)) / float64(c.Len()),
}, nil
}
Loading