Skip to content

Commit

Permalink
deflate: Improve level 5+6 compression (#367)
Browse files Browse the repository at this point in the history
Improve deflate level 5+6 compression by checking an additional hash when the best found match ends.

This improves compression in most cases at an acceptable speed loss which brings it will in line with the surrounding parameters.

Level 5 tries one hash at length < 30 and level 6 tries 2 at all lengths.

Before/after pairs...

```
file	out	level	insize	outsize	millis	mb/s
github-june-2days-2019.json	gzkp	5	6273951764	963122453	31498	189.96
github-june-2days-2019.json	gzkp	5	6273951764	947567306	32795	182.45

github-june-2days-2019.json	gzkp	6	6273951764	949824639	34851	171.68
github-june-2days-2019.json	gzkp	6	6273951764	930428507	37312	160.35

nyc-taxi-data-10M.csv	gzkp	5	3325605752	785784479	24729	128.25
nyc-taxi-data-10M.csv	gzkp	5	3325605752	779343831	27189	116.65

nyc-taxi-data-10M.csv	gzkp	6	3325605752	775719630	26690	118.83
nyc-taxi-data-10M.csv	gzkp	6	3325605752	768153050	29905	106.05

enwik9	gzkp	5	1000000000	338823570	10477	91.02
enwik9	gzkp	5	1000000000	337489137	11353	84.00

enwik9	gzkp	6	1000000000	336549505	10791	88.37
enwik9	gzkp	6	1000000000	334933748	11961	79.73

gob-stream	gzkp	5	1911399616	309832207	8596	212.03
gob-stream	gzkp	5	1911399616	307765377	9101	200.28

gob-stream	gzkp	6	1911399616	308962175	9626	189.35
gob-stream	gzkp	6	1911399616	301609641	10305	176.88
```
  • Loading branch information
klauspost authored Apr 26, 2021
1 parent 5fe15d9 commit ba2263c
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
17 changes: 16 additions & 1 deletion flate/level5.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,27 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.

// Extend the 4-byte match as long as possible.
if l == 0 {
// Extend the 4-byte match as long as possible.
l = e.matchlenLong(s+4, t+4, src) + 4
} else if l == maxMatchLength {
l += e.matchlenLong(s+l, t+l, src)
}

// Try to locate a better match by checking the end of best match...
if sAt := s + l; l < 30 && sAt < sLimit {
eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
// Test current
t2 := eLong - e.cur - l
off := s - t2
if t2 >= 0 && off < maxMatchOffset && off > 0 {
if l2 := e.matchlenLong(s, t2, src); l2 > l {
t = t2
l = l2
}
}
}

// Extend backwards
for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
s--
Expand Down
25 changes: 25 additions & 0 deletions flate/level6.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,31 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
l += e.matchlenLong(s+l, t+l, src)
}

// Try to locate a better match by checking the end-of-match...
if sAt := s + l; sAt < sLimit {
eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
// Test current
t2 := eLong.Cur.offset - e.cur - l
off := s - t2
if off < maxMatchOffset {
if off > 0 && t2 >= 0 {
if l2 := e.matchlenLong(s, t2, src); l2 > l {
t = t2
l = l2
}
}
// Test next:
t2 = eLong.Prev.offset - e.cur - l
off := s - t2
if off > 0 && off < maxMatchOffset && t2 >= 0 {
if l2 := e.matchlenLong(s, t2, src); l2 > l {
t = t2
l = l2
}
}
}
}

// Extend backwards
for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
s--
Expand Down

0 comments on commit ba2263c

Please sign in to comment.