Skip to content

Commit

Permalink
s2: Improve better compression (#359)
Browse files Browse the repository at this point in the history
Index more entries at beginning and end of matches.

Before/after:
```
Reading d:\compress\enwik9...
Compressing... 1000000000 -> 426242923 [42.62%]; 225ms, 4244.2MB/s
Compressing... 1000000000 -> 418428483 [41.84%]; 230ms, 4154.8MB/s

Reading d:\compress\adresser.json...
Compressing... 7983034785 -> 457984083 [5.74%]; 507ms, 15018.9MB/s
Compressing... 7983034785 -> 453666648 [5.68%]; 494ms, 15424.7MB/s

Reading d:\compress\nyc-taxi-data-10M.csv...
Compressing... 3325605752 -> 935022950 [28.12%]; 462ms, 6870.2MB/s
Compressing... 3325605752 -> 917905514 [27.60%]; 460ms, 6889.8MB/s
```
  • Loading branch information
klauspost authored Apr 22, 2021
1 parent e3ae23b commit 6522991
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 13 deletions.
29 changes: 26 additions & 3 deletions s2/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -1284,48 +1284,71 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
cv := GP64()
INCL(base)
MOVQ(Mem{Base: src, Index: base, Scale: 1, Disp: 0}, cv)
hash0, hash1 := GP64(), GP64()
hash0, hash1, hash2, hash3 := GP64(), GP64(), GP64(), GP64()
MOVQ(cv, hash0) // src[base+1]
MOVQ(cv, hash1)
MOVQ(cv, hash2)
SHRQ(U8(8), hash1) // src[base+2]
bp1 := GP32() // base+1
MOVQ(hash1, hash3)
SHRQ(U8(16), hash2) // src[base+3]
bp1, bp2 := GP32(), GP32() // base+1
LEAL(Mem{Base: base, Disp: 1}, bp1)
LEAL(Mem{Base: base, Disp: 2}, bp2)

// Load s-2 early
MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv)

lHasher.hash(hash0)
lHasher.hash(hash3)
sHasher.hash(hash1)
sHasher.hash(hash2)
assert(func(ok LabelRef) {
CMPQ(hash0, U32(lTableSize))
JL(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash3, U32(lTableSize))
JL(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash1, U32(sTableSize))
JL(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash2, U32(sTableSize))
JL(ok)
})
MOVL(base, lTab.Idx(hash0, 4))
MOVL(bp1, lTab.Idx(hash3, 4))
MOVL(bp1, sTab.Idx(hash1, 4))
MOVL(bp2, sTab.Idx(hash2, 4))

// Index s-2 long, s-1 short...
// Index s-2 long, s-1 long+short...
MOVQ(cv, hash0) // src[s-2]
MOVQ(cv, hash1) // src[s-1]
SHRQ(U8(8), hash1)
MOVQ(hash1, hash3)
sm1, sm2 := GP32(), GP32() // s -1, s - 2
LEAL(Mem{Base: s, Disp: -2}, sm2)
LEAL(Mem{Base: s, Disp: -1}, sm1)
lHasher.hash(hash0)
sHasher.hash(hash1)
lHasher.hash(hash3)
assert(func(ok LabelRef) {
CMPQ(hash0, U32(lTableSize))
JL(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash3, U32(lTableSize))
JL(ok)
})
assert(func(ok LabelRef) {
CMPQ(hash1, U32(sTableSize))
JL(ok)
})
MOVL(sm2, lTab.Idx(hash0, 4))
MOVL(sm1, sTab.Idx(hash1, 4))
MOVL(sm1, lTab.Idx(hash3, 4))
}
JMP(LabelRef("search_loop_" + name))

Expand Down
4 changes: 4 additions & 0 deletions s2/encode_better.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
cv = load64(src, s)
continue
}

d += emitLiteral(dst[d:], src[nextEmit:base])
if repeat == offset {
d += emitRepeat(dst[d:], offset, s-base)
Expand All @@ -219,8 +220,11 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
}

Expand Down
105 changes: 95 additions & 10 deletions s2/encodeblock_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -5806,20 +5806,33 @@ match_nolit_dst_ok_encodeBetterBlockAsm:
MOVQ (DX)(SI*1), R8
MOVQ R8, R9
MOVQ R8, R10
MOVQ R8, R11
SHRQ $0x08, R10
LEAL 1(SI), R11
MOVQ R10, R12
SHRQ $0x10, R11
LEAL 1(SI), R13
LEAL 2(SI), R14
MOVQ -2(DX)(CX*1), R8
SHLQ $0x08, R9
IMULQ BP, R9
SHRQ $0x30, R9
SHLQ $0x08, R12
IMULQ BP, R12
SHRQ $0x30, R12
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x32, R10
SHLQ $0x20, R11
IMULQ DI, R11
SHRQ $0x32, R11
MOVL SI, 24(SP)(R9*4)
MOVL R11, 262168(SP)(R10*4)
MOVL R13, 24(SP)(R12*4)
MOVL R13, 262168(SP)(R10*4)
MOVL R14, 262168(SP)(R11*4)
MOVQ R8, R9
MOVQ R8, R10
SHRQ $0x08, R10
MOVQ R10, R12
LEAL -2(CX), R8
LEAL -1(CX), SI
SHLQ $0x08, R9
Expand All @@ -5828,8 +5841,12 @@ match_nolit_dst_ok_encodeBetterBlockAsm:
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x32, R10
SHLQ $0x08, R12
IMULQ BP, R12
SHRQ $0x30, R12
MOVL R8, 24(SP)(R9*4)
MOVL SI, 262168(SP)(R10*4)
MOVL SI, 24(SP)(R12*4)
JMP search_loop_encodeBetterBlockAsm

emit_remainder_encodeBetterBlockAsm:
Expand Down Expand Up @@ -6718,20 +6735,33 @@ match_nolit_dst_ok_encodeBetterBlockAsm4MB:
MOVQ (DX)(SI*1), R8
MOVQ R8, R9
MOVQ R8, R10
MOVQ R8, R11
SHRQ $0x08, R10
LEAL 1(SI), R11
MOVQ R10, R12
SHRQ $0x10, R11
LEAL 1(SI), R13
LEAL 2(SI), R14
MOVQ -2(DX)(CX*1), R8
SHLQ $0x08, R9
IMULQ BP, R9
SHRQ $0x30, R9
SHLQ $0x08, R12
IMULQ BP, R12
SHRQ $0x30, R12
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x32, R10
SHLQ $0x20, R11
IMULQ DI, R11
SHRQ $0x32, R11
MOVL SI, 24(SP)(R9*4)
MOVL R11, 262168(SP)(R10*4)
MOVL R13, 24(SP)(R12*4)
MOVL R13, 262168(SP)(R10*4)
MOVL R14, 262168(SP)(R11*4)
MOVQ R8, R9
MOVQ R8, R10
SHRQ $0x08, R10
MOVQ R10, R12
LEAL -2(CX), R8
LEAL -1(CX), SI
SHLQ $0x08, R9
Expand All @@ -6740,8 +6770,12 @@ match_nolit_dst_ok_encodeBetterBlockAsm4MB:
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x32, R10
SHLQ $0x08, R12
IMULQ BP, R12
SHRQ $0x30, R12
MOVL R8, 24(SP)(R9*4)
MOVL SI, 262168(SP)(R10*4)
MOVL SI, 24(SP)(R12*4)
JMP search_loop_encodeBetterBlockAsm4MB

emit_remainder_encodeBetterBlockAsm4MB:
Expand Down Expand Up @@ -7487,20 +7521,33 @@ match_nolit_dst_ok_encodeBetterBlockAsm12B:
MOVQ (DX)(SI*1), R8
MOVQ R8, R9
MOVQ R8, R10
MOVQ R8, R11
SHRQ $0x08, R10
LEAL 1(SI), R11
MOVQ R10, R12
SHRQ $0x10, R11
LEAL 1(SI), R13
LEAL 2(SI), R14
MOVQ -2(DX)(CX*1), R8
SHLQ $0x10, R9
IMULQ BP, R9
SHRQ $0x32, R9
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x32, R12
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x34, R10
SHLQ $0x20, R11
IMULQ DI, R11
SHRQ $0x34, R11
MOVL SI, 24(SP)(R9*4)
MOVL R11, 65560(SP)(R10*4)
MOVL R13, 24(SP)(R12*4)
MOVL R13, 65560(SP)(R10*4)
MOVL R14, 65560(SP)(R11*4)
MOVQ R8, R9
MOVQ R8, R10
SHRQ $0x08, R10
MOVQ R10, R12
LEAL -2(CX), R8
LEAL -1(CX), SI
SHLQ $0x10, R9
Expand All @@ -7509,8 +7556,12 @@ match_nolit_dst_ok_encodeBetterBlockAsm12B:
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x34, R10
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x32, R12
MOVL R8, 24(SP)(R9*4)
MOVL SI, 65560(SP)(R10*4)
MOVL SI, 24(SP)(R12*4)
JMP search_loop_encodeBetterBlockAsm12B

emit_remainder_encodeBetterBlockAsm12B:
Expand Down Expand Up @@ -8245,20 +8296,33 @@ match_nolit_dst_ok_encodeBetterBlockAsm10B:
MOVQ (DX)(SI*1), R8
MOVQ R8, R9
MOVQ R8, R10
MOVQ R8, R11
SHRQ $0x08, R10
LEAL 1(SI), R11
MOVQ R10, R12
SHRQ $0x10, R11
LEAL 1(SI), R13
LEAL 2(SI), R14
MOVQ -2(DX)(CX*1), R8
SHLQ $0x10, R9
IMULQ BP, R9
SHRQ $0x34, R9
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x34, R12
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x36, R10
SHLQ $0x20, R11
IMULQ DI, R11
SHRQ $0x36, R11
MOVL SI, 24(SP)(R9*4)
MOVL R11, 16408(SP)(R10*4)
MOVL R13, 24(SP)(R12*4)
MOVL R13, 16408(SP)(R10*4)
MOVL R14, 16408(SP)(R11*4)
MOVQ R8, R9
MOVQ R8, R10
SHRQ $0x08, R10
MOVQ R10, R12
LEAL -2(CX), R8
LEAL -1(CX), SI
SHLQ $0x10, R9
Expand All @@ -8267,8 +8331,12 @@ match_nolit_dst_ok_encodeBetterBlockAsm10B:
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x36, R10
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x34, R12
MOVL R8, 24(SP)(R9*4)
MOVL SI, 16408(SP)(R10*4)
MOVL SI, 24(SP)(R12*4)
JMP search_loop_encodeBetterBlockAsm10B

emit_remainder_encodeBetterBlockAsm10B:
Expand Down Expand Up @@ -8993,20 +9061,33 @@ match_nolit_dst_ok_encodeBetterBlockAsm8B:
MOVQ (DX)(SI*1), R8
MOVQ R8, R9
MOVQ R8, R10
MOVQ R8, R11
SHRQ $0x08, R10
LEAL 1(SI), R11
MOVQ R10, R12
SHRQ $0x10, R11
LEAL 1(SI), R13
LEAL 2(SI), R14
MOVQ -2(DX)(CX*1), R8
SHLQ $0x10, R9
IMULQ BP, R9
SHRQ $0x36, R9
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x36, R12
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x38, R10
SHLQ $0x20, R11
IMULQ DI, R11
SHRQ $0x38, R11
MOVL SI, 24(SP)(R9*4)
MOVL R11, 4120(SP)(R10*4)
MOVL R13, 24(SP)(R12*4)
MOVL R13, 4120(SP)(R10*4)
MOVL R14, 4120(SP)(R11*4)
MOVQ R8, R9
MOVQ R8, R10
SHRQ $0x08, R10
MOVQ R10, R12
LEAL -2(CX), R8
LEAL -1(CX), SI
SHLQ $0x10, R9
Expand All @@ -9015,8 +9096,12 @@ match_nolit_dst_ok_encodeBetterBlockAsm8B:
SHLQ $0x20, R10
IMULQ DI, R10
SHRQ $0x38, R10
SHLQ $0x10, R12
IMULQ BP, R12
SHRQ $0x36, R12
MOVL R8, 24(SP)(R9*4)
MOVL SI, 4120(SP)(R10*4)
MOVL SI, 24(SP)(R12*4)
JMP search_loop_encodeBetterBlockAsm8B

emit_remainder_encodeBetterBlockAsm8B:
Expand Down

0 comments on commit 6522991

Please sign in to comment.