Skip to content

Commit 970d4dc

Browse files
authored
zstd: Improve match speed of fastest level. (#241)
Inline matchlen code and slightly simplify it. This loses a small amount of compression but gives a big speedup. Before/after, best of 3 runs each: ``` file out level insize outsize millis mb/s enwik9 zskp 1 1000000000 343831004 4202 226.91 enwik9 zskp 1 1000000000 343848582 3682 258.97 github-june-2days-2019.json zskp 1 6273951764 698824137 10787 554.67 github-june-2days-2019.json zskp 1 6273951764 699045015 10474 571.23 github-ranks-backup.bin zskp 1 1862623243 454018274 4833 367.54 github-ranks-backup.bin zskp 1 1862623243 454072815 4568 388.82 rawstudio-mint14.tar zskp 1 8558382592 3667295557 21060 387.55 rawstudio-mint14.tar zskp 1 8558382592 3667489370 20207 403.90 nyc-taxi-data-10M.csv zskp 1 3325605752 641244049 10954 289.53 nyc-taxi-data-10M.csv zskp 1 3325605752 641339945 9668 328.01 gob-stream zskp 1 1911399616 234947276 3514 518.62 gob-stream zskp 1 1911399616 235022249 3354 543.36 ```
1 parent 56999ed commit 970d4dc

File tree

2 files changed

+103
-13
lines changed

2 files changed

+103
-13
lines changed

zstd/enc_fast.go

Lines changed: 92 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package zstd
66

77
import (
88
"fmt"
9+
"math"
910
"math/bits"
1011

1112
"github.com/klauspost/compress/zstd/internal/xxhash"
@@ -173,9 +174,22 @@ encodeLoop:
173174
if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
174175
// Consider history as well.
175176
var seq seq
176-
lenght := 4 + e.matchlen(s+6, repIndex+4, src)
177+
var length int32
178+
// length = 4 + e.matchlen(s+6, repIndex+4, src)
179+
{
180+
a := src[s+6:]
181+
b := src[repIndex+4:]
182+
endI := len(a) & (math.MaxInt32 - 7)
183+
length = int32(endI) + 4
184+
for i := 0; i < endI; i += 8 {
185+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
186+
length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
187+
break
188+
}
189+
}
190+
}
177191

178-
seq.matchLen = uint32(lenght - zstdMinMatch)
192+
seq.matchLen = uint32(length - zstdMinMatch)
179193

180194
// We might be able to match backwards.
181195
// Extend as long as we can.
@@ -201,11 +215,11 @@ encodeLoop:
201215
println("repeat sequence", seq, "next s:", s)
202216
}
203217
blk.sequences = append(blk.sequences, seq)
204-
s += lenght + 2
218+
s += length + 2
205219
nextEmit = s
206220
if s >= sLimit {
207221
if debug {
208-
println("repeat ended", s, lenght)
222+
println("repeat ended", s, length)
209223

210224
}
211225
break encodeLoop
@@ -261,7 +275,20 @@ encodeLoop:
261275
}
262276

263277
// Extend the 4-byte match as long as possible.
264-
l := e.matchlen(s+4, t+4, src) + 4
278+
//l := e.matchlen(s+4, t+4, src) + 4
279+
var l int32
280+
{
281+
a := src[s+4:]
282+
b := src[t+4:]
283+
endI := len(a) & (math.MaxInt32 - 7)
284+
l = int32(endI) + 4
285+
for i := 0; i < endI; i += 8 {
286+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
287+
l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
288+
break
289+
}
290+
}
291+
}
265292

266293
// Extend backwards
267294
tMin := s - e.maxMatchOff
@@ -298,7 +325,20 @@ encodeLoop:
298325
if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
299326
// We have at least 4 byte match.
300327
// No need to check backwards. We come straight from a match
301-
l := 4 + e.matchlen(s+4, o2+4, src)
328+
//l := 4 + e.matchlen(s+4, o2+4, src)
329+
var l int32
330+
{
331+
a := src[s+4:]
332+
b := src[o2+4:]
333+
endI := len(a) & (math.MaxInt32 - 7)
334+
l = int32(endI) + 4
335+
for i := 0; i < endI; i += 8 {
336+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
337+
l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
338+
break
339+
}
340+
}
341+
}
302342

303343
// Store this, since we have it.
304344
nextHash := hash6(cv, hashLog)
@@ -416,10 +456,23 @@ encodeLoop:
416456
if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
417457
// Consider history as well.
418458
var seq seq
419-
// lenght := 4 + e.matchlen(s+6, repIndex+4, src)
420-
lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
459+
// length := 4 + e.matchlen(s+6, repIndex+4, src)
460+
// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
461+
var length int32
462+
{
463+
a := src[s+6:]
464+
b := src[repIndex+4:]
465+
endI := len(a) & (math.MaxInt32 - 7)
466+
length = int32(endI) + 4
467+
for i := 0; i < endI; i += 8 {
468+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
469+
length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
470+
break
471+
}
472+
}
473+
}
421474

422-
seq.matchLen = uint32(lenght - zstdMinMatch)
475+
seq.matchLen = uint32(length - zstdMinMatch)
423476

424477
// We might be able to match backwards.
425478
// Extend as long as we can.
@@ -445,11 +498,11 @@ encodeLoop:
445498
println("repeat sequence", seq, "next s:", s)
446499
}
447500
blk.sequences = append(blk.sequences, seq)
448-
s += lenght + 2
501+
s += length + 2
449502
nextEmit = s
450503
if s >= sLimit {
451504
if debug {
452-
println("repeat ended", s, lenght)
505+
println("repeat ended", s, length)
453506

454507
}
455508
break encodeLoop
@@ -502,7 +555,20 @@ encodeLoop:
502555

503556
// Extend the 4-byte match as long as possible.
504557
//l := e.matchlenNoHist(s+4, t+4, src) + 4
505-
l := int32(matchLen(src[s+4:], src[t+4:])) + 4
558+
// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
559+
var l int32
560+
{
561+
a := src[s+4:]
562+
b := src[t+4:]
563+
endI := len(a) & (math.MaxInt32 - 7)
564+
l = int32(endI) + 4
565+
for i := 0; i < endI; i += 8 {
566+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
567+
l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
568+
break
569+
}
570+
}
571+
}
506572

507573
// Extend backwards
508574
tMin := s - e.maxMatchOff
@@ -540,7 +606,20 @@ encodeLoop:
540606
// We have at least 4 byte match.
541607
// No need to check backwards. We come straight from a match
542608
//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
543-
l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
609+
// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
610+
var l int32
611+
{
612+
a := src[s+4:]
613+
b := src[o2+4:]
614+
endI := len(a) & (math.MaxInt32 - 7)
615+
l = int32(endI) + 4
616+
for i := 0; i < endI; i += 8 {
617+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
618+
l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
619+
break
620+
}
621+
}
622+
}
544623

545624
// Store this, since we have it.
546625
nextHash := hash6(cv, hashLog)

zstd/zstd.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,17 @@ func printf(format string, a ...interface{}) {
8787
}
8888
}
8989

90+
// matchLenFast does matching, but will not match the last up to 7 bytes.
91+
func matchLenFast(a, b []byte) int {
92+
endI := len(a) & (math.MaxInt32 - 7)
93+
for i := 0; i < endI; i += 8 {
94+
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
95+
return i + bits.TrailingZeros64(diff)>>3
96+
}
97+
}
98+
return endI
99+
}
100+
90101
// matchLen returns the maximum length.
91102
// a must be the shortest of the two.
92103
// The function also returns whether all bytes matched.

0 commit comments

Comments
 (0)