klauspost
diff --git a/‎s2/encode.go‎
Lines changed: 44 additions & 0 deletions b/‎s2/encode.go‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎s2/encode_amd64.go‎
Lines changed: 1 addition & 47 deletions b/‎s2/encode_amd64.go‎
Lines changed: 1 addition & 47 deletions
diff --git a/‎s2/encode_go.go‎
Lines changed: 245 additions & 20 deletions b/‎s2/encode_go.go‎
Lines changed: 245 additions & 20 deletions
@@ -100,6 +100,50 @@ func EncodeBetter(dst, src []byte) []byte {
 	return dst[:d]
 }
 
+// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappy(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
 // ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
 // If the destination is nil or too small, a new will be allocated.
 // The blocks are not validated, so garbage in = garbage out.
 
@@ -4,8 +4,6 @@
 
 package s2
 
-import "encoding/binary"
-
 func init() {
 	avxAvailable = cpu.avx()
 }
@@ -57,51 +55,7 @@ func encodeBlock(dst, src []byte) (d int) {
 	return encodeBlockAsm8B(dst, src)
 }
 
-// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire encoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The output is Snappy compatible and will likely decompress faster.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-//
-// The blocks will require the same amount of memory to decode as encoding,
-// and does not make for concurrent decoding.
-// Also note that blocks do not contain CRC information, so corruption may be undetected.
-//
-// If you need to encode larger amounts of data, consider using
-// the streaming interface which gives all of these features.
-func EncodeSnappy(dst, src []byte) []byte {
-	if n := MaxEncodedLen(len(src)); n < 0 {
-		panic(ErrTooLarge)
-	} else if cap(dst) < n {
-		dst = make([]byte, n)
-	} else {
-		dst = dst[:n]
-	}
-
-	// The block starts with the varint-encoded length of the decompressed bytes.
-	d := binary.PutUvarint(dst, uint64(len(src)))
-
-	if len(src) == 0 {
-		return dst[:d]
-	}
-	if len(src) < minNonLiteralBlockSize {
-		d += emitLiteral(dst[d:], src)
-		return dst[:d]
-	}
-
-	n := encodeBlockSnappy(dst[d:], src)
-	if n > 0 {
-		d += n
-		return dst[:d]
-	}
-	// Not compressible
-	d += emitLiteral(dst[d:], src)
-	return dst[:d]
-}
-
-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 
@@ -3,29 +3,10 @@
 package s2
 
 import (
+	"bytes"
 	"math/bits"
-
-	"github.com/klauspost/compress/snappy"
 )
 
-// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire encoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The output is Snappy compatible and will likely decompress faster.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-//
-// The blocks will require the same amount of memory to decode as encoding,
-// and does not make for concurrent decoding.
-// Also note that blocks do not contain CRC information, so corruption may be undetected.
-//
-// If you need to encode larger amounts of data, consider using
-// the streaming interface which gives all of these features.
-func EncodeSnappy(dst, src []byte) []byte {
-	return snappy.Encode(dst, src)
-}
-
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
@@ -188,6 +169,65 @@ func emitCopy(dst []byte, offset, length int) int {
 	return 2
 }
 
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeat(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitCopyNoRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		// Emit remaining as repeat value (minimum 4 bytes).
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = 59<<2 | tagCopy2
+		length -= 60
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitCopyNoRepeat(dst[3:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
 // matchLen returns how many bytes match in a and b
 //
 // It assumes that:
@@ -223,3 +263,188 @@ func matchLen(a []byte, b []byte) int {
 	}
 	return len(a) + checked
 }
+
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}