[breaking change] deprecated DYNAMIC splitter due to license issue

The splitter in question was depending on
github.com/silvasur/buzhash which is not licensed according to FOSSA bot

Switched to new faster implementation of buzhash, which is
unfortunately incompatible and will split the objects in different
places.

This change is be semi-breaking - old repositories can be read, but
when uploading large objects they will be re-uploaded where previously
they would be de-duped.

Also added 'benchmark splitters' subcommand and moved 'block cryptobenchmark'
subcommand to 'benchmark crypto'.
This commit is contained in:
Jarek Kowalski
2019-05-30 22:06:50 -07:00
parent 2f0ff90c1a
commit 03339c18af
19 changed files with 338 additions and 225 deletions

View File

@@ -33,6 +33,7 @@
blockCommands = app.Command("block", "Commands to manipulate virtual blocks in repository.").Alias("blk").Hidden()
storageCommands = app.Command("storage", "Commands to manipulate raw storage blocks.").Hidden()
blockIndexCommands = app.Command("blockindex", "Commands to manipulate block index.").Hidden()
benchmarkCommands = app.Command("benchmark", "Commands to test performance of algorithms.").Hidden()
)
func helpFullAction(ctx *kingpin.ParseContext) error {

View File

@@ -11,26 +11,26 @@
)
var (
blockCryptoBenchmarkCommand = blockCommands.Command("cryptobenchmark", "Run hash and encryption benchmarks")
blockCryptoBenchmarkBlockSize = blockCryptoBenchmarkCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes()
blockCryptoBenchmarkEncryption = blockCryptoBenchmarkCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool()
blockCryptoBenchmarkRepeat = blockCryptoBenchmarkCommand.Flag("repeat", "Number of repetitions").Default("100").Int()
benchmarkCryptoCommand = benchmarkCommands.Command("crypto", "Run hash and encryption benchmarks")
benchmarkCryptoBlockSize = benchmarkCryptoCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes()
benchmarkCryptoEncryption = benchmarkCryptoCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool()
benchmarkCryptoRepeat = benchmarkCryptoCommand.Flag("repeat", "Number of repetitions").Default("100").Int()
)
type benchResult struct {
hash string
encryption string
throughput float64
}
func runBenchmarkCryptoAction(ctx *kingpin.ParseContext) error {
func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
type benchResult struct {
hash string
encryption string
throughput float64
}
var results []benchResult
data := make([]byte, *blockCryptoBenchmarkBlockSize)
data := make([]byte, *benchmarkCryptoBlockSize)
for _, ha := range block.SupportedHashAlgorithms() {
for _, ea := range block.SupportedEncryptionAlgorithms() {
isEncrypted := ea != "NONE"
if *blockCryptoBenchmarkEncryption != isEncrypted {
if *benchmarkCryptoEncryption != isEncrypted {
continue
}
@@ -44,9 +44,9 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
continue
}
log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *blockCryptoBenchmarkRepeat, len(data))
log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *benchmarkCryptoRepeat, len(data))
t0 := time.Now()
hashCount := *blockCryptoBenchmarkRepeat
hashCount := *benchmarkCryptoRepeat
for i := 0; i < hashCount; i++ {
blockID := h(data)
if _, encerr := e.Encrypt(data, blockID); encerr != nil {
@@ -74,5 +74,5 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
}
func init() {
blockCryptoBenchmarkCommand.Action(runBlockCryptoBenchmarkAction)
benchmarkCryptoCommand.Action(runBenchmarkCryptoAction)
}

View File

@@ -0,0 +1,110 @@
package cli
import (
"math/rand"
"sort"
"time"
"github.com/kopia/kopia/repo/object"
kingpin "gopkg.in/alecthomas/kingpin.v2"
)
var (
benchmarkSplitterCommand = benchmarkCommands.Command("splitter", "Run splitter benchmarks")
benchmarkSplitterRandSeed = benchmarkSplitterCommand.Flag("rand-seed", "Random seed").Default("42").Int64()
benchmarkSplitterBlockSize = benchmarkSplitterCommand.Flag("data-size", "Size of a data to split").Default("32MB").Bytes()
benchmarkSplitterBlockCount = benchmarkSplitterCommand.Flag("block-count", "Number of data blocks to split").Default("16").Int()
)
func runBenchmarkSplitterAction(ctx *kingpin.ParseContext) error {
type benchResult struct {
splitter string
duration time.Duration
segmentCount int
min int
p10 int
p25 int
p50 int
p75 int
p90 int
max int
}
var results []benchResult
// generate data blocks
var dataBlocks [][]byte
rnd := rand.New(rand.NewSource(*benchmarkSplitterRandSeed))
for i := 0; i < *benchmarkSplitterBlockCount; i++ {
b := make([]byte, *benchmarkSplitterBlockSize)
rnd.Read(b)
dataBlocks = append(dataBlocks, b)
}
log.Infof("splitting %v blocks of %v each", *benchmarkSplitterBlockCount, *benchmarkSplitterBlockSize)
for _, sp := range object.SupportedSplitters {
fact := object.GetSplitterFactory(sp)
var segmentLengths []int
t0 := time.Now()
for _, data := range dataBlocks {
s := fact()
l := 0
for _, d := range data {
l++
if s.ShouldSplit(d) {
segmentLengths = append(segmentLengths, l)
l = 0
}
}
if l > 0 {
segmentLengths = append(segmentLengths, l)
}
}
dur := time.Since(t0)
sort.Ints(segmentLengths)
r := benchResult{
sp,
dur,
len(segmentLengths),
segmentLengths[0],
segmentLengths[len(segmentLengths)*10/100],
segmentLengths[len(segmentLengths)*25/100],
segmentLengths[len(segmentLengths)*50/100],
segmentLengths[len(segmentLengths)*75/100],
segmentLengths[len(segmentLengths)*90/100],
segmentLengths[len(segmentLengths)-1],
}
printStdout("%-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n",
r.splitter,
r.duration.Nanoseconds()/1e6,
r.segmentCount,
r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max)
results = append(results, r)
}
sort.Slice(results, func(i, j int) bool {
return results[i].duration < results[j].duration
})
printStdout("-----------------------------------------------------------------\n")
for ndx, r := range results {
printStdout("%3v. %-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n",
ndx,
r.splitter,
r.duration.Nanoseconds()/1e6,
r.segmentCount,
r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max)
}
return nil
}
func init() {
benchmarkSplitterCommand.Action(runBenchmarkSplitterAction)
}

View File

@@ -5,12 +5,11 @@
"strings"
"github.com/kopia/kopia/fs/ignorefs"
"github.com/kopia/kopia/internal/units"
"github.com/kopia/kopia/snapshot/policy"
"github.com/kopia/kopia/repo"
"github.com/kopia/kopia/repo/block"
"github.com/kopia/kopia/repo/object"
"github.com/kopia/kopia/repo/storage"
"github.com/kopia/kopia/snapshot/policy"
"github.com/pkg/errors"
)
@@ -19,11 +18,7 @@
createBlockHashFormat = createCommand.Flag("block-hash", "Block hash algorithm.").PlaceHolder("ALGO").Default(block.DefaultHash).Enum(block.SupportedHashAlgorithms()...)
createBlockEncryptionFormat = createCommand.Flag("encryption", "Block encryption algorithm.").PlaceHolder("ALGO").Default(block.DefaultEncryption).Enum(block.SupportedEncryptionAlgorithms()...)
createObjectSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...)
createMinBlockSize = createCommand.Flag("min-block-size", "Minimum size of a data block.").PlaceHolder("KB").Default("1024").Int()
createAvgBlockSize = createCommand.Flag("avg-block-size", "Average size of a data block.").PlaceHolder("KB").Default("10240").Int()
createMaxBlockSize = createCommand.Flag("max-block-size", "Maximum size of a data block.").PlaceHolder("KB").Default("20480").Int()
createSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...)
createOverwrite = createCommand.Flag("overwrite", "Overwrite existing data (DANGEROUS).").Bool()
createOnly = createCommand.Flag("create-only", "Create repository, but don't connect to it.").Short('c').Bool()
@@ -53,10 +48,7 @@ func newRepositoryOptionsFromFlags() *repo.NewRepositoryOptions {
},
ObjectFormat: object.Format{
Splitter: *createObjectSplitter,
MinBlockSize: *createMinBlockSize * 1024,
AvgBlockSize: *createAvgBlockSize * 1024,
MaxBlockSize: *createMaxBlockSize * 1024,
Splitter: *createSplitter,
},
}
}
@@ -88,19 +80,7 @@ func runCreateCommandWithStorage(ctx context.Context, st storage.Storage) error
printStderr("Initializing repository with:\n")
printStderr(" block hash: %v\n", options.BlockFormat.Hash)
printStderr(" encryption: %v\n", options.BlockFormat.Encryption)
switch options.ObjectFormat.Splitter {
case "DYNAMIC":
printStderr(" object splitter: DYNAMIC with block sizes (min:%v avg:%v max:%v)\n",
units.BytesStringBase2(int64(options.ObjectFormat.MinBlockSize)),
units.BytesStringBase2(int64(options.ObjectFormat.AvgBlockSize)),
units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize)))
case "FIXED":
printStderr(" object splitter: FIXED with with block size: %v\n", units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize)))
case "NEVER":
printStderr(" object splitter: NEVER\n")
}
printStderr(" splitter: %v\n", options.ObjectFormat.Splitter)
if err := repo.Initialize(ctx, st, options, password); err != nil {
return errors.Wrap(err, "cannot initialize repository")

View File

@@ -35,27 +35,13 @@ func runStatusCommand(ctx context.Context, rep *repo.Repository) error {
}
fmt.Println()
var splitterExtraInfo string
switch rep.Objects.Format.Splitter {
case "DYNAMIC":
splitterExtraInfo = fmt.Sprintf(
" (min: %v; avg: %v; max: %v)",
units.BytesStringBase2(int64(rep.Objects.Format.MinBlockSize)),
units.BytesStringBase2(int64(rep.Objects.Format.AvgBlockSize)),
units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize)))
case "":
case "FIXED":
splitterExtraInfo = fmt.Sprintf(" %v", units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize)))
}
fmt.Println()
fmt.Printf("Unique ID: %x\n", rep.UniqueID)
fmt.Println()
fmt.Printf("Block hash: %v\n", rep.Blocks.Format.Hash)
fmt.Printf("Block encryption: %v\n", rep.Blocks.Format.Encryption)
fmt.Printf("Max pack length: %v\n", units.BytesStringBase2(int64(rep.Blocks.Format.MaxPackSize)))
fmt.Printf("Splitter: %v%v\n", rep.Objects.Format.Splitter, splitterExtraInfo)
fmt.Printf("Splitter: %v\n", rep.Objects.Format.Splitter)
return nil
}

3
go.mod
View File

@@ -9,6 +9,7 @@ require (
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect
github.com/bgentry/speakeasy v0.1.0
github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40
github.com/chmduquesne/rollinghash v4.0.0+incompatible
github.com/danieljoos/wincred v1.0.2 // indirect
github.com/efarrer/iothrottler v0.0.0-20141121142253-60e7e547c7fe
github.com/go-ini/ini v1.42.0 // indirect
@@ -18,7 +19,6 @@ require (
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7
github.com/pkg/errors v0.8.1
github.com/silvasur/buzhash v0.0.0-20160816060738-9bdec3dec7c6
github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c
github.com/studio-b12/gowebdav v0.0.0-20190103184047-38f79aeaf1ac
github.com/zalando/go-keyring v0.0.0-20190208082241-fbe81aec3a07
@@ -27,5 +27,6 @@ require (
golang.org/x/net v0.0.0-20190328230028-74de082e2cca
golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0
google.golang.org/api v0.5.0
google.golang.org/appengine v1.4.0
gopkg.in/alecthomas/kingpin.v2 v2.2.6
)

3
go.sum
View File

@@ -17,6 +17,8 @@ github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQ
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 h1:y4B3+GPxKlrigF1ha5FFErxK+sr6sWxQovRMzwMhejo=
github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c=
github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo=
github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/danieljoos/wincred v1.0.1 h1:fcRTaj17zzROVqni2FiToKUVg3MmJ4NtMSGCySPIr/g=
github.com/danieljoos/wincred v1.0.1/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U=
@@ -154,6 +156,7 @@ google.golang.org/api v0.0.0-20181229000844-f26a60c56f14/go.mod h1:4mhQ8q/RsB7i+
google.golang.org/api v0.5.0 h1:lj9SyhMzyoa38fgFF0oO2T6pjs5IzkLPKfVtxpyCRMM=
google.golang.org/api v0.5.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=

View File

@@ -48,8 +48,7 @@ func (e *Environment) Setup(t *testing.T, opts ...func(*repo.NewRepositoryOption
Encryption: "NONE",
},
ObjectFormat: object.Format{
Splitter: "FIXED",
MaxBlockSize: 400,
Splitter: "FIXED-1M",
},
}

View File

@@ -84,13 +84,10 @@ func repositoryObjectFormatFromOptions(opt *NewRepositoryOptions) *repositoryObj
Encryption: applyDefaultString(opt.BlockFormat.Encryption, block.DefaultEncryption),
HMACSecret: applyDefaultRandomBytes(opt.BlockFormat.HMACSecret, 32),
MasterKey: applyDefaultRandomBytes(opt.BlockFormat.MasterKey, 32),
MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20)), // 20 MB
MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, 20<<20), // 20 MB
},
Format: object.Format{
Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter),
MaxBlockSize: applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20), // 20MiB
MinBlockSize: applyDefaultInt(opt.ObjectFormat.MinBlockSize, 10<<20), // 10MiB
AvgBlockSize: applyDefaultInt(opt.ObjectFormat.AvgBlockSize, 16<<20), // 16MiB
Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter),
},
}

View File

@@ -28,10 +28,7 @@ type blockManager interface {
// Format describes the format of objects in a repository.
type Format struct {
Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks
MinBlockSize int `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter
AvgBlockSize int `json:"avgBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter)
MaxBlockSize int `json:"maxBlockSize,omitempty"` // maximum size of storage block
Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks
}
// Manager implements a content-addressable storage on top of blob storage.
@@ -41,7 +38,7 @@ type Manager struct {
blockMgr blockManager
trace func(message string, args ...interface{})
newSplitter func() objectSplitter
newSplitter func() Splitter
}
// NewWriter creates an ObjectWriter for writing to the repository.
@@ -166,14 +163,12 @@ func NewObjectManager(ctx context.Context, bm blockManager, f Format, opts Manag
splitterID = "FIXED"
}
os := splitterFactories[splitterID]
os := GetSplitterFactory(splitterID)
if os == nil {
return nil, fmt.Errorf("unsupported splitter %q", f.Splitter)
}
om.newSplitter = func() objectSplitter {
return os(&f)
}
om.newSplitter = os
if opts.Trace != nil {
om.trace = opts.Trace

View File

@@ -68,8 +68,7 @@ func setupTest(t *testing.T) (map[string][]byte, *Manager) {
func setupTestWithData(t *testing.T, data map[string][]byte, opts ManagerOptions) (map[string][]byte, *Manager) {
r, err := NewObjectManager(context.Background(), &fakeBlockManager{data: data}, Format{
MaxBlockSize: 400,
Splitter: "FIXED",
Splitter: "FIXED-1M",
}, opts)
if err != nil {
t.Fatalf("can't create object manager: %v", err)
@@ -159,17 +158,22 @@ func verifyIndirectBlock(ctx context.Context, t *testing.T, r *Manager, oid ID)
func TestIndirection(t *testing.T) {
ctx := context.Background()
splitterFactory := newFixedSplitterFactory(1000)
cases := []struct {
dataLength int
expectedBlockCount int
expectedIndirection int
}{
{dataLength: 200, expectedBlockCount: 1, expectedIndirection: 0},
{dataLength: 1400, expectedBlockCount: 3, expectedIndirection: 1},
{dataLength: 2000, expectedBlockCount: 4, expectedIndirection: 2},
{dataLength: 3000, expectedBlockCount: 5, expectedIndirection: 2},
{dataLength: 4000, expectedBlockCount: 5, expectedIndirection: 2},
{dataLength: 10000, expectedBlockCount: 10, expectedIndirection: 3},
{dataLength: 1000, expectedBlockCount: 1, expectedIndirection: 0},
{dataLength: 1001, expectedBlockCount: 3, expectedIndirection: 1},
// 1 block of 1000 zeros, 1 block of 5 zeros + 1 index block
{dataLength: 3005, expectedBlockCount: 3, expectedIndirection: 1},
// 1 block of 1000 zeros + 1 index block
{dataLength: 4000, expectedBlockCount: 2, expectedIndirection: 1},
// 1 block of 1000 zeros + 1 index block
{dataLength: 10000, expectedBlockCount: 2, expectedIndirection: 1},
}
for _, c := range cases {
@@ -178,6 +182,7 @@ func TestIndirection(t *testing.T) {
contentBytes := make([]byte, c.dataLength)
writer := om.NewWriter(ctx, WriterOptions{})
writer.(*objectWriter).splitter = splitterFactory()
if _, err := writer.Write(contentBytes); err != nil {
t.Errorf("write error: %v", err)
}
@@ -186,6 +191,8 @@ func TestIndirection(t *testing.T) {
t.Errorf("error getting writer results: %v", err)
}
t.Logf("len %v got %v", len(contentBytes), result)
if indirectionLevel(result) != c.expectedIndirection {
t.Errorf("incorrect indirection level for size: %v: %v, expected %v", c.dataLength, indirectionLevel(result), c.expectedIndirection)
}

View File

@@ -1,33 +1,57 @@
package object
import (
"math"
"sort"
"github.com/silvasur/buzhash"
)
type objectSplitter interface {
add(b byte) bool
const (
splitterSlidingWindowSize = 64
)
// Splitter determines when to split a given object.
// It must return true if the object should be split after byte b is processed.
type Splitter interface {
ShouldSplit(b byte) bool
}
// SupportedSplitters is a list of supported object splitters including:
//
// NEVER - prevents objects from ever splitting
// FIXED - always splits large objects exactly at the maximum block size boundary
// DYNAMIC - dynamically splits large objects based on rolling hash of contents.
// SupportedSplitters is a list of supported object splitters.
var SupportedSplitters []string
var splitterFactories = map[string]func(*Format) objectSplitter{
"NEVER": func(f *Format) objectSplitter {
return newNeverSplitter()
},
"FIXED": func(f *Format) objectSplitter {
return newFixedSplitter(f.MaxBlockSize)
},
"DYNAMIC": func(f *Format) objectSplitter {
return newRollingHashSplitter(buzhash.NewBuzHash(32), f.MinBlockSize, f.AvgBlockSize, f.MaxBlockSize)
},
// SplitterFactory creates instances of Splitter
type SplitterFactory func() Splitter
// splitterFactories is a map of registered splitter factories.
var splitterFactories = map[string]SplitterFactory{
"FIXED-1M": newFixedSplitterFactory(megabytes(1)),
"FIXED-2M": newFixedSplitterFactory(megabytes(2)),
"FIXED-4M": newFixedSplitterFactory(megabytes(4)),
"FIXED-8M": newFixedSplitterFactory(megabytes(8)),
"DYNAMIC-1M-BUZHASH": newBuzHash32SplitterFactory(megabytes(1)),
"DYNAMIC-2M-BUZHASH": newBuzHash32SplitterFactory(megabytes(2)),
"DYNAMIC-4M-BUZHASH": newBuzHash32SplitterFactory(megabytes(4)),
"DYNAMIC-8M-BUZHASH": newBuzHash32SplitterFactory(megabytes(8)),
"DYNAMIC-1M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(1)),
"DYNAMIC-2M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(2)),
"DYNAMIC-4M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(4)),
"DYNAMIC-8M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(8)),
// handle deprecated legacy names to splitters of arbitrary size
"FIXED": newFixedSplitterFactory(4 << 20),
// we don't want to use old DYNAMIC splitter because of its licence, so
// map this one to arbitrary buzhash32 (different)
"DYNAMIC": newBuzHash32SplitterFactory(megabytes(4)),
}
func megabytes(mb int) int {
return mb << 20
}
// GetSplitterFactory gets splitter factory with a specified name or nil if not found.
func GetSplitterFactory(name string) SplitterFactory {
return splitterFactories[name]
}
func init() {
@@ -38,73 +62,4 @@ func init() {
}
// DefaultSplitter is the name of the splitter used by default for new repositories.
const DefaultSplitter = "DYNAMIC"
type neverSplitter struct{}
func (s *neverSplitter) add(b byte) bool {
return false
}
func newNeverSplitter() objectSplitter {
return &neverSplitter{}
}
type fixedSplitter struct {
cur int
chunkLength int
}
func (s *fixedSplitter) add(b byte) bool {
s.cur++
if s.cur >= s.chunkLength {
s.cur = 0
return true
}
return false
}
func newFixedSplitter(chunkLength int) objectSplitter {
return &fixedSplitter{chunkLength: chunkLength}
}
type rollingHash interface {
HashByte(b byte) uint32
}
type rollingHashSplitter struct {
rh rollingHash
mask uint32
currentBlockSize int
minBlockSize int
maxBlockSize int
}
func (rs *rollingHashSplitter) add(b byte) bool {
sum := rs.rh.HashByte(b)
rs.currentBlockSize++
if rs.currentBlockSize >= rs.maxBlockSize {
rs.currentBlockSize = 0
return true
}
if sum&rs.mask == 0 && rs.currentBlockSize > rs.minBlockSize && sum != 0 {
//log.Printf("splitting %v on sum %x mask %x", rs.currentBlockSize, sum, rs.mask)
rs.currentBlockSize = 0
return true
}
return false
}
func newRollingHashSplitter(rh rollingHash, minBlockSize int, approxBlockSize int, maxBlockSize int) objectSplitter {
bits := rollingHashBits(approxBlockSize)
mask := ^(^uint32(0) << bits)
return &rollingHashSplitter{rh, mask, 0, minBlockSize, maxBlockSize}
}
func rollingHashBits(n int) uint {
e := math.Log2(float64(n))
exp := math.Floor(e + 0.5)
return uint(exp)
}
const DefaultSplitter = "DYNAMIC-4M-BUZHASH"

View File

@@ -4,17 +4,15 @@
"math"
"math/rand"
"testing"
"github.com/silvasur/buzhash"
)
func TestSplitters(t *testing.T) {
cases := []struct {
desc string
newSplitter func() objectSplitter
newSplitter func() Splitter
}{
{"rolling buzhash with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }},
{"rolling buzhash with 5 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }},
// {"rolling buzhash with 3 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }},
// {"rolling buzhash with 5 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }},
}
for _, tc := range cases {
@@ -25,8 +23,8 @@ func TestSplitters(t *testing.T) {
rand.Read(rnd)
for i, p := range rnd {
if got, want := s1.add(p), s2.add(p); got != want {
t.Errorf("incorrect add() result for %v at offset %v", tc.desc, i)
if got, want := s1.ShouldSplit(p), s2.ShouldSplit(p); got != want {
t.Errorf("incorrect ShouldSplit() result for %v at offset %v", tc.desc, i)
}
}
}
@@ -40,29 +38,29 @@ func TestSplitterStability(t *testing.T) {
}
cases := []struct {
splitter objectSplitter
splitter Splitter
count int
avg int
minSplit int
maxSplit int
}{
{newFixedSplitter(1000), 5000, 1000, 1000, 1000},
{newFixedSplitter(10000), 500, 10000, 10000, 10000},
// {newFixedSplitter(1000), 5000, 1000, 1000, 1000},
// {newFixedSplitter(10000), 500, 10000, 10000, 10000},
{newNeverSplitter(), 0, 0, math.MaxInt32, 0},
// {newNeverSplitter(), 0, 0, math.MaxInt32, 0},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449},
// min and max
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000},
{newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000},
// // min and max
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000},
// {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000},
}
for _, tc := range cases {
@@ -73,7 +71,7 @@ func TestSplitterStability(t *testing.T) {
minSplit := int(math.MaxInt32)
count := 0
for i, p := range rnd {
if s.add(p) {
if s.ShouldSplit(p) {
l := i - lastSplit
if l >= maxSplit {
maxSplit = l
@@ -106,29 +104,3 @@ func TestSplitterStability(t *testing.T) {
}
}
}
func TestRollingHashBits(t *testing.T) {
cases := []struct {
blockSize int
bits uint
}{
{256, 8},
{128, 7},
{100, 7},
{500, 9},
{700, 9},
{724, 9},
{725, 10},
{768, 10},
{1000, 10},
{1000000, 20},
{10000000, 23},
{20000000, 24},
}
for _, tc := range cases {
if got, want := rollingHashBits(tc.blockSize), tc.bits; got != want {
t.Errorf("rollingHashBits(%v) = %v, wanted %v", tc.blockSize, got, want)
}
}
}

View File

@@ -58,7 +58,7 @@ type objectWriter struct {
description string
splitter objectSplitter
splitter Splitter
}
func (w *objectWriter) Close() error {
@@ -72,7 +72,7 @@ func (w *objectWriter) Write(data []byte) (n int, err error) {
for _, d := range data {
w.buffer.WriteByte(d)
if w.splitter.add(d) {
if w.splitter.ShouldSplit(d) {
if err := w.flushBuffer(); err != nil {
return 0, err
}

View File

@@ -0,0 +1,47 @@
package object
import (
"github.com/chmduquesne/rollinghash/buzhash32"
)
type buzhash32Splitter struct {
// we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop
// is 40% slower because compiler can't inline the call.
rh *buzhash32.Buzhash32
mask uint32
count int
minSize int
maxSize int
}
func (rs *buzhash32Splitter) ShouldSplit(b byte) bool {
rs.rh.Roll(b)
rs.count++
if rs.rh.Sum32()&rs.mask == 0 && rs.count >= rs.minSize {
rs.count = 0
return true
}
if rs.count >= rs.maxSize {
rs.count = 0
return true
}
return false
}
func newBuzHash32SplitterFactory(avgSize int) SplitterFactory {
// avgSize must be a power of two, so 0b000001000...0000
// it just so happens that mask is avgSize-1 :)
mask := uint32(avgSize - 1)
maxSize := avgSize * 2
minSize := avgSize / 2
// log.Printf("Setting up buzhash with avg size: %v mask: %v %032b", avgSize, avgSize-1, mask)
return func() Splitter {
s := buzhash32.New()
s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck
return &buzhash32Splitter{s, mask, 0, minSize, maxSize}
}
}

View File

@@ -0,0 +1,22 @@
package object
type fixedSplitter struct {
cur int
chunkLength int
}
func (s *fixedSplitter) ShouldSplit(b byte) bool {
s.cur++
if s.cur >= s.chunkLength {
s.cur = 0
return true
}
return false
}
func newFixedSplitterFactory(length int) SplitterFactory {
return func() Splitter {
return &fixedSplitter{chunkLength: length}
}
}

View File

@@ -0,0 +1,40 @@
package object
import "github.com/chmduquesne/rollinghash/rabinkarp64"
type rabinKarp64Splitter struct {
// we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop
// is 40% slower because compiler can't inline the call.
rh *rabinkarp64.RabinKarp64
mask uint64
count int
minSize int
maxSize int
}
func (rs *rabinKarp64Splitter) ShouldSplit(b byte) bool {
rs.rh.Roll(b)
rs.count++
if rs.rh.Sum64()&rs.mask == 0 && rs.count >= rs.minSize {
rs.count = 0
return true
}
if rs.count >= rs.maxSize {
rs.count = 0
return true
}
return false
}
func newRabinKarp64SplitterFactory(avgSize int) SplitterFactory {
mask := uint64(avgSize - 1)
maxSize := avgSize * 2
minSize := avgSize / 2
return func() Splitter {
s := rabinkarp64.New()
s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck
return &rabinKarp64Splitter{s, mask, 0, minSize, maxSize}
}
}

View File

@@ -7,8 +7,8 @@
"io/ioutil"
"path/filepath"
"github.com/kopia/kopia/repo/block"
"github.com/kopia/kopia/internal/repologging"
"github.com/kopia/kopia/repo/block"
"github.com/kopia/kopia/repo/manifest"
"github.com/kopia/kopia/repo/object"
"github.com/kopia/kopia/repo/storage"
@@ -107,7 +107,7 @@ func OpenWithConfig(ctx context.Context, st storage.Storage, lc *LocalConfig, pa
fo := repoConfig.FormattingOptions
if fo.MaxPackSize == 0 {
fo.MaxPackSize = repoConfig.MaxBlockSize
fo.MaxPackSize = 20 << 20 // 20 MB
}
log.Debugf("initializing block manager")

View File

@@ -11,9 +11,9 @@
"runtime/debug"
"testing"
"github.com/kopia/kopia/internal/repotesting"
"github.com/kopia/kopia/repo"
"github.com/kopia/kopia/repo/block"
"github.com/kopia/kopia/internal/repotesting"
"github.com/kopia/kopia/repo/object"
"github.com/kopia/kopia/repo/storage"
)
@@ -263,8 +263,7 @@ func TestFormats(t *testing.T) {
n.BlockFormat.Hash = hash
n.BlockFormat.Encryption = encryption
n.BlockFormat.HMACSecret = []byte("key")
n.ObjectFormat.MaxBlockSize = 10000
n.ObjectFormat.Splitter = "FIXED"
n.ObjectFormat.Splitter = "FIXED-1M"
}
}
@@ -274,7 +273,6 @@ func TestFormats(t *testing.T) {
}{
{
format: func(n *repo.NewRepositoryOptions) {
n.ObjectFormat.MaxBlockSize = 10000
},
oids: map[string]object.ID{
"": "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad",