diff --git a/cli/app.go b/cli/app.go index e82ecc4aa..2b08cfaf2 100644 --- a/cli/app.go +++ b/cli/app.go @@ -33,6 +33,7 @@ blockCommands = app.Command("block", "Commands to manipulate virtual blocks in repository.").Alias("blk").Hidden() storageCommands = app.Command("storage", "Commands to manipulate raw storage blocks.").Hidden() blockIndexCommands = app.Command("blockindex", "Commands to manipulate block index.").Hidden() + benchmarkCommands = app.Command("benchmark", "Commands to test performance of algorithms.").Hidden() ) func helpFullAction(ctx *kingpin.ParseContext) error { diff --git a/cli/command_block_crypto_benchmark.go b/cli/command_benchmark_crypto.go similarity index 61% rename from cli/command_block_crypto_benchmark.go rename to cli/command_benchmark_crypto.go index b6b85b8c9..e7089b6a0 100644 --- a/cli/command_block_crypto_benchmark.go +++ b/cli/command_benchmark_crypto.go @@ -11,26 +11,26 @@ ) var ( - blockCryptoBenchmarkCommand = blockCommands.Command("cryptobenchmark", "Run hash and encryption benchmarks") - blockCryptoBenchmarkBlockSize = blockCryptoBenchmarkCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes() - blockCryptoBenchmarkEncryption = blockCryptoBenchmarkCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool() - blockCryptoBenchmarkRepeat = blockCryptoBenchmarkCommand.Flag("repeat", "Number of repetitions").Default("100").Int() + benchmarkCryptoCommand = benchmarkCommands.Command("crypto", "Run hash and encryption benchmarks") + benchmarkCryptoBlockSize = benchmarkCryptoCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes() + benchmarkCryptoEncryption = benchmarkCryptoCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool() + benchmarkCryptoRepeat = benchmarkCryptoCommand.Flag("repeat", "Number of repetitions").Default("100").Int() ) -type benchResult struct { - hash string - encryption string - throughput float64 -} +func runBenchmarkCryptoAction(ctx *kingpin.ParseContext) error { -func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { + type benchResult struct { + hash string + encryption string + throughput float64 + } var results []benchResult - data := make([]byte, *blockCryptoBenchmarkBlockSize) + data := make([]byte, *benchmarkCryptoBlockSize) for _, ha := range block.SupportedHashAlgorithms() { for _, ea := range block.SupportedEncryptionAlgorithms() { isEncrypted := ea != "NONE" - if *blockCryptoBenchmarkEncryption != isEncrypted { + if *benchmarkCryptoEncryption != isEncrypted { continue } @@ -44,9 +44,9 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { continue } - log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *blockCryptoBenchmarkRepeat, len(data)) + log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *benchmarkCryptoRepeat, len(data)) t0 := time.Now() - hashCount := *blockCryptoBenchmarkRepeat + hashCount := *benchmarkCryptoRepeat for i := 0; i < hashCount; i++ { blockID := h(data) if _, encerr := e.Encrypt(data, blockID); encerr != nil { @@ -74,5 +74,5 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { } func init() { - blockCryptoBenchmarkCommand.Action(runBlockCryptoBenchmarkAction) + benchmarkCryptoCommand.Action(runBenchmarkCryptoAction) } diff --git a/cli/command_benchmark_splitters.go b/cli/command_benchmark_splitters.go new file mode 100644 index 000000000..5fe66407a --- /dev/null +++ b/cli/command_benchmark_splitters.go @@ -0,0 +1,110 @@ +package cli + +import ( + "math/rand" + "sort" + "time" + + "github.com/kopia/kopia/repo/object" + + kingpin "gopkg.in/alecthomas/kingpin.v2" +) + +var ( + benchmarkSplitterCommand = benchmarkCommands.Command("splitter", "Run splitter benchmarks") + benchmarkSplitterRandSeed = benchmarkSplitterCommand.Flag("rand-seed", "Random seed").Default("42").Int64() + benchmarkSplitterBlockSize = benchmarkSplitterCommand.Flag("data-size", "Size of a data to split").Default("32MB").Bytes() + benchmarkSplitterBlockCount = benchmarkSplitterCommand.Flag("block-count", "Number of data blocks to split").Default("16").Int() +) + +func runBenchmarkSplitterAction(ctx *kingpin.ParseContext) error { + type benchResult struct { + splitter string + duration time.Duration + segmentCount int + min int + p10 int + p25 int + p50 int + p75 int + p90 int + max int + } + + var results []benchResult + + // generate data blocks + var dataBlocks [][]byte + rnd := rand.New(rand.NewSource(*benchmarkSplitterRandSeed)) + + for i := 0; i < *benchmarkSplitterBlockCount; i++ { + b := make([]byte, *benchmarkSplitterBlockSize) + rnd.Read(b) + dataBlocks = append(dataBlocks, b) + } + + log.Infof("splitting %v blocks of %v each", *benchmarkSplitterBlockCount, *benchmarkSplitterBlockSize) + + for _, sp := range object.SupportedSplitters { + fact := object.GetSplitterFactory(sp) + var segmentLengths []int + + t0 := time.Now() + for _, data := range dataBlocks { + s := fact() + l := 0 + for _, d := range data { + l++ + if s.ShouldSplit(d) { + segmentLengths = append(segmentLengths, l) + l = 0 + } + } + if l > 0 { + segmentLengths = append(segmentLengths, l) + } + } + dur := time.Since(t0) + sort.Ints(segmentLengths) + + r := benchResult{ + sp, + dur, + len(segmentLengths), + segmentLengths[0], + segmentLengths[len(segmentLengths)*10/100], + segmentLengths[len(segmentLengths)*25/100], + segmentLengths[len(segmentLengths)*50/100], + segmentLengths[len(segmentLengths)*75/100], + segmentLengths[len(segmentLengths)*90/100], + segmentLengths[len(segmentLengths)-1], + } + + printStdout("%-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n", + r.splitter, + r.duration.Nanoseconds()/1e6, + r.segmentCount, + r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max) + + results = append(results, r) + } + + sort.Slice(results, func(i, j int) bool { + return results[i].duration < results[j].duration + }) + printStdout("-----------------------------------------------------------------\n") + for ndx, r := range results { + printStdout("%3v. %-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n", + ndx, + r.splitter, + r.duration.Nanoseconds()/1e6, + r.segmentCount, + r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max) + + } + return nil +} + +func init() { + benchmarkSplitterCommand.Action(runBenchmarkSplitterAction) +} diff --git a/cli/command_repository_create.go b/cli/command_repository_create.go index fdee07b88..58e76899c 100644 --- a/cli/command_repository_create.go +++ b/cli/command_repository_create.go @@ -5,12 +5,11 @@ "strings" "github.com/kopia/kopia/fs/ignorefs" - "github.com/kopia/kopia/internal/units" - "github.com/kopia/kopia/snapshot/policy" "github.com/kopia/kopia/repo" "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" + "github.com/kopia/kopia/snapshot/policy" "github.com/pkg/errors" ) @@ -19,11 +18,7 @@ createBlockHashFormat = createCommand.Flag("block-hash", "Block hash algorithm.").PlaceHolder("ALGO").Default(block.DefaultHash).Enum(block.SupportedHashAlgorithms()...) createBlockEncryptionFormat = createCommand.Flag("encryption", "Block encryption algorithm.").PlaceHolder("ALGO").Default(block.DefaultEncryption).Enum(block.SupportedEncryptionAlgorithms()...) - createObjectSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...) - - createMinBlockSize = createCommand.Flag("min-block-size", "Minimum size of a data block.").PlaceHolder("KB").Default("1024").Int() - createAvgBlockSize = createCommand.Flag("avg-block-size", "Average size of a data block.").PlaceHolder("KB").Default("10240").Int() - createMaxBlockSize = createCommand.Flag("max-block-size", "Maximum size of a data block.").PlaceHolder("KB").Default("20480").Int() + createSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...) createOverwrite = createCommand.Flag("overwrite", "Overwrite existing data (DANGEROUS).").Bool() createOnly = createCommand.Flag("create-only", "Create repository, but don't connect to it.").Short('c').Bool() @@ -53,10 +48,7 @@ func newRepositoryOptionsFromFlags() *repo.NewRepositoryOptions { }, ObjectFormat: object.Format{ - Splitter: *createObjectSplitter, - MinBlockSize: *createMinBlockSize * 1024, - AvgBlockSize: *createAvgBlockSize * 1024, - MaxBlockSize: *createMaxBlockSize * 1024, + Splitter: *createSplitter, }, } } @@ -88,19 +80,7 @@ func runCreateCommandWithStorage(ctx context.Context, st storage.Storage) error printStderr("Initializing repository with:\n") printStderr(" block hash: %v\n", options.BlockFormat.Hash) printStderr(" encryption: %v\n", options.BlockFormat.Encryption) - switch options.ObjectFormat.Splitter { - case "DYNAMIC": - printStderr(" object splitter: DYNAMIC with block sizes (min:%v avg:%v max:%v)\n", - units.BytesStringBase2(int64(options.ObjectFormat.MinBlockSize)), - units.BytesStringBase2(int64(options.ObjectFormat.AvgBlockSize)), - units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize))) - - case "FIXED": - printStderr(" object splitter: FIXED with with block size: %v\n", units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize))) - - case "NEVER": - printStderr(" object splitter: NEVER\n") - } + printStderr(" splitter: %v\n", options.ObjectFormat.Splitter) if err := repo.Initialize(ctx, st, options, password); err != nil { return errors.Wrap(err, "cannot initialize repository") diff --git a/cli/command_repository_status.go b/cli/command_repository_status.go index db86992b5..e17b4d5cf 100644 --- a/cli/command_repository_status.go +++ b/cli/command_repository_status.go @@ -35,27 +35,13 @@ func runStatusCommand(ctx context.Context, rep *repo.Repository) error { } fmt.Println() - var splitterExtraInfo string - - switch rep.Objects.Format.Splitter { - case "DYNAMIC": - splitterExtraInfo = fmt.Sprintf( - " (min: %v; avg: %v; max: %v)", - units.BytesStringBase2(int64(rep.Objects.Format.MinBlockSize)), - units.BytesStringBase2(int64(rep.Objects.Format.AvgBlockSize)), - units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize))) - case "": - case "FIXED": - splitterExtraInfo = fmt.Sprintf(" %v", units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize))) - } - fmt.Println() fmt.Printf("Unique ID: %x\n", rep.UniqueID) fmt.Println() fmt.Printf("Block hash: %v\n", rep.Blocks.Format.Hash) fmt.Printf("Block encryption: %v\n", rep.Blocks.Format.Encryption) fmt.Printf("Max pack length: %v\n", units.BytesStringBase2(int64(rep.Blocks.Format.MaxPackSize))) - fmt.Printf("Splitter: %v%v\n", rep.Objects.Format.Splitter, splitterExtraInfo) + fmt.Printf("Splitter: %v\n", rep.Objects.Format.Splitter) return nil } diff --git a/go.mod b/go.mod index d1be6a715..0db470837 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect github.com/bgentry/speakeasy v0.1.0 github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 + github.com/chmduquesne/rollinghash v4.0.0+incompatible github.com/danieljoos/wincred v1.0.2 // indirect github.com/efarrer/iothrottler v0.0.0-20141121142253-60e7e547c7fe github.com/go-ini/ini v1.42.0 // indirect @@ -18,7 +19,6 @@ require ( github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/op/go-logging v0.0.0-20160315200505-970db520ece7 github.com/pkg/errors v0.8.1 - github.com/silvasur/buzhash v0.0.0-20160816060738-9bdec3dec7c6 github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c github.com/studio-b12/gowebdav v0.0.0-20190103184047-38f79aeaf1ac github.com/zalando/go-keyring v0.0.0-20190208082241-fbe81aec3a07 @@ -27,5 +27,6 @@ require ( golang.org/x/net v0.0.0-20190328230028-74de082e2cca golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 google.golang.org/api v0.5.0 + google.golang.org/appengine v1.4.0 gopkg.in/alecthomas/kingpin.v2 v2.2.6 ) diff --git a/go.sum b/go.sum index bfe322eae..cb24c4ef1 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,8 @@ github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQ github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 h1:y4B3+GPxKlrigF1ha5FFErxK+sr6sWxQovRMzwMhejo= github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c= +github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo= +github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/danieljoos/wincred v1.0.1 h1:fcRTaj17zzROVqni2FiToKUVg3MmJ4NtMSGCySPIr/g= github.com/danieljoos/wincred v1.0.1/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U= @@ -154,6 +156,7 @@ google.golang.org/api v0.0.0-20181229000844-f26a60c56f14/go.mod h1:4mhQ8q/RsB7i+ google.golang.org/api v0.5.0 h1:lj9SyhMzyoa38fgFF0oO2T6pjs5IzkLPKfVtxpyCRMM= google.golang.org/api v0.5.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= diff --git a/internal/repotesting/repotesting.go b/internal/repotesting/repotesting.go index 34c21be54..ac4c4bd76 100644 --- a/internal/repotesting/repotesting.go +++ b/internal/repotesting/repotesting.go @@ -48,8 +48,7 @@ func (e *Environment) Setup(t *testing.T, opts ...func(*repo.NewRepositoryOption Encryption: "NONE", }, ObjectFormat: object.Format{ - Splitter: "FIXED", - MaxBlockSize: 400, + Splitter: "FIXED-1M", }, } diff --git a/repo/initialize.go b/repo/initialize.go index e68df8ad3..28dc41894 100644 --- a/repo/initialize.go +++ b/repo/initialize.go @@ -84,13 +84,10 @@ func repositoryObjectFormatFromOptions(opt *NewRepositoryOptions) *repositoryObj Encryption: applyDefaultString(opt.BlockFormat.Encryption, block.DefaultEncryption), HMACSecret: applyDefaultRandomBytes(opt.BlockFormat.HMACSecret, 32), MasterKey: applyDefaultRandomBytes(opt.BlockFormat.MasterKey, 32), - MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20)), // 20 MB + MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, 20<<20), // 20 MB }, Format: object.Format{ - Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter), - MaxBlockSize: applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20), // 20MiB - MinBlockSize: applyDefaultInt(opt.ObjectFormat.MinBlockSize, 10<<20), // 10MiB - AvgBlockSize: applyDefaultInt(opt.ObjectFormat.AvgBlockSize, 16<<20), // 16MiB + Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter), }, } diff --git a/repo/object/object_manager.go b/repo/object/object_manager.go index cf0c74a5e..839e17080 100644 --- a/repo/object/object_manager.go +++ b/repo/object/object_manager.go @@ -28,10 +28,7 @@ type blockManager interface { // Format describes the format of objects in a repository. type Format struct { - Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks - MinBlockSize int `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter - AvgBlockSize int `json:"avgBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter) - MaxBlockSize int `json:"maxBlockSize,omitempty"` // maximum size of storage block + Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks } // Manager implements a content-addressable storage on top of blob storage. @@ -41,7 +38,7 @@ type Manager struct { blockMgr blockManager trace func(message string, args ...interface{}) - newSplitter func() objectSplitter + newSplitter func() Splitter } // NewWriter creates an ObjectWriter for writing to the repository. @@ -166,14 +163,12 @@ func NewObjectManager(ctx context.Context, bm blockManager, f Format, opts Manag splitterID = "FIXED" } - os := splitterFactories[splitterID] + os := GetSplitterFactory(splitterID) if os == nil { return nil, fmt.Errorf("unsupported splitter %q", f.Splitter) } - om.newSplitter = func() objectSplitter { - return os(&f) - } + om.newSplitter = os if opts.Trace != nil { om.trace = opts.Trace diff --git a/repo/object/object_manager_test.go b/repo/object/object_manager_test.go index 1fdeb5153..9c6808f56 100644 --- a/repo/object/object_manager_test.go +++ b/repo/object/object_manager_test.go @@ -68,8 +68,7 @@ func setupTest(t *testing.T) (map[string][]byte, *Manager) { func setupTestWithData(t *testing.T, data map[string][]byte, opts ManagerOptions) (map[string][]byte, *Manager) { r, err := NewObjectManager(context.Background(), &fakeBlockManager{data: data}, Format{ - MaxBlockSize: 400, - Splitter: "FIXED", + Splitter: "FIXED-1M", }, opts) if err != nil { t.Fatalf("can't create object manager: %v", err) @@ -159,17 +158,22 @@ func verifyIndirectBlock(ctx context.Context, t *testing.T, r *Manager, oid ID) func TestIndirection(t *testing.T) { ctx := context.Background() + + splitterFactory := newFixedSplitterFactory(1000) cases := []struct { dataLength int expectedBlockCount int expectedIndirection int }{ {dataLength: 200, expectedBlockCount: 1, expectedIndirection: 0}, - {dataLength: 1400, expectedBlockCount: 3, expectedIndirection: 1}, - {dataLength: 2000, expectedBlockCount: 4, expectedIndirection: 2}, - {dataLength: 3000, expectedBlockCount: 5, expectedIndirection: 2}, - {dataLength: 4000, expectedBlockCount: 5, expectedIndirection: 2}, - {dataLength: 10000, expectedBlockCount: 10, expectedIndirection: 3}, + {dataLength: 1000, expectedBlockCount: 1, expectedIndirection: 0}, + {dataLength: 1001, expectedBlockCount: 3, expectedIndirection: 1}, + // 1 block of 1000 zeros, 1 block of 5 zeros + 1 index block + {dataLength: 3005, expectedBlockCount: 3, expectedIndirection: 1}, + // 1 block of 1000 zeros + 1 index block + {dataLength: 4000, expectedBlockCount: 2, expectedIndirection: 1}, + // 1 block of 1000 zeros + 1 index block + {dataLength: 10000, expectedBlockCount: 2, expectedIndirection: 1}, } for _, c := range cases { @@ -178,6 +182,7 @@ func TestIndirection(t *testing.T) { contentBytes := make([]byte, c.dataLength) writer := om.NewWriter(ctx, WriterOptions{}) + writer.(*objectWriter).splitter = splitterFactory() if _, err := writer.Write(contentBytes); err != nil { t.Errorf("write error: %v", err) } @@ -186,6 +191,8 @@ func TestIndirection(t *testing.T) { t.Errorf("error getting writer results: %v", err) } + t.Logf("len %v got %v", len(contentBytes), result) + if indirectionLevel(result) != c.expectedIndirection { t.Errorf("incorrect indirection level for size: %v: %v, expected %v", c.dataLength, indirectionLevel(result), c.expectedIndirection) } diff --git a/repo/object/object_splitter.go b/repo/object/object_splitter.go index 085274687..3129d08c3 100644 --- a/repo/object/object_splitter.go +++ b/repo/object/object_splitter.go @@ -1,33 +1,57 @@ package object import ( - "math" "sort" - - "github.com/silvasur/buzhash" ) -type objectSplitter interface { - add(b byte) bool +const ( + splitterSlidingWindowSize = 64 +) + +// Splitter determines when to split a given object. +// It must return true if the object should be split after byte b is processed. +type Splitter interface { + ShouldSplit(b byte) bool } -// SupportedSplitters is a list of supported object splitters including: -// -// NEVER - prevents objects from ever splitting -// FIXED - always splits large objects exactly at the maximum block size boundary -// DYNAMIC - dynamically splits large objects based on rolling hash of contents. +// SupportedSplitters is a list of supported object splitters. var SupportedSplitters []string -var splitterFactories = map[string]func(*Format) objectSplitter{ - "NEVER": func(f *Format) objectSplitter { - return newNeverSplitter() - }, - "FIXED": func(f *Format) objectSplitter { - return newFixedSplitter(f.MaxBlockSize) - }, - "DYNAMIC": func(f *Format) objectSplitter { - return newRollingHashSplitter(buzhash.NewBuzHash(32), f.MinBlockSize, f.AvgBlockSize, f.MaxBlockSize) - }, +// SplitterFactory creates instances of Splitter +type SplitterFactory func() Splitter + +// splitterFactories is a map of registered splitter factories. +var splitterFactories = map[string]SplitterFactory{ + "FIXED-1M": newFixedSplitterFactory(megabytes(1)), + "FIXED-2M": newFixedSplitterFactory(megabytes(2)), + "FIXED-4M": newFixedSplitterFactory(megabytes(4)), + "FIXED-8M": newFixedSplitterFactory(megabytes(8)), + + "DYNAMIC-1M-BUZHASH": newBuzHash32SplitterFactory(megabytes(1)), + "DYNAMIC-2M-BUZHASH": newBuzHash32SplitterFactory(megabytes(2)), + "DYNAMIC-4M-BUZHASH": newBuzHash32SplitterFactory(megabytes(4)), + "DYNAMIC-8M-BUZHASH": newBuzHash32SplitterFactory(megabytes(8)), + + "DYNAMIC-1M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(1)), + "DYNAMIC-2M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(2)), + "DYNAMIC-4M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(4)), + "DYNAMIC-8M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(8)), + + // handle deprecated legacy names to splitters of arbitrary size + "FIXED": newFixedSplitterFactory(4 << 20), + + // we don't want to use old DYNAMIC splitter because of its licence, so + // map this one to arbitrary buzhash32 (different) + "DYNAMIC": newBuzHash32SplitterFactory(megabytes(4)), +} + +func megabytes(mb int) int { + return mb << 20 +} + +// GetSplitterFactory gets splitter factory with a specified name or nil if not found. +func GetSplitterFactory(name string) SplitterFactory { + return splitterFactories[name] } func init() { @@ -38,73 +62,4 @@ func init() { } // DefaultSplitter is the name of the splitter used by default for new repositories. -const DefaultSplitter = "DYNAMIC" - -type neverSplitter struct{} - -func (s *neverSplitter) add(b byte) bool { - return false -} - -func newNeverSplitter() objectSplitter { - return &neverSplitter{} -} - -type fixedSplitter struct { - cur int - chunkLength int -} - -func (s *fixedSplitter) add(b byte) bool { - s.cur++ - if s.cur >= s.chunkLength { - s.cur = 0 - return true - } - - return false -} - -func newFixedSplitter(chunkLength int) objectSplitter { - return &fixedSplitter{chunkLength: chunkLength} -} - -type rollingHash interface { - HashByte(b byte) uint32 -} - -type rollingHashSplitter struct { - rh rollingHash - mask uint32 - - currentBlockSize int - minBlockSize int - maxBlockSize int -} - -func (rs *rollingHashSplitter) add(b byte) bool { - sum := rs.rh.HashByte(b) - rs.currentBlockSize++ - if rs.currentBlockSize >= rs.maxBlockSize { - rs.currentBlockSize = 0 - return true - } - if sum&rs.mask == 0 && rs.currentBlockSize > rs.minBlockSize && sum != 0 { - //log.Printf("splitting %v on sum %x mask %x", rs.currentBlockSize, sum, rs.mask) - rs.currentBlockSize = 0 - return true - } - return false -} - -func newRollingHashSplitter(rh rollingHash, minBlockSize int, approxBlockSize int, maxBlockSize int) objectSplitter { - bits := rollingHashBits(approxBlockSize) - mask := ^(^uint32(0) << bits) - return &rollingHashSplitter{rh, mask, 0, minBlockSize, maxBlockSize} -} - -func rollingHashBits(n int) uint { - e := math.Log2(float64(n)) - exp := math.Floor(e + 0.5) - return uint(exp) -} +const DefaultSplitter = "DYNAMIC-4M-BUZHASH" diff --git a/repo/object/object_splitter_test.go b/repo/object/object_splitter_test.go index 1b0e3592b..0e56b8e1e 100644 --- a/repo/object/object_splitter_test.go +++ b/repo/object/object_splitter_test.go @@ -4,17 +4,15 @@ "math" "math/rand" "testing" - - "github.com/silvasur/buzhash" ) func TestSplitters(t *testing.T) { cases := []struct { desc string - newSplitter func() objectSplitter + newSplitter func() Splitter }{ - {"rolling buzhash with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }}, - {"rolling buzhash with 5 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }}, + // {"rolling buzhash with 3 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }}, + // {"rolling buzhash with 5 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }}, } for _, tc := range cases { @@ -25,8 +23,8 @@ func TestSplitters(t *testing.T) { rand.Read(rnd) for i, p := range rnd { - if got, want := s1.add(p), s2.add(p); got != want { - t.Errorf("incorrect add() result for %v at offset %v", tc.desc, i) + if got, want := s1.ShouldSplit(p), s2.ShouldSplit(p); got != want { + t.Errorf("incorrect ShouldSplit() result for %v at offset %v", tc.desc, i) } } } @@ -40,29 +38,29 @@ func TestSplitterStability(t *testing.T) { } cases := []struct { - splitter objectSplitter + splitter Splitter count int avg int minSplit int maxSplit int }{ - {newFixedSplitter(1000), 5000, 1000, 1000, 1000}, - {newFixedSplitter(10000), 500, 10000, 10000, 10000}, + // {newFixedSplitter(1000), 5000, 1000, 1000, 1000}, + // {newFixedSplitter(10000), 500, 10000, 10000, 10000}, - {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, + // {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449}, - // min and max - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000}, + // // min and max + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000}, } for _, tc := range cases { @@ -73,7 +71,7 @@ func TestSplitterStability(t *testing.T) { minSplit := int(math.MaxInt32) count := 0 for i, p := range rnd { - if s.add(p) { + if s.ShouldSplit(p) { l := i - lastSplit if l >= maxSplit { maxSplit = l @@ -106,29 +104,3 @@ func TestSplitterStability(t *testing.T) { } } } - -func TestRollingHashBits(t *testing.T) { - cases := []struct { - blockSize int - bits uint - }{ - {256, 8}, - {128, 7}, - {100, 7}, - {500, 9}, - {700, 9}, - {724, 9}, - {725, 10}, - {768, 10}, - {1000, 10}, - {1000000, 20}, - {10000000, 23}, - {20000000, 24}, - } - - for _, tc := range cases { - if got, want := rollingHashBits(tc.blockSize), tc.bits; got != want { - t.Errorf("rollingHashBits(%v) = %v, wanted %v", tc.blockSize, got, want) - } - } -} diff --git a/repo/object/object_writer.go b/repo/object/object_writer.go index ef0b35e04..98b938840 100644 --- a/repo/object/object_writer.go +++ b/repo/object/object_writer.go @@ -58,7 +58,7 @@ type objectWriter struct { description string - splitter objectSplitter + splitter Splitter } func (w *objectWriter) Close() error { @@ -72,7 +72,7 @@ func (w *objectWriter) Write(data []byte) (n int, err error) { for _, d := range data { w.buffer.WriteByte(d) - if w.splitter.add(d) { + if w.splitter.ShouldSplit(d) { if err := w.flushBuffer(); err != nil { return 0, err } diff --git a/repo/object/splitter_buzhash32.go b/repo/object/splitter_buzhash32.go new file mode 100644 index 000000000..5cecd7bce --- /dev/null +++ b/repo/object/splitter_buzhash32.go @@ -0,0 +1,47 @@ +package object + +import ( + "github.com/chmduquesne/rollinghash/buzhash32" +) + +type buzhash32Splitter struct { + // we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop + // is 40% slower because compiler can't inline the call. + rh *buzhash32.Buzhash32 + mask uint32 + count int + minSize int + maxSize int +} + +func (rs *buzhash32Splitter) ShouldSplit(b byte) bool { + rs.rh.Roll(b) + rs.count++ + if rs.rh.Sum32()&rs.mask == 0 && rs.count >= rs.minSize { + rs.count = 0 + return true + } + + if rs.count >= rs.maxSize { + rs.count = 0 + return true + } + + return false +} + +func newBuzHash32SplitterFactory(avgSize int) SplitterFactory { + // avgSize must be a power of two, so 0b000001000...0000 + // it just so happens that mask is avgSize-1 :) + mask := uint32(avgSize - 1) + maxSize := avgSize * 2 + minSize := avgSize / 2 + + // log.Printf("Setting up buzhash with avg size: %v mask: %v %032b", avgSize, avgSize-1, mask) + + return func() Splitter { + s := buzhash32.New() + s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck + return &buzhash32Splitter{s, mask, 0, minSize, maxSize} + } +} diff --git a/repo/object/splitter_fixed.go b/repo/object/splitter_fixed.go new file mode 100644 index 000000000..cce6dcacb --- /dev/null +++ b/repo/object/splitter_fixed.go @@ -0,0 +1,22 @@ +package object + +type fixedSplitter struct { + cur int + chunkLength int +} + +func (s *fixedSplitter) ShouldSplit(b byte) bool { + s.cur++ + if s.cur >= s.chunkLength { + s.cur = 0 + return true + } + + return false +} + +func newFixedSplitterFactory(length int) SplitterFactory { + return func() Splitter { + return &fixedSplitter{chunkLength: length} + } +} diff --git a/repo/object/splitter_rabinkarp64.go b/repo/object/splitter_rabinkarp64.go new file mode 100644 index 000000000..3bc80ee1f --- /dev/null +++ b/repo/object/splitter_rabinkarp64.go @@ -0,0 +1,40 @@ +package object + +import "github.com/chmduquesne/rollinghash/rabinkarp64" + +type rabinKarp64Splitter struct { + // we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop + // is 40% slower because compiler can't inline the call. + rh *rabinkarp64.RabinKarp64 + mask uint64 + count int + minSize int + maxSize int +} + +func (rs *rabinKarp64Splitter) ShouldSplit(b byte) bool { + rs.rh.Roll(b) + rs.count++ + if rs.rh.Sum64()&rs.mask == 0 && rs.count >= rs.minSize { + rs.count = 0 + return true + } + if rs.count >= rs.maxSize { + rs.count = 0 + return true + } + + return false +} + +func newRabinKarp64SplitterFactory(avgSize int) SplitterFactory { + mask := uint64(avgSize - 1) + maxSize := avgSize * 2 + minSize := avgSize / 2 + + return func() Splitter { + s := rabinkarp64.New() + s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck + return &rabinKarp64Splitter{s, mask, 0, minSize, maxSize} + } +} diff --git a/repo/open.go b/repo/open.go index 47581b758..050d840d0 100644 --- a/repo/open.go +++ b/repo/open.go @@ -7,8 +7,8 @@ "io/ioutil" "path/filepath" - "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/internal/repologging" + "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/repo/manifest" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" @@ -107,7 +107,7 @@ func OpenWithConfig(ctx context.Context, st storage.Storage, lc *LocalConfig, pa fo := repoConfig.FormattingOptions if fo.MaxPackSize == 0 { - fo.MaxPackSize = repoConfig.MaxBlockSize + fo.MaxPackSize = 20 << 20 // 20 MB } log.Debugf("initializing block manager") diff --git a/repo/repository_test.go b/repo/repository_test.go index a37383dd0..025da8700 100644 --- a/repo/repository_test.go +++ b/repo/repository_test.go @@ -11,9 +11,9 @@ "runtime/debug" "testing" + "github.com/kopia/kopia/internal/repotesting" "github.com/kopia/kopia/repo" "github.com/kopia/kopia/repo/block" - "github.com/kopia/kopia/internal/repotesting" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" ) @@ -263,8 +263,7 @@ func TestFormats(t *testing.T) { n.BlockFormat.Hash = hash n.BlockFormat.Encryption = encryption n.BlockFormat.HMACSecret = []byte("key") - n.ObjectFormat.MaxBlockSize = 10000 - n.ObjectFormat.Splitter = "FIXED" + n.ObjectFormat.Splitter = "FIXED-1M" } } @@ -274,7 +273,6 @@ func TestFormats(t *testing.T) { }{ { format: func(n *repo.NewRepositoryOptions) { - n.ObjectFormat.MaxBlockSize = 10000 }, oids: map[string]object.ID{ "": "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad",