From 03339c18afedb810f31320ef01c707e7acbdc374 Mon Sep 17 00:00:00 2001 From: Jarek Kowalski Date: Thu, 30 May 2019 22:06:50 -0700 Subject: [PATCH] [breaking change] deprecated DYNAMIC splitter due to license issue The splitter in question was depending on github.com/silvasur/buzhash which is not licensed according to FOSSA bot Switched to new faster implementation of buzhash, which is unfortunately incompatible and will split the objects in different places. This change is be semi-breaking - old repositories can be read, but when uploading large objects they will be re-uploaded where previously they would be de-duped. Also added 'benchmark splitters' subcommand and moved 'block cryptobenchmark' subcommand to 'benchmark crypto'. --- cli/app.go | 1 + ...nchmark.go => command_benchmark_crypto.go} | 30 ++-- cli/command_benchmark_splitters.go | 110 ++++++++++++++ cli/command_repository_create.go | 28 +--- cli/command_repository_status.go | 16 +-- go.mod | 3 +- go.sum | 3 + internal/repotesting/repotesting.go | 3 +- repo/initialize.go | 7 +- repo/object/object_manager.go | 13 +- repo/object/object_manager_test.go | 21 ++- repo/object/object_splitter.go | 135 ++++++------------ repo/object/object_splitter_test.go | 70 +++------ repo/object/object_writer.go | 4 +- repo/object/splitter_buzhash32.go | 47 ++++++ repo/object/splitter_fixed.go | 22 +++ repo/object/splitter_rabinkarp64.go | 40 ++++++ repo/open.go | 4 +- repo/repository_test.go | 6 +- 19 files changed, 338 insertions(+), 225 deletions(-) rename cli/{command_block_crypto_benchmark.go => command_benchmark_crypto.go} (61%) create mode 100644 cli/command_benchmark_splitters.go create mode 100644 repo/object/splitter_buzhash32.go create mode 100644 repo/object/splitter_fixed.go create mode 100644 repo/object/splitter_rabinkarp64.go diff --git a/cli/app.go b/cli/app.go index e82ecc4aa..2b08cfaf2 100644 --- a/cli/app.go +++ b/cli/app.go @@ -33,6 +33,7 @@ blockCommands = app.Command("block", "Commands to manipulate virtual blocks in repository.").Alias("blk").Hidden() storageCommands = app.Command("storage", "Commands to manipulate raw storage blocks.").Hidden() blockIndexCommands = app.Command("blockindex", "Commands to manipulate block index.").Hidden() + benchmarkCommands = app.Command("benchmark", "Commands to test performance of algorithms.").Hidden() ) func helpFullAction(ctx *kingpin.ParseContext) error { diff --git a/cli/command_block_crypto_benchmark.go b/cli/command_benchmark_crypto.go similarity index 61% rename from cli/command_block_crypto_benchmark.go rename to cli/command_benchmark_crypto.go index b6b85b8c9..e7089b6a0 100644 --- a/cli/command_block_crypto_benchmark.go +++ b/cli/command_benchmark_crypto.go @@ -11,26 +11,26 @@ ) var ( - blockCryptoBenchmarkCommand = blockCommands.Command("cryptobenchmark", "Run hash and encryption benchmarks") - blockCryptoBenchmarkBlockSize = blockCryptoBenchmarkCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes() - blockCryptoBenchmarkEncryption = blockCryptoBenchmarkCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool() - blockCryptoBenchmarkRepeat = blockCryptoBenchmarkCommand.Flag("repeat", "Number of repetitions").Default("100").Int() + benchmarkCryptoCommand = benchmarkCommands.Command("crypto", "Run hash and encryption benchmarks") + benchmarkCryptoBlockSize = benchmarkCryptoCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes() + benchmarkCryptoEncryption = benchmarkCryptoCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool() + benchmarkCryptoRepeat = benchmarkCryptoCommand.Flag("repeat", "Number of repetitions").Default("100").Int() ) -type benchResult struct { - hash string - encryption string - throughput float64 -} +func runBenchmarkCryptoAction(ctx *kingpin.ParseContext) error { -func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { + type benchResult struct { + hash string + encryption string + throughput float64 + } var results []benchResult - data := make([]byte, *blockCryptoBenchmarkBlockSize) + data := make([]byte, *benchmarkCryptoBlockSize) for _, ha := range block.SupportedHashAlgorithms() { for _, ea := range block.SupportedEncryptionAlgorithms() { isEncrypted := ea != "NONE" - if *blockCryptoBenchmarkEncryption != isEncrypted { + if *benchmarkCryptoEncryption != isEncrypted { continue } @@ -44,9 +44,9 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { continue } - log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *blockCryptoBenchmarkRepeat, len(data)) + log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *benchmarkCryptoRepeat, len(data)) t0 := time.Now() - hashCount := *blockCryptoBenchmarkRepeat + hashCount := *benchmarkCryptoRepeat for i := 0; i < hashCount; i++ { blockID := h(data) if _, encerr := e.Encrypt(data, blockID); encerr != nil { @@ -74,5 +74,5 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error { } func init() { - blockCryptoBenchmarkCommand.Action(runBlockCryptoBenchmarkAction) + benchmarkCryptoCommand.Action(runBenchmarkCryptoAction) } diff --git a/cli/command_benchmark_splitters.go b/cli/command_benchmark_splitters.go new file mode 100644 index 000000000..5fe66407a --- /dev/null +++ b/cli/command_benchmark_splitters.go @@ -0,0 +1,110 @@ +package cli + +import ( + "math/rand" + "sort" + "time" + + "github.com/kopia/kopia/repo/object" + + kingpin "gopkg.in/alecthomas/kingpin.v2" +) + +var ( + benchmarkSplitterCommand = benchmarkCommands.Command("splitter", "Run splitter benchmarks") + benchmarkSplitterRandSeed = benchmarkSplitterCommand.Flag("rand-seed", "Random seed").Default("42").Int64() + benchmarkSplitterBlockSize = benchmarkSplitterCommand.Flag("data-size", "Size of a data to split").Default("32MB").Bytes() + benchmarkSplitterBlockCount = benchmarkSplitterCommand.Flag("block-count", "Number of data blocks to split").Default("16").Int() +) + +func runBenchmarkSplitterAction(ctx *kingpin.ParseContext) error { + type benchResult struct { + splitter string + duration time.Duration + segmentCount int + min int + p10 int + p25 int + p50 int + p75 int + p90 int + max int + } + + var results []benchResult + + // generate data blocks + var dataBlocks [][]byte + rnd := rand.New(rand.NewSource(*benchmarkSplitterRandSeed)) + + for i := 0; i < *benchmarkSplitterBlockCount; i++ { + b := make([]byte, *benchmarkSplitterBlockSize) + rnd.Read(b) + dataBlocks = append(dataBlocks, b) + } + + log.Infof("splitting %v blocks of %v each", *benchmarkSplitterBlockCount, *benchmarkSplitterBlockSize) + + for _, sp := range object.SupportedSplitters { + fact := object.GetSplitterFactory(sp) + var segmentLengths []int + + t0 := time.Now() + for _, data := range dataBlocks { + s := fact() + l := 0 + for _, d := range data { + l++ + if s.ShouldSplit(d) { + segmentLengths = append(segmentLengths, l) + l = 0 + } + } + if l > 0 { + segmentLengths = append(segmentLengths, l) + } + } + dur := time.Since(t0) + sort.Ints(segmentLengths) + + r := benchResult{ + sp, + dur, + len(segmentLengths), + segmentLengths[0], + segmentLengths[len(segmentLengths)*10/100], + segmentLengths[len(segmentLengths)*25/100], + segmentLengths[len(segmentLengths)*50/100], + segmentLengths[len(segmentLengths)*75/100], + segmentLengths[len(segmentLengths)*90/100], + segmentLengths[len(segmentLengths)-1], + } + + printStdout("%-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n", + r.splitter, + r.duration.Nanoseconds()/1e6, + r.segmentCount, + r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max) + + results = append(results, r) + } + + sort.Slice(results, func(i, j int) bool { + return results[i].duration < results[j].duration + }) + printStdout("-----------------------------------------------------------------\n") + for ndx, r := range results { + printStdout("%3v. %-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n", + ndx, + r.splitter, + r.duration.Nanoseconds()/1e6, + r.segmentCount, + r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max) + + } + return nil +} + +func init() { + benchmarkSplitterCommand.Action(runBenchmarkSplitterAction) +} diff --git a/cli/command_repository_create.go b/cli/command_repository_create.go index fdee07b88..58e76899c 100644 --- a/cli/command_repository_create.go +++ b/cli/command_repository_create.go @@ -5,12 +5,11 @@ "strings" "github.com/kopia/kopia/fs/ignorefs" - "github.com/kopia/kopia/internal/units" - "github.com/kopia/kopia/snapshot/policy" "github.com/kopia/kopia/repo" "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" + "github.com/kopia/kopia/snapshot/policy" "github.com/pkg/errors" ) @@ -19,11 +18,7 @@ createBlockHashFormat = createCommand.Flag("block-hash", "Block hash algorithm.").PlaceHolder("ALGO").Default(block.DefaultHash).Enum(block.SupportedHashAlgorithms()...) createBlockEncryptionFormat = createCommand.Flag("encryption", "Block encryption algorithm.").PlaceHolder("ALGO").Default(block.DefaultEncryption).Enum(block.SupportedEncryptionAlgorithms()...) - createObjectSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...) - - createMinBlockSize = createCommand.Flag("min-block-size", "Minimum size of a data block.").PlaceHolder("KB").Default("1024").Int() - createAvgBlockSize = createCommand.Flag("avg-block-size", "Average size of a data block.").PlaceHolder("KB").Default("10240").Int() - createMaxBlockSize = createCommand.Flag("max-block-size", "Maximum size of a data block.").PlaceHolder("KB").Default("20480").Int() + createSplitter = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...) createOverwrite = createCommand.Flag("overwrite", "Overwrite existing data (DANGEROUS).").Bool() createOnly = createCommand.Flag("create-only", "Create repository, but don't connect to it.").Short('c').Bool() @@ -53,10 +48,7 @@ func newRepositoryOptionsFromFlags() *repo.NewRepositoryOptions { }, ObjectFormat: object.Format{ - Splitter: *createObjectSplitter, - MinBlockSize: *createMinBlockSize * 1024, - AvgBlockSize: *createAvgBlockSize * 1024, - MaxBlockSize: *createMaxBlockSize * 1024, + Splitter: *createSplitter, }, } } @@ -88,19 +80,7 @@ func runCreateCommandWithStorage(ctx context.Context, st storage.Storage) error printStderr("Initializing repository with:\n") printStderr(" block hash: %v\n", options.BlockFormat.Hash) printStderr(" encryption: %v\n", options.BlockFormat.Encryption) - switch options.ObjectFormat.Splitter { - case "DYNAMIC": - printStderr(" object splitter: DYNAMIC with block sizes (min:%v avg:%v max:%v)\n", - units.BytesStringBase2(int64(options.ObjectFormat.MinBlockSize)), - units.BytesStringBase2(int64(options.ObjectFormat.AvgBlockSize)), - units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize))) - - case "FIXED": - printStderr(" object splitter: FIXED with with block size: %v\n", units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize))) - - case "NEVER": - printStderr(" object splitter: NEVER\n") - } + printStderr(" splitter: %v\n", options.ObjectFormat.Splitter) if err := repo.Initialize(ctx, st, options, password); err != nil { return errors.Wrap(err, "cannot initialize repository") diff --git a/cli/command_repository_status.go b/cli/command_repository_status.go index db86992b5..e17b4d5cf 100644 --- a/cli/command_repository_status.go +++ b/cli/command_repository_status.go @@ -35,27 +35,13 @@ func runStatusCommand(ctx context.Context, rep *repo.Repository) error { } fmt.Println() - var splitterExtraInfo string - - switch rep.Objects.Format.Splitter { - case "DYNAMIC": - splitterExtraInfo = fmt.Sprintf( - " (min: %v; avg: %v; max: %v)", - units.BytesStringBase2(int64(rep.Objects.Format.MinBlockSize)), - units.BytesStringBase2(int64(rep.Objects.Format.AvgBlockSize)), - units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize))) - case "": - case "FIXED": - splitterExtraInfo = fmt.Sprintf(" %v", units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize))) - } - fmt.Println() fmt.Printf("Unique ID: %x\n", rep.UniqueID) fmt.Println() fmt.Printf("Block hash: %v\n", rep.Blocks.Format.Hash) fmt.Printf("Block encryption: %v\n", rep.Blocks.Format.Encryption) fmt.Printf("Max pack length: %v\n", units.BytesStringBase2(int64(rep.Blocks.Format.MaxPackSize))) - fmt.Printf("Splitter: %v%v\n", rep.Objects.Format.Splitter, splitterExtraInfo) + fmt.Printf("Splitter: %v\n", rep.Objects.Format.Splitter) return nil } diff --git a/go.mod b/go.mod index d1be6a715..0db470837 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect github.com/bgentry/speakeasy v0.1.0 github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 + github.com/chmduquesne/rollinghash v4.0.0+incompatible github.com/danieljoos/wincred v1.0.2 // indirect github.com/efarrer/iothrottler v0.0.0-20141121142253-60e7e547c7fe github.com/go-ini/ini v1.42.0 // indirect @@ -18,7 +19,6 @@ require ( github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/op/go-logging v0.0.0-20160315200505-970db520ece7 github.com/pkg/errors v0.8.1 - github.com/silvasur/buzhash v0.0.0-20160816060738-9bdec3dec7c6 github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c github.com/studio-b12/gowebdav v0.0.0-20190103184047-38f79aeaf1ac github.com/zalando/go-keyring v0.0.0-20190208082241-fbe81aec3a07 @@ -27,5 +27,6 @@ require ( golang.org/x/net v0.0.0-20190328230028-74de082e2cca golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 google.golang.org/api v0.5.0 + google.golang.org/appengine v1.4.0 gopkg.in/alecthomas/kingpin.v2 v2.2.6 ) diff --git a/go.sum b/go.sum index bfe322eae..cb24c4ef1 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,8 @@ github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQ github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 h1:y4B3+GPxKlrigF1ha5FFErxK+sr6sWxQovRMzwMhejo= github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c= +github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo= +github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/danieljoos/wincred v1.0.1 h1:fcRTaj17zzROVqni2FiToKUVg3MmJ4NtMSGCySPIr/g= github.com/danieljoos/wincred v1.0.1/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U= @@ -154,6 +156,7 @@ google.golang.org/api v0.0.0-20181229000844-f26a60c56f14/go.mod h1:4mhQ8q/RsB7i+ google.golang.org/api v0.5.0 h1:lj9SyhMzyoa38fgFF0oO2T6pjs5IzkLPKfVtxpyCRMM= google.golang.org/api v0.5.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= diff --git a/internal/repotesting/repotesting.go b/internal/repotesting/repotesting.go index 34c21be54..ac4c4bd76 100644 --- a/internal/repotesting/repotesting.go +++ b/internal/repotesting/repotesting.go @@ -48,8 +48,7 @@ func (e *Environment) Setup(t *testing.T, opts ...func(*repo.NewRepositoryOption Encryption: "NONE", }, ObjectFormat: object.Format{ - Splitter: "FIXED", - MaxBlockSize: 400, + Splitter: "FIXED-1M", }, } diff --git a/repo/initialize.go b/repo/initialize.go index e68df8ad3..28dc41894 100644 --- a/repo/initialize.go +++ b/repo/initialize.go @@ -84,13 +84,10 @@ func repositoryObjectFormatFromOptions(opt *NewRepositoryOptions) *repositoryObj Encryption: applyDefaultString(opt.BlockFormat.Encryption, block.DefaultEncryption), HMACSecret: applyDefaultRandomBytes(opt.BlockFormat.HMACSecret, 32), MasterKey: applyDefaultRandomBytes(opt.BlockFormat.MasterKey, 32), - MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20)), // 20 MB + MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, 20<<20), // 20 MB }, Format: object.Format{ - Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter), - MaxBlockSize: applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20), // 20MiB - MinBlockSize: applyDefaultInt(opt.ObjectFormat.MinBlockSize, 10<<20), // 10MiB - AvgBlockSize: applyDefaultInt(opt.ObjectFormat.AvgBlockSize, 16<<20), // 16MiB + Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter), }, } diff --git a/repo/object/object_manager.go b/repo/object/object_manager.go index cf0c74a5e..839e17080 100644 --- a/repo/object/object_manager.go +++ b/repo/object/object_manager.go @@ -28,10 +28,7 @@ type blockManager interface { // Format describes the format of objects in a repository. type Format struct { - Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks - MinBlockSize int `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter - AvgBlockSize int `json:"avgBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter) - MaxBlockSize int `json:"maxBlockSize,omitempty"` // maximum size of storage block + Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks } // Manager implements a content-addressable storage on top of blob storage. @@ -41,7 +38,7 @@ type Manager struct { blockMgr blockManager trace func(message string, args ...interface{}) - newSplitter func() objectSplitter + newSplitter func() Splitter } // NewWriter creates an ObjectWriter for writing to the repository. @@ -166,14 +163,12 @@ func NewObjectManager(ctx context.Context, bm blockManager, f Format, opts Manag splitterID = "FIXED" } - os := splitterFactories[splitterID] + os := GetSplitterFactory(splitterID) if os == nil { return nil, fmt.Errorf("unsupported splitter %q", f.Splitter) } - om.newSplitter = func() objectSplitter { - return os(&f) - } + om.newSplitter = os if opts.Trace != nil { om.trace = opts.Trace diff --git a/repo/object/object_manager_test.go b/repo/object/object_manager_test.go index 1fdeb5153..9c6808f56 100644 --- a/repo/object/object_manager_test.go +++ b/repo/object/object_manager_test.go @@ -68,8 +68,7 @@ func setupTest(t *testing.T) (map[string][]byte, *Manager) { func setupTestWithData(t *testing.T, data map[string][]byte, opts ManagerOptions) (map[string][]byte, *Manager) { r, err := NewObjectManager(context.Background(), &fakeBlockManager{data: data}, Format{ - MaxBlockSize: 400, - Splitter: "FIXED", + Splitter: "FIXED-1M", }, opts) if err != nil { t.Fatalf("can't create object manager: %v", err) @@ -159,17 +158,22 @@ func verifyIndirectBlock(ctx context.Context, t *testing.T, r *Manager, oid ID) func TestIndirection(t *testing.T) { ctx := context.Background() + + splitterFactory := newFixedSplitterFactory(1000) cases := []struct { dataLength int expectedBlockCount int expectedIndirection int }{ {dataLength: 200, expectedBlockCount: 1, expectedIndirection: 0}, - {dataLength: 1400, expectedBlockCount: 3, expectedIndirection: 1}, - {dataLength: 2000, expectedBlockCount: 4, expectedIndirection: 2}, - {dataLength: 3000, expectedBlockCount: 5, expectedIndirection: 2}, - {dataLength: 4000, expectedBlockCount: 5, expectedIndirection: 2}, - {dataLength: 10000, expectedBlockCount: 10, expectedIndirection: 3}, + {dataLength: 1000, expectedBlockCount: 1, expectedIndirection: 0}, + {dataLength: 1001, expectedBlockCount: 3, expectedIndirection: 1}, + // 1 block of 1000 zeros, 1 block of 5 zeros + 1 index block + {dataLength: 3005, expectedBlockCount: 3, expectedIndirection: 1}, + // 1 block of 1000 zeros + 1 index block + {dataLength: 4000, expectedBlockCount: 2, expectedIndirection: 1}, + // 1 block of 1000 zeros + 1 index block + {dataLength: 10000, expectedBlockCount: 2, expectedIndirection: 1}, } for _, c := range cases { @@ -178,6 +182,7 @@ func TestIndirection(t *testing.T) { contentBytes := make([]byte, c.dataLength) writer := om.NewWriter(ctx, WriterOptions{}) + writer.(*objectWriter).splitter = splitterFactory() if _, err := writer.Write(contentBytes); err != nil { t.Errorf("write error: %v", err) } @@ -186,6 +191,8 @@ func TestIndirection(t *testing.T) { t.Errorf("error getting writer results: %v", err) } + t.Logf("len %v got %v", len(contentBytes), result) + if indirectionLevel(result) != c.expectedIndirection { t.Errorf("incorrect indirection level for size: %v: %v, expected %v", c.dataLength, indirectionLevel(result), c.expectedIndirection) } diff --git a/repo/object/object_splitter.go b/repo/object/object_splitter.go index 085274687..3129d08c3 100644 --- a/repo/object/object_splitter.go +++ b/repo/object/object_splitter.go @@ -1,33 +1,57 @@ package object import ( - "math" "sort" - - "github.com/silvasur/buzhash" ) -type objectSplitter interface { - add(b byte) bool +const ( + splitterSlidingWindowSize = 64 +) + +// Splitter determines when to split a given object. +// It must return true if the object should be split after byte b is processed. +type Splitter interface { + ShouldSplit(b byte) bool } -// SupportedSplitters is a list of supported object splitters including: -// -// NEVER - prevents objects from ever splitting -// FIXED - always splits large objects exactly at the maximum block size boundary -// DYNAMIC - dynamically splits large objects based on rolling hash of contents. +// SupportedSplitters is a list of supported object splitters. var SupportedSplitters []string -var splitterFactories = map[string]func(*Format) objectSplitter{ - "NEVER": func(f *Format) objectSplitter { - return newNeverSplitter() - }, - "FIXED": func(f *Format) objectSplitter { - return newFixedSplitter(f.MaxBlockSize) - }, - "DYNAMIC": func(f *Format) objectSplitter { - return newRollingHashSplitter(buzhash.NewBuzHash(32), f.MinBlockSize, f.AvgBlockSize, f.MaxBlockSize) - }, +// SplitterFactory creates instances of Splitter +type SplitterFactory func() Splitter + +// splitterFactories is a map of registered splitter factories. +var splitterFactories = map[string]SplitterFactory{ + "FIXED-1M": newFixedSplitterFactory(megabytes(1)), + "FIXED-2M": newFixedSplitterFactory(megabytes(2)), + "FIXED-4M": newFixedSplitterFactory(megabytes(4)), + "FIXED-8M": newFixedSplitterFactory(megabytes(8)), + + "DYNAMIC-1M-BUZHASH": newBuzHash32SplitterFactory(megabytes(1)), + "DYNAMIC-2M-BUZHASH": newBuzHash32SplitterFactory(megabytes(2)), + "DYNAMIC-4M-BUZHASH": newBuzHash32SplitterFactory(megabytes(4)), + "DYNAMIC-8M-BUZHASH": newBuzHash32SplitterFactory(megabytes(8)), + + "DYNAMIC-1M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(1)), + "DYNAMIC-2M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(2)), + "DYNAMIC-4M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(4)), + "DYNAMIC-8M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(8)), + + // handle deprecated legacy names to splitters of arbitrary size + "FIXED": newFixedSplitterFactory(4 << 20), + + // we don't want to use old DYNAMIC splitter because of its licence, so + // map this one to arbitrary buzhash32 (different) + "DYNAMIC": newBuzHash32SplitterFactory(megabytes(4)), +} + +func megabytes(mb int) int { + return mb << 20 +} + +// GetSplitterFactory gets splitter factory with a specified name or nil if not found. +func GetSplitterFactory(name string) SplitterFactory { + return splitterFactories[name] } func init() { @@ -38,73 +62,4 @@ func init() { } // DefaultSplitter is the name of the splitter used by default for new repositories. -const DefaultSplitter = "DYNAMIC" - -type neverSplitter struct{} - -func (s *neverSplitter) add(b byte) bool { - return false -} - -func newNeverSplitter() objectSplitter { - return &neverSplitter{} -} - -type fixedSplitter struct { - cur int - chunkLength int -} - -func (s *fixedSplitter) add(b byte) bool { - s.cur++ - if s.cur >= s.chunkLength { - s.cur = 0 - return true - } - - return false -} - -func newFixedSplitter(chunkLength int) objectSplitter { - return &fixedSplitter{chunkLength: chunkLength} -} - -type rollingHash interface { - HashByte(b byte) uint32 -} - -type rollingHashSplitter struct { - rh rollingHash - mask uint32 - - currentBlockSize int - minBlockSize int - maxBlockSize int -} - -func (rs *rollingHashSplitter) add(b byte) bool { - sum := rs.rh.HashByte(b) - rs.currentBlockSize++ - if rs.currentBlockSize >= rs.maxBlockSize { - rs.currentBlockSize = 0 - return true - } - if sum&rs.mask == 0 && rs.currentBlockSize > rs.minBlockSize && sum != 0 { - //log.Printf("splitting %v on sum %x mask %x", rs.currentBlockSize, sum, rs.mask) - rs.currentBlockSize = 0 - return true - } - return false -} - -func newRollingHashSplitter(rh rollingHash, minBlockSize int, approxBlockSize int, maxBlockSize int) objectSplitter { - bits := rollingHashBits(approxBlockSize) - mask := ^(^uint32(0) << bits) - return &rollingHashSplitter{rh, mask, 0, minBlockSize, maxBlockSize} -} - -func rollingHashBits(n int) uint { - e := math.Log2(float64(n)) - exp := math.Floor(e + 0.5) - return uint(exp) -} +const DefaultSplitter = "DYNAMIC-4M-BUZHASH" diff --git a/repo/object/object_splitter_test.go b/repo/object/object_splitter_test.go index 1b0e3592b..0e56b8e1e 100644 --- a/repo/object/object_splitter_test.go +++ b/repo/object/object_splitter_test.go @@ -4,17 +4,15 @@ "math" "math/rand" "testing" - - "github.com/silvasur/buzhash" ) func TestSplitters(t *testing.T) { cases := []struct { desc string - newSplitter func() objectSplitter + newSplitter func() Splitter }{ - {"rolling buzhash with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }}, - {"rolling buzhash with 5 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }}, + // {"rolling buzhash with 3 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }}, + // {"rolling buzhash with 5 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }}, } for _, tc := range cases { @@ -25,8 +23,8 @@ func TestSplitters(t *testing.T) { rand.Read(rnd) for i, p := range rnd { - if got, want := s1.add(p), s2.add(p); got != want { - t.Errorf("incorrect add() result for %v at offset %v", tc.desc, i) + if got, want := s1.ShouldSplit(p), s2.ShouldSplit(p); got != want { + t.Errorf("incorrect ShouldSplit() result for %v at offset %v", tc.desc, i) } } } @@ -40,29 +38,29 @@ func TestSplitterStability(t *testing.T) { } cases := []struct { - splitter objectSplitter + splitter Splitter count int avg int minSplit int maxSplit int }{ - {newFixedSplitter(1000), 5000, 1000, 1000, 1000}, - {newFixedSplitter(10000), 500, 10000, 10000, 10000}, + // {newFixedSplitter(1000), 5000, 1000, 1000, 1000}, + // {newFixedSplitter(10000), 500, 10000, 10000, 10000}, - {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, + // {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449}, - // min and max - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000}, - {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000}, + // // min and max + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000}, + // {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000}, } for _, tc := range cases { @@ -73,7 +71,7 @@ func TestSplitterStability(t *testing.T) { minSplit := int(math.MaxInt32) count := 0 for i, p := range rnd { - if s.add(p) { + if s.ShouldSplit(p) { l := i - lastSplit if l >= maxSplit { maxSplit = l @@ -106,29 +104,3 @@ func TestSplitterStability(t *testing.T) { } } } - -func TestRollingHashBits(t *testing.T) { - cases := []struct { - blockSize int - bits uint - }{ - {256, 8}, - {128, 7}, - {100, 7}, - {500, 9}, - {700, 9}, - {724, 9}, - {725, 10}, - {768, 10}, - {1000, 10}, - {1000000, 20}, - {10000000, 23}, - {20000000, 24}, - } - - for _, tc := range cases { - if got, want := rollingHashBits(tc.blockSize), tc.bits; got != want { - t.Errorf("rollingHashBits(%v) = %v, wanted %v", tc.blockSize, got, want) - } - } -} diff --git a/repo/object/object_writer.go b/repo/object/object_writer.go index ef0b35e04..98b938840 100644 --- a/repo/object/object_writer.go +++ b/repo/object/object_writer.go @@ -58,7 +58,7 @@ type objectWriter struct { description string - splitter objectSplitter + splitter Splitter } func (w *objectWriter) Close() error { @@ -72,7 +72,7 @@ func (w *objectWriter) Write(data []byte) (n int, err error) { for _, d := range data { w.buffer.WriteByte(d) - if w.splitter.add(d) { + if w.splitter.ShouldSplit(d) { if err := w.flushBuffer(); err != nil { return 0, err } diff --git a/repo/object/splitter_buzhash32.go b/repo/object/splitter_buzhash32.go new file mode 100644 index 000000000..5cecd7bce --- /dev/null +++ b/repo/object/splitter_buzhash32.go @@ -0,0 +1,47 @@ +package object + +import ( + "github.com/chmduquesne/rollinghash/buzhash32" +) + +type buzhash32Splitter struct { + // we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop + // is 40% slower because compiler can't inline the call. + rh *buzhash32.Buzhash32 + mask uint32 + count int + minSize int + maxSize int +} + +func (rs *buzhash32Splitter) ShouldSplit(b byte) bool { + rs.rh.Roll(b) + rs.count++ + if rs.rh.Sum32()&rs.mask == 0 && rs.count >= rs.minSize { + rs.count = 0 + return true + } + + if rs.count >= rs.maxSize { + rs.count = 0 + return true + } + + return false +} + +func newBuzHash32SplitterFactory(avgSize int) SplitterFactory { + // avgSize must be a power of two, so 0b000001000...0000 + // it just so happens that mask is avgSize-1 :) + mask := uint32(avgSize - 1) + maxSize := avgSize * 2 + minSize := avgSize / 2 + + // log.Printf("Setting up buzhash with avg size: %v mask: %v %032b", avgSize, avgSize-1, mask) + + return func() Splitter { + s := buzhash32.New() + s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck + return &buzhash32Splitter{s, mask, 0, minSize, maxSize} + } +} diff --git a/repo/object/splitter_fixed.go b/repo/object/splitter_fixed.go new file mode 100644 index 000000000..cce6dcacb --- /dev/null +++ b/repo/object/splitter_fixed.go @@ -0,0 +1,22 @@ +package object + +type fixedSplitter struct { + cur int + chunkLength int +} + +func (s *fixedSplitter) ShouldSplit(b byte) bool { + s.cur++ + if s.cur >= s.chunkLength { + s.cur = 0 + return true + } + + return false +} + +func newFixedSplitterFactory(length int) SplitterFactory { + return func() Splitter { + return &fixedSplitter{chunkLength: length} + } +} diff --git a/repo/object/splitter_rabinkarp64.go b/repo/object/splitter_rabinkarp64.go new file mode 100644 index 000000000..3bc80ee1f --- /dev/null +++ b/repo/object/splitter_rabinkarp64.go @@ -0,0 +1,40 @@ +package object + +import "github.com/chmduquesne/rollinghash/rabinkarp64" + +type rabinKarp64Splitter struct { + // we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop + // is 40% slower because compiler can't inline the call. + rh *rabinkarp64.RabinKarp64 + mask uint64 + count int + minSize int + maxSize int +} + +func (rs *rabinKarp64Splitter) ShouldSplit(b byte) bool { + rs.rh.Roll(b) + rs.count++ + if rs.rh.Sum64()&rs.mask == 0 && rs.count >= rs.minSize { + rs.count = 0 + return true + } + if rs.count >= rs.maxSize { + rs.count = 0 + return true + } + + return false +} + +func newRabinKarp64SplitterFactory(avgSize int) SplitterFactory { + mask := uint64(avgSize - 1) + maxSize := avgSize * 2 + minSize := avgSize / 2 + + return func() Splitter { + s := rabinkarp64.New() + s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck + return &rabinKarp64Splitter{s, mask, 0, minSize, maxSize} + } +} diff --git a/repo/open.go b/repo/open.go index 47581b758..050d840d0 100644 --- a/repo/open.go +++ b/repo/open.go @@ -7,8 +7,8 @@ "io/ioutil" "path/filepath" - "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/internal/repologging" + "github.com/kopia/kopia/repo/block" "github.com/kopia/kopia/repo/manifest" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" @@ -107,7 +107,7 @@ func OpenWithConfig(ctx context.Context, st storage.Storage, lc *LocalConfig, pa fo := repoConfig.FormattingOptions if fo.MaxPackSize == 0 { - fo.MaxPackSize = repoConfig.MaxBlockSize + fo.MaxPackSize = 20 << 20 // 20 MB } log.Debugf("initializing block manager") diff --git a/repo/repository_test.go b/repo/repository_test.go index a37383dd0..025da8700 100644 --- a/repo/repository_test.go +++ b/repo/repository_test.go @@ -11,9 +11,9 @@ "runtime/debug" "testing" + "github.com/kopia/kopia/internal/repotesting" "github.com/kopia/kopia/repo" "github.com/kopia/kopia/repo/block" - "github.com/kopia/kopia/internal/repotesting" "github.com/kopia/kopia/repo/object" "github.com/kopia/kopia/repo/storage" ) @@ -263,8 +263,7 @@ func TestFormats(t *testing.T) { n.BlockFormat.Hash = hash n.BlockFormat.Encryption = encryption n.BlockFormat.HMACSecret = []byte("key") - n.ObjectFormat.MaxBlockSize = 10000 - n.ObjectFormat.Splitter = "FIXED" + n.ObjectFormat.Splitter = "FIXED-1M" } } @@ -274,7 +273,6 @@ func TestFormats(t *testing.T) { }{ { format: func(n *repo.NewRepositoryOptions) { - n.ObjectFormat.MaxBlockSize = 10000 }, oids: map[string]object.ID{ "": "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad",