[breaking change] deprecated DYNAMIC splitter due to license issue

The splitter in question was depending on github.com/silvasur/buzhash which is not licensed according to FOSSA bot Switched to new faster implementation of buzhash, which is unfortunately incompatible and will split the objects in different places. This change is be semi-breaking - old repositories can be read, but when uploading large objects they will be re-uploaded where previously they would be de-duped. Also added 'benchmark splitters' subcommand and moved 'block cryptobenchmark' subcommand to 'benchmark crypto'.
2026-05-14 01:37:07 -04:00 · 2019-05-30 22:06:50 -07:00
parent 2f0ff90c1a
commit 03339c18af
19 changed files with 338 additions and 225 deletions
--- a/cli/app.go
+++ b/cli/app.go
@@ -33,6 +33,7 @@
 	blockCommands      = app.Command("block", "Commands to manipulate virtual blocks in repository.").Alias("blk").Hidden()
 	storageCommands    = app.Command("storage", "Commands to manipulate raw storage blocks.").Hidden()
 	blockIndexCommands = app.Command("blockindex", "Commands to manipulate block index.").Hidden()
+	benchmarkCommands  = app.Command("benchmark", "Commands to test performance of algorithms.").Hidden()
 )

 func helpFullAction(ctx *kingpin.ParseContext) error {
--- a/cli/command_block_crypto_benchmark.go
+++ b/cli/command_block_crypto_benchmark.go
@@ -11,26 +11,26 @@
 )

 var (
-	blockCryptoBenchmarkCommand    = blockCommands.Command("cryptobenchmark", "Run hash and encryption benchmarks")
-	blockCryptoBenchmarkBlockSize  = blockCryptoBenchmarkCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes()
-	blockCryptoBenchmarkEncryption = blockCryptoBenchmarkCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool()
-	blockCryptoBenchmarkRepeat     = blockCryptoBenchmarkCommand.Flag("repeat", "Number of repetitions").Default("100").Int()
+	benchmarkCryptoCommand    = benchmarkCommands.Command("crypto", "Run hash and encryption benchmarks")
+	benchmarkCryptoBlockSize  = benchmarkCryptoCommand.Flag("block-size", "Size of a block to encrypt").Default("1MB").Bytes()
+	benchmarkCryptoEncryption = benchmarkCryptoCommand.Flag("encryption", "Test encrypted formats").Default("true").Bool()
+	benchmarkCryptoRepeat     = benchmarkCryptoCommand.Flag("repeat", "Number of repetitions").Default("100").Int()
 )

-type benchResult struct {
-	hash       string
-	encryption string
-	throughput float64
-}
+func runBenchmarkCryptoAction(ctx *kingpin.ParseContext) error {

-func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
+	type benchResult struct {
+		hash       string
+		encryption string
+		throughput float64
+	}
 	var results []benchResult

-	data := make([]byte, *blockCryptoBenchmarkBlockSize)
+	data := make([]byte, *benchmarkCryptoBlockSize)
 	for _, ha := range block.SupportedHashAlgorithms() {
 		for _, ea := range block.SupportedEncryptionAlgorithms() {
 			isEncrypted := ea != "NONE"
-			if *blockCryptoBenchmarkEncryption != isEncrypted {
+			if *benchmarkCryptoEncryption != isEncrypted {
 				continue
 			}

@@ -44,9 +44,9 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
 				continue
 			}

-			log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *blockCryptoBenchmarkRepeat, len(data))
+			log.Infof("Benchmarking hash '%v' and encryption '%v'... (%v x %v bytes)", ha, ea, *benchmarkCryptoRepeat, len(data))
 			t0 := time.Now()
-			hashCount := *blockCryptoBenchmarkRepeat
+			hashCount := *benchmarkCryptoRepeat
 			for i := 0; i < hashCount; i++ {
 				blockID := h(data)
 				if _, encerr := e.Encrypt(data, blockID); encerr != nil {
@@ -74,5 +74,5 @@ func runBlockCryptoBenchmarkAction(ctx *kingpin.ParseContext) error {
 }

 func init() {
-	blockCryptoBenchmarkCommand.Action(runBlockCryptoBenchmarkAction)
+	benchmarkCryptoCommand.Action(runBenchmarkCryptoAction)
 }
--- a/cli/command_benchmark_splitters.go
+++ b/cli/command_benchmark_splitters.go
@@ -0,0 +1,110 @@
+package cli
+
+import (
+	"math/rand"
+	"sort"
+	"time"
+
+	"github.com/kopia/kopia/repo/object"
+
+	kingpin "gopkg.in/alecthomas/kingpin.v2"
+)
+
+var (
+	benchmarkSplitterCommand    = benchmarkCommands.Command("splitter", "Run splitter benchmarks")
+	benchmarkSplitterRandSeed   = benchmarkSplitterCommand.Flag("rand-seed", "Random seed").Default("42").Int64()
+	benchmarkSplitterBlockSize  = benchmarkSplitterCommand.Flag("data-size", "Size of a data to split").Default("32MB").Bytes()
+	benchmarkSplitterBlockCount = benchmarkSplitterCommand.Flag("block-count", "Number of data blocks to split").Default("16").Int()
+)
+
+func runBenchmarkSplitterAction(ctx *kingpin.ParseContext) error {
+	type benchResult struct {
+		splitter     string
+		duration     time.Duration
+		segmentCount int
+		min          int
+		p10          int
+		p25          int
+		p50          int
+		p75          int
+		p90          int
+		max          int
+	}
+
+	var results []benchResult
+
+	// generate data blocks
+	var dataBlocks [][]byte
+	rnd := rand.New(rand.NewSource(*benchmarkSplitterRandSeed))
+
+	for i := 0; i < *benchmarkSplitterBlockCount; i++ {
+		b := make([]byte, *benchmarkSplitterBlockSize)
+		rnd.Read(b)
+		dataBlocks = append(dataBlocks, b)
+	}
+
+	log.Infof("splitting %v blocks of %v each", *benchmarkSplitterBlockCount, *benchmarkSplitterBlockSize)
+
+	for _, sp := range object.SupportedSplitters {
+		fact := object.GetSplitterFactory(sp)
+		var segmentLengths []int
+
+		t0 := time.Now()
+		for _, data := range dataBlocks {
+			s := fact()
+			l := 0
+			for _, d := range data {
+				l++
+				if s.ShouldSplit(d) {
+					segmentLengths = append(segmentLengths, l)
+					l = 0
+				}
+			}
+			if l > 0 {
+				segmentLengths = append(segmentLengths, l)
+			}
+		}
+		dur := time.Since(t0)
+		sort.Ints(segmentLengths)
+
+		r := benchResult{
+			sp,
+			dur,
+			len(segmentLengths),
+			segmentLengths[0],
+			segmentLengths[len(segmentLengths)*10/100],
+			segmentLengths[len(segmentLengths)*25/100],
+			segmentLengths[len(segmentLengths)*50/100],
+			segmentLengths[len(segmentLengths)*75/100],
+			segmentLengths[len(segmentLengths)*90/100],
+			segmentLengths[len(segmentLengths)-1],
+		}
+
+		printStdout("%-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n",
+			r.splitter,
+			r.duration.Nanoseconds()/1e6,
+			r.segmentCount,
+			r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max)
+
+		results = append(results, r)
+	}
+
+	sort.Slice(results, func(i, j int) bool {
+		return results[i].duration < results[j].duration
+	})
+	printStdout("-----------------------------------------------------------------\n")
+	for ndx, r := range results {
+		printStdout("%3v. %-25v %6v ms count:%v min:%v 10th:%v 25th:%v 50th:%v 75th:%v 90th:%v max:%v\n",
+			ndx,
+			r.splitter,
+			r.duration.Nanoseconds()/1e6,
+			r.segmentCount,
+			r.min, r.p10, r.p25, r.p50, r.p75, r.p90, r.max)
+
+	}
+	return nil
+}
+
+func init() {
+	benchmarkSplitterCommand.Action(runBenchmarkSplitterAction)
+}
--- a/cli/command_repository_create.go
+++ b/cli/command_repository_create.go
@@ -5,12 +5,11 @@
 	"strings"

 	"github.com/kopia/kopia/fs/ignorefs"
-	"github.com/kopia/kopia/internal/units"
-	"github.com/kopia/kopia/snapshot/policy"
 	"github.com/kopia/kopia/repo"
 	"github.com/kopia/kopia/repo/block"
 	"github.com/kopia/kopia/repo/object"
 	"github.com/kopia/kopia/repo/storage"
+	"github.com/kopia/kopia/snapshot/policy"
 	"github.com/pkg/errors"
 )

@@ -19,11 +18,7 @@

 	createBlockHashFormat       = createCommand.Flag("block-hash", "Block hash algorithm.").PlaceHolder("ALGO").Default(block.DefaultHash).Enum(block.SupportedHashAlgorithms()...)
 	createBlockEncryptionFormat = createCommand.Flag("encryption", "Block encryption algorithm.").PlaceHolder("ALGO").Default(block.DefaultEncryption).Enum(block.SupportedEncryptionAlgorithms()...)
-	createObjectSplitter        = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...)
-
-	createMinBlockSize = createCommand.Flag("min-block-size", "Minimum size of a data block.").PlaceHolder("KB").Default("1024").Int()
-	createAvgBlockSize = createCommand.Flag("avg-block-size", "Average size of a data block.").PlaceHolder("KB").Default("10240").Int()
-	createMaxBlockSize = createCommand.Flag("max-block-size", "Maximum size of a data block.").PlaceHolder("KB").Default("20480").Int()
+	createSplitter              = createCommand.Flag("object-splitter", "The splitter to use for new objects in the repository").Default("DYNAMIC").Enum(object.SupportedSplitters...)

 	createOverwrite = createCommand.Flag("overwrite", "Overwrite existing data (DANGEROUS).").Bool()
 	createOnly      = createCommand.Flag("create-only", "Create repository, but don't connect to it.").Short('c').Bool()
@@ -53,10 +48,7 @@ func newRepositoryOptionsFromFlags() *repo.NewRepositoryOptions {
 		},

 		ObjectFormat: object.Format{
-			Splitter:     *createObjectSplitter,
-			MinBlockSize: *createMinBlockSize * 1024,
-			AvgBlockSize: *createAvgBlockSize * 1024,
-			MaxBlockSize: *createMaxBlockSize * 1024,
+			Splitter: *createSplitter,
 		},
 	}
 }
@@ -88,19 +80,7 @@ func runCreateCommandWithStorage(ctx context.Context, st storage.Storage) error
 	printStderr("Initializing repository with:\n")
 	printStderr("  block hash:          %v\n", options.BlockFormat.Hash)
 	printStderr("  encryption:          %v\n", options.BlockFormat.Encryption)
-	switch options.ObjectFormat.Splitter {
-	case "DYNAMIC":
-		printStderr("  object splitter:     DYNAMIC with block sizes (min:%v avg:%v max:%v)\n",
-			units.BytesStringBase2(int64(options.ObjectFormat.MinBlockSize)),
-			units.BytesStringBase2(int64(options.ObjectFormat.AvgBlockSize)),
-			units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize)))
-
-	case "FIXED":
-		printStderr("  object splitter:     FIXED with with block size: %v\n", units.BytesStringBase2(int64(options.ObjectFormat.MaxBlockSize)))
-
-	case "NEVER":
-		printStderr("  object splitter:     NEVER\n")
-	}
+	printStderr("  splitter:            %v\n", options.ObjectFormat.Splitter)

 	if err := repo.Initialize(ctx, st, options, password); err != nil {
 		return errors.Wrap(err, "cannot initialize repository")
--- a/cli/command_repository_status.go
+++ b/cli/command_repository_status.go
@@ -35,27 +35,13 @@ func runStatusCommand(ctx context.Context, rep *repo.Repository) error {
 	}
 	fmt.Println()

-	var splitterExtraInfo string
-
-	switch rep.Objects.Format.Splitter {
-	case "DYNAMIC":
-		splitterExtraInfo = fmt.Sprintf(
-			" (min: %v; avg: %v; max: %v)",
-			units.BytesStringBase2(int64(rep.Objects.Format.MinBlockSize)),
-			units.BytesStringBase2(int64(rep.Objects.Format.AvgBlockSize)),
-			units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize)))
-	case "":
-	case "FIXED":
-		splitterExtraInfo = fmt.Sprintf(" %v", units.BytesStringBase2(int64(rep.Objects.Format.MaxBlockSize)))
-	}
-
 	fmt.Println()
 	fmt.Printf("Unique ID:           %x\n", rep.UniqueID)
 	fmt.Println()
 	fmt.Printf("Block hash:          %v\n", rep.Blocks.Format.Hash)
 	fmt.Printf("Block encryption:    %v\n", rep.Blocks.Format.Encryption)
 	fmt.Printf("Max pack length:     %v\n", units.BytesStringBase2(int64(rep.Blocks.Format.MaxPackSize)))
-	fmt.Printf("Splitter:            %v%v\n", rep.Objects.Format.Splitter, splitterExtraInfo)
+	fmt.Printf("Splitter:            %v\n", rep.Objects.Format.Splitter)

 	return nil
 }
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
 	github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect
 	github.com/bgentry/speakeasy v0.1.0
 	github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40
+	github.com/chmduquesne/rollinghash v4.0.0+incompatible
 	github.com/danieljoos/wincred v1.0.2 // indirect
 	github.com/efarrer/iothrottler v0.0.0-20141121142253-60e7e547c7fe
 	github.com/go-ini/ini v1.42.0 // indirect
@@ -18,7 +19,6 @@ require (
 	github.com/mitchellh/go-homedir v1.1.0 // indirect
 	github.com/op/go-logging v0.0.0-20160315200505-970db520ece7
 	github.com/pkg/errors v0.8.1
-	github.com/silvasur/buzhash v0.0.0-20160816060738-9bdec3dec7c6
 	github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c
 	github.com/studio-b12/gowebdav v0.0.0-20190103184047-38f79aeaf1ac
 	github.com/zalando/go-keyring v0.0.0-20190208082241-fbe81aec3a07
@@ -27,5 +27,6 @@ require (
 	golang.org/x/net v0.0.0-20190328230028-74de082e2cca
 	golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0
 	google.golang.org/api v0.5.0
+	google.golang.org/appengine v1.4.0
 	gopkg.in/alecthomas/kingpin.v2 v2.2.6
 )
--- a/go.sum
+++ b/go.sum
@@ -17,6 +17,8 @@ github.com/bgentry/speakeasy v0.1.0 h1:ByYyxL9InA1OWqxJqqp2A5pYHUrCiAL6K3J+LKSsQ
 github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
 github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40 h1:y4B3+GPxKlrigF1ha5FFErxK+sr6sWxQovRMzwMhejo=
 github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c=
+github.com/chmduquesne/rollinghash v4.0.0+incompatible h1:hnREQO+DXjqIw3rUTzWN7/+Dpw+N5Um8zpKV0JOEgbo=
+github.com/chmduquesne/rollinghash v4.0.0+incompatible/go.mod h1:Uc2I36RRfTAf7Dge82bi3RU0OQUmXT9iweIcPqvr8A0=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/danieljoos/wincred v1.0.1 h1:fcRTaj17zzROVqni2FiToKUVg3MmJ4NtMSGCySPIr/g=
 github.com/danieljoos/wincred v1.0.1/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U=
@@ -154,6 +156,7 @@ google.golang.org/api v0.0.0-20181229000844-f26a60c56f14/go.mod h1:4mhQ8q/RsB7i+
 google.golang.org/api v0.5.0 h1:lj9SyhMzyoa38fgFF0oO2T6pjs5IzkLPKfVtxpyCRMM=
 google.golang.org/api v0.5.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
--- a/internal/repotesting/repotesting.go
+++ b/internal/repotesting/repotesting.go
@@ -48,8 +48,7 @@ func (e *Environment) Setup(t *testing.T, opts ...func(*repo.NewRepositoryOption
 			Encryption: "NONE",
 		},
 		ObjectFormat: object.Format{
-			Splitter:     "FIXED",
-			MaxBlockSize: 400,
+			Splitter: "FIXED-1M",
 		},
 	}

--- a/repo/initialize.go
+++ b/repo/initialize.go
@@ -84,13 +84,10 @@ func repositoryObjectFormatFromOptions(opt *NewRepositoryOptions) *repositoryObj
 			Encryption:  applyDefaultString(opt.BlockFormat.Encryption, block.DefaultEncryption),
 			HMACSecret:  applyDefaultRandomBytes(opt.BlockFormat.HMACSecret, 32),
 			MasterKey:   applyDefaultRandomBytes(opt.BlockFormat.MasterKey, 32),
-			MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20)), // 20 MB
+			MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, 20<<20), // 20 MB
 		},
 		Format: object.Format{
-			Splitter:     applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter),
-			MaxBlockSize: applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20), // 20MiB
-			MinBlockSize: applyDefaultInt(opt.ObjectFormat.MinBlockSize, 10<<20), // 10MiB
-			AvgBlockSize: applyDefaultInt(opt.ObjectFormat.AvgBlockSize, 16<<20), // 16MiB
+			Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter),
 		},
 	}

--- a/repo/object/object_manager.go
+++ b/repo/object/object_manager.go
@@ -28,10 +28,7 @@ type blockManager interface {

 // Format describes the format of objects in a repository.
 type Format struct {
-	Splitter     string `json:"splitter,omitempty"`     // splitter used to break objects into storage blocks
-	MinBlockSize int    `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter
-	AvgBlockSize int    `json:"avgBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter)
-	MaxBlockSize int    `json:"maxBlockSize,omitempty"` // maximum size of storage block
+	Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks
 }

 // Manager implements a content-addressable storage on top of blob storage.
@@ -41,7 +38,7 @@ type Manager struct {
 	blockMgr blockManager
 	trace    func(message string, args ...interface{})

-	newSplitter func() objectSplitter
+	newSplitter func() Splitter
 }

 // NewWriter creates an ObjectWriter for writing to the repository.
@@ -166,14 +163,12 @@ func NewObjectManager(ctx context.Context, bm blockManager, f Format, opts Manag
 		splitterID = "FIXED"
 	}

-	os := splitterFactories[splitterID]
+	os := GetSplitterFactory(splitterID)
 	if os == nil {
 		return nil, fmt.Errorf("unsupported splitter %q", f.Splitter)
 	}

-	om.newSplitter = func() objectSplitter {
-		return os(&f)
-	}
+	om.newSplitter = os

 	if opts.Trace != nil {
 		om.trace = opts.Trace
--- a/repo/object/object_manager_test.go
+++ b/repo/object/object_manager_test.go
@@ -68,8 +68,7 @@ func setupTest(t *testing.T) (map[string][]byte, *Manager) {

 func setupTestWithData(t *testing.T, data map[string][]byte, opts ManagerOptions) (map[string][]byte, *Manager) {
 	r, err := NewObjectManager(context.Background(), &fakeBlockManager{data: data}, Format{
-		MaxBlockSize: 400,
-		Splitter:     "FIXED",
+		Splitter: "FIXED-1M",
 	}, opts)
 	if err != nil {
 		t.Fatalf("can't create object manager: %v", err)
@@ -159,17 +158,22 @@ func verifyIndirectBlock(ctx context.Context, t *testing.T, r *Manager, oid ID)

 func TestIndirection(t *testing.T) {
 	ctx := context.Background()
+
+	splitterFactory := newFixedSplitterFactory(1000)
 	cases := []struct {
 		dataLength          int
 		expectedBlockCount  int
 		expectedIndirection int
 	}{
 		{dataLength: 200, expectedBlockCount: 1, expectedIndirection: 0},
-		{dataLength: 1400, expectedBlockCount: 3, expectedIndirection: 1},
-		{dataLength: 2000, expectedBlockCount: 4, expectedIndirection: 2},
-		{dataLength: 3000, expectedBlockCount: 5, expectedIndirection: 2},
-		{dataLength: 4000, expectedBlockCount: 5, expectedIndirection: 2},
-		{dataLength: 10000, expectedBlockCount: 10, expectedIndirection: 3},
+		{dataLength: 1000, expectedBlockCount: 1, expectedIndirection: 0},
+		{dataLength: 1001, expectedBlockCount: 3, expectedIndirection: 1},
+		// 1 block of 1000 zeros, 1 block of 5 zeros + 1 index block
+		{dataLength: 3005, expectedBlockCount: 3, expectedIndirection: 1},
+		// 1 block of 1000 zeros + 1 index block
+		{dataLength: 4000, expectedBlockCount: 2, expectedIndirection: 1},
+		// 1 block of 1000 zeros + 1 index block
+		{dataLength: 10000, expectedBlockCount: 2, expectedIndirection: 1},
 	}

 	for _, c := range cases {
@@ -178,6 +182,7 @@ func TestIndirection(t *testing.T) {
 		contentBytes := make([]byte, c.dataLength)

 		writer := om.NewWriter(ctx, WriterOptions{})
+		writer.(*objectWriter).splitter = splitterFactory()
 		if _, err := writer.Write(contentBytes); err != nil {
 			t.Errorf("write error: %v", err)
 		}
@@ -186,6 +191,8 @@ func TestIndirection(t *testing.T) {
 			t.Errorf("error getting writer results: %v", err)
 		}

+		t.Logf("len %v got %v", len(contentBytes), result)
+
 		if indirectionLevel(result) != c.expectedIndirection {
 			t.Errorf("incorrect indirection level for size: %v: %v, expected %v", c.dataLength, indirectionLevel(result), c.expectedIndirection)
 		}
--- a/repo/object/object_splitter.go
+++ b/repo/object/object_splitter.go
@@ -1,33 +1,57 @@
 package object

 import (
-	"math"
 	"sort"
-
-	"github.com/silvasur/buzhash"
 )

-type objectSplitter interface {
-	add(b byte) bool
+const (
+	splitterSlidingWindowSize = 64
+)
+
+// Splitter determines when to split a given object.
+// It must return true if the object should be split after byte b is processed.
+type Splitter interface {
+	ShouldSplit(b byte) bool
 }

-// SupportedSplitters is a list of supported object splitters including:
-//
-//    NEVER    - prevents objects from ever splitting
-//    FIXED    - always splits large objects exactly at the maximum block size boundary
-//    DYNAMIC  - dynamically splits large objects based on rolling hash of contents.
+// SupportedSplitters is a list of supported object splitters.
 var SupportedSplitters []string

-var splitterFactories = map[string]func(*Format) objectSplitter{
-	"NEVER": func(f *Format) objectSplitter {
-		return newNeverSplitter()
-	},
-	"FIXED": func(f *Format) objectSplitter {
-		return newFixedSplitter(f.MaxBlockSize)
-	},
-	"DYNAMIC": func(f *Format) objectSplitter {
-		return newRollingHashSplitter(buzhash.NewBuzHash(32), f.MinBlockSize, f.AvgBlockSize, f.MaxBlockSize)
-	},
+// SplitterFactory creates instances of Splitter
+type SplitterFactory func() Splitter
+
+// splitterFactories is a map of registered splitter factories.
+var splitterFactories = map[string]SplitterFactory{
+	"FIXED-1M": newFixedSplitterFactory(megabytes(1)),
+	"FIXED-2M": newFixedSplitterFactory(megabytes(2)),
+	"FIXED-4M": newFixedSplitterFactory(megabytes(4)),
+	"FIXED-8M": newFixedSplitterFactory(megabytes(8)),
+
+	"DYNAMIC-1M-BUZHASH": newBuzHash32SplitterFactory(megabytes(1)),
+	"DYNAMIC-2M-BUZHASH": newBuzHash32SplitterFactory(megabytes(2)),
+	"DYNAMIC-4M-BUZHASH": newBuzHash32SplitterFactory(megabytes(4)),
+	"DYNAMIC-8M-BUZHASH": newBuzHash32SplitterFactory(megabytes(8)),
+
+	"DYNAMIC-1M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(1)),
+	"DYNAMIC-2M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(2)),
+	"DYNAMIC-4M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(4)),
+	"DYNAMIC-8M-RABINKARP": newRabinKarp64SplitterFactory(megabytes(8)),
+
+	// handle deprecated legacy names to splitters of arbitrary size
+	"FIXED": newFixedSplitterFactory(4 << 20),
+
+	// we don't want to use old DYNAMIC splitter because of its licence, so
+	// map this one to arbitrary buzhash32 (different)
+	"DYNAMIC": newBuzHash32SplitterFactory(megabytes(4)),
+}
+
+func megabytes(mb int) int {
+	return mb << 20
+}
+
+// GetSplitterFactory gets splitter factory with a specified name or nil if not found.
+func GetSplitterFactory(name string) SplitterFactory {
+	return splitterFactories[name]
 }

 func init() {
@@ -38,73 +62,4 @@ func init() {
 }

 // DefaultSplitter is the name of the splitter used by default for new repositories.
-const DefaultSplitter = "DYNAMIC"
-
-type neverSplitter struct{}
-
-func (s *neverSplitter) add(b byte) bool {
-	return false
-}
-
-func newNeverSplitter() objectSplitter {
-	return &neverSplitter{}
-}
-
-type fixedSplitter struct {
-	cur         int
-	chunkLength int
-}
-
-func (s *fixedSplitter) add(b byte) bool {
-	s.cur++
-	if s.cur >= s.chunkLength {
-		s.cur = 0
-		return true
-	}
-
-	return false
-}
-
-func newFixedSplitter(chunkLength int) objectSplitter {
-	return &fixedSplitter{chunkLength: chunkLength}
-}
-
-type rollingHash interface {
-	HashByte(b byte) uint32
-}
-
-type rollingHashSplitter struct {
-	rh   rollingHash
-	mask uint32
-
-	currentBlockSize int
-	minBlockSize     int
-	maxBlockSize     int
-}
-
-func (rs *rollingHashSplitter) add(b byte) bool {
-	sum := rs.rh.HashByte(b)
-	rs.currentBlockSize++
-	if rs.currentBlockSize >= rs.maxBlockSize {
-		rs.currentBlockSize = 0
-		return true
-	}
-	if sum&rs.mask == 0 && rs.currentBlockSize > rs.minBlockSize && sum != 0 {
-		//log.Printf("splitting %v on sum %x mask %x", rs.currentBlockSize, sum, rs.mask)
-		rs.currentBlockSize = 0
-		return true
-	}
-	return false
-}
-
-func newRollingHashSplitter(rh rollingHash, minBlockSize int, approxBlockSize int, maxBlockSize int) objectSplitter {
-	bits := rollingHashBits(approxBlockSize)
-	mask := ^(^uint32(0) << bits)
-	return &rollingHashSplitter{rh, mask, 0, minBlockSize, maxBlockSize}
-}
-
-func rollingHashBits(n int) uint {
-	e := math.Log2(float64(n))
-	exp := math.Floor(e + 0.5)
-	return uint(exp)
-}
+const DefaultSplitter = "DYNAMIC-4M-BUZHASH"
--- a/repo/object/object_splitter_test.go
+++ b/repo/object/object_splitter_test.go
@@ -4,17 +4,15 @@
 	"math"
 	"math/rand"
 	"testing"
-
-	"github.com/silvasur/buzhash"
 )

 func TestSplitters(t *testing.T) {
 	cases := []struct {
 		desc        string
-		newSplitter func() objectSplitter
+		newSplitter func() Splitter
 	}{
-		{"rolling buzhash with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }},
-		{"rolling buzhash with 5 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }},
+		// {"rolling buzhash with 3 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }},
+		// {"rolling buzhash with 5 bits", func() Splitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }},
 	}

 	for _, tc := range cases {
@@ -25,8 +23,8 @@ func TestSplitters(t *testing.T) {
 		rand.Read(rnd)

 		for i, p := range rnd {
-			if got, want := s1.add(p), s2.add(p); got != want {
-				t.Errorf("incorrect add() result for %v at offset %v", tc.desc, i)
+			if got, want := s1.ShouldSplit(p), s2.ShouldSplit(p); got != want {
+				t.Errorf("incorrect ShouldSplit() result for %v at offset %v", tc.desc, i)
 			}
 		}
 	}
@@ -40,29 +38,29 @@ func TestSplitterStability(t *testing.T) {
 	}

 	cases := []struct {
-		splitter objectSplitter
+		splitter Splitter
 		count    int
 		avg      int
 		minSplit int
 		maxSplit int
 	}{
-		{newFixedSplitter(1000), 5000, 1000, 1000, 1000},
-		{newFixedSplitter(10000), 500, 10000, 10000, 10000},
+		// {newFixedSplitter(1000), 5000, 1000, 1000, 1000},
+		// {newFixedSplitter(10000), 500, 10000, 10000, 10000},

-		{newNeverSplitter(), 0, 0, math.MaxInt32, 0},
+		// {newNeverSplitter(), 0, 0, math.MaxInt32, 0},

-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449},

-		// min and max
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000},
-		{newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000},
+		// // min and max
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000},
+		// {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000},
 	}

 	for _, tc := range cases {
@@ -73,7 +71,7 @@ func TestSplitterStability(t *testing.T) {
 		minSplit := int(math.MaxInt32)
 		count := 0
 		for i, p := range rnd {
-			if s.add(p) {
+			if s.ShouldSplit(p) {
 				l := i - lastSplit
 				if l >= maxSplit {
 					maxSplit = l
@@ -106,29 +104,3 @@ func TestSplitterStability(t *testing.T) {
 		}
 	}
 }
-
-func TestRollingHashBits(t *testing.T) {
-	cases := []struct {
-		blockSize int
-		bits      uint
-	}{
-		{256, 8},
-		{128, 7},
-		{100, 7},
-		{500, 9},
-		{700, 9},
-		{724, 9},
-		{725, 10},
-		{768, 10},
-		{1000, 10},
-		{1000000, 20},
-		{10000000, 23},
-		{20000000, 24},
-	}
-
-	for _, tc := range cases {
-		if got, want := rollingHashBits(tc.blockSize), tc.bits; got != want {
-			t.Errorf("rollingHashBits(%v) = %v, wanted %v", tc.blockSize, got, want)
-		}
-	}
-}
--- a/repo/object/object_writer.go
+++ b/repo/object/object_writer.go
@@ -58,7 +58,7 @@ type objectWriter struct {

 	description string

-	splitter objectSplitter
+	splitter Splitter
 }

 func (w *objectWriter) Close() error {
@@ -72,7 +72,7 @@ func (w *objectWriter) Write(data []byte) (n int, err error) {
 	for _, d := range data {
 		w.buffer.WriteByte(d)

-		if w.splitter.add(d) {
+		if w.splitter.ShouldSplit(d) {
 			if err := w.flushBuffer(); err != nil {
 				return 0, err
 			}
--- a/repo/object/splitter_buzhash32.go
+++ b/repo/object/splitter_buzhash32.go
@@ -0,0 +1,47 @@
+package object
+
+import (
+	"github.com/chmduquesne/rollinghash/buzhash32"
+)
+
+type buzhash32Splitter struct {
+	// we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop
+	// is 40% slower because compiler can't inline the call.
+	rh      *buzhash32.Buzhash32
+	mask    uint32
+	count   int
+	minSize int
+	maxSize int
+}
+
+func (rs *buzhash32Splitter) ShouldSplit(b byte) bool {
+	rs.rh.Roll(b)
+	rs.count++
+	if rs.rh.Sum32()&rs.mask == 0 && rs.count >= rs.minSize {
+		rs.count = 0
+		return true
+	}
+
+	if rs.count >= rs.maxSize {
+		rs.count = 0
+		return true
+	}
+
+	return false
+}
+
+func newBuzHash32SplitterFactory(avgSize int) SplitterFactory {
+	// avgSize must be a power of two, so 0b000001000...0000
+	// it just so happens that mask is avgSize-1 :)
+	mask := uint32(avgSize - 1)
+	maxSize := avgSize * 2
+	minSize := avgSize / 2
+
+	// log.Printf("Setting up buzhash with avg size: %v mask: %v %032b", avgSize, avgSize-1, mask)
+
+	return func() Splitter {
+		s := buzhash32.New()
+		s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck
+		return &buzhash32Splitter{s, mask, 0, minSize, maxSize}
+	}
+}
--- a/repo/object/splitter_fixed.go
+++ b/repo/object/splitter_fixed.go
@@ -0,0 +1,22 @@
+package object
+
+type fixedSplitter struct {
+	cur         int
+	chunkLength int
+}
+
+func (s *fixedSplitter) ShouldSplit(b byte) bool {
+	s.cur++
+	if s.cur >= s.chunkLength {
+		s.cur = 0
+		return true
+	}
+
+	return false
+}
+
+func newFixedSplitterFactory(length int) SplitterFactory {
+	return func() Splitter {
+		return &fixedSplitter{chunkLength: length}
+	}
+}
--- a/repo/object/splitter_rabinkarp64.go
+++ b/repo/object/splitter_rabinkarp64.go
@@ -0,0 +1,40 @@
+package object
+
+import "github.com/chmduquesne/rollinghash/rabinkarp64"
+
+type rabinKarp64Splitter struct {
+	// we're intentionally not using rollinghash.Hash32 interface because doing this in a tight loop
+	// is 40% slower because compiler can't inline the call.
+	rh      *rabinkarp64.RabinKarp64
+	mask    uint64
+	count   int
+	minSize int
+	maxSize int
+}
+
+func (rs *rabinKarp64Splitter) ShouldSplit(b byte) bool {
+	rs.rh.Roll(b)
+	rs.count++
+	if rs.rh.Sum64()&rs.mask == 0 && rs.count >= rs.minSize {
+		rs.count = 0
+		return true
+	}
+	if rs.count >= rs.maxSize {
+		rs.count = 0
+		return true
+	}
+
+	return false
+}
+
+func newRabinKarp64SplitterFactory(avgSize int) SplitterFactory {
+	mask := uint64(avgSize - 1)
+	maxSize := avgSize * 2
+	minSize := avgSize / 2
+
+	return func() Splitter {
+		s := rabinkarp64.New()
+		s.Write(make([]byte, splitterSlidingWindowSize)) //nolint:errcheck
+		return &rabinKarp64Splitter{s, mask, 0, minSize, maxSize}
+	}
+}
--- a/repo/open.go
+++ b/repo/open.go
@@ -7,8 +7,8 @@
 	"io/ioutil"
 	"path/filepath"

-	"github.com/kopia/kopia/repo/block"
 	"github.com/kopia/kopia/internal/repologging"
+	"github.com/kopia/kopia/repo/block"
 	"github.com/kopia/kopia/repo/manifest"
 	"github.com/kopia/kopia/repo/object"
 	"github.com/kopia/kopia/repo/storage"
@@ -107,7 +107,7 @@ func OpenWithConfig(ctx context.Context, st storage.Storage, lc *LocalConfig, pa

 	fo := repoConfig.FormattingOptions
 	if fo.MaxPackSize == 0 {
-		fo.MaxPackSize = repoConfig.MaxBlockSize
+		fo.MaxPackSize = 20 << 20 // 20 MB
 	}

 	log.Debugf("initializing block manager")
--- a/repo/repository_test.go
+++ b/repo/repository_test.go
@@ -11,9 +11,9 @@
 	"runtime/debug"
 	"testing"

+	"github.com/kopia/kopia/internal/repotesting"
 	"github.com/kopia/kopia/repo"
 	"github.com/kopia/kopia/repo/block"
-	"github.com/kopia/kopia/internal/repotesting"
 	"github.com/kopia/kopia/repo/object"
 	"github.com/kopia/kopia/repo/storage"
 )
@@ -263,8 +263,7 @@ func TestFormats(t *testing.T) {
 			n.BlockFormat.Hash = hash
 			n.BlockFormat.Encryption = encryption
 			n.BlockFormat.HMACSecret = []byte("key")
-			n.ObjectFormat.MaxBlockSize = 10000
-			n.ObjectFormat.Splitter = "FIXED"
+			n.ObjectFormat.Splitter = "FIXED-1M"
 		}
 	}

@@ -274,7 +273,6 @@ func TestFormats(t *testing.T) {
 	}{
 		{
 			format: func(n *repo.NewRepositoryOptions) {
-				n.ObjectFormat.MaxBlockSize = 10000
 			},
 			oids: map[string]object.ID{
 				"": "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad",