From 70c7af092cb12c4ca6be21c975a1fb9f0cb90e28 Mon Sep 17 00:00:00 2001 From: Jarek Kowalski Date: Fri, 14 Jul 2017 11:54:30 +0200 Subject: [PATCH] added configurable splitters to repo.Format --- repo/format.go | 7 +++++-- repo/object_splitter.go | 26 +++++++++++++++++++++----- repo/object_splitter_test.go | 31 +++++++++++++++++++------------ repo/repository.go | 17 ++++++++++++++--- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/repo/format.go b/repo/format.go index 8534b0211..6824de211 100644 --- a/repo/format.go +++ b/repo/format.go @@ -18,9 +18,12 @@ type Format struct { ObjectFormat string `json:"objectFormat,omitempty"` // identifier of object format Secret []byte `json:"secret,omitempty"` // HMAC secret used to generate encryption keys MaxInlineContentLength int32 `json:"maxInlineContentLength,omitempty"` // maximum size of object to be considered for inline storage within ObjectID - MaxBlockSize int32 `json:"maxBlockSize,omitempty"` // maximum size of storage block MasterKey []byte `json:"masterKey,omitempty"` // master encryption key (SIV-mode encryption only) - ApproxBlockSize int32 `json:"approxBlockSize,omitempty"` // approximate size of storage block (used with rolling hash) + Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks + + MinBlockSize int32 `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter + ApproxBlockSize int32 `json:"approxBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter) + MaxBlockSize int32 `json:"maxBlockSize,omitempty"` // maximum size of storage block } // Validate checks the validity of a Format and returns an error if invalid. diff --git a/repo/object_splitter.go b/repo/object_splitter.go index 779d27cd5..f0ed1db32 100644 --- a/repo/object_splitter.go +++ b/repo/object_splitter.go @@ -44,17 +44,33 @@ type rollingHashSplitter struct { rh rollinghash.Hash32 mask uint32 allOnes uint32 + + currentBlockSize int32 + minBlockSize int32 + maxBlockSize int32 } func (rs *rollingHashSplitter) add(b byte) bool { rs.rh.Roll(b) - return rs.rh.Sum32()&rs.mask == rs.allOnes + rs.currentBlockSize++ + if rs.currentBlockSize < rs.minBlockSize { + return false + } + if rs.currentBlockSize >= rs.maxBlockSize { + rs.currentBlockSize = 0 + return true + } + if rs.rh.Sum32()&rs.mask == rs.allOnes { + rs.currentBlockSize = 0 + return true + } + return false } -func newRollingHashSplitter(rh rollinghash.Hash32, approxBlockSize int32) objectSplitter { +func newRollingHashSplitter(rh rollinghash.Hash32, minBlockSize int32, approxBlockSize int32, maxBlockSize int32) objectSplitter { bits := rollingHashBits(approxBlockSize) mask := ^(^uint32(0) << bits) - return &rollingHashSplitter{rh, mask, (uint32(0)) ^ mask} + return &rollingHashSplitter{rh, mask, (uint32(0)) ^ mask, 0, minBlockSize, maxBlockSize} } func rollingHashBits(n int32) uint { @@ -71,7 +87,7 @@ func rollingHashBits(n int32) uint { "FIXED": func(f *Format) objectSplitter { return newFixedSplitter(int(f.MaxBlockSize)) }, - "ROLLING": func(f *Format) objectSplitter { - return newRollingHashSplitter(buzhash32.New(), f.MaxBlockSize) + "DYNAMIC": func(f *Format) objectSplitter { + return newRollingHashSplitter(buzhash32.New(), f.MinBlockSize, f.ApproxBlockSize, f.MaxBlockSize) }, } diff --git a/repo/object_splitter_test.go b/repo/object_splitter_test.go index 63b6b5c20..441311b88 100644 --- a/repo/object_splitter_test.go +++ b/repo/object_splitter_test.go @@ -15,8 +15,8 @@ func TestSplitters(t *testing.T) { desc string newSplitter func() objectSplitter }{ - {"rolling buzhash32 with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash32.New(), 3) }}, - {"rolling adler32 with 5 bits", func() objectSplitter { return newRollingHashSplitter(adler32.New(), 5) }}, + {"rolling buzhash32 with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash32.New(), 0, 8, 20) }}, + {"rolling adler32 with 5 bits", func() objectSplitter { return newRollingHashSplitter(adler32.New(), 0, 32, 100) }}, } for _, tc := range cases { @@ -53,17 +53,24 @@ func TestSplitterStability(t *testing.T) { {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, - {newRollingHashSplitter(buzhash32.New(), 32), 156283, 31, 1, 427}, - {newRollingHashSplitter(buzhash32.New(), 1024), 4794, 1042, 1, 10001}, - {newRollingHashSplitter(buzhash32.New(), 2048), 2404, 2079, 1, 19312}, - {newRollingHashSplitter(buzhash32.New(), 32768), 143, 34965, 1, 233567}, - {newRollingHashSplitter(buzhash32.New(), 65536), 72, 69444, 1, 430586}, + {newRollingHashSplitter(buzhash32.New(), 0, 32, math.MaxInt32), 156283, 31, 1, 427}, + {newRollingHashSplitter(buzhash32.New(), 0, 1024, math.MaxInt32), 4794, 1042, 1, 10001}, + {newRollingHashSplitter(buzhash32.New(), 0, 2048, math.MaxInt32), 2404, 2079, 1, 19312}, + {newRollingHashSplitter(buzhash32.New(), 0, 32768, math.MaxInt32), 143, 34965, 1, 233567}, + {newRollingHashSplitter(buzhash32.New(), 0, 65536, math.MaxInt32), 72, 69444, 1, 430586}, - {newRollingHashSplitter(rabinkarp32.New(), 32), 156303, 31, 1, 425}, - {newRollingHashSplitter(rabinkarp32.New(), 1024), 4985, 1003, 1, 9572}, - {newRollingHashSplitter(rabinkarp32.New(), 2048), 2497, 2002, 1, 15173}, - {newRollingHashSplitter(rabinkarp32.New(), 32768), 151, 33112, 790, 164382}, - {newRollingHashSplitter(rabinkarp32.New(), 65536), 76, 65789, 1124, 295680}, + {newRollingHashSplitter(rabinkarp32.New(), 0, 32, math.MaxInt32), 156303, 31, 1, 425}, + {newRollingHashSplitter(rabinkarp32.New(), 0, 1024, math.MaxInt32), 4985, 1003, 1, 9572}, + {newRollingHashSplitter(rabinkarp32.New(), 0, 2048, math.MaxInt32), 2497, 2002, 1, 15173}, + {newRollingHashSplitter(rabinkarp32.New(), 0, 32768, math.MaxInt32), 151, 33112, 790, 164382}, + {newRollingHashSplitter(rabinkarp32.New(), 0, 65536, math.MaxInt32), 76, 65789, 1124, 295680}, + + // min and max + {newRollingHashSplitter(buzhash32.New(), 0, 32, 64), 179920, 27, 1, 64}, + {newRollingHashSplitter(buzhash32.New(), 0, 1024, 10000), 4795, 1042, 1, 10000}, + {newRollingHashSplitter(buzhash32.New(), 0, 2048, 10000), 2432, 2055, 1, 10000}, + {newRollingHashSplitter(buzhash32.New(), 500, 32768, 100000), 147, 34013, 762, 100000}, + {newRollingHashSplitter(buzhash32.New(), 500, 65536, 100000), 90, 55555, 762, 100000}, } for _, tc := range cases { diff --git a/repo/repository.go b/repo/repository.go index dc0d76a99..2124a237f 100644 --- a/repo/repository.go +++ b/repo/repository.go @@ -186,9 +186,20 @@ func New(s blob.Storage, f *Format, options ...RepositoryOption) (*Repository, e r := &Repository{ Storage: s, format: *f, - newSplitter: func() objectSplitter { - return newFixedSplitter(int(f.MaxBlockSize)) - }, + } + + sp := f.Splitter + if sp == "" { + sp = "FIXED" + } + + os := SupportedSplitters[sp] + if os == nil { + return nil, fmt.Errorf("unsupported splitter %q", sp) + } + + r.newSplitter = func() objectSplitter { + return os(f) } var err error