added configurable splitters to repo.Format

This commit is contained in:
Jarek Kowalski
2017-07-14 11:54:30 +02:00
parent e411b5cfb4
commit 70c7af092c
4 changed files with 59 additions and 22 deletions

View File

@@ -18,9 +18,12 @@ type Format struct {
ObjectFormat string `json:"objectFormat,omitempty"` // identifier of object format
Secret []byte `json:"secret,omitempty"` // HMAC secret used to generate encryption keys
MaxInlineContentLength int32 `json:"maxInlineContentLength,omitempty"` // maximum size of object to be considered for inline storage within ObjectID
MaxBlockSize int32 `json:"maxBlockSize,omitempty"` // maximum size of storage block
MasterKey []byte `json:"masterKey,omitempty"` // master encryption key (SIV-mode encryption only)
ApproxBlockSize int32 `json:"approxBlockSize,omitempty"` // approximate size of storage block (used with rolling hash)
Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks
MinBlockSize int32 `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter
ApproxBlockSize int32 `json:"approxBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter)
MaxBlockSize int32 `json:"maxBlockSize,omitempty"` // maximum size of storage block
}
// Validate checks the validity of a Format and returns an error if invalid.

View File

@@ -44,17 +44,33 @@ type rollingHashSplitter struct {
rh rollinghash.Hash32
mask uint32
allOnes uint32
currentBlockSize int32
minBlockSize int32
maxBlockSize int32
}
func (rs *rollingHashSplitter) add(b byte) bool {
rs.rh.Roll(b)
return rs.rh.Sum32()&rs.mask == rs.allOnes
rs.currentBlockSize++
if rs.currentBlockSize < rs.minBlockSize {
return false
}
if rs.currentBlockSize >= rs.maxBlockSize {
rs.currentBlockSize = 0
return true
}
if rs.rh.Sum32()&rs.mask == rs.allOnes {
rs.currentBlockSize = 0
return true
}
return false
}
func newRollingHashSplitter(rh rollinghash.Hash32, approxBlockSize int32) objectSplitter {
func newRollingHashSplitter(rh rollinghash.Hash32, minBlockSize int32, approxBlockSize int32, maxBlockSize int32) objectSplitter {
bits := rollingHashBits(approxBlockSize)
mask := ^(^uint32(0) << bits)
return &rollingHashSplitter{rh, mask, (uint32(0)) ^ mask}
return &rollingHashSplitter{rh, mask, (uint32(0)) ^ mask, 0, minBlockSize, maxBlockSize}
}
func rollingHashBits(n int32) uint {
@@ -71,7 +87,7 @@ func rollingHashBits(n int32) uint {
"FIXED": func(f *Format) objectSplitter {
return newFixedSplitter(int(f.MaxBlockSize))
},
"ROLLING": func(f *Format) objectSplitter {
return newRollingHashSplitter(buzhash32.New(), f.MaxBlockSize)
"DYNAMIC": func(f *Format) objectSplitter {
return newRollingHashSplitter(buzhash32.New(), f.MinBlockSize, f.ApproxBlockSize, f.MaxBlockSize)
},
}

View File

@@ -15,8 +15,8 @@ func TestSplitters(t *testing.T) {
desc string
newSplitter func() objectSplitter
}{
{"rolling buzhash32 with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash32.New(), 3) }},
{"rolling adler32 with 5 bits", func() objectSplitter { return newRollingHashSplitter(adler32.New(), 5) }},
{"rolling buzhash32 with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash32.New(), 0, 8, 20) }},
{"rolling adler32 with 5 bits", func() objectSplitter { return newRollingHashSplitter(adler32.New(), 0, 32, 100) }},
}
for _, tc := range cases {
@@ -53,17 +53,24 @@ func TestSplitterStability(t *testing.T) {
{newNeverSplitter(), 0, 0, math.MaxInt32, 0},
{newRollingHashSplitter(buzhash32.New(), 32), 156283, 31, 1, 427},
{newRollingHashSplitter(buzhash32.New(), 1024), 4794, 1042, 1, 10001},
{newRollingHashSplitter(buzhash32.New(), 2048), 2404, 2079, 1, 19312},
{newRollingHashSplitter(buzhash32.New(), 32768), 143, 34965, 1, 233567},
{newRollingHashSplitter(buzhash32.New(), 65536), 72, 69444, 1, 430586},
{newRollingHashSplitter(buzhash32.New(), 0, 32, math.MaxInt32), 156283, 31, 1, 427},
{newRollingHashSplitter(buzhash32.New(), 0, 1024, math.MaxInt32), 4794, 1042, 1, 10001},
{newRollingHashSplitter(buzhash32.New(), 0, 2048, math.MaxInt32), 2404, 2079, 1, 19312},
{newRollingHashSplitter(buzhash32.New(), 0, 32768, math.MaxInt32), 143, 34965, 1, 233567},
{newRollingHashSplitter(buzhash32.New(), 0, 65536, math.MaxInt32), 72, 69444, 1, 430586},
{newRollingHashSplitter(rabinkarp32.New(), 32), 156303, 31, 1, 425},
{newRollingHashSplitter(rabinkarp32.New(), 1024), 4985, 1003, 1, 9572},
{newRollingHashSplitter(rabinkarp32.New(), 2048), 2497, 2002, 1, 15173},
{newRollingHashSplitter(rabinkarp32.New(), 32768), 151, 33112, 790, 164382},
{newRollingHashSplitter(rabinkarp32.New(), 65536), 76, 65789, 1124, 295680},
{newRollingHashSplitter(rabinkarp32.New(), 0, 32, math.MaxInt32), 156303, 31, 1, 425},
{newRollingHashSplitter(rabinkarp32.New(), 0, 1024, math.MaxInt32), 4985, 1003, 1, 9572},
{newRollingHashSplitter(rabinkarp32.New(), 0, 2048, math.MaxInt32), 2497, 2002, 1, 15173},
{newRollingHashSplitter(rabinkarp32.New(), 0, 32768, math.MaxInt32), 151, 33112, 790, 164382},
{newRollingHashSplitter(rabinkarp32.New(), 0, 65536, math.MaxInt32), 76, 65789, 1124, 295680},
// min and max
{newRollingHashSplitter(buzhash32.New(), 0, 32, 64), 179920, 27, 1, 64},
{newRollingHashSplitter(buzhash32.New(), 0, 1024, 10000), 4795, 1042, 1, 10000},
{newRollingHashSplitter(buzhash32.New(), 0, 2048, 10000), 2432, 2055, 1, 10000},
{newRollingHashSplitter(buzhash32.New(), 500, 32768, 100000), 147, 34013, 762, 100000},
{newRollingHashSplitter(buzhash32.New(), 500, 65536, 100000), 90, 55555, 762, 100000},
}
for _, tc := range cases {

View File

@@ -186,9 +186,20 @@ func New(s blob.Storage, f *Format, options ...RepositoryOption) (*Repository, e
r := &Repository{
Storage: s,
format: *f,
newSplitter: func() objectSplitter {
return newFixedSplitter(int(f.MaxBlockSize))
},
}
sp := f.Splitter
if sp == "" {
sp = "FIXED"
}
os := SupportedSplitters[sp]
if os == nil {
return nil, fmt.Errorf("unsupported splitter %q", sp)
}
r.newSplitter = func() objectSplitter {
return os(f)
}
var err error