refactor: extracted complete blob set functions to separate package (#1175)

* refactor: extracted complete blob set functions to separate package

* completeset: added more functions

Also treat malformed blob IDs as their own sets for backwards compat.
This commit is contained in:
Jarek Kowalski
2021-07-05 17:08:40 -07:00
committed by GitHub
parent a917027544
commit 5642a8a521
5 changed files with 250 additions and 143 deletions

View File

@@ -0,0 +1,72 @@
// Package completeset manages complete set of blob metadata.
package completeset
import (
"strconv"
"strings"
"github.com/kopia/kopia/repo/blob"
)
// FindFirst looks for a first complete set of blobs IDs following a naming convention:
// '<any>-s<set>-c<count>'
// where:
// 'prefix' is arbitrary string not containing a dash ('-')
// 'set' is a random string shared by all indexes in the same set
// 'count' is a number that specifies how many items must be in the set to make it complete.
//
// The algorithm returns IDs of blobs that form the first complete set.
func FindFirst(bms []blob.Metadata) []blob.Metadata {
sets := FindAll(bms)
if len(sets) == 0 {
return nil
}
return sets[0]
}
// ExcludeIncomplete removes from the provided slice any blobs that are part of incomplete sets.
func ExcludeIncomplete(bms []blob.Metadata) []blob.Metadata {
var result []blob.Metadata
for _, set := range FindAll(bms) {
result = append(result, set...)
}
return result
}
// FindAll returns a list of complete sets in the provided slice, grouped by set ID.
// Blobs that are not in any set are also returned.
func FindAll(bms []blob.Metadata) [][]blob.Metadata {
sets := map[string][]blob.Metadata{}
var completeSets [][]blob.Metadata
for _, bm := range bms {
id := bm.BlobID
parts := strings.Split(string(id), "-")
if len(parts) < 3 || !strings.HasPrefix(parts[1], "s") || !strings.HasPrefix(parts[2], "c") {
// treat blobs with malformed names as a single-item sets of their own.
completeSets = append(completeSets, []blob.Metadata{bm})
continue
}
count, err := strconv.Atoi(parts[2][1:])
if err != nil {
// treat blobs with malformed names as a single-item sets of their own.
completeSets = append(completeSets, []blob.Metadata{bm})
continue
}
setID := parts[1]
sets[setID] = append(sets[setID], bm)
if len(sets[setID]) == count {
completeSets = append(completeSets, sets[setID])
}
}
return completeSets
}

View File

@@ -0,0 +1,175 @@
package completeset_test
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/kopia/kopia/internal/completeset"
"github.com/kopia/kopia/repo/blob"
)
func TestFindFirstAndAll(t *testing.T) {
cases := []struct {
input []blob.ID
wantFirst []blob.ID
wantAll [][]blob.ID
wantExcludeIncomplete []blob.ID
}{
{
input: []blob.ID{},
wantFirst: []blob.ID{},
wantAll: [][]blob.ID(nil),
wantExcludeIncomplete: []blob.ID{},
},
// one complete session of size 2
{
input: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
wantFirst: []blob.ID{"a-s0-c2", "b-s0-c2"},
wantAll: [][]blob.ID{
{"a-s0-c2", "b-s0-c2"},
},
wantExcludeIncomplete: []blob.ID{"a-s0-c2", "b-s0-c2"},
},
// one complete session with some malformed name, which by itself forms a complete session.
{
input: []blob.ID{
"a-s0-c2",
"malformed",
"b-s0-c2",
},
wantFirst: []blob.ID{
"malformed",
},
wantAll: [][]blob.ID{
{"malformed"},
{"a-s0-c2", "b-s0-c2"},
},
wantExcludeIncomplete: []blob.ID{
"malformed",
"a-s0-c2", "b-s0-c2",
},
},
// one complete session with some malformed blob ID
{
input: []blob.ID{
"a-s0-c2",
"malformed-s0-x2",
"b-s0-c2",
},
wantFirst: []blob.ID{
"malformed-s0-x2",
},
wantAll: [][]blob.ID{
{"malformed-s0-x2"},
{"a-s0-c2", "b-s0-c2"},
},
wantExcludeIncomplete: []blob.ID{"malformed-s0-x2", "a-s0-c2", "b-s0-c2"},
},
// one complete session with some malformed count
{
input: []blob.ID{
"a-s0-c2",
"malformed-s0-cNAN",
"b-s0-c2",
},
wantFirst: []blob.ID{
"malformed-s0-cNAN",
},
wantAll: [][]blob.ID{
{"malformed-s0-cNAN"},
{"a-s0-c2", "b-s0-c2"},
},
wantExcludeIncomplete: []blob.ID{
"malformed-s0-cNAN",
"a-s0-c2", "b-s0-c2",
},
},
// two complete sessions, we pick 's0' as it's the first one to become complete.
{
input: []blob.ID{
"foo-s0-c2",
"aaa-s1-c2",
"bar-s0-c2",
"bbb-s1-c2",
},
wantFirst: []blob.ID{
"foo-s0-c2", "bar-s0-c2",
},
wantAll: [][]blob.ID{
{"foo-s0-c2", "bar-s0-c2"},
{"aaa-s1-c2", "bbb-s1-c2"},
},
wantExcludeIncomplete: []blob.ID{
"foo-s0-c2", "bar-s0-c2",
"aaa-s1-c2", "bbb-s1-c2",
},
},
// two incomplete sessions
{
input: []blob.ID{
"foo-s0-c3",
"aaa-s1-c3",
"bar-s0-c3",
"bbb-s1-c3",
},
wantFirst: []blob.ID{},
wantAll: nil,
wantExcludeIncomplete: []blob.ID{},
},
// two complete, two incomplete sessions
{
input: []blob.ID{
"foo-s0-c2",
"aaa-s1-c3",
"bar-s0-c2",
"bbb-s1-c3",
"foo-s2-c2",
"aaa-s3-c3",
"bar-s2-c2",
"bbb-s3-c3",
},
wantFirst: []blob.ID{
"foo-s0-c2",
"bar-s0-c2",
},
wantAll: [][]blob.ID{
{"foo-s0-c2", "bar-s0-c2"},
{"foo-s2-c2", "bar-s2-c2"},
},
wantExcludeIncomplete: []blob.ID{
"foo-s0-c2", "bar-s0-c2", "foo-s2-c2", "bar-s2-c2",
},
},
}
for _, tc := range cases {
require.Equal(t, tc.wantFirst, blob.IDsFromMetadata(completeset.FindFirst(dummyMetadataForIDs(tc.input))), "invalid result for FindFirst(%v)", tc.input)
require.Equal(t, tc.wantAll, idsFromMetadataSets(completeset.FindAll(dummyMetadataForIDs(tc.input))), "invalid result for FindAll(%v)", tc.input)
require.Equal(t, tc.wantExcludeIncomplete, blob.IDsFromMetadata(completeset.ExcludeIncomplete(dummyMetadataForIDs(tc.input))), "invalid result for ExcludeIncomplete(%v)", tc.input)
}
}
func idsFromMetadataSets(sets [][]blob.Metadata) [][]blob.ID {
var result [][]blob.ID
for _, s := range sets {
result = append(result, blob.IDsFromMetadata(s))
}
return result
}
func dummyMetadataForIDs(ids []blob.ID) []blob.Metadata {
var result []blob.Metadata
for _, id := range ids {
result = append(result, blob.Metadata{BlobID: id})
}
return result
}

View File

@@ -1,45 +0,0 @@
package epoch
import (
"strconv"
"strings"
"github.com/kopia/kopia/repo/blob"
)
// findCompleteSetOfBlobs looks for a complete set of blobs IDs following a naming convention:
// '<any>-s<set>-c<count>'
// where:
// 'prefix' is arbitrary string not containing a dash ('-')
// 'set' is a random string shared by all indexes in the same set
// 'count' is a number that specifies how many items must be in the set to make it complete.
//
// The algorithm returns IDs of blobs that form the first complete set.
func findCompleteSetOfBlobs(bms []blob.Metadata) []blob.Metadata {
sets := map[string][]blob.Metadata{}
for _, bm := range bms {
id := bm.BlobID
parts := strings.Split(string(id), "-")
if len(parts) < 3 || !strings.HasPrefix(parts[1], "s") || !strings.HasPrefix(parts[2], "c") {
// malformed ID, ignore
continue
}
count, err := strconv.Atoi(parts[2][1:])
if err != nil {
// malformed ID, ignore
continue
}
setID := parts[1]
sets[setID] = append(sets[setID], bm)
if len(sets[setID]) == count {
return sets[setID]
}
}
return nil
}

View File

@@ -1,96 +0,0 @@
package epoch
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/kopia/kopia/repo/blob"
)
func TestFindCompleteSetOfBlobs(t *testing.T) {
cases := []struct {
input []blob.ID
want []blob.ID
}{
{
input: []blob.ID{},
want: []blob.ID{},
},
// one complete session of size 2
{
input: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
want: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
},
// one complete session with some malformed name
{
input: []blob.ID{
"a-s0-c2",
"malformed",
"b-s0-c2",
},
want: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
},
// one complete session with some malformed blob ID
{
input: []blob.ID{
"a-s0-c2",
"malformed-s0-x2",
"b-s0-c2",
},
want: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
},
// one complete session with some malformed count
{
input: []blob.ID{
"a-s0-c2",
"malformed-s0-cNAN",
"b-s0-c2",
},
want: []blob.ID{
"a-s0-c2",
"b-s0-c2",
},
},
// two complete sessions, we pick 's0' as it's the first one to become complete.
{
input: []blob.ID{
"foo-s0-c2",
"aaa-s1-c2",
"bar-s0-c2",
"bbb-s1-c2",
},
want: []blob.ID{
"foo-s0-c2",
"bar-s0-c2",
},
},
}
for _, tc := range cases {
require.Equal(t, tc.want, blob.IDsFromMetadata(findCompleteSetOfBlobs(dummyMetadataForIDs(tc.input))), "invalid result for %v", tc.input)
}
}
func dummyMetadataForIDs(ids []blob.ID) []blob.Metadata {
var result []blob.Metadata
for _, id := range ids {
result = append(result, blob.Metadata{BlobID: id})
}
return result
}

View File

@@ -14,6 +14,7 @@
"golang.org/x/sync/errgroup"
"github.com/kopia/kopia/internal/clock"
"github.com/kopia/kopia/internal/completeset"
"github.com/kopia/kopia/internal/gather"
"github.com/kopia/kopia/repo/blob"
"github.com/kopia/kopia/repo/logging"
@@ -286,7 +287,7 @@ func (e *Manager) loadRangeCheckpoints(ctx context.Context, cs *CurrentSnapshot)
for epoch1, m := range groupByEpochRanges(blobs) {
for epoch2, bms := range m {
if comp := findCompleteSetOfBlobs(bms); comp != nil {
if comp := completeset.FindFirst(bms); comp != nil {
erm := &RangeMetadata{
MinEpoch: epoch1,
MaxEpoch: epoch2,
@@ -310,7 +311,7 @@ func (e *Manager) loadSingleEpochCompactions(ctx context.Context, cs *CurrentSna
}
for epoch, bms := range groupByEpochNumber(blobs) {
if comp := findCompleteSetOfBlobs(bms); comp != nil {
if comp := completeset.FindFirst(bms); comp != nil {
cs.SingleEpochCompactionSets[epoch] = comp
}
}