refactor: extracted complete blob set functions to separate package (#1175)

* refactor: extracted complete blob set functions to separate package * completeset: added more functions Also treat malformed blob IDs as their own sets for backwards compat.
2026-05-19 04:04:56 -04:00 · 2021-07-05 17:08:40 -07:00
parent a917027544
commit 5642a8a521
5 changed files with 250 additions and 143 deletions
--- a/internal/completeset/complete_set.go
+++ b/internal/completeset/complete_set.go
@@ -0,0 +1,72 @@
+// Package completeset manages complete set of blob metadata.
+package completeset
+
+import (
+	"strconv"
+	"strings"
+
+	"github.com/kopia/kopia/repo/blob"
+)
+
+// FindFirst looks for a first complete set of blobs IDs following a naming convention:
+//    '<any>-s<set>-c<count>'
+// where:
+//   'prefix' is arbitrary string not containing a dash ('-')
+//   'set' is a random string shared by all indexes in the same set
+//   'count' is a number that specifies how many items must be in the set to make it complete.
+//
+// The algorithm returns IDs of blobs that form the first complete set.
+func FindFirst(bms []blob.Metadata) []blob.Metadata {
+	sets := FindAll(bms)
+	if len(sets) == 0 {
+		return nil
+	}
+
+	return sets[0]
+}
+
+// ExcludeIncomplete removes from the provided slice any blobs that are part of incomplete sets.
+func ExcludeIncomplete(bms []blob.Metadata) []blob.Metadata {
+	var result []blob.Metadata
+
+	for _, set := range FindAll(bms) {
+		result = append(result, set...)
+	}
+
+	return result
+}
+
+// FindAll returns a list of complete sets in the provided slice, grouped by set ID.
+// Blobs that are not in any set are also returned.
+func FindAll(bms []blob.Metadata) [][]blob.Metadata {
+	sets := map[string][]blob.Metadata{}
+
+	var completeSets [][]blob.Metadata
+
+	for _, bm := range bms {
+		id := bm.BlobID
+		parts := strings.Split(string(id), "-")
+
+		if len(parts) < 3 || !strings.HasPrefix(parts[1], "s") || !strings.HasPrefix(parts[2], "c") {
+			// treat blobs with malformed names as a single-item sets of their own.
+			completeSets = append(completeSets, []blob.Metadata{bm})
+			continue
+		}
+
+		count, err := strconv.Atoi(parts[2][1:])
+		if err != nil {
+			// treat blobs with malformed names as a single-item sets of their own.
+			completeSets = append(completeSets, []blob.Metadata{bm})
+			continue
+		}
+
+		setID := parts[1]
+		sets[setID] = append(sets[setID], bm)
+
+		if len(sets[setID]) == count {
+			completeSets = append(completeSets, sets[setID])
+		}
+	}
+
+	return completeSets
+}
--- a/internal/completeset/complete_set_test.go
+++ b/internal/completeset/complete_set_test.go
@@ -0,0 +1,175 @@
+package completeset_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/kopia/kopia/internal/completeset"
+	"github.com/kopia/kopia/repo/blob"
+)
+
+func TestFindFirstAndAll(t *testing.T) {
+	cases := []struct {
+		input                 []blob.ID
+		wantFirst             []blob.ID
+		wantAll               [][]blob.ID
+		wantExcludeIncomplete []blob.ID
+	}{
+		{
+			input:                 []blob.ID{},
+			wantFirst:             []blob.ID{},
+			wantAll:               [][]blob.ID(nil),
+			wantExcludeIncomplete: []blob.ID{},
+		},
+
+		// one complete session of size 2
+		{
+			input: []blob.ID{
+				"a-s0-c2",
+				"b-s0-c2",
+			},
+			wantFirst: []blob.ID{"a-s0-c2", "b-s0-c2"},
+			wantAll: [][]blob.ID{
+				{"a-s0-c2", "b-s0-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{"a-s0-c2", "b-s0-c2"},
+		},
+		// one complete session with some malformed name, which by itself forms a complete session.
+		{
+			input: []blob.ID{
+				"a-s0-c2",
+				"malformed",
+				"b-s0-c2",
+			},
+			wantFirst: []blob.ID{
+				"malformed",
+			},
+			wantAll: [][]blob.ID{
+				{"malformed"},
+				{"a-s0-c2", "b-s0-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{
+				"malformed",
+				"a-s0-c2", "b-s0-c2",
+			},
+		},
+		// one complete session with some malformed blob ID
+		{
+			input: []blob.ID{
+				"a-s0-c2",
+				"malformed-s0-x2",
+				"b-s0-c2",
+			},
+			wantFirst: []blob.ID{
+				"malformed-s0-x2",
+			},
+			wantAll: [][]blob.ID{
+				{"malformed-s0-x2"},
+				{"a-s0-c2", "b-s0-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{"malformed-s0-x2", "a-s0-c2", "b-s0-c2"},
+		},
+		// one complete session with some malformed count
+		{
+			input: []blob.ID{
+				"a-s0-c2",
+				"malformed-s0-cNAN",
+				"b-s0-c2",
+			},
+			wantFirst: []blob.ID{
+				"malformed-s0-cNAN",
+			},
+			wantAll: [][]blob.ID{
+				{"malformed-s0-cNAN"},
+				{"a-s0-c2", "b-s0-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{
+				"malformed-s0-cNAN",
+				"a-s0-c2", "b-s0-c2",
+			},
+		},
+		// two complete sessions, we pick 's0' as it's the first one to become complete.
+		{
+			input: []blob.ID{
+				"foo-s0-c2",
+				"aaa-s1-c2",
+				"bar-s0-c2",
+				"bbb-s1-c2",
+			},
+			wantFirst: []blob.ID{
+				"foo-s0-c2", "bar-s0-c2",
+			},
+			wantAll: [][]blob.ID{
+				{"foo-s0-c2", "bar-s0-c2"},
+				{"aaa-s1-c2", "bbb-s1-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{
+				"foo-s0-c2", "bar-s0-c2",
+				"aaa-s1-c2", "bbb-s1-c2",
+			},
+		},
+		// two incomplete sessions
+		{
+			input: []blob.ID{
+				"foo-s0-c3",
+				"aaa-s1-c3",
+				"bar-s0-c3",
+				"bbb-s1-c3",
+			},
+			wantFirst:             []blob.ID{},
+			wantAll:               nil,
+			wantExcludeIncomplete: []blob.ID{},
+		},
+		// two complete, two incomplete sessions
+		{
+			input: []blob.ID{
+				"foo-s0-c2",
+				"aaa-s1-c3",
+				"bar-s0-c2",
+				"bbb-s1-c3",
+				"foo-s2-c2",
+				"aaa-s3-c3",
+				"bar-s2-c2",
+				"bbb-s3-c3",
+			},
+			wantFirst: []blob.ID{
+				"foo-s0-c2",
+				"bar-s0-c2",
+			},
+			wantAll: [][]blob.ID{
+				{"foo-s0-c2", "bar-s0-c2"},
+				{"foo-s2-c2", "bar-s2-c2"},
+			},
+			wantExcludeIncomplete: []blob.ID{
+				"foo-s0-c2", "bar-s0-c2", "foo-s2-c2", "bar-s2-c2",
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		require.Equal(t, tc.wantFirst, blob.IDsFromMetadata(completeset.FindFirst(dummyMetadataForIDs(tc.input))), "invalid result for FindFirst(%v)", tc.input)
+		require.Equal(t, tc.wantAll, idsFromMetadataSets(completeset.FindAll(dummyMetadataForIDs(tc.input))), "invalid result for FindAll(%v)", tc.input)
+		require.Equal(t, tc.wantExcludeIncomplete, blob.IDsFromMetadata(completeset.ExcludeIncomplete(dummyMetadataForIDs(tc.input))), "invalid result for ExcludeIncomplete(%v)", tc.input)
+	}
+}
+
+func idsFromMetadataSets(sets [][]blob.Metadata) [][]blob.ID {
+	var result [][]blob.ID
+
+	for _, s := range sets {
+		result = append(result, blob.IDsFromMetadata(s))
+	}
+
+	return result
+}
+
+func dummyMetadataForIDs(ids []blob.ID) []blob.Metadata {
+	var result []blob.Metadata
+
+	for _, id := range ids {
+		result = append(result, blob.Metadata{BlobID: id})
+	}
+
+	return result
+}
--- a/internal/epoch/complete_set.go
+++ b/internal/epoch/complete_set.go
@@ -1,45 +0,0 @@
-package epoch
-
-import (
-	"strconv"
-	"strings"
-
-	"github.com/kopia/kopia/repo/blob"
-)
-
-// findCompleteSetOfBlobs looks for a complete set of blobs IDs following a naming convention:
-//    '<any>-s<set>-c<count>'
-// where:
-//   'prefix' is arbitrary string not containing a dash ('-')
-//   'set' is a random string shared by all indexes in the same set
-//   'count' is a number that specifies how many items must be in the set to make it complete.
-//
-// The algorithm returns IDs of blobs that form the first complete set.
-func findCompleteSetOfBlobs(bms []blob.Metadata) []blob.Metadata {
-	sets := map[string][]blob.Metadata{}
-
-	for _, bm := range bms {
-		id := bm.BlobID
-		parts := strings.Split(string(id), "-")
-
-		if len(parts) < 3 || !strings.HasPrefix(parts[1], "s") || !strings.HasPrefix(parts[2], "c") {
-			// malformed ID, ignore
-			continue
-		}
-
-		count, err := strconv.Atoi(parts[2][1:])
-		if err != nil {
-			// malformed ID, ignore
-			continue
-		}
-
-		setID := parts[1]
-		sets[setID] = append(sets[setID], bm)
-
-		if len(sets[setID]) == count {
-			return sets[setID]
-		}
-	}
-
-	return nil
-}
--- a/internal/epoch/complete_set_test.go
+++ b/internal/epoch/complete_set_test.go
@@ -1,96 +0,0 @@
-package epoch
-
-import (
-	"testing"
-
-	"github.com/stretchr/testify/require"
-
-	"github.com/kopia/kopia/repo/blob"
-)
-
-func TestFindCompleteSetOfBlobs(t *testing.T) {
-	cases := []struct {
-		input []blob.ID
-		want  []blob.ID
-	}{
-		{
-			input: []blob.ID{},
-			want:  []blob.ID{},
-		},
-
-		// one complete session of size 2
-		{
-			input: []blob.ID{
-				"a-s0-c2",
-				"b-s0-c2",
-			},
-			want: []blob.ID{
-				"a-s0-c2",
-				"b-s0-c2",
-			},
-		},
-		// one complete session with some malformed name
-		{
-			input: []blob.ID{
-				"a-s0-c2",
-				"malformed",
-				"b-s0-c2",
-			},
-			want: []blob.ID{
-				"a-s0-c2",
-				"b-s0-c2",
-			},
-		},
-		// one complete session with some malformed blob ID
-		{
-			input: []blob.ID{
-				"a-s0-c2",
-				"malformed-s0-x2",
-				"b-s0-c2",
-			},
-			want: []blob.ID{
-				"a-s0-c2",
-				"b-s0-c2",
-			},
-		},
-		// one complete session with some malformed count
-		{
-			input: []blob.ID{
-				"a-s0-c2",
-				"malformed-s0-cNAN",
-				"b-s0-c2",
-			},
-			want: []blob.ID{
-				"a-s0-c2",
-				"b-s0-c2",
-			},
-		},
-		// two complete sessions, we pick 's0' as it's the first one to become complete.
-		{
-			input: []blob.ID{
-				"foo-s0-c2",
-				"aaa-s1-c2",
-				"bar-s0-c2",
-				"bbb-s1-c2",
-			},
-			want: []blob.ID{
-				"foo-s0-c2",
-				"bar-s0-c2",
-			},
-		},
-	}
-
-	for _, tc := range cases {
-		require.Equal(t, tc.want, blob.IDsFromMetadata(findCompleteSetOfBlobs(dummyMetadataForIDs(tc.input))), "invalid result for %v", tc.input)
-	}
-}
-
-func dummyMetadataForIDs(ids []blob.ID) []blob.Metadata {
-	var result []blob.Metadata
-
-	for _, id := range ids {
-		result = append(result, blob.Metadata{BlobID: id})
-	}
-
-	return result
-}
--- a/internal/epoch/epoch_manager.go
+++ b/internal/epoch/epoch_manager.go
@@ -14,6 +14,7 @@
 	"golang.org/x/sync/errgroup"

 	"github.com/kopia/kopia/internal/clock"
+	"github.com/kopia/kopia/internal/completeset"
 	"github.com/kopia/kopia/internal/gather"
 	"github.com/kopia/kopia/repo/blob"
 	"github.com/kopia/kopia/repo/logging"
@@ -286,7 +287,7 @@ func (e *Manager) loadRangeCheckpoints(ctx context.Context, cs *CurrentSnapshot)

 	for epoch1, m := range groupByEpochRanges(blobs) {
 		for epoch2, bms := range m {
-			if comp := findCompleteSetOfBlobs(bms); comp != nil {
+			if comp := completeset.FindFirst(bms); comp != nil {
 				erm := &RangeMetadata{
 					MinEpoch: epoch1,
 					MaxEpoch: epoch2,
@@ -310,7 +311,7 @@ func (e *Manager) loadSingleEpochCompactions(ctx context.Context, cs *CurrentSna
 	}

 	for epoch, bms := range groupByEpochNumber(blobs) {
-		if comp := findCompleteSetOfBlobs(bms); comp != nil {
+		if comp := completeset.FindFirst(bms); comp != nil {
 			cs.SingleEpochCompactionSets[epoch] = comp
 		}
 	}