Files
kopia/repo/content/builder.go

230 lines
5.1 KiB
Go

package content
import (
"crypto/rand"
"hash/fnv"
"io"
"runtime"
"sort"
"sync"
"github.com/pkg/errors"
"github.com/kopia/kopia/internal/gather"
)
const randomSuffixSize = 32 // number of random bytes to append at the end to make the index blob unique
// packIndexBuilder prepares and writes content index.
type packIndexBuilder map[ID]Info
// clone returns a deep clone of packIndexBuilder.
func (b packIndexBuilder) clone() packIndexBuilder {
if b == nil {
return nil
}
r := packIndexBuilder{}
for k, v := range b {
r[k] = v
}
return r
}
// Add adds a new entry to the builder or conditionally replaces it if the timestamp is greater.
func (b packIndexBuilder) Add(i Info) {
cid := i.GetContentID()
if contentInfoGreaterThan(i, b[cid]) {
b[cid] = i
}
}
// base36Value stores a base-36 reverse lookup such that ASCII character corresponds to its
// base-36 value ('0'=0..'9'=9, 'a'=10, 'b'=11, .., 'z'=35).
var base36Value [256]byte
func init() {
for i := 0; i < 10; i++ {
base36Value['0'+i] = byte(i)
}
for i := 0; i < 26; i++ {
base36Value['a'+i] = byte(i + 10) //nolint:gomnd
base36Value['A'+i] = byte(i + 10) //nolint:gomnd
}
}
// sortedContents returns the list of []Info sorted lexicographically using bucket sort
// sorting is optimized based on the format of content IDs (optional single-character
// alphanumeric prefix (0-9a-z), followed by hexadecimal digits (0-9a-f).
func (b packIndexBuilder) sortedContents() []Info {
var buckets [36 * 16][]Info
// phase 1 - bucketize into 576 (36 *16) separate lists
// by first [0-9a-z] and second character [0-9a-f].
for cid, v := range b {
first := int(base36Value[cid[0]])
second := int(base36Value[cid[1]])
buck := first<<4 + second //nolint:gomnd
buckets[buck] = append(buckets[buck], v)
}
// phase 2 - sort each non-empty bucket in parallel using goroutines
// this is much faster than sorting one giant list.
var wg sync.WaitGroup
numWorkers := runtime.NumCPU()
for worker := 0; worker < numWorkers; worker++ {
worker := worker
wg.Add(1)
go func() {
defer wg.Done()
for i := range buckets {
if i%numWorkers == worker {
buck := buckets[i]
sort.Slice(buck, func(i, j int) bool {
return buck[i].GetContentID() < buck[j].GetContentID()
})
}
}
}()
}
wg.Wait()
// Phase 3 - merge results from all buckets.
result := make([]Info, 0, len(b))
for i := 0; i < len(buckets); i++ {
result = append(result, buckets[i]...)
}
return result
}
// Build writes the pack index to the provided output.
func (b packIndexBuilder) Build(output io.Writer, version int) error {
if err := b.BuildStable(output, version); err != nil {
return err
}
randomSuffix := make([]byte, randomSuffixSize)
if _, err := rand.Read(randomSuffix); err != nil {
return errors.Wrap(err, "error getting random bytes for suffix")
}
if _, err := output.Write(randomSuffix); err != nil {
return errors.Wrap(err, "error writing extra random suffix to ensure indexes are always globally unique")
}
return nil
}
// BuildStable writes the pack index to the provided output.
func (b packIndexBuilder) BuildStable(output io.Writer, version int) error {
switch version {
case v1IndexVersion:
return b.buildV1(output)
case v2IndexVersion:
return b.buildV2(output)
default:
return errors.Errorf("unsupported index version: %v", version)
}
}
func (b packIndexBuilder) shard(maxShardSize int) []packIndexBuilder {
numShards := (len(b) + maxShardSize - 1) / maxShardSize
if numShards <= 1 {
if len(b) == 0 {
return []packIndexBuilder{}
}
return []packIndexBuilder{b}
}
result := make([]packIndexBuilder, numShards)
for i := range result {
result[i] = make(packIndexBuilder)
}
for k, v := range b {
h := fnv.New32a()
io.WriteString(h, string(k)) // nolint:errcheck
shard := h.Sum32() % uint32(numShards)
result[shard][k] = v
}
var nonEmpty []packIndexBuilder
for _, r := range result {
if len(r) > 0 {
nonEmpty = append(nonEmpty, r)
}
}
return nonEmpty
}
func (b packIndexBuilder) buildShards(indexVersion int, stable bool, shardSize int) ([]gather.Bytes, func(), error) {
if shardSize == 0 {
return nil, nil, errors.Errorf("invalid shard size")
}
var (
shardedBuilders = b.shard(shardSize)
dataShardsBuf []*gather.WriteBuffer
dataShards []gather.Bytes
randomSuffix [32]byte
)
closeShards := func() {
for _, ds := range dataShardsBuf {
ds.Close()
}
}
for _, s := range shardedBuilders {
buf := gather.NewWriteBuffer()
dataShardsBuf = append(dataShardsBuf, buf)
if err := s.BuildStable(buf, indexVersion); err != nil {
closeShards()
return nil, nil, errors.Wrap(err, "error building index shard")
}
if !stable {
if _, err := rand.Read(randomSuffix[:]); err != nil {
closeShards()
return nil, nil, errors.Wrap(err, "error getting random bytes for suffix")
}
if _, err := buf.Write(randomSuffix[:]); err != nil {
closeShards()
return nil, nil, errors.Wrap(err, "error writing extra random suffix to ensure indexes are always globally unique")
}
}
dataShards = append(dataShards, buf.Bytes())
}
return dataShards, closeShards, nil
}