mirror of
https://github.com/kopia/kopia.git
synced 2026-03-27 18:42:26 -04:00
230 lines
5.1 KiB
Go
230 lines
5.1 KiB
Go
package content
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"hash/fnv"
|
|
"io"
|
|
"runtime"
|
|
"sort"
|
|
"sync"
|
|
|
|
"github.com/pkg/errors"
|
|
|
|
"github.com/kopia/kopia/internal/gather"
|
|
)
|
|
|
|
const randomSuffixSize = 32 // number of random bytes to append at the end to make the index blob unique
|
|
|
|
// packIndexBuilder prepares and writes content index.
|
|
type packIndexBuilder map[ID]Info
|
|
|
|
// clone returns a deep clone of packIndexBuilder.
|
|
func (b packIndexBuilder) clone() packIndexBuilder {
|
|
if b == nil {
|
|
return nil
|
|
}
|
|
|
|
r := packIndexBuilder{}
|
|
|
|
for k, v := range b {
|
|
r[k] = v
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
// Add adds a new entry to the builder or conditionally replaces it if the timestamp is greater.
|
|
func (b packIndexBuilder) Add(i Info) {
|
|
cid := i.GetContentID()
|
|
|
|
if contentInfoGreaterThan(i, b[cid]) {
|
|
b[cid] = i
|
|
}
|
|
}
|
|
|
|
// base36Value stores a base-36 reverse lookup such that ASCII character corresponds to its
|
|
// base-36 value ('0'=0..'9'=9, 'a'=10, 'b'=11, .., 'z'=35).
|
|
var base36Value [256]byte
|
|
|
|
func init() {
|
|
for i := 0; i < 10; i++ {
|
|
base36Value['0'+i] = byte(i)
|
|
}
|
|
|
|
for i := 0; i < 26; i++ {
|
|
base36Value['a'+i] = byte(i + 10) //nolint:gomnd
|
|
base36Value['A'+i] = byte(i + 10) //nolint:gomnd
|
|
}
|
|
}
|
|
|
|
// sortedContents returns the list of []Info sorted lexicographically using bucket sort
|
|
// sorting is optimized based on the format of content IDs (optional single-character
|
|
// alphanumeric prefix (0-9a-z), followed by hexadecimal digits (0-9a-f).
|
|
func (b packIndexBuilder) sortedContents() []Info {
|
|
var buckets [36 * 16][]Info
|
|
|
|
// phase 1 - bucketize into 576 (36 *16) separate lists
|
|
// by first [0-9a-z] and second character [0-9a-f].
|
|
for cid, v := range b {
|
|
first := int(base36Value[cid[0]])
|
|
second := int(base36Value[cid[1]])
|
|
|
|
buck := first<<4 + second //nolint:gomnd
|
|
|
|
buckets[buck] = append(buckets[buck], v)
|
|
}
|
|
|
|
// phase 2 - sort each non-empty bucket in parallel using goroutines
|
|
// this is much faster than sorting one giant list.
|
|
var wg sync.WaitGroup
|
|
|
|
numWorkers := runtime.NumCPU()
|
|
for worker := 0; worker < numWorkers; worker++ {
|
|
worker := worker
|
|
|
|
wg.Add(1)
|
|
|
|
go func() {
|
|
defer wg.Done()
|
|
|
|
for i := range buckets {
|
|
if i%numWorkers == worker {
|
|
buck := buckets[i]
|
|
|
|
sort.Slice(buck, func(i, j int) bool {
|
|
return buck[i].GetContentID() < buck[j].GetContentID()
|
|
})
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
// Phase 3 - merge results from all buckets.
|
|
result := make([]Info, 0, len(b))
|
|
|
|
for i := 0; i < len(buckets); i++ {
|
|
result = append(result, buckets[i]...)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// Build writes the pack index to the provided output.
|
|
func (b packIndexBuilder) Build(output io.Writer, version int) error {
|
|
if err := b.BuildStable(output, version); err != nil {
|
|
return err
|
|
}
|
|
|
|
randomSuffix := make([]byte, randomSuffixSize)
|
|
|
|
if _, err := rand.Read(randomSuffix); err != nil {
|
|
return errors.Wrap(err, "error getting random bytes for suffix")
|
|
}
|
|
|
|
if _, err := output.Write(randomSuffix); err != nil {
|
|
return errors.Wrap(err, "error writing extra random suffix to ensure indexes are always globally unique")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// BuildStable writes the pack index to the provided output.
|
|
func (b packIndexBuilder) BuildStable(output io.Writer, version int) error {
|
|
switch version {
|
|
case v1IndexVersion:
|
|
return b.buildV1(output)
|
|
|
|
case v2IndexVersion:
|
|
return b.buildV2(output)
|
|
|
|
default:
|
|
return errors.Errorf("unsupported index version: %v", version)
|
|
}
|
|
}
|
|
|
|
func (b packIndexBuilder) shard(maxShardSize int) []packIndexBuilder {
|
|
numShards := (len(b) + maxShardSize - 1) / maxShardSize
|
|
if numShards <= 1 {
|
|
if len(b) == 0 {
|
|
return []packIndexBuilder{}
|
|
}
|
|
|
|
return []packIndexBuilder{b}
|
|
}
|
|
|
|
result := make([]packIndexBuilder, numShards)
|
|
for i := range result {
|
|
result[i] = make(packIndexBuilder)
|
|
}
|
|
|
|
for k, v := range b {
|
|
h := fnv.New32a()
|
|
io.WriteString(h, string(k)) // nolint:errcheck
|
|
|
|
shard := h.Sum32() % uint32(numShards)
|
|
|
|
result[shard][k] = v
|
|
}
|
|
|
|
var nonEmpty []packIndexBuilder
|
|
|
|
for _, r := range result {
|
|
if len(r) > 0 {
|
|
nonEmpty = append(nonEmpty, r)
|
|
}
|
|
}
|
|
|
|
return nonEmpty
|
|
}
|
|
|
|
func (b packIndexBuilder) buildShards(indexVersion int, stable bool, shardSize int) ([]gather.Bytes, func(), error) {
|
|
if shardSize == 0 {
|
|
return nil, nil, errors.Errorf("invalid shard size")
|
|
}
|
|
|
|
var (
|
|
shardedBuilders = b.shard(shardSize)
|
|
dataShardsBuf []*gather.WriteBuffer
|
|
dataShards []gather.Bytes
|
|
randomSuffix [32]byte
|
|
)
|
|
|
|
closeShards := func() {
|
|
for _, ds := range dataShardsBuf {
|
|
ds.Close()
|
|
}
|
|
}
|
|
|
|
for _, s := range shardedBuilders {
|
|
buf := gather.NewWriteBuffer()
|
|
|
|
dataShardsBuf = append(dataShardsBuf, buf)
|
|
|
|
if err := s.BuildStable(buf, indexVersion); err != nil {
|
|
closeShards()
|
|
|
|
return nil, nil, errors.Wrap(err, "error building index shard")
|
|
}
|
|
|
|
if !stable {
|
|
if _, err := rand.Read(randomSuffix[:]); err != nil {
|
|
closeShards()
|
|
|
|
return nil, nil, errors.Wrap(err, "error getting random bytes for suffix")
|
|
}
|
|
|
|
if _, err := buf.Write(randomSuffix[:]); err != nil {
|
|
closeShards()
|
|
|
|
return nil, nil, errors.Wrap(err, "error writing extra random suffix to ensure indexes are always globally unique")
|
|
}
|
|
}
|
|
|
|
dataShards = append(dataShards, buf.Bytes())
|
|
}
|
|
|
|
return dataShards, closeShards, nil
|
|
}
|