diff --git a/block/block_cache.go b/block/block_cache.go new file mode 100644 index 000000000..dd2609081 --- /dev/null +++ b/block/block_cache.go @@ -0,0 +1,220 @@ +package block + +import ( + "container/heap" + "context" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "github.com/kopia/repo/storage" + "github.com/kopia/repo/storage/filesystem" +) + +const ( + defaultSweepFrequency = 1 * time.Minute + defaultTouchThreshold = 10 * time.Minute +) + +type blockCache struct { + st storage.Storage + cacheStorage storage.Storage + maxSizeBytes int64 + hmacSecret []byte + sweepFrequency time.Duration + touchThreshold time.Duration + + mu sync.Mutex + lastTotalSizeBytes int64 + + closed chan struct{} +} + +type blockToucher interface { + TouchBlock(ctx context.Context, blockID string, threshold time.Duration) error +} + +func adjustCacheKey(cacheKey string) string { + // block IDs with odd length have a single-byte prefix. + // move the prefix to the end of cache key to make sure the top level shard is spread 256 ways. + if len(cacheKey)%2 == 1 { + return cacheKey[1:] + cacheKey[0:1] + } + + return cacheKey +} + +func (c *blockCache) getContentBlock(ctx context.Context, cacheKey string, physicalBlockID string, offset, length int64) ([]byte, error) { + cacheKey = adjustCacheKey(cacheKey) + + useCache := shouldUseBlockCache(ctx) && c.cacheStorage != nil + if useCache { + if b := c.readAndVerifyCacheBlock(ctx, cacheKey); b != nil { + return b, nil + } + } + + b, err := c.st.GetBlock(ctx, physicalBlockID, offset, length) + if err == storage.ErrBlockNotFound { + // not found in underlying storage + return nil, err + } + + if err == nil && useCache { + if puterr := c.cacheStorage.PutBlock(ctx, cacheKey, appendHMAC(b, c.hmacSecret)); puterr != nil { + log.Warningf("unable to write cache item %v: %v", cacheKey, puterr) + } + } + + return b, err +} + +func (c *blockCache) readAndVerifyCacheBlock(ctx context.Context, cacheKey string) []byte { + b, err := c.cacheStorage.GetBlock(ctx, cacheKey, 0, -1) + if err == nil { + b, err = verifyAndStripHMAC(b, c.hmacSecret) + if err == nil { + if t, ok := c.cacheStorage.(blockToucher); ok { + t.TouchBlock(ctx, cacheKey, c.touchThreshold) //nolint:errcheck + } + + // retrieved from cache and HMAC valid + return b + } + + // ignore malformed blocks + log.Warningf("malformed block %v: %v", cacheKey, err) + return nil + } + + if err != storage.ErrBlockNotFound { + log.Warningf("unable to read cache %v: %v", cacheKey, err) + } + return nil +} + +func (c *blockCache) close() { + close(c.closed) +} + +func (c *blockCache) sweepDirectoryPeriodically(ctx context.Context) { + for { + select { + case <-c.closed: + return + + case <-time.After(c.sweepFrequency): + err := c.sweepDirectory(ctx) + if err != nil { + log.Warningf("blockCache sweep failed: %v", err) + } + } + } +} + +// A blockMetadataHeap implements heap.Interface and holds storage.BlockMetadata. +type blockMetadataHeap []storage.BlockMetadata + +func (h blockMetadataHeap) Len() int { return len(h) } + +func (h blockMetadataHeap) Less(i, j int) bool { + return h[i].Timestamp.Before(h[j].Timestamp) +} + +func (h blockMetadataHeap) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} + +func (h *blockMetadataHeap) Push(x interface{}) { + *h = append(*h, x.(storage.BlockMetadata)) +} + +func (h *blockMetadataHeap) Pop() interface{} { + old := *h + n := len(old) + item := old[n-1] + *h = old[0 : n-1] + return item +} + +func (c *blockCache) sweepDirectory(ctx context.Context) (err error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.cacheStorage == nil { + return nil + } + + t0 := time.Now() + + var h blockMetadataHeap + var totalRetainedSize int64 + + err = c.cacheStorage.ListBlocks(ctx, "", func(it storage.BlockMetadata) error { + heap.Push(&h, it) + totalRetainedSize += it.Length + + if totalRetainedSize > c.maxSizeBytes { + oldest := heap.Pop(&h).(storage.BlockMetadata) + if delerr := c.cacheStorage.DeleteBlock(ctx, oldest.BlockID); delerr != nil { + log.Warningf("unable to remove %v: %v", oldest.BlockID, delerr) + } else { + totalRetainedSize -= oldest.Length + } + } + return nil + }) + if err != nil { + return fmt.Errorf("error listing cache: %v", err) + } + + log.Debugf("finished sweeping directory in %v and retained %v/%v bytes (%v %%)", time.Since(t0), totalRetainedSize, c.maxSizeBytes, 100*totalRetainedSize/c.maxSizeBytes) + c.lastTotalSizeBytes = totalRetainedSize + return nil +} + +func newBlockCache(ctx context.Context, st storage.Storage, caching CachingOptions) (*blockCache, error) { + var cacheStorage storage.Storage + var err error + + if caching.MaxCacheSizeBytes > 0 && caching.CacheDirectory != "" { + blockCacheDir := filepath.Join(caching.CacheDirectory, "blocks") + + if _, err = os.Stat(blockCacheDir); os.IsNotExist(err) { + if err = os.MkdirAll(blockCacheDir, 0700); err != nil { + return nil, err + } + } + + cacheStorage, err = filesystem.New(context.Background(), &filesystem.Options{ + Path: blockCacheDir, + DirectoryShards: []int{2}, + }) + if err != nil { + return nil, err + } + } + + return newBlockCacheWithCacheStorage(ctx, st, cacheStorage, caching, defaultTouchThreshold, defaultSweepFrequency) +} + +func newBlockCacheWithCacheStorage(ctx context.Context, st, cacheStorage storage.Storage, caching CachingOptions, touchThreshold time.Duration, sweepFrequency time.Duration) (*blockCache, error) { + c := &blockCache{ + st: st, + cacheStorage: cacheStorage, + maxSizeBytes: caching.MaxCacheSizeBytes, + hmacSecret: append([]byte(nil), caching.HMACSecret...), + closed: make(chan struct{}), + touchThreshold: touchThreshold, + sweepFrequency: sweepFrequency, + } + + if err := c.sweepDirectory(ctx); err != nil { + return nil, err + } + go c.sweepDirectoryPeriodically(ctx) + + return c, nil +} diff --git a/block/block_cache_test.go b/block/block_cache_test.go new file mode 100644 index 000000000..485b24e4c --- /dev/null +++ b/block/block_cache_test.go @@ -0,0 +1,298 @@ +package block + +import ( + "bytes" + "context" + "errors" + "fmt" + "io/ioutil" + "os" + "reflect" + "sort" + "strings" + "testing" + "time" + + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" +) + +func newUnderlyingStorageForBlockCacheTesting(t *testing.T) storage.Storage { + ctx := context.Background() + data := map[string][]byte{} + st := storagetesting.NewMapStorage(data, nil, nil) + assertNoError(t, st.PutBlock(ctx, "block-1", []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10})) + assertNoError(t, st.PutBlock(ctx, "block-4k", bytes.Repeat([]byte{1, 2, 3, 4}, 1000))) // 4000 bytes + return st +} + +func TestCacheExpiration(t *testing.T) { + cacheData := map[string][]byte{} + cacheStorage := storagetesting.NewMapStorage(cacheData, nil, nil) + + underlyingStorage := newUnderlyingStorageForBlockCacheTesting(t) + + cache, err := newBlockCacheWithCacheStorage(context.Background(), underlyingStorage, cacheStorage, CachingOptions{ + MaxCacheSizeBytes: 10000, + }, 0, 500*time.Millisecond) + if err != nil { + t.Fatalf("err: %v", err) + } + defer cache.close() + + ctx := context.Background() + _, err = cache.getContentBlock(ctx, "00000a", "block-4k", 0, -1) // 4k + assertNoError(t, err) + _, err = cache.getContentBlock(ctx, "00000b", "block-4k", 0, -1) // 4k + assertNoError(t, err) + _, err = cache.getContentBlock(ctx, "00000c", "block-4k", 0, -1) // 4k + assertNoError(t, err) + _, err = cache.getContentBlock(ctx, "00000d", "block-4k", 0, -1) // 4k + assertNoError(t, err) + + // wait for a sweep + time.Sleep(2 * time.Second) + + // 00000a and 00000b will be removed from cache because it's the oldest. + // to verify, let's remove block-4k from the underlying storage and make sure we can still read + // 00000c and 00000d from the cache but not 00000a nor 00000b + assertNoError(t, underlyingStorage.DeleteBlock(ctx, "block-4k")) + + cases := []struct { + block string + expectedError error + }{ + {"00000a", storage.ErrBlockNotFound}, + {"00000b", storage.ErrBlockNotFound}, + {"00000c", nil}, + {"00000d", nil}, + } + + for _, tc := range cases { + _, got := cache.getContentBlock(ctx, tc.block, "block-4k", 0, -1) + if want := tc.expectedError; got != want { + t.Errorf("unexpected error when getting block %v: %v wanted %v", tc.block, got, want) + } else { + t.Logf("got correct error %v when reading block %v", tc.expectedError, tc.block) + } + } +} + +func TestDiskBlockCache(t *testing.T) { + ctx := context.Background() + + tmpDir, err := ioutil.TempDir("", "kopia") + if err != nil { + t.Fatalf("error getting temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + cache, err := newBlockCache(ctx, newUnderlyingStorageForBlockCacheTesting(t), CachingOptions{ + MaxCacheSizeBytes: 10000, + CacheDirectory: tmpDir, + }) + + if err != nil { + t.Fatalf("err: %v", err) + } + defer cache.close() + verifyBlockCache(t, cache) +} + +func verifyBlockCache(t *testing.T, cache *blockCache) { + ctx := context.Background() + + t.Run("GetContentBlock", func(t *testing.T) { + cases := []struct { + cacheKey string + physicalBlockID string + offset int64 + length int64 + + expected []byte + err error + }{ + {"xf0f0f1", "block-1", 1, 5, []byte{2, 3, 4, 5, 6}, nil}, + {"xf0f0f2", "block-1", 0, -1, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, nil}, + {"xf0f0f1", "block-1", 1, 5, []byte{2, 3, 4, 5, 6}, nil}, + {"xf0f0f2", "block-1", 0, -1, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, nil}, + {"xf0f0f3", "no-such-block", 0, -1, nil, storage.ErrBlockNotFound}, + {"xf0f0f4", "no-such-block", 10, 5, nil, storage.ErrBlockNotFound}, + {"f0f0f5", "block-1", 7, 3, []byte{8, 9, 10}, nil}, + {"xf0f0f6", "block-1", 11, 10, nil, fmt.Errorf("invalid offset")}, + {"xf0f0f6", "block-1", -1, 5, nil, fmt.Errorf("invalid offset")}, + } + + for _, tc := range cases { + v, err := cache.getContentBlock(ctx, tc.cacheKey, tc.physicalBlockID, tc.offset, tc.length) + if !reflect.DeepEqual(err, tc.err) { + t.Errorf("unexpected error for %v: %+v, wanted %+v", tc.cacheKey, err, tc.err) + } + if !reflect.DeepEqual(v, tc.expected) { + t.Errorf("unexpected data for %v: %x, wanted %x", tc.cacheKey, v, tc.expected) + } + } + + verifyStorageBlockList(t, cache.cacheStorage, "f0f0f1x", "f0f0f2x", "f0f0f5") + }) + + t.Run("DataCorruption", func(t *testing.T) { + cacheKey := "f0f0f1x" + d, err := cache.cacheStorage.GetBlock(ctx, cacheKey, 0, -1) + if err != nil { + t.Fatalf("unable to retrieve data from cache: %v", err) + } + + // corrupt the data and write back + d[0] ^= 1 + + if err := cache.cacheStorage.PutBlock(ctx, cacheKey, d); err != nil { + t.Fatalf("unable to write corrupted block: %v", err) + } + + v, err := cache.getContentBlock(ctx, "xf0f0f1", "block-1", 1, 5) + if err != nil { + t.Fatalf("error in getContentBlock: %v", err) + } + if got, want := v, []byte{2, 3, 4, 5, 6}; !reflect.DeepEqual(v, want) { + t.Errorf("invalid result when reading corrupted data: %v, wanted %v", got, want) + } + }) +} + +func TestCacheFailureToOpen(t *testing.T) { + someError := errors.New("some error") + + cacheData := map[string][]byte{} + cacheStorage := storagetesting.NewMapStorage(cacheData, nil, nil) + underlyingStorage := newUnderlyingStorageForBlockCacheTesting(t) + faultyCache := &storagetesting.FaultyStorage{ + Base: cacheStorage, + Faults: map[string][]*storagetesting.Fault{ + "ListBlocks": { + {Err: someError}, + }, + }, + } + + // Will fail because of ListBlocks failure. + _, err := newBlockCacheWithCacheStorage(context.Background(), underlyingStorage, faultyCache, CachingOptions{ + MaxCacheSizeBytes: 10000, + }, 0, 5*time.Hour) + if err == nil || !strings.Contains(err.Error(), someError.Error()) { + t.Errorf("invalid error %v, wanted: %v", err, someError) + } + + // ListBlocks fails only once, next time it succeeds. + cache, err := newBlockCacheWithCacheStorage(context.Background(), underlyingStorage, faultyCache, CachingOptions{ + MaxCacheSizeBytes: 10000, + }, 0, 100*time.Millisecond) + if err != nil { + t.Fatalf("err: %v", err) + } + + defer cache.close() +} + +func TestCacheFailureToWrite(t *testing.T) { + someError := errors.New("some error") + + cacheData := map[string][]byte{} + cacheStorage := storagetesting.NewMapStorage(cacheData, nil, nil) + underlyingStorage := newUnderlyingStorageForBlockCacheTesting(t) + faultyCache := &storagetesting.FaultyStorage{ + Base: cacheStorage, + } + + cache, err := newBlockCacheWithCacheStorage(context.Background(), underlyingStorage, faultyCache, CachingOptions{ + MaxCacheSizeBytes: 10000, + }, 0, 5*time.Hour) + if err != nil { + t.Fatalf("err: %v", err) + } + + defer cache.close() + + ctx := context.Background() + faultyCache.Faults = map[string][]*storagetesting.Fault{ + "PutBlock": { + {Err: someError}, + }, + } + + v, err := cache.getContentBlock(ctx, "aa", "block-1", 0, 3) + if err != nil { + t.Errorf("write failure wasn't ignored: %v", err) + } + + if got, want := v, []byte{1, 2, 3}; !reflect.DeepEqual(got, want) { + t.Errorf("unexpected value retrieved from cache: %v, want: %v", got, want) + } + + all, err := storage.ListAllBlocks(ctx, cacheStorage, "") + if err != nil { + t.Errorf("error listing cache: %v", err) + } + if len(all) != 0 { + t.Errorf("invalid test - cache was written") + } +} + +func TestCacheFailureToRead(t *testing.T) { + someError := errors.New("some error") + + cacheData := map[string][]byte{} + cacheStorage := storagetesting.NewMapStorage(cacheData, nil, nil) + underlyingStorage := newUnderlyingStorageForBlockCacheTesting(t) + faultyCache := &storagetesting.FaultyStorage{ + Base: cacheStorage, + } + + cache, err := newBlockCacheWithCacheStorage(context.Background(), underlyingStorage, faultyCache, CachingOptions{ + MaxCacheSizeBytes: 10000, + }, 0, 5*time.Hour) + if err != nil { + t.Fatalf("err: %v", err) + } + + defer cache.close() + + ctx := context.Background() + faultyCache.Faults = map[string][]*storagetesting.Fault{ + "GetBlock": { + {Err: someError, Repeat: 100}, + }, + } + + for i := 0; i < 2; i++ { + v, err := cache.getContentBlock(ctx, "aa", "block-1", 0, 3) + if err != nil { + t.Errorf("read failure wasn't ignored: %v", err) + } + + if got, want := v, []byte{1, 2, 3}; !reflect.DeepEqual(got, want) { + t.Errorf("unexpected value retrieved from cache: %v, want: %v", got, want) + } + } +} + +func verifyStorageBlockList(t *testing.T, st storage.Storage, expectedBlocks ...string) { + t.Helper() + var foundBlocks []string + assertNoError(t, st.ListBlocks(context.Background(), "", func(bm storage.BlockMetadata) error { + foundBlocks = append(foundBlocks, bm.BlockID) + return nil + })) + + sort.Strings(foundBlocks) + if !reflect.DeepEqual(foundBlocks, expectedBlocks) { + t.Errorf("unexpected block list: %v, wanted %v", foundBlocks, expectedBlocks) + } +} + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Errorf("err: %v", err) + } +} diff --git a/block/block_formatter.go b/block/block_formatter.go new file mode 100644 index 000000000..27bd89b1c --- /dev/null +++ b/block/block_formatter.go @@ -0,0 +1,217 @@ +package block + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/hmac" //nolint:gas + "crypto/sha256" + "fmt" + "hash" + "sort" + + "golang.org/x/crypto/blake2b" + "golang.org/x/crypto/blake2s" + "golang.org/x/crypto/salsa20" + "golang.org/x/crypto/sha3" +) + +// HashFunc computes hash of block of data using a cryptographic hash function, possibly with HMAC and/or truncation. +type HashFunc func(data []byte) []byte + +// HashFuncFactory returns a hash function for given formatting options. +type HashFuncFactory func(o FormattingOptions) (HashFunc, error) + +// Encryptor performs encryption and decryption of blocks of data. +type Encryptor interface { + // Encrypt returns encrypted bytes corresponding to the given plaintext. Must not clobber the input slice. + Encrypt(plainText []byte, blockID []byte) ([]byte, error) + + // Decrypt returns unencrypted bytes corresponding to the given ciphertext. Must not clobber the input slice. + Decrypt(cipherText []byte, blockID []byte) ([]byte, error) +} + +// EncryptorFactory creates new Encryptor for given FormattingOptions +type EncryptorFactory func(o FormattingOptions) (Encryptor, error) + +var hashFunctions = map[string]HashFuncFactory{} +var encryptors = map[string]EncryptorFactory{} + +// nullEncryptor implements non-encrypted format. +type nullEncryptor struct { +} + +func (fi nullEncryptor) Encrypt(plainText []byte, blockID []byte) ([]byte, error) { + return cloneBytes(plainText), nil +} + +func (fi nullEncryptor) Decrypt(cipherText []byte, blockID []byte) ([]byte, error) { + return cloneBytes(cipherText), nil +} + +// ctrEncryptor implements encrypted format which uses CTR mode of a block cipher with nonce==IV. +type ctrEncryptor struct { + createCipher func() (cipher.Block, error) +} + +func (fi ctrEncryptor) Encrypt(plainText []byte, blockID []byte) ([]byte, error) { + return symmetricEncrypt(fi.createCipher, blockID, plainText) +} + +func (fi ctrEncryptor) Decrypt(cipherText []byte, blockID []byte) ([]byte, error) { + return symmetricEncrypt(fi.createCipher, blockID, cipherText) +} + +func symmetricEncrypt(createCipher func() (cipher.Block, error), iv []byte, b []byte) ([]byte, error) { + blockCipher, err := createCipher() + if err != nil { + return nil, err + } + + ctr := cipher.NewCTR(blockCipher, iv[0:blockCipher.BlockSize()]) + result := make([]byte, len(b)) + ctr.XORKeyStream(result, b) + return result, nil +} + +type salsaEncryptor struct { + nonceSize int + key *[32]byte +} + +func (s salsaEncryptor) Decrypt(input []byte, blockID []byte) ([]byte, error) { + return s.encryptDecrypt(input, blockID) +} + +func (s salsaEncryptor) Encrypt(input []byte, blockID []byte) ([]byte, error) { + return s.encryptDecrypt(input, blockID) +} + +func (s salsaEncryptor) encryptDecrypt(input []byte, blockID []byte) ([]byte, error) { + if len(blockID) < s.nonceSize { + return nil, fmt.Errorf("hash too short, expected >=%v bytes, got %v", s.nonceSize, len(blockID)) + } + result := make([]byte, len(input)) + nonce := blockID[0:s.nonceSize] + salsa20.XORKeyStream(result, input, nonce, s.key) + return result, nil +} + +// truncatedHMACHashFuncFactory returns a HashFuncFactory that computes HMAC(hash, secret) of a given block of bytes +// and truncates results to the given size. +func truncatedHMACHashFuncFactory(hf func() hash.Hash, truncate int) HashFuncFactory { + return func(o FormattingOptions) (HashFunc, error) { + return func(b []byte) []byte { + h := hmac.New(hf, o.HMACSecret) + h.Write(b) // nolint:errcheck + return h.Sum(nil)[0:truncate] + }, nil + } +} + +// truncatedKeyedHashFuncFactory returns a HashFuncFactory that computes keyed hash of a given block of bytes +// and truncates results to the given size. +func truncatedKeyedHashFuncFactory(hf func(key []byte) (hash.Hash, error), truncate int) HashFuncFactory { + return func(o FormattingOptions) (HashFunc, error) { + if _, err := hf(o.HMACSecret); err != nil { + return nil, err + } + + return func(b []byte) []byte { + h, _ := hf(o.HMACSecret) + h.Write(b) // nolint:errcheck + return h.Sum(nil)[0:truncate] + }, nil + } +} + +// newCTREncryptorFactory returns new EncryptorFactory that uses CTR with symmetric encryption (such as AES) and a given key size. +func newCTREncryptorFactory(keySize int, createCipherWithKey func(key []byte) (cipher.Block, error)) EncryptorFactory { + return func(o FormattingOptions) (Encryptor, error) { + key, err := adjustKey(o.MasterKey, keySize) + if err != nil { + return nil, fmt.Errorf("unable to get encryption key: %v", err) + } + + return ctrEncryptor{ + createCipher: func() (cipher.Block, error) { + return createCipherWithKey(key) + }, + }, nil + } +} + +// RegisterHash registers a hash function with a given name. +func RegisterHash(name string, newHashFunc HashFuncFactory) { + hashFunctions[name] = newHashFunc +} + +func SupportedHashAlgorithms() []string { + var result []string + for k := range hashFunctions { + result = append(result, k) + } + sort.Strings(result) + return result +} + +func SupportedEncryptionAlgorithms() []string { + var result []string + for k := range encryptors { + result = append(result, k) + } + sort.Strings(result) + return result +} + +// RegisterEncryption registers new encryption algorithm. +func RegisterEncryption(name string, newEncryptor EncryptorFactory) { + encryptors[name] = newEncryptor +} + +// DefaultHash is the name of the default hash algorithm. +const DefaultHash = "BLAKE2B-256-128" + +// DefaultEncryption is the name of the default encryption algorithm. +const DefaultEncryption = "SALSA20" + +func init() { + RegisterHash("HMAC-SHA256", truncatedHMACHashFuncFactory(sha256.New, 32)) + RegisterHash("HMAC-SHA256-128", truncatedHMACHashFuncFactory(sha256.New, 16)) + RegisterHash("HMAC-SHA224", truncatedHMACHashFuncFactory(sha256.New224, 28)) + RegisterHash("HMAC-SHA3-224", truncatedHMACHashFuncFactory(sha3.New224, 28)) + RegisterHash("HMAC-SHA3-256", truncatedHMACHashFuncFactory(sha3.New256, 32)) + + RegisterHash("BLAKE2S-128", truncatedKeyedHashFuncFactory(blake2s.New128, 16)) + RegisterHash("BLAKE2S-256", truncatedKeyedHashFuncFactory(blake2s.New256, 32)) + RegisterHash("BLAKE2B-256-128", truncatedKeyedHashFuncFactory(blake2b.New256, 16)) + RegisterHash("BLAKE2B-256", truncatedKeyedHashFuncFactory(blake2b.New256, 32)) + + RegisterEncryption("NONE", func(f FormattingOptions) (Encryptor, error) { + return nullEncryptor{}, nil + }) + RegisterEncryption("AES-128-CTR", newCTREncryptorFactory(16, aes.NewCipher)) + RegisterEncryption("AES-192-CTR", newCTREncryptorFactory(24, aes.NewCipher)) + RegisterEncryption("AES-256-CTR", newCTREncryptorFactory(32, aes.NewCipher)) + RegisterEncryption("SALSA20", func(f FormattingOptions) (Encryptor, error) { + var k [32]byte + copy(k[:], f.MasterKey[0:32]) + return salsaEncryptor{8, &k}, nil + }) + RegisterEncryption("XSALSA20", func(f FormattingOptions) (Encryptor, error) { + var k [32]byte + copy(k[:], f.MasterKey[0:32]) + return salsaEncryptor{24, &k}, nil + }) +} + +func adjustKey(masterKey []byte, desiredKeySize int) ([]byte, error) { + if len(masterKey) == desiredKeySize { + return masterKey, nil + } + + if desiredKeySize < len(masterKey) { + return masterKey[0:desiredKeySize], nil + } + + return nil, fmt.Errorf("required key too long %v, but only have %v", desiredKeySize, len(masterKey)) +} diff --git a/block/block_formatter_test.go b/block/block_formatter_test.go new file mode 100644 index 000000000..5d4b2c83c --- /dev/null +++ b/block/block_formatter_test.go @@ -0,0 +1,62 @@ +package block + +import ( + "bytes" + "crypto/sha1" + "math/rand" + "testing" +) + +// combinations of hash and encryption that are not compatible. +var incompatibleAlgorithms = map[string]string{ + "BLAKE2B-256-128/XSALSA20": "invalid encryptor: hash too short, expected >=24 bytes, got 16", + "BLAKE2S-128/XSALSA20": "invalid encryptor: hash too short, expected >=24 bytes, got 16", + "HMAC-RIPEMD-160/XSALSA20": "invalid encryptor: hash too short, expected >=24 bytes, got 20", + "HMAC-SHA256-128/XSALSA20": "invalid encryptor: hash too short, expected >=24 bytes, got 16", +} + +func TestFormatters(t *testing.T) { + secret := []byte("secret") + + data := make([]byte, 100) + rand.Read(data) + h0 := sha1.Sum(data) + + for _, hashAlgo := range SupportedHashAlgorithms() { + for _, encryptionAlgo := range SupportedEncryptionAlgorithms() { + h, e, err := CreateHashAndEncryptor(FormattingOptions{ + HMACSecret: secret, + MasterKey: make([]byte, 32), + Hash: hashAlgo, + Encryption: encryptionAlgo, + }) + + if err != nil { + key := hashAlgo + "/" + encryptionAlgo + errmsg := incompatibleAlgorithms[key] + if err.Error() == errmsg { + continue + } + t.Errorf("Algorithm %v not marked as incompatible and failed with %v", key, err) + continue + } + + blockID := h(data) + cipherText, err := e.Encrypt(data, blockID) + if err != nil || cipherText == nil { + t.Errorf("invalid response from Encrypt: %v %v", cipherText, err) + } + + plainText, err := e.Decrypt(cipherText, blockID) + if err != nil || plainText == nil { + t.Errorf("invalid response from Decrypt: %v %v", plainText, err) + } + + h1 := sha1.Sum(plainText) + + if !bytes.Equal(h0[:], h1[:]) { + t.Errorf("Encrypt()/Decrypt() does not round-trip: %x %x", h0, h1) + } + } + } +} diff --git a/block/block_formatting_options.go b/block/block_formatting_options.go new file mode 100644 index 000000000..33520eaf0 --- /dev/null +++ b/block/block_formatting_options.go @@ -0,0 +1,11 @@ +package block + +// FormattingOptions describes the rules for formatting blocks in repository. +type FormattingOptions struct { + Version int `json:"version,omitempty"` // version number, must be "1" + Hash string `json:"hash,omitempty"` // identifier of the hash algorithm used + Encryption string `json:"encryption,omitempty"` // identifier of the encryption algorithm used + HMACSecret []byte `json:"secret,omitempty"` // HMAC secret used to generate encryption keys + MasterKey []byte `json:"masterKey,omitempty"` // master encryption key (SIV-mode encryption only) + MaxPackSize int `json:"maxPackSize,omitempty"` // maximum size of a pack object +} diff --git a/block/block_index_recovery.go b/block/block_index_recovery.go new file mode 100644 index 000000000..c15f5b2d1 --- /dev/null +++ b/block/block_index_recovery.go @@ -0,0 +1,226 @@ +package block + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "hash/crc32" + "reflect" +) + +// RecoverIndexFromPackFile attempts to recover index block entries from a given pack file. +// Pack file length may be provided (if known) to reduce the number of bytes that are read from the storage. +func (bm *Manager) RecoverIndexFromPackFile(ctx context.Context, packFile string, packFileLength int64, commit bool) ([]Info, error) { + localIndexBytes, err := bm.readPackFileLocalIndex(ctx, packFile, packFileLength) + if err != nil { + return nil, err + } + + ndx, err := openPackIndex(bytes.NewReader(localIndexBytes)) + if err != nil { + return nil, fmt.Errorf("unable to open index in file %v", packFile) + } + defer ndx.Close() //nolint:errcheck + + var recovered []Info + + err = ndx.Iterate("", func(i Info) error { + recovered = append(recovered, i) + if commit { + bm.packIndexBuilder.Add(i) + } + return nil + }) + + return recovered, err +} + +type packBlockPostamble struct { + localIndexIV []byte + localIndexOffset uint32 + localIndexLength uint32 +} + +func (p *packBlockPostamble) toBytes() ([]byte, error) { + // 4 varints + IV + 4 bytes of checksum + 1 byte of postamble length + n := 0 + buf := make([]byte, 4*binary.MaxVarintLen64+len(p.localIndexIV)+4+1) + + n += binary.PutUvarint(buf[n:], uint64(1)) // version flag + n += binary.PutUvarint(buf[n:], uint64(len(p.localIndexIV))) // length of local index IV + copy(buf[n:], p.localIndexIV) + n += len(p.localIndexIV) + n += binary.PutUvarint(buf[n:], uint64(p.localIndexOffset)) + n += binary.PutUvarint(buf[n:], uint64(p.localIndexLength)) + + checksum := crc32.ChecksumIEEE(buf[0:n]) + binary.BigEndian.PutUint32(buf[n:], checksum) + n += 4 + if n > 255 { + return nil, fmt.Errorf("postamble too long: %v", n) + } + + buf[n] = byte(n) + return buf[0 : n+1], nil +} + +// findPostamble detects if a given block of bytes contains a possibly valid postamble, and returns it if so +// NOTE, even if this function returns a postamble, it should not be trusted to be correct, since it's not +// cryptographically signed. this is to facilitate data recovery. +func findPostamble(b []byte) *packBlockPostamble { + if len(b) == 0 { + // no postamble + return nil + } + + // length of postamble is the last byte + postambleLength := int(b[len(b)-1]) + if postambleLength < 5 { + // too short, must be at least 5 bytes (checksum + own length) + return nil + } + postambleStart := len(b) - 1 - postambleLength + postambleEnd := len(b) - 1 + if postambleStart < 0 { + // invalid last byte + return nil + } + + postambleBytes := b[postambleStart:postambleEnd] + payload, checksumBytes := postambleBytes[0:len(postambleBytes)-4], postambleBytes[len(postambleBytes)-4:] + checksum := binary.BigEndian.Uint32(checksumBytes) + validChecksum := crc32.ChecksumIEEE(payload) + + if checksum != validChecksum { + // invalid checksum, not a valid postamble + return nil + } + + return decodePostamble(payload) +} + +func decodePostamble(payload []byte) *packBlockPostamble { + flags, n := binary.Uvarint(payload) + if n <= 0 { + // invalid flags + return nil + } + if flags != 1 { + // unsupported flag + return nil + } + payload = payload[n:] + + ivLength, n := binary.Uvarint(payload) + if n <= 0 { + // invalid flags + return nil + } + payload = payload[n:] + if ivLength > uint64(len(payload)) { + // invalid IV length + return nil + } + + iv := payload[0:ivLength] + payload = payload[ivLength:] + + off, n := binary.Uvarint(payload) + if n <= 0 { + // invalid offset + return nil + } + payload = payload[n:] + + length, n := binary.Uvarint(payload) + if n <= 0 { + // invalid offset + return nil + } + + return &packBlockPostamble{ + localIndexIV: iv, + localIndexLength: uint32(length), + localIndexOffset: uint32(off), + } +} + +func (bm *Manager) buildLocalIndex(pending packIndexBuilder) ([]byte, error) { + var buf bytes.Buffer + if err := pending.Build(&buf); err != nil { + return nil, fmt.Errorf("unable to build local index: %v", err) + } + + return buf.Bytes(), nil +} + +// appendPackFileIndexRecoveryData appends data designed to help with recovery of pack index in case it gets damaged or lost. +func (bm *Manager) appendPackFileIndexRecoveryData(blockData []byte, pending packIndexBuilder) ([]byte, error) { + // build, encrypt and append local index + localIndexOffset := len(blockData) + localIndex, err := bm.buildLocalIndex(pending) + if err != nil { + return nil, err + } + + localIndexIV := bm.hashData(localIndex) + encryptedLocalIndex, err := bm.encryptor.Encrypt(localIndex, localIndexIV) + if err != nil { + return nil, err + } + + postamble := packBlockPostamble{ + localIndexIV: localIndexIV, + localIndexOffset: uint32(localIndexOffset), + localIndexLength: uint32(len(localIndex)), + } + + blockData = append(blockData, encryptedLocalIndex...) + postambleBytes, err := postamble.toBytes() + if err != nil { + return nil, err + } + + blockData = append(blockData, postambleBytes...) + + pa2 := findPostamble(blockData) + if pa2 == nil { + log.Fatalf("invalid postamble written, that could not be immediately decoded, it's a bug") + } + + if !reflect.DeepEqual(postamble, *pa2) { + log.Fatalf("postamble did not round-trip: %v %v", postamble, *pa2) + } + + return blockData, nil +} + +func (bm *Manager) readPackFileLocalIndex(ctx context.Context, packFile string, packFileLength int64) ([]byte, error) { + payload, err := bm.st.GetBlock(ctx, packFile, 0, -1) + if err != nil { + return nil, err + } + + postamble := findPostamble(payload) + if postamble == nil { + return nil, fmt.Errorf("unable to find valid postamble in file %v", packFile) + } + + if uint64(postamble.localIndexOffset+postamble.localIndexLength) > uint64(len(payload)) { + // invalid offset/length + return nil, fmt.Errorf("unable to find valid local index in file %v", packFile) + } + + encryptedLocalIndexBytes := payload[postamble.localIndexOffset : postamble.localIndexOffset+postamble.localIndexLength] + if encryptedLocalIndexBytes == nil { + return nil, fmt.Errorf("unable to find valid local index in file %v", packFile) + } + + localIndexBytes, err := bm.decryptAndVerify(encryptedLocalIndexBytes, postamble.localIndexIV) + if err != nil { + return nil, fmt.Errorf("unable to decrypt local index: %v", err) + } + + return localIndexBytes, nil +} diff --git a/block/block_index_recovery_test.go b/block/block_index_recovery_test.go new file mode 100644 index 000000000..287d81af1 --- /dev/null +++ b/block/block_index_recovery_test.go @@ -0,0 +1,90 @@ +package block + +import ( + "context" + "testing" + "time" + + "github.com/kopia/repo/storage" +) + +func TestBlockIndexRecovery(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + block1 := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + block2 := writeBlockAndVerify(ctx, t, bm, seededRandomData(11, 100)) + block3 := writeBlockAndVerify(ctx, t, bm, seededRandomData(12, 100)) + + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + + // delete all index blocks + assertNoError(t, bm.st.ListBlocks(ctx, newIndexBlockPrefix, func(bi storage.BlockMetadata) error { + log.Debugf("deleting %v", bi.BlockID) + return bm.st.DeleteBlock(ctx, bi.BlockID) + })) + + // now with index blocks gone, all blocks appear to not be found + bm = newTestBlockManager(data, keyTime, nil) + verifyBlockNotFound(ctx, t, bm, block1) + verifyBlockNotFound(ctx, t, bm, block2) + verifyBlockNotFound(ctx, t, bm, block3) + + totalRecovered := 0 + + // pass 1 - just list blocks to recover, but don't commit + err := bm.st.ListBlocks(ctx, PackBlockPrefix, func(bi storage.BlockMetadata) error { + infos, err := bm.RecoverIndexFromPackFile(ctx, bi.BlockID, bi.Length, false) + if err != nil { + return err + } + totalRecovered += len(infos) + log.Debugf("recovered %v blocks", len(infos)) + return nil + }) + if err != nil { + t.Errorf("error recovering: %v", err) + } + + if got, want := totalRecovered, 3; got != want { + t.Errorf("invalid # of blocks recovered: %v, want %v", got, want) + } + + // blocks are stil not found + verifyBlockNotFound(ctx, t, bm, block1) + verifyBlockNotFound(ctx, t, bm, block2) + verifyBlockNotFound(ctx, t, bm, block3) + + // pass 2 now pass commit=true to add recovered blocks to index + totalRecovered = 0 + + err = bm.st.ListBlocks(ctx, PackBlockPrefix, func(bi storage.BlockMetadata) error { + infos, err := bm.RecoverIndexFromPackFile(ctx, bi.BlockID, bi.Length, true) + if err != nil { + return err + } + totalRecovered += len(infos) + log.Debugf("recovered %v blocks", len(infos)) + return nil + }) + if err != nil { + t.Errorf("error recovering: %v", err) + } + + if got, want := totalRecovered, 3; got != want { + t.Errorf("invalid # of blocks recovered: %v, want %v", got, want) + } + + verifyBlock(ctx, t, bm, block1, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm, block2, seededRandomData(11, 100)) + verifyBlock(ctx, t, bm, block3, seededRandomData(12, 100)) + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + verifyBlock(ctx, t, bm, block1, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm, block2, seededRandomData(11, 100)) + verifyBlock(ctx, t, bm, block3, seededRandomData(12, 100)) +} diff --git a/block/block_manager.go b/block/block_manager.go new file mode 100644 index 000000000..1290959ae --- /dev/null +++ b/block/block_manager.go @@ -0,0 +1,1039 @@ +// Package block implements repository support content-addressable storage blocks. +package block + +import ( + "bytes" + "context" + "crypto/aes" + cryptorand "crypto/rand" + "encoding/hex" + "fmt" + "io" + "math" + "math/rand" + "os" + "reflect" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/storage" +) + +var ( + log = repologging.Logger("kopia/block") + formatLog = repologging.Logger("kopia/block/format") +) + +// PackBlockPrefix is the prefix for all pack storage blocks. +const PackBlockPrefix = "p" + +const ( + parallelFetches = 5 // number of parallel reads goroutines + flushPackIndexTimeout = 10 * time.Minute // time after which all pending indexes are flushes + newIndexBlockPrefix = "n" + defaultMinPreambleLength = 32 + defaultMaxPreambleLength = 32 + defaultPaddingUnit = 4096 + + currentWriteVersion = 1 + minSupportedReadVersion = 0 + maxSupportedReadVersion = currentWriteVersion + + indexLoadAttempts = 10 +) + +// IndexInfo is an information about a single index block managed by Manager. +type IndexInfo struct { + FileName string + Length int64 + Timestamp time.Time +} + +// Manager manages storage blocks at a low level with encryption, deduplication and packaging. +type Manager struct { + Format FormattingOptions + + stats Stats + blockCache *blockCache + listCache *listCache + st storage.Storage + + mu sync.Mutex + locked bool + checkInvariantsOnUnlock bool + + currentPackItems map[string]Info // blocks that are in the pack block currently being built (all inline) + currentPackDataLength int // total length of all items in the current pack block + packIndexBuilder packIndexBuilder // blocks that are in index currently being built (current pack and all packs saved but not committed) + committedBlocks *committedBlockIndex + + disableIndexFlushCount int + flushPackIndexesAfter time.Time // time when those indexes should be flushed + + closed chan struct{} + + writeFormatVersion int32 // format version to write + + maxPackSize int + hasher HashFunc + encryptor Encryptor + + minPreambleLength int + maxPreambleLength int + paddingUnit int + timeNow func() time.Time + + repositoryFormatBytes []byte +} + +// DeleteBlock marks the given blockID as deleted. +// +// NOTE: To avoid race conditions only blocks that cannot be possibly re-created +// should ever be deleted. That means that contents of such blocks should include some element +// of randomness or a contemporaneous timestamp that will never reappear. +func (bm *Manager) DeleteBlock(blockID string) error { + bm.lock() + defer bm.unlock() + + log.Debugf("DeleteBlock(%q)", blockID) + + // We have this block in current pack index and it's already deleted there. + if bi, ok := bm.packIndexBuilder[blockID]; ok { + if !bi.Deleted { + if bi.PackFile == "" { + // added and never committed, just forget about it. + delete(bm.packIndexBuilder, blockID) + delete(bm.currentPackItems, blockID) + return nil + } + + // added and committed. + bi2 := *bi + bi2.Deleted = true + bi2.TimestampSeconds = bm.timeNow().Unix() + bm.setPendingBlock(bi2) + } + return nil + } + + // We have this block in current pack index and it's already deleted there. + bi, err := bm.committedBlocks.getBlock(blockID) + if err != nil { + return err + } + + if bi.Deleted { + // already deleted + return nil + } + + // object present but not deleted, mark for deletion and add to pending + bi2 := bi + bi2.Deleted = true + bi2.TimestampSeconds = bm.timeNow().Unix() + bm.setPendingBlock(bi2) + return nil +} + +func (bm *Manager) setPendingBlock(i Info) { + bm.packIndexBuilder.Add(i) + bm.currentPackItems[i.BlockID] = i +} + +func (bm *Manager) addToPackLocked(ctx context.Context, blockID string, data []byte, isDeleted bool) error { + bm.assertLocked() + + data = cloneBytes(data) + bm.currentPackDataLength += len(data) + bm.setPendingBlock(Info{ + Deleted: isDeleted, + BlockID: blockID, + Payload: data, + Length: uint32(len(data)), + TimestampSeconds: bm.timeNow().Unix(), + }) + + if bm.currentPackDataLength >= bm.maxPackSize { + if err := bm.finishPackAndMaybeFlushIndexesLocked(ctx); err != nil { + return err + } + } + + return nil +} + +func (bm *Manager) finishPackAndMaybeFlushIndexesLocked(ctx context.Context) error { + bm.assertLocked() + if err := bm.finishPackLocked(ctx); err != nil { + return err + } + + if bm.timeNow().After(bm.flushPackIndexesAfter) { + if err := bm.flushPackIndexesLocked(ctx); err != nil { + return err + } + } + + return nil +} + +// Stats returns statistics about block manager operations. +func (bm *Manager) Stats() Stats { + return bm.stats +} + +// ResetStats resets statistics to zero values. +func (bm *Manager) ResetStats() { + bm.stats = Stats{} +} + +// DisableIndexFlush increments the counter preventing automatic index flushes. +func (bm *Manager) DisableIndexFlush() { + bm.lock() + defer bm.unlock() + log.Debugf("DisableIndexFlush()") + bm.disableIndexFlushCount++ +} + +// EnableIndexFlush decrements the counter preventing automatic index flushes. +// The flushes will be reenabled when the index drops to zero. +func (bm *Manager) EnableIndexFlush() { + bm.lock() + defer bm.unlock() + log.Debugf("EnableIndexFlush()") + bm.disableIndexFlushCount-- +} + +func (bm *Manager) verifyInvariantsLocked() { + bm.assertLocked() + + bm.verifyCurrentPackItemsLocked() + bm.verifyPackIndexBuilderLocked() +} + +func (bm *Manager) verifyCurrentPackItemsLocked() { + for k, cpi := range bm.currentPackItems { + bm.assertInvariant(cpi.BlockID == k, "block ID entry has invalid key: %v %v", cpi.BlockID, k) + bm.assertInvariant(cpi.Deleted || cpi.PackFile == "", "block ID entry has unexpected pack block ID %v: %v", cpi.BlockID, cpi.PackFile) + bm.assertInvariant(cpi.TimestampSeconds != 0, "block has no timestamp: %v", cpi.BlockID) + bi, ok := bm.packIndexBuilder[k] + bm.assertInvariant(ok, "block ID entry not present in pack index builder: %v", cpi.BlockID) + bm.assertInvariant(reflect.DeepEqual(*bi, cpi), "current pack index does not match pack index builder: %v", cpi, *bi) + } +} + +func (bm *Manager) verifyPackIndexBuilderLocked() { + for k, cpi := range bm.packIndexBuilder { + bm.assertInvariant(cpi.BlockID == k, "block ID entry has invalid key: %v %v", cpi.BlockID, k) + if _, ok := bm.currentPackItems[cpi.BlockID]; ok { + // ignore blocks also in currentPackItems + continue + } + if cpi.Deleted { + bm.assertInvariant(cpi.PackFile == "", "block can't be both deleted and have a pack block: %v", cpi.BlockID) + } else { + bm.assertInvariant(cpi.PackFile != "", "block that's not deleted must have a pack block: %+v", cpi) + bm.assertInvariant(cpi.FormatVersion == byte(bm.writeFormatVersion), "block that's not deleted must have a valid format version: %+v", cpi) + } + bm.assertInvariant(cpi.TimestampSeconds != 0, "block has no timestamp: %v", cpi.BlockID) + } +} + +func (bm *Manager) assertInvariant(ok bool, errorMsg string, arg ...interface{}) { + if ok { + return + } + + if len(arg) > 0 { + errorMsg = fmt.Sprintf(errorMsg, arg...) + } + + panic(errorMsg) +} + +func (bm *Manager) startPackIndexLocked() { + bm.currentPackItems = make(map[string]Info) + bm.currentPackDataLength = 0 +} + +func (bm *Manager) flushPackIndexesLocked(ctx context.Context) error { + bm.assertLocked() + + if bm.disableIndexFlushCount > 0 { + log.Debugf("not flushing index because flushes are currently disabled") + return nil + } + + if len(bm.packIndexBuilder) > 0 { + var buf bytes.Buffer + + if err := bm.packIndexBuilder.Build(&buf); err != nil { + return fmt.Errorf("unable to build pack index: %v", err) + } + + data := buf.Bytes() + dataCopy := append([]byte(nil), data...) + + indexBlockID, err := bm.writePackIndexesNew(ctx, data) + if err != nil { + return err + } + + if err := bm.committedBlocks.addBlock(indexBlockID, dataCopy, true); err != nil { + return fmt.Errorf("unable to add committed block: %v", err) + } + bm.packIndexBuilder = make(packIndexBuilder) + } + + bm.flushPackIndexesAfter = bm.timeNow().Add(flushPackIndexTimeout) + return nil +} + +func (bm *Manager) writePackIndexesNew(ctx context.Context, data []byte) (string, error) { + return bm.encryptAndWriteBlockNotLocked(ctx, data, newIndexBlockPrefix) +} + +func (bm *Manager) finishPackLocked(ctx context.Context) error { + if len(bm.currentPackItems) == 0 { + log.Debugf("no current pack entries") + return nil + } + + if err := bm.writePackBlockLocked(ctx); err != nil { + return fmt.Errorf("error writing pack block: %v", err) + } + + bm.startPackIndexLocked() + return nil +} + +func (bm *Manager) writePackBlockLocked(ctx context.Context) error { + bm.assertLocked() + + blockID := make([]byte, 16) + if _, err := cryptorand.Read(blockID); err != nil { + return fmt.Errorf("unable to read crypto bytes: %v", err) + } + + packFile := fmt.Sprintf("%v%x", PackBlockPrefix, blockID) + + blockData, packFileIndex, err := bm.preparePackDataBlock(packFile) + if err != nil { + return fmt.Errorf("error preparing data block: %v", err) + } + + if len(blockData) > 0 { + if err := bm.writePackFileNotLocked(ctx, packFile, blockData); err != nil { + return fmt.Errorf("can't save pack data block: %v", err) + } + } + + formatLog.Debugf("wrote pack file: %v (%v bytes)", packFile, len(blockData)) + for _, info := range packFileIndex { + bm.packIndexBuilder.Add(*info) + } + + return nil +} + +func (bm *Manager) preparePackDataBlock(packFile string) ([]byte, packIndexBuilder, error) { + formatLog.Debugf("preparing block data with %v items", len(bm.currentPackItems)) + + blockData, err := appendRandomBytes(append([]byte(nil), bm.repositoryFormatBytes...), rand.Intn(bm.maxPreambleLength-bm.minPreambleLength+1)+bm.minPreambleLength) + if err != nil { + return nil, nil, fmt.Errorf("unable to prepare block preamble: %v", err) + } + + packFileIndex := packIndexBuilder{} + for blockID, info := range bm.currentPackItems { + if info.Payload == nil { + continue + } + + var encrypted []byte + encrypted, err = bm.maybeEncryptBlockDataForPacking(info.Payload, info.BlockID) + if err != nil { + return nil, nil, fmt.Errorf("unable to encrypt %q: %v", blockID, err) + } + + formatLog.Debugf("adding %v length=%v deleted=%v", blockID, len(info.Payload), info.Deleted) + + packFileIndex.Add(Info{ + BlockID: blockID, + Deleted: info.Deleted, + FormatVersion: byte(bm.writeFormatVersion), + PackFile: packFile, + PackOffset: uint32(len(blockData)), + Length: uint32(len(info.Payload)), + TimestampSeconds: info.TimestampSeconds, + }) + + blockData = append(blockData, encrypted...) + } + + if len(packFileIndex) == 0 { + return nil, nil, nil + } + + if bm.paddingUnit > 0 { + if missing := bm.paddingUnit - (len(blockData) % bm.paddingUnit); missing > 0 { + blockData, err = appendRandomBytes(blockData, missing) + if err != nil { + return nil, nil, fmt.Errorf("unable to prepare block postamble: %v", err) + } + } + } + + origBlockLength := len(blockData) + blockData, err = bm.appendPackFileIndexRecoveryData(blockData, packFileIndex) + + formatLog.Debugf("finished block %v bytes (%v bytes index)", len(blockData), len(blockData)-origBlockLength) + return blockData, packFileIndex, err +} + +func (bm *Manager) maybeEncryptBlockDataForPacking(data []byte, blockID string) ([]byte, error) { + if bm.writeFormatVersion == 0 { + // in v0 the entire block is encrypted together later on + return data, nil + } + iv, err := getPackedBlockIV(blockID) + if err != nil { + return nil, fmt.Errorf("unable to get packed block IV for %q: %v", blockID, err) + } + return bm.encryptor.Encrypt(data, iv) +} + +func appendRandomBytes(b []byte, count int) ([]byte, error) { + rnd := make([]byte, count) + if _, err := io.ReadFull(cryptorand.Reader, rnd); err != nil { + return nil, err + } + + return append(b, rnd...), nil +} + +// IndexBlocks returns the list of active index blocks. +func (bm *Manager) IndexBlocks(ctx context.Context) ([]IndexInfo, error) { + return bm.listCache.listIndexBlocks(ctx) +} + +func (bm *Manager) loadPackIndexesUnlocked(ctx context.Context) ([]IndexInfo, bool, error) { + nextSleepTime := 100 * time.Millisecond + + for i := 0; i < indexLoadAttempts; i++ { + if err := ctx.Err(); err != nil { + return nil, false, err + } + + if i > 0 { + bm.listCache.deleteListCache(ctx) + log.Debugf("encountered NOT_FOUND when loading, sleeping %v before retrying #%v", nextSleepTime, i) + time.Sleep(nextSleepTime) + nextSleepTime *= 2 + } + + blocks, err := bm.listCache.listIndexBlocks(ctx) + if err != nil { + return nil, false, err + } + + err = bm.tryLoadPackIndexBlocksUnlocked(ctx, blocks) + if err == nil { + var blockIDs []string + for _, b := range blocks { + blockIDs = append(blockIDs, b.FileName) + } + var updated bool + updated, err = bm.committedBlocks.use(blockIDs) + if err != nil { + return nil, false, err + } + return blocks, updated, nil + } + if err != storage.ErrBlockNotFound { + return nil, false, err + } + } + + return nil, false, fmt.Errorf("unable to load pack indexes despite %v retries", indexLoadAttempts) +} + +func (bm *Manager) tryLoadPackIndexBlocksUnlocked(ctx context.Context, blocks []IndexInfo) error { + ch, unprocessedIndexesSize, err := bm.unprocessedIndexBlocksUnlocked(blocks) + if err != nil { + return err + } + if len(ch) == 0 { + return nil + } + + log.Infof("downloading %v new index blocks (%v bytes)...", len(ch), unprocessedIndexesSize) + var wg sync.WaitGroup + + errors := make(chan error, parallelFetches) + + for i := 0; i < parallelFetches; i++ { + wg.Add(1) + go func() { + defer wg.Done() + + for indexBlockID := range ch { + data, err := bm.getPhysicalBlockInternal(ctx, indexBlockID) + if err != nil { + errors <- err + return + } + + if err := bm.committedBlocks.addBlock(indexBlockID, data, false); err != nil { + errors <- fmt.Errorf("unable to add to committed block cache: %v", err) + return + } + } + }() + } + + wg.Wait() + close(errors) + + // Propagate async errors, if any. + for err := range errors { + return err + } + log.Infof("Index blocks downloaded.") + + return nil +} + +// unprocessedIndexBlocksUnlocked returns a closed channel filled with block IDs that are not in committedBlocks cache. +func (bm *Manager) unprocessedIndexBlocksUnlocked(blocks []IndexInfo) (<-chan string, int64, error) { + var totalSize int64 + ch := make(chan string, len(blocks)) + for _, block := range blocks { + has, err := bm.committedBlocks.cache.hasIndexBlockID(block.FileName) + if err != nil { + return nil, 0, err + } + if has { + log.Debugf("index block %q already in cache, skipping", block.FileName) + continue + } + ch <- block.FileName + totalSize += block.Length + } + close(ch) + return ch, totalSize, nil +} + +// Close closes the block manager. +func (bm *Manager) Close() { + bm.blockCache.close() + close(bm.closed) +} + +// ListBlocks returns IDs of blocks matching given prefix. +func (bm *Manager) ListBlocks(prefix string) ([]string, error) { + bm.lock() + defer bm.unlock() + + var result []string + + appendToResult := func(i Info) error { + if i.Deleted || !strings.HasPrefix(i.BlockID, prefix) { + return nil + } + if bi, ok := bm.packIndexBuilder[i.BlockID]; ok && bi.Deleted { + return nil + } + result = append(result, i.BlockID) + return nil + } + + for _, bi := range bm.packIndexBuilder { + _ = appendToResult(*bi) + } + + _ = bm.committedBlocks.listBlocks(prefix, appendToResult) + return result, nil +} + +// ListBlockInfos returns the metadata about blocks with a given prefix and kind. +func (bm *Manager) ListBlockInfos(prefix string, includeDeleted bool) ([]Info, error) { + bm.lock() + defer bm.unlock() + + var result []Info + + appendToResult := func(i Info) error { + if (i.Deleted && !includeDeleted) || !strings.HasPrefix(i.BlockID, prefix) { + return nil + } + if bi, ok := bm.packIndexBuilder[i.BlockID]; ok && bi.Deleted { + return nil + } + result = append(result, i) + return nil + } + + for _, bi := range bm.packIndexBuilder { + _ = appendToResult(*bi) + } + + _ = bm.committedBlocks.listBlocks(prefix, appendToResult) + + return result, nil +} + +// Flush completes writing any pending packs and writes pack indexes to the underlyign storage. +func (bm *Manager) Flush(ctx context.Context) error { + bm.lock() + defer bm.unlock() + + if err := bm.finishPackLocked(ctx); err != nil { + return fmt.Errorf("error writing pending block: %v", err) + } + + if err := bm.flushPackIndexesLocked(ctx); err != nil { + return fmt.Errorf("error flushing indexes: %v", err) + } + + return nil +} + +// RewriteBlock causes reads and re-writes a given block using the most recent format. +func (bm *Manager) RewriteBlock(ctx context.Context, blockID string) error { + bi, err := bm.getBlockInfo(blockID) + if err != nil { + return err + } + + data, err := bm.getBlockContentsUnlocked(ctx, bi) + if err != nil { + return err + } + + bm.lock() + defer bm.unlock() + return bm.addToPackLocked(ctx, blockID, data, bi.Deleted) +} + +// WriteBlock saves a given block of data to a pack group with a provided name and returns a blockID +// that's based on the contents of data written. +func (bm *Manager) WriteBlock(ctx context.Context, data []byte, prefix string) (string, error) { + if err := validatePrefix(prefix); err != nil { + return "", err + } + blockID := prefix + hex.EncodeToString(bm.hashData(data)) + + // block already tracked + if bi, err := bm.getBlockInfo(blockID); err == nil { + if !bi.Deleted { + return blockID, nil + } + } + + log.Debugf("WriteBlock(%q) - new", blockID) + bm.lock() + defer bm.unlock() + err := bm.addToPackLocked(ctx, blockID, data, false) + return blockID, err +} + +func validatePrefix(prefix string) error { + switch len(prefix) { + case 0: + return nil + case 1: + if prefix[0] >= 'g' && prefix[0] <= 'z' { + return nil + } + } + + return fmt.Errorf("invalid prefix, must be a empty or single letter between 'g' and 'z'") +} + +func (bm *Manager) writePackFileNotLocked(ctx context.Context, packFile string, data []byte) error { + atomic.AddInt32(&bm.stats.WrittenBlocks, 1) + atomic.AddInt64(&bm.stats.WrittenBytes, int64(len(data))) + bm.listCache.deleteListCache(ctx) + return bm.st.PutBlock(ctx, packFile, data) +} + +func (bm *Manager) encryptAndWriteBlockNotLocked(ctx context.Context, data []byte, prefix string) (string, error) { + hash := bm.hashData(data) + physicalBlockID := prefix + hex.EncodeToString(hash) + + // Encrypt the block in-place. + atomic.AddInt64(&bm.stats.EncryptedBytes, int64(len(data))) + data2, err := bm.encryptor.Encrypt(data, hash) + if err != nil { + return "", err + } + + atomic.AddInt32(&bm.stats.WrittenBlocks, 1) + atomic.AddInt64(&bm.stats.WrittenBytes, int64(len(data))) + bm.listCache.deleteListCache(ctx) + if err := bm.st.PutBlock(ctx, physicalBlockID, data2); err != nil { + return "", err + } + + return physicalBlockID, nil +} + +func (bm *Manager) hashData(data []byte) []byte { + // Hash the block and compute encryption key. + blockID := bm.hasher(data) + atomic.AddInt32(&bm.stats.HashedBlocks, 1) + atomic.AddInt64(&bm.stats.HashedBytes, int64(len(data))) + return blockID +} + +func cloneBytes(b []byte) []byte { + return append([]byte{}, b...) +} + +// GetBlock gets the contents of a given block. If the block is not found returns blob.ErrBlockNotFound. +func (bm *Manager) GetBlock(ctx context.Context, blockID string) ([]byte, error) { + bi, err := bm.getBlockInfo(blockID) + if err != nil { + return nil, err + } + + if bi.Deleted { + return nil, storage.ErrBlockNotFound + } + + return bm.getBlockContentsUnlocked(ctx, bi) +} + +func (bm *Manager) getBlockInfo(blockID string) (Info, error) { + bm.lock() + defer bm.unlock() + + // check added blocks, not written to any packs. + if bi, ok := bm.currentPackItems[blockID]; ok { + return bi, nil + } + + // added blocks, written to packs but not yet added to indexes + if bi, ok := bm.packIndexBuilder[blockID]; ok { + return *bi, nil + } + + // read from committed block index + return bm.committedBlocks.getBlock(blockID) +} + +// BlockInfo returns information about a single block. +func (bm *Manager) BlockInfo(ctx context.Context, blockID string) (Info, error) { + bi, err := bm.getBlockInfo(blockID) + if err != nil { + log.Debugf("BlockInfo(%q) - error %v", err) + return Info{}, err + } + + if bi.Deleted { + log.Debugf("BlockInfo(%q) - deleted", blockID) + } else { + log.Debugf("BlockInfo(%q) - exists in %v", blockID, bi.PackFile) + } + + return bi, err +} + +// FindUnreferencedStorageFiles returns the list of unreferenced storage blocks. +func (bm *Manager) FindUnreferencedStorageFiles(ctx context.Context) ([]storage.BlockMetadata, error) { + infos, err := bm.ListBlockInfos("", true) + if err != nil { + return nil, fmt.Errorf("unable to list index blocks: %v", err) + } + + usedPackBlocks := findPackBlocksInUse(infos) + + var unused []storage.BlockMetadata + err = bm.st.ListBlocks(ctx, PackBlockPrefix, func(bi storage.BlockMetadata) error { + u := usedPackBlocks[bi.BlockID] + if u > 0 { + log.Debugf("pack %v, in use by %v blocks", bi.BlockID, u) + return nil + } + + unused = append(unused, bi) + return nil + }) + if err != nil { + return nil, fmt.Errorf("error listing storage blocks: %v", err) + } + + return unused, nil +} + +func findPackBlocksInUse(infos []Info) map[string]int { + packUsage := map[string]int{} + + for _, bi := range infos { + packUsage[bi.PackFile]++ + } + + return packUsage +} + +func (bm *Manager) getBlockContentsUnlocked(ctx context.Context, bi Info) ([]byte, error) { + if bi.Payload != nil { + return cloneBytes(bi.Payload), nil + } + + payload, err := bm.blockCache.getContentBlock(ctx, bi.BlockID, bi.PackFile, int64(bi.PackOffset), int64(bi.Length)) + if err != nil { + return nil, err + } + + atomic.AddInt32(&bm.stats.ReadBlocks, 1) + atomic.AddInt64(&bm.stats.ReadBytes, int64(len(payload))) + + iv, err := getPackedBlockIV(bi.BlockID) + if err != nil { + return nil, err + } + + decrypted, err := bm.decryptAndVerify(payload, iv) + if err != nil { + return nil, fmt.Errorf("invalid checksum at %v offset %v length %v: %v", bi.PackFile, bi.PackOffset, len(payload), err) + } + + return decrypted, nil +} + +func (bm *Manager) decryptAndVerify(encrypted []byte, iv []byte) ([]byte, error) { + decrypted, err := bm.encryptor.Decrypt(encrypted, iv) + if err != nil { + return nil, err + } + + atomic.AddInt64(&bm.stats.DecryptedBytes, int64(len(decrypted))) + + // Since the encryption key is a function of data, we must be able to generate exactly the same key + // after decrypting the content. This serves as a checksum. + return decrypted, bm.verifyChecksum(decrypted, iv) +} + +func (bm *Manager) getPhysicalBlockInternal(ctx context.Context, blockID string) ([]byte, error) { + payload, err := bm.blockCache.getContentBlock(ctx, blockID, blockID, 0, -1) + if err != nil { + return nil, err + } + + iv, err := getPhysicalBlockIV(blockID) + if err != nil { + return nil, err + } + + atomic.AddInt32(&bm.stats.ReadBlocks, 1) + atomic.AddInt64(&bm.stats.ReadBytes, int64(len(payload))) + + payload, err = bm.encryptor.Decrypt(payload, iv) + atomic.AddInt64(&bm.stats.DecryptedBytes, int64(len(payload))) + if err != nil { + return nil, err + } + + // Since the encryption key is a function of data, we must be able to generate exactly the same key + // after decrypting the content. This serves as a checksum. + if err := bm.verifyChecksum(payload, iv); err != nil { + return nil, err + } + + return payload, nil +} + +func getPackedBlockIV(blockID string) ([]byte, error) { + return hex.DecodeString(blockID[len(blockID)-(aes.BlockSize*2):]) +} + +func getPhysicalBlockIV(s string) ([]byte, error) { + if p := strings.Index(s, "-"); p >= 0 { + s = s[0:p] + } + return hex.DecodeString(s[len(s)-(aes.BlockSize*2):]) +} + +func (bm *Manager) verifyChecksum(data []byte, blockID []byte) error { + expected := bm.hasher(data) + expected = expected[len(expected)-aes.BlockSize:] + if !bytes.HasSuffix(blockID, expected) { + atomic.AddInt32(&bm.stats.InvalidBlocks, 1) + return fmt.Errorf("invalid checksum for blob %x, expected %x", blockID, expected) + } + + atomic.AddInt32(&bm.stats.ValidBlocks, 1) + return nil +} + +func (bm *Manager) lock() { + bm.mu.Lock() + bm.locked = true +} + +func (bm *Manager) unlock() { + if bm.checkInvariantsOnUnlock { + bm.verifyInvariantsLocked() + } + + bm.locked = false + bm.mu.Unlock() +} + +func (bm *Manager) assertLocked() { + if !bm.locked { + panic("must be locked") + } +} + +// Refresh reloads the committed block indexes. +func (bm *Manager) Refresh(ctx context.Context) (bool, error) { + bm.mu.Lock() + defer bm.mu.Unlock() + + log.Debugf("Refresh started") + t0 := time.Now() + _, updated, err := bm.loadPackIndexesUnlocked(ctx) + log.Debugf("Refresh completed in %v and updated=%v", time.Since(t0), updated) + return updated, err +} + +type cachedList struct { + Timestamp time.Time `json:"timestamp"` + Blocks []IndexInfo `json:"blocks"` +} + +// listIndexBlocksFromStorage returns the list of index blocks in the given storage. +// The list of blocks is not guaranteed to be sorted. +func listIndexBlocksFromStorage(ctx context.Context, st storage.Storage) ([]IndexInfo, error) { + snapshot, err := storage.ListAllBlocksConsistent(ctx, st, newIndexBlockPrefix, math.MaxInt32) + if err != nil { + return nil, err + } + + var results []IndexInfo + for _, it := range snapshot { + ii := IndexInfo{ + FileName: it.BlockID, + Timestamp: it.Timestamp, + Length: it.Length, + } + results = append(results, ii) + } + + return results, err +} + +// NewManager creates new block manager with given packing options and a formatter. +func NewManager(ctx context.Context, st storage.Storage, f FormattingOptions, caching CachingOptions, repositoryFormatBytes []byte) (*Manager, error) { + return newManagerWithOptions(ctx, st, f, caching, time.Now, repositoryFormatBytes) +} + +func newManagerWithOptions(ctx context.Context, st storage.Storage, f FormattingOptions, caching CachingOptions, timeNow func() time.Time, repositoryFormatBytes []byte) (*Manager, error) { + if f.Version < minSupportedReadVersion || f.Version > currentWriteVersion { + return nil, fmt.Errorf("can't handle repositories created using version %v (min supported %v, max supported %v)", f.Version, minSupportedReadVersion, maxSupportedReadVersion) + } + + hasher, encryptor, err := CreateHashAndEncryptor(f) + if err != nil { + return nil, err + } + + blockCache, err := newBlockCache(ctx, st, caching) + if err != nil { + return nil, fmt.Errorf("unable to initialize block cache: %v", err) + } + + listCache, err := newListCache(ctx, st, caching) + if err != nil { + return nil, fmt.Errorf("unable to initialize list cache: %v", err) + } + + blockIndex, err := newCommittedBlockIndex(caching) + if err != nil { + return nil, fmt.Errorf("unable to initialize committed block index: %v", err) + } + + m := &Manager{ + Format: f, + timeNow: timeNow, + flushPackIndexesAfter: timeNow().Add(flushPackIndexTimeout), + maxPackSize: f.MaxPackSize, + encryptor: encryptor, + hasher: hasher, + currentPackItems: make(map[string]Info), + packIndexBuilder: make(packIndexBuilder), + committedBlocks: blockIndex, + minPreambleLength: defaultMinPreambleLength, + maxPreambleLength: defaultMaxPreambleLength, + paddingUnit: defaultPaddingUnit, + blockCache: blockCache, + listCache: listCache, + st: st, + repositoryFormatBytes: repositoryFormatBytes, + + writeFormatVersion: int32(f.Version), + closed: make(chan struct{}), + checkInvariantsOnUnlock: os.Getenv("KOPIA_VERIFY_INVARIANTS") != "", + } + + m.startPackIndexLocked() + + if err := m.CompactIndexes(ctx, autoCompactionOptions); err != nil { + return nil, fmt.Errorf("error initializing block manager: %v", err) + } + + return m, nil +} + +func CreateHashAndEncryptor(f FormattingOptions) (HashFunc, Encryptor, error) { + h, err := createHashFunc(f) + if err != nil { + return nil, nil, fmt.Errorf("unable to create hash: %v", err) + } + + e, err := createEncryptor(f) + if err != nil { + return nil, nil, fmt.Errorf("unable to create encryptor: %v", err) + } + + blockID := h(nil) + _, err = e.Encrypt(nil, blockID) + if err != nil { + return nil, nil, fmt.Errorf("invalid encryptor: %v", err) + } + + return h, e, nil +} + +func createHashFunc(f FormattingOptions) (HashFunc, error) { + h := hashFunctions[f.Hash] + if h == nil { + return nil, fmt.Errorf("unknown hash function %v", f.Hash) + } + + hashFunc, err := h(f) + if err != nil { + return nil, fmt.Errorf("unable to initialize hash: %v", err) + } + + if hashFunc == nil { + return nil, fmt.Errorf("nil hash function returned for %v", f.Hash) + } + + return hashFunc, nil +} + +func createEncryptor(f FormattingOptions) (Encryptor, error) { + e := encryptors[f.Encryption] + if e == nil { + return nil, fmt.Errorf("unknown encryption algorithm: %v", f.Encryption) + } + + return e(f) +} diff --git a/block/block_manager_compaction.go b/block/block_manager_compaction.go new file mode 100644 index 000000000..33e6c41a5 --- /dev/null +++ b/block/block_manager_compaction.go @@ -0,0 +1,148 @@ +package block + +import ( + "bytes" + "context" + "fmt" + "time" + + "github.com/pkg/errors" +) + +var autoCompactionOptions = CompactOptions{ + MinSmallBlocks: 4 * parallelFetches, + MaxSmallBlocks: 64, +} + +// CompactOptions provides options for compaction +type CompactOptions struct { + MinSmallBlocks int + MaxSmallBlocks int + AllBlocks bool + SkipDeletedOlderThan time.Duration +} + +// CompactIndexes performs compaction of index blocks ensuring that # of small blocks is between minSmallBlockCount and maxSmallBlockCount +func (bm *Manager) CompactIndexes(ctx context.Context, opt CompactOptions) error { + log.Debugf("CompactIndexes(%+v)", opt) + if opt.MaxSmallBlocks < opt.MinSmallBlocks { + return fmt.Errorf("invalid block counts") + } + + indexBlocks, _, err := bm.loadPackIndexesUnlocked(ctx) + if err != nil { + return errors.Wrap(err, "error loading indexes") + } + + blocksToCompact := bm.getBlocksToCompact(indexBlocks, opt) + + if err := bm.compactAndDeleteIndexBlocks(ctx, blocksToCompact, opt); err != nil { + log.Warningf("error performing quick compaction: %v", err) + } + + return nil +} + +func (bm *Manager) getBlocksToCompact(indexBlocks []IndexInfo, opt CompactOptions) []IndexInfo { + var nonCompactedBlocks []IndexInfo + var totalSizeNonCompactedBlocks int64 + + var verySmallBlocks []IndexInfo + var totalSizeVerySmallBlocks int64 + + var mediumSizedBlocks []IndexInfo + var totalSizeMediumSizedBlocks int64 + + for _, b := range indexBlocks { + if b.Length > int64(bm.maxPackSize) && !opt.AllBlocks { + continue + } + + nonCompactedBlocks = append(nonCompactedBlocks, b) + if b.Length < int64(bm.maxPackSize/20) { + verySmallBlocks = append(verySmallBlocks, b) + totalSizeVerySmallBlocks += b.Length + } else { + mediumSizedBlocks = append(mediumSizedBlocks, b) + totalSizeMediumSizedBlocks += b.Length + } + totalSizeNonCompactedBlocks += b.Length + } + + if len(nonCompactedBlocks) < opt.MinSmallBlocks { + // current count is below min allowed - nothing to do + formatLog.Debugf("no small blocks to compact") + return nil + } + + if len(verySmallBlocks) > len(nonCompactedBlocks)/2 && len(mediumSizedBlocks)+1 < opt.MinSmallBlocks { + formatLog.Debugf("compacting %v very small blocks", len(verySmallBlocks)) + return verySmallBlocks + } + + formatLog.Debugf("compacting all %v non-compacted blocks", len(nonCompactedBlocks)) + return nonCompactedBlocks +} + +func (bm *Manager) compactAndDeleteIndexBlocks(ctx context.Context, indexBlocks []IndexInfo, opt CompactOptions) error { + if len(indexBlocks) <= 1 { + return nil + } + formatLog.Debugf("compacting %v blocks", len(indexBlocks)) + t0 := time.Now() + + bld := make(packIndexBuilder) + for _, indexBlock := range indexBlocks { + if err := bm.addIndexBlocksToBuilder(ctx, bld, indexBlock, opt); err != nil { + return err + } + } + + var buf bytes.Buffer + if err := bld.Build(&buf); err != nil { + return errors.Wrap(err, "unable to build an index") + } + + compactedIndexBlock, err := bm.writePackIndexesNew(ctx, buf.Bytes()) + if err != nil { + return errors.Wrap(err, "unable to write compacted indexes") + } + + formatLog.Debugf("wrote compacted index (%v bytes) in %v", compactedIndexBlock, time.Since(t0)) + + for _, indexBlock := range indexBlocks { + if indexBlock.FileName == compactedIndexBlock { + continue + } + + bm.listCache.deleteListCache(ctx) + if err := bm.st.DeleteBlock(ctx, indexBlock.FileName); err != nil { + log.Warningf("unable to delete compacted block %q: %v", indexBlock.FileName, err) + } + } + + return nil +} + +func (bm *Manager) addIndexBlocksToBuilder(ctx context.Context, bld packIndexBuilder, indexBlock IndexInfo, opt CompactOptions) error { + data, err := bm.getPhysicalBlockInternal(ctx, indexBlock.FileName) + if err != nil { + return err + } + + index, err := openPackIndex(bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("unable to open index block %q: %v", indexBlock, err) + } + + _ = index.Iterate("", func(i Info) error { + if i.Deleted && opt.SkipDeletedOlderThan > 0 && time.Since(i.Timestamp()) > opt.SkipDeletedOlderThan { + log.Debugf("skipping block %v deleted at %v", i.BlockID, i.Timestamp()) + return nil + } + bld.Add(i) + return nil + }) + + return nil +} diff --git a/block/block_manager_test.go b/block/block_manager_test.go new file mode 100644 index 000000000..354786528 --- /dev/null +++ b/block/block_manager_test.go @@ -0,0 +1,909 @@ +package block + +import ( + "bytes" + "context" + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "math/rand" + "reflect" + "strings" + "sync" + "testing" + "time" + + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" + logging "github.com/op/go-logging" +) + +const ( + maxPackSize = 2000 +) + +var fakeTime = time.Date(2017, 1, 1, 0, 0, 0, 0, time.UTC) +var hmacSecret = []byte{1, 2, 3} + +func init() { + logging.SetLevel(logging.DEBUG, "") +} + +func TestBlockManagerEmptyFlush(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + bm.Flush(ctx) + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func TestBlockZeroBytes1(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + blockID := writeBlockAndVerify(ctx, t, bm, []byte{}) + bm.Flush(ctx) + if got, want := len(data), 2; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + dumpBlockManagerData(t, data) + bm = newTestBlockManager(data, keyTime, nil) + verifyBlock(ctx, t, bm, blockID, []byte{}) +} + +func TestBlockZeroBytes2(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 10)) + writeBlockAndVerify(ctx, t, bm, []byte{}) + bm.Flush(ctx) + if got, want := len(data), 2; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + dumpBlockManagerData(t, data) + } +} + +func TestBlockManagerSmallBlockWrites(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + + for i := 0; i < 100; i++ { + writeBlockAndVerify(ctx, t, bm, seededRandomData(i, 10)) + } + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + bm.Flush(ctx) + if got, want := len(data), 2; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func TestBlockManagerDedupesPendingBlocks(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + + for i := 0; i < 100; i++ { + writeBlockAndVerify(ctx, t, bm, seededRandomData(0, 999)) + } + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + bm.Flush(ctx) + if got, want := len(data), 2; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func TestBlockManagerDedupesPendingAndUncommittedBlocks(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + + // no writes here, all data fits in a single pack. + writeBlockAndVerify(ctx, t, bm, seededRandomData(0, 950)) + writeBlockAndVerify(ctx, t, bm, seededRandomData(1, 950)) + writeBlockAndVerify(ctx, t, bm, seededRandomData(2, 10)) + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + + // no writes here + writeBlockAndVerify(ctx, t, bm, seededRandomData(0, 950)) + writeBlockAndVerify(ctx, t, bm, seededRandomData(1, 950)) + writeBlockAndVerify(ctx, t, bm, seededRandomData(2, 10)) + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + bm.Flush(ctx) + + // this flushes the pack block + index block + if got, want := len(data), 2; got != want { + dumpBlockManagerData(t, data) + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func TestBlockManagerEmpty(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + + noSuchBlockID := string(hashValue([]byte("foo"))) + + b, err := bm.GetBlock(ctx, noSuchBlockID) + if err != storage.ErrBlockNotFound { + t.Errorf("unexpected error when getting non-existent block: %v, %v", b, err) + } + + bi, err := bm.BlockInfo(ctx, noSuchBlockID) + if err != storage.ErrBlockNotFound { + t.Errorf("unexpected error when getting non-existent block info: %v, %v", bi, err) + } + + if got, want := len(data), 0; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func verifyActiveIndexBlockCount(ctx context.Context, t *testing.T, bm *Manager, expected int) { + t.Helper() + + blks, err := bm.IndexBlocks(ctx) + if err != nil { + t.Errorf("error listing active index blocks: %v", err) + return + } + + if got, want := len(blks), expected; got != want { + t.Errorf("unexpected number of active index blocks %v, expected %v (%v)", got, want, blks) + } +} +func TestBlockManagerInternalFlush(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + + for i := 0; i < 100; i++ { + b := make([]byte, 25) + rand.Read(b) + writeBlockAndVerify(ctx, t, bm, b) + } + + // 1 data block written, but no index yet. + if got, want := len(data), 1; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + + // do it again - should be 2 blocks + 1000 bytes pending. + for i := 0; i < 100; i++ { + b := make([]byte, 25) + rand.Read(b) + writeBlockAndVerify(ctx, t, bm, b) + } + + // 2 data blocks written, but no index yet. + if got, want := len(data), 2; got != want { + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } + + bm.Flush(ctx) + + // third block gets written, followed by index. + if got, want := len(data), 4; got != want { + dumpBlockManagerData(t, data) + t.Errorf("unexpected number of blocks: %v, wanted %v", got, want) + } +} + +func TestBlockManagerWriteMultiple(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + timeFunc := fakeTimeNowWithAutoAdvance(fakeTime, 1*time.Second) + bm := newTestBlockManager(data, keyTime, timeFunc) + + var blockIDs []string + + for i := 0; i < 5000; i++ { + //t.Logf("i=%v", i) + b := seededRandomData(i, i%113) + blkID, err := bm.WriteBlock(ctx, b, "") + if err != nil { + t.Errorf("err: %v", err) + } + + blockIDs = append(blockIDs, blkID) + + if i%17 == 0 { + //t.Logf("flushing %v", i) + if err := bm.Flush(ctx); err != nil { + t.Fatalf("error flushing: %v", err) + } + //dumpBlockManagerData(t, data) + } + + if i%41 == 0 { + //t.Logf("opening new manager: %v", i) + if err := bm.Flush(ctx); err != nil { + t.Fatalf("error flushing: %v", err) + } + //t.Logf("data block count: %v", len(data)) + //dumpBlockManagerData(t, data) + bm = newTestBlockManager(data, keyTime, timeFunc) + } + + pos := rand.Intn(len(blockIDs)) + if _, err := bm.GetBlock(ctx, blockIDs[pos]); err != nil { + dumpBlockManagerData(t, data) + t.Fatalf("can't read block %q: %v", blockIDs[pos], err) + continue + } + } +} + +// This is regression test for a bug where we would corrupt data when encryption +// was done in place and clobbered pending data in memory. +func TestBlockManagerFailedToWritePack(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + st := storagetesting.NewMapStorage(data, keyTime, nil) + faulty := &storagetesting.FaultyStorage{ + Base: st, + } + st = faulty + + bm, err := newManagerWithOptions(context.Background(), st, FormattingOptions{ + Version: 1, + Hash: "HMAC-SHA256-128", + Encryption: "AES-256-CTR", + MaxPackSize: maxPackSize, + HMACSecret: []byte("foo"), + MasterKey: []byte("0123456789abcdef0123456789abcdef"), + }, CachingOptions{}, fakeTimeNowFrozen(fakeTime), nil) + if err != nil { + t.Fatalf("can't create bm: %v", err) + } + logging.SetLevel(logging.DEBUG, "faulty-storage") + + faulty.Faults = map[string][]*storagetesting.Fault{ + "PutBlock": { + {Err: errors.New("booboo")}, + }, + } + + b1, err := bm.WriteBlock(ctx, seededRandomData(1, 10), "") + if err != nil { + t.Fatalf("can't create block: %v", err) + } + + if err := bm.Flush(ctx); err != nil { + t.Logf("expected flush error: %v", err) + } + + verifyBlock(ctx, t, bm, b1, seededRandomData(1, 10)) +} + +func TestBlockManagerConcurrency(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + preexistingBlock := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + bm.Flush(ctx) + + dumpBlockManagerData(t, data) + bm1 := newTestBlockManager(data, keyTime, nil) + bm2 := newTestBlockManager(data, keyTime, nil) + bm3 := newTestBlockManager(data, keyTime, fakeTimeNowWithAutoAdvance(fakeTime.Add(1), 1*time.Second)) + + // all bm* can see pre-existing block + verifyBlock(ctx, t, bm1, preexistingBlock, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm2, preexistingBlock, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm3, preexistingBlock, seededRandomData(10, 100)) + + // write the same block in all managers. + sharedBlock := writeBlockAndVerify(ctx, t, bm1, seededRandomData(20, 100)) + writeBlockAndVerify(ctx, t, bm2, seededRandomData(20, 100)) + writeBlockAndVerify(ctx, t, bm3, seededRandomData(20, 100)) + + // write unique block per manager. + bm1block := writeBlockAndVerify(ctx, t, bm1, seededRandomData(31, 100)) + bm2block := writeBlockAndVerify(ctx, t, bm2, seededRandomData(32, 100)) + bm3block := writeBlockAndVerify(ctx, t, bm3, seededRandomData(33, 100)) + + // make sure they can't see each other's unflushed blocks. + verifyBlockNotFound(ctx, t, bm1, bm2block) + verifyBlockNotFound(ctx, t, bm1, bm3block) + verifyBlockNotFound(ctx, t, bm2, bm1block) + verifyBlockNotFound(ctx, t, bm2, bm3block) + verifyBlockNotFound(ctx, t, bm3, bm1block) + verifyBlockNotFound(ctx, t, bm3, bm2block) + + // now flush all writers, they still can't see each others' data. + bm1.Flush(ctx) + bm2.Flush(ctx) + bm3.Flush(ctx) + verifyBlockNotFound(ctx, t, bm1, bm2block) + verifyBlockNotFound(ctx, t, bm1, bm3block) + verifyBlockNotFound(ctx, t, bm2, bm1block) + verifyBlockNotFound(ctx, t, bm2, bm3block) + verifyBlockNotFound(ctx, t, bm3, bm1block) + verifyBlockNotFound(ctx, t, bm3, bm2block) + + // new block manager at this point can see all data. + bm4 := newTestBlockManager(data, keyTime, nil) + verifyBlock(ctx, t, bm4, preexistingBlock, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm4, sharedBlock, seededRandomData(20, 100)) + verifyBlock(ctx, t, bm4, bm1block, seededRandomData(31, 100)) + verifyBlock(ctx, t, bm4, bm2block, seededRandomData(32, 100)) + verifyBlock(ctx, t, bm4, bm3block, seededRandomData(33, 100)) + + if got, want := getIndexCount(data), 4; got != want { + t.Errorf("unexpected index count before compaction: %v, wanted %v", got, want) + } + + if err := bm4.CompactIndexes(ctx, CompactOptions{ + MinSmallBlocks: 1, + MaxSmallBlocks: 1, + }); err != nil { + t.Errorf("compaction error: %v", err) + } + if got, want := getIndexCount(data), 1; got != want { + t.Errorf("unexpected index count after compaction: %v, wanted %v", got, want) + } + + // new block manager at this point can see all data. + bm5 := newTestBlockManager(data, keyTime, nil) + verifyBlock(ctx, t, bm5, preexistingBlock, seededRandomData(10, 100)) + verifyBlock(ctx, t, bm5, sharedBlock, seededRandomData(20, 100)) + verifyBlock(ctx, t, bm5, bm1block, seededRandomData(31, 100)) + verifyBlock(ctx, t, bm5, bm2block, seededRandomData(32, 100)) + verifyBlock(ctx, t, bm5, bm3block, seededRandomData(33, 100)) + if err := bm5.CompactIndexes(ctx, CompactOptions{ + MinSmallBlocks: 1, + MaxSmallBlocks: 1, + }); err != nil { + t.Errorf("compaction error: %v", err) + } +} + +func TestDeleteBlock(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + block1 := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + bm.Flush(ctx) + block2 := writeBlockAndVerify(ctx, t, bm, seededRandomData(11, 100)) + if err := bm.DeleteBlock(block1); err != nil { + t.Errorf("unable to delete block: %v", block1) + } + if err := bm.DeleteBlock(block2); err != nil { + t.Errorf("unable to delete block: %v", block1) + } + verifyBlockNotFound(ctx, t, bm, block1) + verifyBlockNotFound(ctx, t, bm, block2) + bm.Flush(ctx) + log.Debugf("-----------") + bm = newTestBlockManager(data, keyTime, nil) + //dumpBlockManagerData(t, data) + verifyBlockNotFound(ctx, t, bm, block1) + verifyBlockNotFound(ctx, t, bm, block2) +} + +func TestRewriteNonDeleted(t *testing.T) { + const stepBehaviors = 3 + + // perform a sequence WriteBlock() RewriteBlock() GetBlock() + // where actionX can be (0=flush and reopen, 1=flush, 2=nothing) + for action1 := 0; action1 < stepBehaviors; action1++ { + for action2 := 0; action2 < stepBehaviors; action2++ { + t.Run(fmt.Sprintf("case-%v-%v", action1, action2), func(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + fakeNow := fakeTimeNowWithAutoAdvance(fakeTime, 1*time.Second) + bm := newTestBlockManager(data, keyTime, fakeNow) + + applyStep := func(action int) { + switch action { + case 0: + t.Logf("flushing and reopening") + bm.Flush(ctx) + bm = newTestBlockManager(data, keyTime, fakeNow) + case 1: + t.Logf("flushing") + bm.Flush(ctx) + case 2: + t.Logf("doing nothing") + } + } + + block1 := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + applyStep(action1) + assertNoError(t, bm.RewriteBlock(ctx, block1)) + applyStep(action2) + verifyBlock(ctx, t, bm, block1, seededRandomData(10, 100)) + dumpBlockManagerData(t, data) + }) + } + } +} + +func TestDisableFlush(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + bm.DisableIndexFlush() + bm.DisableIndexFlush() + for i := 0; i < 500; i++ { + writeBlockAndVerify(ctx, t, bm, seededRandomData(i, 100)) + } + bm.Flush(ctx) // flush will not have effect + bm.EnableIndexFlush() + bm.Flush(ctx) // flush will not have effect + bm.EnableIndexFlush() + + verifyActiveIndexBlockCount(ctx, t, bm, 0) + bm.EnableIndexFlush() + verifyActiveIndexBlockCount(ctx, t, bm, 0) + bm.Flush(ctx) // flush will happen now + verifyActiveIndexBlockCount(ctx, t, bm, 1) +} + +func TestRewriteDeleted(t *testing.T) { + const stepBehaviors = 3 + + // perform a sequence WriteBlock() Delete() RewriteBlock() GetBlock() + // where actionX can be (0=flush and reopen, 1=flush, 2=nothing) + for action1 := 0; action1 < stepBehaviors; action1++ { + for action2 := 0; action2 < stepBehaviors; action2++ { + for action3 := 0; action3 < stepBehaviors; action3++ { + t.Run(fmt.Sprintf("case-%v-%v-%v", action1, action2, action3), func(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + fakeNow := fakeTimeNowWithAutoAdvance(fakeTime, 1*time.Second) + bm := newTestBlockManager(data, keyTime, fakeNow) + + applyStep := func(action int) { + switch action { + case 0: + t.Logf("flushing and reopening") + bm.Flush(ctx) + bm = newTestBlockManager(data, keyTime, fakeNow) + case 1: + t.Logf("flushing") + bm.Flush(ctx) + case 2: + t.Logf("doing nothing") + } + } + + block1 := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + applyStep(action1) + assertNoError(t, bm.DeleteBlock(block1)) + applyStep(action2) + if got, want := bm.RewriteBlock(ctx, block1), storage.ErrBlockNotFound; got != want && got != nil { + t.Errorf("unexpected error %v, wanted %v", got, want) + } + applyStep(action3) + verifyBlockNotFound(ctx, t, bm, block1) + dumpBlockManagerData(t, data) + }) + } + } + } +} + +func TestDeleteAndRecreate(t *testing.T) { + ctx := context.Background() + // simulate race between delete/recreate and delete + // delete happens at t0+10, recreate at t0+20 and second delete time is parameterized. + // depending on it, the second delete results will be visible. + cases := []struct { + desc string + deletionTime time.Time + isVisible bool + }{ + {"deleted before delete and-recreate", fakeTime.Add(5 * time.Second), true}, + //{"deleted after delete and recreate", fakeTime.Add(25 * time.Second), false}, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + // write a block + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, fakeTimeNowFrozen(fakeTime)) + block1 := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + bm.Flush(ctx) + + // delete but at given timestamp but don't commit yet. + bm0 := newTestBlockManager(data, keyTime, fakeTimeNowWithAutoAdvance(tc.deletionTime, 1*time.Second)) + assertNoError(t, bm0.DeleteBlock(block1)) + + // delete it at t0+10 + bm1 := newTestBlockManager(data, keyTime, fakeTimeNowWithAutoAdvance(fakeTime.Add(10*time.Second), 1*time.Second)) + verifyBlock(ctx, t, bm1, block1, seededRandomData(10, 100)) + assertNoError(t, bm1.DeleteBlock(block1)) + bm1.Flush(ctx) + + // recreate at t0+20 + bm2 := newTestBlockManager(data, keyTime, fakeTimeNowWithAutoAdvance(fakeTime.Add(20*time.Second), 1*time.Second)) + block2 := writeBlockAndVerify(ctx, t, bm2, seededRandomData(10, 100)) + bm2.Flush(ctx) + + // commit deletion from bm0 (t0+5) + bm0.Flush(ctx) + + //dumpBlockManagerData(t, data) + + if block1 != block2 { + t.Errorf("got invalid block %v, expected %v", block2, block1) + } + + bm3 := newTestBlockManager(data, keyTime, nil) + dumpBlockManagerData(t, data) + if tc.isVisible { + verifyBlock(ctx, t, bm3, block1, seededRandomData(10, 100)) + } else { + verifyBlockNotFound(ctx, t, bm3, block1) + } + }) + } +} + +func TestFindUnreferencedStorageFiles(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) + blockID := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) + if err := bm.DeleteBlock(blockID); err != nil { + t.Errorf("error deleting block: %v", blockID) + } + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + + // block still present in first pack + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) + + assertNoError(t, bm.RewriteBlock(ctx, blockID)) + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + verifyUnreferencedStorageFilesCount(ctx, t, bm, 1) + assertNoError(t, bm.RewriteBlock(ctx, blockID)) + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + verifyUnreferencedStorageFilesCount(ctx, t, bm, 2) +} + +func TestFindUnreferencedStorageFiles2(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, nil) + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) + blockID := writeBlockAndVerify(ctx, t, bm, seededRandomData(10, 100)) + writeBlockAndVerify(ctx, t, bm, seededRandomData(11, 100)) + dumpBlocks(t, bm, "after writing") + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + dumpBlocks(t, bm, "after flush") + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) + if err := bm.DeleteBlock(blockID); err != nil { + t.Errorf("error deleting block: %v", blockID) + } + dumpBlocks(t, bm, "after delete") + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + dumpBlocks(t, bm, "after flush") + // block present in first pack, original pack is still referenced + verifyUnreferencedStorageFilesCount(ctx, t, bm, 0) +} + +func dumpBlocks(t *testing.T, bm *Manager, caption string) { + t.Helper() + infos, err := bm.ListBlockInfos("", true) + if err != nil { + t.Errorf("error listing blocks: %v", err) + return + } + + log.Infof("**** dumping %v blocks %v", len(infos), caption) + for i, bi := range infos { + log.Debugf(" bi[%v]=%#v", i, bi) + } + log.Infof("finished dumping %v blocks", len(infos)) +} + +func verifyUnreferencedStorageFilesCount(ctx context.Context, t *testing.T, bm *Manager, want int) { + t.Helper() + unref, err := bm.FindUnreferencedStorageFiles(ctx) + if err != nil { + t.Errorf("error in FindUnreferencedStorageFiles: %v", err) + } + + log.Infof("got %v expecting %v", unref, want) + if got := len(unref); got != want { + t.Errorf("invalid number of unreferenced blocks: %v, wanted %v", got, want) + } +} + +func TestBlockWriteAliasing(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, fakeTimeNowFrozen(fakeTime)) + + blockData := []byte{100, 0, 0} + id1 := writeBlockAndVerify(ctx, t, bm, blockData) + blockData[0] = 101 + id2 := writeBlockAndVerify(ctx, t, bm, blockData) + bm.Flush(ctx) + blockData[0] = 102 + id3 := writeBlockAndVerify(ctx, t, bm, blockData) + blockData[0] = 103 + id4 := writeBlockAndVerify(ctx, t, bm, blockData) + verifyBlock(ctx, t, bm, id1, []byte{100, 0, 0}) + verifyBlock(ctx, t, bm, id2, []byte{101, 0, 0}) + verifyBlock(ctx, t, bm, id3, []byte{102, 0, 0}) + verifyBlock(ctx, t, bm, id4, []byte{103, 0, 0}) +} + +func TestBlockReadAliasing(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + keyTime := map[string]time.Time{} + bm := newTestBlockManager(data, keyTime, fakeTimeNowFrozen(fakeTime)) + + blockData := []byte{100, 0, 0} + id1 := writeBlockAndVerify(ctx, t, bm, blockData) + blockData2, err := bm.GetBlock(ctx, id1) + if err != nil { + t.Fatalf("can't get block data: %v", err) + } + + blockData2[0]++ + verifyBlock(ctx, t, bm, id1, blockData) + bm.Flush(ctx) + verifyBlock(ctx, t, bm, id1, blockData) +} + +func TestVersionCompatibility(t *testing.T) { + for writeVer := minSupportedReadVersion; writeVer <= currentWriteVersion; writeVer++ { + t.Run(fmt.Sprintf("version-%v", writeVer), func(t *testing.T) { + verifyVersionCompat(t, writeVer) + }) + } +} + +func verifyVersionCompat(t *testing.T, writeVersion int) { + ctx := context.Background() + + // create block manager that writes 'writeVersion' and reads all versions >= minSupportedReadVersion + data := map[string][]byte{} + keyTime := map[string]time.Time{} + mgr := newTestBlockManager(data, keyTime, nil) + mgr.writeFormatVersion = int32(writeVersion) + + dataSet := map[string][]byte{} + + for i := 0; i < 3000000; i = (i + 1) * 2 { + data := make([]byte, i) + rand.Read(data) + + cid, err := mgr.WriteBlock(ctx, data, "") + if err != nil { + t.Fatalf("unable to write %v bytes: %v", len(data), err) + } + dataSet[cid] = data + } + verifyBlockManagerDataSet(ctx, t, mgr, dataSet) + + // delete random 3 items (map iteration order is random) + cnt := 0 + for blockID := range dataSet { + t.Logf("deleting %v", blockID) + assertNoError(t, mgr.DeleteBlock(blockID)) + delete(dataSet, blockID) + cnt++ + if cnt >= 3 { + break + } + } + if err := mgr.Flush(ctx); err != nil { + t.Fatalf("failed to flush: %v", err) + } + + // create new manager that reads and writes using new version. + mgr = newTestBlockManager(data, keyTime, nil) + + // make sure we can read everything + verifyBlockManagerDataSet(ctx, t, mgr, dataSet) + + if err := mgr.CompactIndexes(ctx, CompactOptions{ + MinSmallBlocks: 1, + MaxSmallBlocks: 1, + }); err != nil { + t.Fatalf("unable to compact indexes: %v", err) + } + if err := mgr.Flush(ctx); err != nil { + t.Fatalf("failed to flush: %v", err) + } + verifyBlockManagerDataSet(ctx, t, mgr, dataSet) + + // now open one more manager + mgr = newTestBlockManager(data, keyTime, nil) + verifyBlockManagerDataSet(ctx, t, mgr, dataSet) +} + +func verifyBlockManagerDataSet(ctx context.Context, t *testing.T, mgr *Manager, dataSet map[string][]byte) { + for blockID, originalPayload := range dataSet { + v, err := mgr.GetBlock(ctx, blockID) + if err != nil { + t.Errorf("unable to read block %q: %v", blockID, err) + continue + } + + if !reflect.DeepEqual(v, originalPayload) { + t.Errorf("payload for %q does not match original: %v", v, originalPayload) + } + } +} + +func newTestBlockManager(data map[string][]byte, keyTime map[string]time.Time, timeFunc func() time.Time) *Manager { + //st = logging.NewWrapper(st) + if timeFunc == nil { + timeFunc = fakeTimeNowWithAutoAdvance(fakeTime, 1*time.Second) + } + st := storagetesting.NewMapStorage(data, keyTime, timeFunc) + bm, err := newManagerWithOptions(context.Background(), st, FormattingOptions{ + Hash: "HMAC-SHA256", + Encryption: "NONE", + HMACSecret: hmacSecret, + MaxPackSize: maxPackSize, + }, CachingOptions{}, timeFunc, nil) + if err != nil { + panic("can't create block manager: " + err.Error()) + } + bm.checkInvariantsOnUnlock = true + return bm +} + +func getIndexCount(d map[string][]byte) int { + var cnt int + + for k := range d { + if strings.HasPrefix(k, newIndexBlockPrefix) { + cnt++ + } + } + + return cnt +} + +func fakeTimeNowFrozen(t time.Time) func() time.Time { + return fakeTimeNowWithAutoAdvance(t, 0) +} + +func fakeTimeNowWithAutoAdvance(t time.Time, dt time.Duration) func() time.Time { + var mu sync.Mutex + return func() time.Time { + mu.Lock() + defer mu.Unlock() + ret := t + t = t.Add(dt) + return ret + } +} + +func verifyBlockNotFound(ctx context.Context, t *testing.T, bm *Manager, blockID string) { + t.Helper() + + b, err := bm.GetBlock(ctx, blockID) + if err != storage.ErrBlockNotFound { + t.Errorf("unexpected response from GetBlock(%q), got %v,%v, expected %v", blockID, b, err, storage.ErrBlockNotFound) + } +} + +func verifyBlock(ctx context.Context, t *testing.T, bm *Manager, blockID string, b []byte) { + t.Helper() + + b2, err := bm.GetBlock(ctx, blockID) + if err != nil { + t.Errorf("unable to read block %q: %v", blockID, err) + return + } + + if got, want := b2, b; !reflect.DeepEqual(got, want) { + t.Errorf("block %q data mismatch: got %x (nil:%v), wanted %x (nil:%v)", blockID, got, got == nil, want, want == nil) + } + + bi, err := bm.BlockInfo(ctx, blockID) + if err != nil { + t.Errorf("error getting block info %q: %v", blockID, err) + } + + if got, want := bi.Length, uint32(len(b)); got != want { + t.Errorf("invalid block size for %q: %v, wanted %v", blockID, got, want) + } + +} +func writeBlockAndVerify(ctx context.Context, t *testing.T, bm *Manager, b []byte) string { + t.Helper() + + blockID, err := bm.WriteBlock(ctx, b, "") + if err != nil { + t.Errorf("err: %v", err) + } + + if got, want := blockID, string(hashValue(b)); got != want { + t.Errorf("invalid block ID for %x, got %v, want %v", b, got, want) + } + + verifyBlock(ctx, t, bm, blockID, b) + + return blockID +} + +func seededRandomData(seed int, length int) []byte { + b := make([]byte, length) + rnd := rand.New(rand.NewSource(int64(seed))) + rnd.Read(b) + return b +} + +func hashValue(b []byte) string { + h := hmac.New(sha256.New, hmacSecret) + h.Write(b) //nolint:errcheck + return hex.EncodeToString(h.Sum(nil)) +} + +func dumpBlockManagerData(t *testing.T, data map[string][]byte) { + t.Helper() + for k, v := range data { + if k[0] == 'n' { + ndx, err := openPackIndex(bytes.NewReader(v)) + if err == nil { + t.Logf("index %v (%v bytes)", k, len(v)) + assertNoError(t, ndx.Iterate("", func(i Info) error { + t.Logf(" %+v\n", i) + return nil + })) + + } + } else { + t.Logf("data %v (%v bytes)\n", k, len(v)) + } + } +} diff --git a/block/builder.go b/block/builder.go new file mode 100644 index 000000000..276d8e7c5 --- /dev/null +++ b/block/builder.go @@ -0,0 +1,147 @@ +package block + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "sort" +) + +// packIndexBuilder prepares and writes block index for writing. +type packIndexBuilder map[string]*Info + +// Add adds a new entry to the builder or conditionally replaces it if the timestamp is greater. +func (b packIndexBuilder) Add(i Info) { + old, ok := b[i.BlockID] + if !ok || i.TimestampSeconds >= old.TimestampSeconds { + b[i.BlockID] = &i + } +} + +func (b packIndexBuilder) sortedBlocks() []*Info { + var allBlocks []*Info + + for _, v := range b { + allBlocks = append(allBlocks, v) + } + + sort.Slice(allBlocks, func(i, j int) bool { + return allBlocks[i].BlockID < allBlocks[j].BlockID + }) + + return allBlocks +} + +type indexLayout struct { + packFileOffsets map[string]uint32 + entryCount int + keyLength int + entryLength int + extraDataOffset uint32 +} + +// Build writes the pack index to the provided output. +func (b packIndexBuilder) Build(output io.Writer) error { + allBlocks := b.sortedBlocks() + layout := &indexLayout{ + packFileOffsets: map[string]uint32{}, + keyLength: -1, + entryLength: 20, + entryCount: len(allBlocks), + } + + w := bufio.NewWriter(output) + + // prepare extra data to be appended at the end of an index. + extraData := prepareExtraData(allBlocks, layout) + + // write header + header := make([]byte, 8) + header[0] = 1 // version + header[1] = byte(layout.keyLength) + binary.BigEndian.PutUint16(header[2:4], uint16(layout.entryLength)) + binary.BigEndian.PutUint32(header[4:8], uint32(layout.entryCount)) + if _, err := w.Write(header); err != nil { + return fmt.Errorf("unable to write header: %v", err) + } + + // write all sorted blocks. + entry := make([]byte, layout.entryLength) + for _, it := range allBlocks { + if err := writeEntry(w, it, layout, entry); err != nil { + return fmt.Errorf("unable to write entry: %v", err) + } + } + + if _, err := w.Write(extraData); err != nil { + return fmt.Errorf("error writing extra data: %v", err) + } + + return w.Flush() +} + +func prepareExtraData(allBlocks []*Info, layout *indexLayout) []byte { + var extraData []byte + + for i, it := range allBlocks { + if i == 0 { + layout.keyLength = len(contentIDToBytes(it.BlockID)) + } + if it.PackFile != "" { + if _, ok := layout.packFileOffsets[it.PackFile]; !ok { + layout.packFileOffsets[it.PackFile] = uint32(len(extraData)) + extraData = append(extraData, []byte(it.PackFile)...) + } + } + if len(it.Payload) > 0 { + panic("storing payloads in indexes is not supported") + } + } + layout.extraDataOffset = uint32(8 + layout.entryCount*(layout.keyLength+layout.entryLength)) + return extraData +} + +func writeEntry(w io.Writer, it *Info, layout *indexLayout, entry []byte) error { + k := contentIDToBytes(it.BlockID) + if len(k) != layout.keyLength { + return fmt.Errorf("inconsistent key length: %v vs %v", len(k), layout.keyLength) + } + + if err := formatEntry(entry, it, layout); err != nil { + return fmt.Errorf("unable to format entry: %v", err) + } + + if _, err := w.Write(k); err != nil { + return fmt.Errorf("error writing entry key: %v", err) + } + if _, err := w.Write(entry); err != nil { + return fmt.Errorf("error writing entry: %v", err) + } + + return nil +} + +func formatEntry(entry []byte, it *Info, layout *indexLayout) error { + entryTimestampAndFlags := entry[0:8] + entryPackFileOffset := entry[8:12] + entryPackedOffset := entry[12:16] + entryPackedLength := entry[16:20] + timestampAndFlags := uint64(it.TimestampSeconds) << 16 + + if len(it.PackFile) == 0 { + return fmt.Errorf("empty pack block ID for %v", it.BlockID) + } + + binary.BigEndian.PutUint32(entryPackFileOffset, layout.extraDataOffset+layout.packFileOffsets[it.PackFile]) + if it.Deleted { + binary.BigEndian.PutUint32(entryPackedOffset, it.PackOffset|0x80000000) + } else { + binary.BigEndian.PutUint32(entryPackedOffset, it.PackOffset) + } + binary.BigEndian.PutUint32(entryPackedLength, it.Length) + timestampAndFlags |= uint64(it.FormatVersion) << 8 + timestampAndFlags |= uint64(len(it.PackFile)) + binary.BigEndian.PutUint64(entryTimestampAndFlags, timestampAndFlags) + return nil +} diff --git a/block/cache_hmac.go b/block/cache_hmac.go new file mode 100644 index 000000000..73fb09908 --- /dev/null +++ b/block/cache_hmac.go @@ -0,0 +1,33 @@ +package block + +import "crypto/hmac" +import "crypto/sha256" +import "errors" + +func appendHMAC(data []byte, secret []byte) []byte { + h := hmac.New(sha256.New, secret) + h.Write(data) // nolint:errcheck + return h.Sum(data) +} + +func verifyAndStripHMAC(b []byte, secret []byte) ([]byte, error) { + if len(b) < sha256.Size { + return nil, errors.New("invalid data - too short") + } + + p := len(b) - sha256.Size + data := b[0:p] + signature := b[p:] + + h := hmac.New(sha256.New, secret) + h.Write(data) // nolint:errcheck + validSignature := h.Sum(nil) + if len(signature) != len(validSignature) { + return nil, errors.New("invalid signature length") + } + if hmac.Equal(validSignature, signature) { + return data, nil + } + + return nil, errors.New("invalid data - corrupted") +} diff --git a/block/caching_options.go b/block/caching_options.go new file mode 100644 index 000000000..bd4b92bf1 --- /dev/null +++ b/block/caching_options.go @@ -0,0 +1,10 @@ +package block + +// CachingOptions specifies configuration of local cache. +type CachingOptions struct { + CacheDirectory string `json:"cacheDirectory,omitempty"` + MaxCacheSizeBytes int64 `json:"maxCacheSize,omitempty"` + MaxListCacheDurationSec int `json:"maxListCacheDuration,omitempty"` + IgnoreListCache bool `json:"-"` + HMACSecret []byte `json:"-"` +} diff --git a/block/committed_block_index.go b/block/committed_block_index.go new file mode 100644 index 000000000..2a4a81efc --- /dev/null +++ b/block/committed_block_index.go @@ -0,0 +1,138 @@ +package block + +import ( + "fmt" + "path/filepath" + "sync" + + "github.com/kopia/repo/storage" +) + +type committedBlockIndex struct { + cache committedBlockIndexCache + + mu sync.Mutex + inUse map[string]packIndex + merged mergedIndex +} + +type committedBlockIndexCache interface { + hasIndexBlockID(indexBlockID string) (bool, error) + addBlockToCache(indexBlockID string, data []byte) error + openIndex(indexBlockID string) (packIndex, error) + expireUnused(used []string) error +} + +func (b *committedBlockIndex) getBlock(blockID string) (Info, error) { + b.mu.Lock() + defer b.mu.Unlock() + + info, err := b.merged.GetInfo(blockID) + if info != nil { + return *info, nil + } + if err == nil { + return Info{}, storage.ErrBlockNotFound + } + return Info{}, err +} + +func (b *committedBlockIndex) addBlock(indexBlockID string, data []byte, use bool) error { + if err := b.cache.addBlockToCache(indexBlockID, data); err != nil { + return err + } + + if !use { + return nil + } + + b.mu.Lock() + defer b.mu.Unlock() + + if b.inUse[indexBlockID] != nil { + return nil + } + + ndx, err := b.cache.openIndex(indexBlockID) + if err != nil { + return fmt.Errorf("unable to open pack index %q: %v", indexBlockID, err) + } + b.inUse[indexBlockID] = ndx + b.merged = append(b.merged, ndx) + return nil +} + +func (b *committedBlockIndex) listBlocks(prefix string, cb func(i Info) error) error { + b.mu.Lock() + m := append(mergedIndex(nil), b.merged...) + b.mu.Unlock() + + return m.Iterate(prefix, cb) +} + +func (b *committedBlockIndex) packFilesChanged(packFiles []string) bool { + if len(packFiles) != len(b.inUse) { + return true + } + + for _, packFile := range packFiles { + if b.inUse[packFile] == nil { + return true + } + } + + return false +} + +func (b *committedBlockIndex) use(packFiles []string) (bool, error) { + b.mu.Lock() + defer b.mu.Unlock() + + if !b.packFilesChanged(packFiles) { + return false, nil + } + log.Debugf("set of index files has changed (had %v, now %v)", len(b.inUse), len(packFiles)) + + var newMerged mergedIndex + newInUse := map[string]packIndex{} + defer func() { + newMerged.Close() //nolint:errcheck + }() + + for _, e := range packFiles { + ndx, err := b.cache.openIndex(e) + if err != nil { + return false, fmt.Errorf("unable to open pack index %q: %v", e, err) + } + + newMerged = append(newMerged, ndx) + newInUse[e] = ndx + } + b.merged = newMerged + b.inUse = newInUse + + if err := b.cache.expireUnused(packFiles); err != nil { + log.Warningf("unable to expire unused block index files: %v", err) + } + newMerged = nil + + return true, nil +} + +func newCommittedBlockIndex(caching CachingOptions) (*committedBlockIndex, error) { + var cache committedBlockIndexCache + + if caching.CacheDirectory != "" { + dirname := filepath.Join(caching.CacheDirectory, "indexes") + cache = &diskCommittedBlockIndexCache{dirname} + } else { + cache = &memoryCommittedBlockIndexCache{ + blocks: map[string]packIndex{}, + } + } + + return &committedBlockIndex{ + cache: cache, + inUse: map[string]packIndex{}, + }, nil +} diff --git a/block/committed_block_index_disk_cache.go b/block/committed_block_index_disk_cache.go new file mode 100644 index 000000000..9e0a1f4c9 --- /dev/null +++ b/block/committed_block_index_disk_cache.go @@ -0,0 +1,134 @@ +package block + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "time" + + "golang.org/x/exp/mmap" +) + +const ( + simpleIndexSuffix = ".sndx" + unusedCommittedBlockIndexCleanupTime = 1 * time.Hour // delete unused committed index blocks after 1 hour +) + +type diskCommittedBlockIndexCache struct { + dirname string +} + +func (c *diskCommittedBlockIndexCache) indexBlockPath(indexBlockID string) string { + return filepath.Join(c.dirname, indexBlockID+simpleIndexSuffix) +} + +func (c *diskCommittedBlockIndexCache) openIndex(indexBlockID string) (packIndex, error) { + fullpath := c.indexBlockPath(indexBlockID) + + f, err := mmap.Open(fullpath) + if err != nil { + return nil, err + } + + return openPackIndex(f) +} + +func (c *diskCommittedBlockIndexCache) hasIndexBlockID(indexBlockID string) (bool, error) { + _, err := os.Stat(c.indexBlockPath(indexBlockID)) + if err == nil { + return true, nil + } + if os.IsNotExist(err) { + return false, nil + } + + return false, err +} + +func (c *diskCommittedBlockIndexCache) addBlockToCache(indexBlockID string, data []byte) error { + exists, err := c.hasIndexBlockID(indexBlockID) + if err != nil { + return err + } + + if exists { + return nil + } + + tmpFile, err := writeTempFileAtomic(c.dirname, data) + if err != nil { + return err + } + + // rename() is atomic, so one process will succeed, but the other will fail + if err := os.Rename(tmpFile, c.indexBlockPath(indexBlockID)); err != nil { + // verify that the block exists + exists, err := c.hasIndexBlockID(indexBlockID) + if err != nil { + return err + } + if !exists { + return fmt.Errorf("unsuccessful index write of block %q", indexBlockID) + } + } + + return nil +} + +func writeTempFileAtomic(dirname string, data []byte) (string, error) { + // write to a temp file to avoid race where two processes are writing at the same time. + tf, err := ioutil.TempFile(dirname, "tmp") + if err != nil { + if os.IsNotExist(err) { + os.MkdirAll(dirname, 0700) //nolint:errcheck + tf, err = ioutil.TempFile(dirname, "tmp") + } + } + if err != nil { + return "", fmt.Errorf("can't create tmp file: %v", err) + } + + if _, err := tf.Write(data); err != nil { + return "", fmt.Errorf("can't write to temp file: %v", err) + } + if err := tf.Close(); err != nil { + return "", fmt.Errorf("can't close tmp file") + } + + return tf.Name(), nil +} + +func (c *diskCommittedBlockIndexCache) expireUnused(used []string) error { + entries, err := ioutil.ReadDir(c.dirname) + if err != nil { + return fmt.Errorf("can't list cache: %v", err) + } + + remaining := map[string]os.FileInfo{} + + for _, ent := range entries { + if strings.HasSuffix(ent.Name(), simpleIndexSuffix) { + n := strings.TrimSuffix(ent.Name(), simpleIndexSuffix) + remaining[n] = ent + } + } + + for _, u := range used { + delete(remaining, u) + } + + for _, rem := range remaining { + if time.Since(rem.ModTime()) > unusedCommittedBlockIndexCleanupTime { + log.Debugf("removing unused %v %v", rem.Name(), rem.ModTime()) + if err := os.Remove(filepath.Join(c.dirname, rem.Name())); err != nil { + log.Warningf("unable to remove unused index file: %v", err) + } + } else { + log.Debugf("keeping unused %v because it's too new %v", rem.Name(), rem.ModTime()) + } + } + + return nil +} diff --git a/block/committed_block_index_mem_cache.go b/block/committed_block_index_mem_cache.go new file mode 100644 index 000000000..03fe7817f --- /dev/null +++ b/block/committed_block_index_mem_cache.go @@ -0,0 +1,48 @@ +package block + +import ( + "bytes" + "fmt" + "sync" +) + +type memoryCommittedBlockIndexCache struct { + mu sync.Mutex + blocks map[string]packIndex +} + +func (m *memoryCommittedBlockIndexCache) hasIndexBlockID(indexBlockID string) (bool, error) { + m.mu.Lock() + defer m.mu.Unlock() + + return m.blocks[indexBlockID] != nil, nil +} + +func (m *memoryCommittedBlockIndexCache) addBlockToCache(indexBlockID string, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + + ndx, err := openPackIndex(bytes.NewReader(data)) + if err != nil { + return err + } + + m.blocks[indexBlockID] = ndx + return nil +} + +func (m *memoryCommittedBlockIndexCache) openIndex(indexBlockID string) (packIndex, error) { + m.mu.Lock() + defer m.mu.Unlock() + + v := m.blocks[indexBlockID] + if v == nil { + return nil, fmt.Errorf("block not found in cache: %v", indexBlockID) + } + + return v, nil +} + +func (m *memoryCommittedBlockIndexCache) expireUnused(used []string) error { + return nil +} diff --git a/block/content_id_to_bytes.go b/block/content_id_to_bytes.go new file mode 100644 index 000000000..136219d06 --- /dev/null +++ b/block/content_id_to_bytes.go @@ -0,0 +1,38 @@ +package block + +import ( + "encoding/hex" +) + +func bytesToContentID(b []byte) string { + if len(b) == 0 { + return "" + } + if b[0] == 0xff { + return string(b[1:]) + } + prefix := "" + if b[0] != 0 { + prefix = string(b[0:1]) + } + + return prefix + hex.EncodeToString(b[1:]) +} + +func contentIDToBytes(c string) []byte { + var prefix []byte + var skip int + if len(c)%2 == 1 { + prefix = []byte(c[0:1]) + skip = 1 + } else { + prefix = []byte{0} + } + + b, err := hex.DecodeString(c[skip:]) + if err != nil { + return append([]byte{0xff}, []byte(c)...) + } + + return append(prefix, b...) +} diff --git a/block/context.go b/block/context.go new file mode 100644 index 000000000..b7f22abd2 --- /dev/null +++ b/block/context.go @@ -0,0 +1,34 @@ +package block + +import "context" + +type contextKey string + +var useBlockCacheContextKey contextKey = "use-block-cache" +var useListCacheContextKey contextKey = "use-list-cache" + +// UsingBlockCache returns a derived context that causes block manager to use cache. +func UsingBlockCache(ctx context.Context, enabled bool) context.Context { + return context.WithValue(ctx, useBlockCacheContextKey, enabled) +} + +// UsingListCache returns a derived context that causes block manager to use cache. +func UsingListCache(ctx context.Context, enabled bool) context.Context { + return context.WithValue(ctx, useListCacheContextKey, enabled) +} + +func shouldUseBlockCache(ctx context.Context) bool { + if enabled, ok := ctx.Value(useBlockCacheContextKey).(bool); ok { + return enabled + } + + return true +} + +func shouldUseListCache(ctx context.Context) bool { + if enabled, ok := ctx.Value(useListCacheContextKey).(bool); ok { + return enabled + } + + return true +} diff --git a/block/format.go b/block/format.go new file mode 100644 index 000000000..5fc65271a --- /dev/null +++ b/block/format.go @@ -0,0 +1,74 @@ +package block + +import ( + "encoding/binary" + "fmt" +) + +// Format describes a format of a single pack index. The actual structure is not used, +// it's purely for documentation purposes. +// The struct is byte-aligned. +type Format struct { + Version byte // format version number must be 0x01 + KeySize byte // size of each key in bytes + EntrySize uint16 // size of each entry in bytes, big-endian + EntryCount uint32 // number of sorted (key,value) entries that follow + + Entries []struct { + Key []byte // key bytes (KeySize) + Entry entry + } + + ExtraData []byte // extra data +} + +type entry struct { + // big endian: + // 48 most significant bits - 48-bit timestamp in seconds since 1970/01/01 UTC + // 8 bits - format version (currently == 1) + // 8 least significant bits - length of pack block ID + timestampAndFlags uint64 // + packFileOffset uint32 // 4 bytes, big endian, offset within index file where pack block ID begins + packedOffset uint32 // 4 bytes, big endian, offset within pack file where the contents begin + packedLength uint32 // 4 bytes, big endian, content length +} + +func (e *entry) parse(b []byte) error { + if len(b) < 20 { + return fmt.Errorf("invalid entry length: %v", len(b)) + } + + e.timestampAndFlags = binary.BigEndian.Uint64(b[0:8]) + e.packFileOffset = binary.BigEndian.Uint32(b[8:12]) + e.packedOffset = binary.BigEndian.Uint32(b[12:16]) + e.packedLength = binary.BigEndian.Uint32(b[16:20]) + return nil +} + +func (e *entry) IsDeleted() bool { + return e.packedOffset&0x80000000 != 0 +} + +func (e *entry) TimestampSeconds() int64 { + return int64(e.timestampAndFlags >> 16) +} + +func (e *entry) PackedFormatVersion() byte { + return byte(e.timestampAndFlags >> 8) +} + +func (e *entry) PackFileLength() byte { + return byte(e.timestampAndFlags) +} + +func (e *entry) PackFileOffset() uint32 { + return e.packFileOffset +} + +func (e *entry) PackedOffset() uint32 { + return e.packedOffset & 0x7fffffff +} + +func (e *entry) PackedLength() uint32 { + return e.packedLength +} diff --git a/block/index.go b/block/index.go new file mode 100644 index 000000000..89d49e61a --- /dev/null +++ b/block/index.go @@ -0,0 +1,198 @@ +package block + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "sort" + "strings" + + "github.com/pkg/errors" +) + +// packIndex is a read-only index of packed blocks. +type packIndex interface { + io.Closer + + GetInfo(blockID string) (*Info, error) + Iterate(prefix string, cb func(Info) error) error +} + +type index struct { + hdr headerInfo + readerAt io.ReaderAt +} + +type headerInfo struct { + keySize int + valueSize int + entryCount int +} + +func readHeader(readerAt io.ReaderAt) (headerInfo, error) { + var header [8]byte + + if n, err := readerAt.ReadAt(header[:], 0); err != nil || n != 8 { + return headerInfo{}, errors.Wrap(err, "invalid header") + } + + if header[0] != 1 { + return headerInfo{}, fmt.Errorf("invalid header format: %v", header[0]) + } + + hi := headerInfo{ + keySize: int(header[1]), + valueSize: int(binary.BigEndian.Uint16(header[2:4])), + entryCount: int(binary.BigEndian.Uint32(header[4:8])), + } + + if hi.keySize <= 1 || hi.valueSize < 0 || hi.entryCount < 0 { + return headerInfo{}, fmt.Errorf("invalid header") + } + + return hi, nil +} + +// Iterate invokes the provided callback function for all blocks in the index, sorted alphabetically. +// The iteration ends when the callback returns an error, which is propagated to the caller or when +// all blocks have been visited. +func (b *index) Iterate(prefix string, cb func(Info) error) error { + startPos, err := b.findEntryPosition(prefix) + if err != nil { + return errors.Wrap(err, "could not find starting position") + } + stride := b.hdr.keySize + b.hdr.valueSize + entry := make([]byte, stride) + for i := startPos; i < b.hdr.entryCount; i++ { + n, err := b.readerAt.ReadAt(entry, int64(8+stride*i)) + if err != nil || n != len(entry) { + return errors.Wrap(err, "unable to read from index") + } + + key := entry[0:b.hdr.keySize] + value := entry[b.hdr.keySize:] + + i, err := b.entryToInfo(bytesToContentID(key), value) + if err != nil { + return errors.Wrap(err, "invalid index data") + } + if !strings.HasPrefix(i.BlockID, prefix) { + break + } + if err := cb(i); err != nil { + return err + } + } + return nil +} + +func (b *index) findEntryPosition(blockID string) (int, error) { + stride := b.hdr.keySize + b.hdr.valueSize + entryBuf := make([]byte, stride) + var readErr error + pos := sort.Search(b.hdr.entryCount, func(p int) bool { + if readErr != nil { + return false + } + _, err := b.readerAt.ReadAt(entryBuf, int64(8+stride*p)) + if err != nil { + readErr = err + return false + } + + return bytesToContentID(entryBuf[0:b.hdr.keySize]) >= blockID + }) + + return pos, readErr +} + +func (b *index) findEntry(blockID string) ([]byte, error) { + key := contentIDToBytes(blockID) + if len(key) != b.hdr.keySize { + return nil, fmt.Errorf("invalid block ID: %q", blockID) + } + stride := b.hdr.keySize + b.hdr.valueSize + + position, err := b.findEntryPosition(blockID) + if err != nil { + return nil, err + } + if position >= b.hdr.entryCount { + return nil, nil + } + + entryBuf := make([]byte, stride) + if _, err := b.readerAt.ReadAt(entryBuf, int64(8+stride*position)); err != nil { + return nil, err + } + + if bytes.Equal(entryBuf[0:len(key)], key) { + return entryBuf[len(key):], nil + } + + return nil, nil +} + +// GetInfo returns information about a given block. If a block is not found, nil is returned. +func (b *index) GetInfo(blockID string) (*Info, error) { + e, err := b.findEntry(blockID) + if err != nil { + return nil, err + } + + if e == nil { + return nil, nil + } + + i, err := b.entryToInfo(blockID, e) + if err != nil { + return nil, err + } + return &i, err +} + +func (b *index) entryToInfo(blockID string, entryData []byte) (Info, error) { + if len(entryData) < 20 { + return Info{}, fmt.Errorf("invalid entry length: %v", len(entryData)) + } + + var e entry + if err := e.parse(entryData); err != nil { + return Info{}, err + } + + packFile := make([]byte, e.PackFileLength()) + n, err := b.readerAt.ReadAt(packFile, int64(e.PackFileOffset())) + if err != nil || n != int(e.PackFileLength()) { + return Info{}, errors.Wrap(err, "can't read pack block ID") + } + + return Info{ + BlockID: blockID, + Deleted: e.IsDeleted(), + TimestampSeconds: e.TimestampSeconds(), + FormatVersion: e.PackedFormatVersion(), + PackOffset: e.PackedOffset(), + Length: e.PackedLength(), + PackFile: string(packFile), + }, nil +} + +// Close closes the index and the underlying reader. +func (b *index) Close() error { + if closer, ok := b.readerAt.(io.Closer); ok { + return closer.Close() + } + + return nil +} + +// openPackIndex reads an Index from a given reader. The caller must call Close() when the index is no longer used. +func openPackIndex(readerAt io.ReaderAt) (packIndex, error) { + h, err := readHeader(readerAt) + if err != nil { + return nil, errors.Wrap(err, "invalid header") + } + return &index{hdr: h, readerAt: readerAt}, nil +} diff --git a/block/info.go b/block/info.go new file mode 100644 index 000000000..388471231 --- /dev/null +++ b/block/info.go @@ -0,0 +1,22 @@ +package block + +import ( + "time" +) + +// Info is an information about a single block managed by Manager. +type Info struct { + BlockID string `json:"blockID"` + Length uint32 `json:"length"` + TimestampSeconds int64 `json:"time"` + PackFile string `json:"packFile,omitempty"` + PackOffset uint32 `json:"packOffset,omitempty"` + Deleted bool `json:"deleted"` + Payload []byte `json:"payload"` // set for payloads stored inline + FormatVersion byte `json:"formatVersion"` +} + +// Timestamp returns the time when a block was created or deleted. +func (i Info) Timestamp() time.Time { + return time.Unix(i.TimestampSeconds, 0) +} diff --git a/block/list_cache.go b/block/list_cache.go new file mode 100644 index 000000000..f5a44cdcf --- /dev/null +++ b/block/list_cache.go @@ -0,0 +1,123 @@ +package block + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "time" + + "github.com/kopia/repo/storage" +) + +type listCache struct { + st storage.Storage + cacheFile string + listCacheDuration time.Duration + hmacSecret []byte +} + +func (c *listCache) listIndexBlocks(ctx context.Context) ([]IndexInfo, error) { + if c.cacheFile != "" { + ci, err := c.readBlocksFromCache(ctx) + if err == nil { + expirationTime := ci.Timestamp.Add(c.listCacheDuration) + if time.Now().Before(expirationTime) { + log.Debugf("retrieved list of index blocks from cache") + return ci.Blocks, nil + } + } else if err != storage.ErrBlockNotFound { + log.Warningf("unable to open cache file: %v", err) + } + } + + blocks, err := listIndexBlocksFromStorage(ctx, c.st) + if err == nil { + c.saveListToCache(ctx, &cachedList{ + Blocks: blocks, + Timestamp: time.Now(), + }) + } + log.Debugf("found %v index blocks from source", len(blocks)) + + return blocks, err +} + +func (c *listCache) saveListToCache(ctx context.Context, ci *cachedList) { + if c.cacheFile == "" { + return + } + log.Debugf("saving index blocks to cache: %v", len(ci.Blocks)) + if data, err := json.Marshal(ci); err == nil { + mySuffix := fmt.Sprintf(".tmp-%v-%v", os.Getpid(), time.Now().UnixNano()) + if err := ioutil.WriteFile(c.cacheFile+mySuffix, appendHMAC(data, c.hmacSecret), 0600); err != nil { + log.Warningf("unable to write list cache: %v", err) + } + os.Rename(c.cacheFile+mySuffix, c.cacheFile) //nolint:errcheck + os.Remove(c.cacheFile + mySuffix) //nolint:errcheck + } +} + +func (c *listCache) deleteListCache(ctx context.Context) { + if c.cacheFile != "" { + os.Remove(c.cacheFile) //nolint:errcheck + } +} + +func (c *listCache) readBlocksFromCache(ctx context.Context) (*cachedList, error) { + if !shouldUseListCache(ctx) { + return nil, storage.ErrBlockNotFound + } + + ci := &cachedList{} + + data, err := ioutil.ReadFile(c.cacheFile) + if err != nil { + if os.IsNotExist(err) { + return nil, storage.ErrBlockNotFound + } + + return nil, err + } + + data, err = verifyAndStripHMAC(data, c.hmacSecret) + if err != nil { + return nil, fmt.Errorf("invalid file %v: %v", c.cacheFile, err) + } + + if err := json.Unmarshal(data, &ci); err != nil { + return nil, fmt.Errorf("can't unmarshal cached list results: %v", err) + } + + return ci, nil + +} + +func newListCache(ctx context.Context, st storage.Storage, caching CachingOptions) (*listCache, error) { + var listCacheFile string + + if caching.CacheDirectory != "" { + listCacheFile = filepath.Join(caching.CacheDirectory, "list") + + if _, err := os.Stat(caching.CacheDirectory); os.IsNotExist(err) { + if err := os.MkdirAll(caching.CacheDirectory, 0700); err != nil { + return nil, err + } + } + } + + c := &listCache{ + st: st, + cacheFile: listCacheFile, + hmacSecret: caching.HMACSecret, + listCacheDuration: time.Duration(caching.MaxListCacheDurationSec) * time.Second, + } + + if caching.IgnoreListCache { + c.deleteListCache(ctx) + } + + return c, nil +} diff --git a/block/merged.go b/block/merged.go new file mode 100644 index 000000000..20140604c --- /dev/null +++ b/block/merged.go @@ -0,0 +1,132 @@ +package block + +import ( + "container/heap" + "errors" +) + +// mergedIndex is an implementation of Index that transparently merges returns from underlying Indexes. +type mergedIndex []packIndex + +// Close closes all underlying indexes. +func (m mergedIndex) Close() error { + for _, ndx := range m { + if err := ndx.Close(); err != nil { + return err + } + } + + return nil +} + +// GetInfo returns information about a single block. If a block is not found, returns (nil,nil) +func (m mergedIndex) GetInfo(contentID string) (*Info, error) { + var best *Info + for _, ndx := range m { + i, err := ndx.GetInfo(contentID) + if err != nil { + return nil, err + } + if i != nil { + if best == nil || i.TimestampSeconds > best.TimestampSeconds || (i.TimestampSeconds == best.TimestampSeconds && !i.Deleted) { + best = i + } + } + } + return best, nil +} + +type nextInfo struct { + it Info + ch <-chan Info +} + +type nextInfoHeap []*nextInfo + +func (h nextInfoHeap) Len() int { return len(h) } +func (h nextInfoHeap) Less(i, j int) bool { + if a, b := h[i].it.BlockID, h[j].it.BlockID; a != b { + return a < b + } + + if a, b := h[i].it.TimestampSeconds, h[j].it.TimestampSeconds; a != b { + return a < b + } + + return !h[i].it.Deleted +} + +func (h nextInfoHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *nextInfoHeap) Push(x interface{}) { + *h = append(*h, x.(*nextInfo)) +} +func (h *nextInfoHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +func iterateChan(prefix string, ndx packIndex, done chan bool) <-chan Info { + ch := make(chan Info) + go func() { + defer close(ch) + + _ = ndx.Iterate(prefix, func(i Info) error { + select { + case <-done: + return errors.New("end of iteration") + case ch <- i: + return nil + } + }) + }() + return ch +} + +// Iterate invokes the provided callback for all unique block IDs in the underlying sources until either +// all blocks have been visited or until an error is returned by the callback. +func (m mergedIndex) Iterate(prefix string, cb func(i Info) error) error { + var minHeap nextInfoHeap + done := make(chan bool) + defer close(done) + + for _, ndx := range m { + ch := iterateChan(prefix, ndx, done) + it, ok := <-ch + if ok { + heap.Push(&minHeap, &nextInfo{it, ch}) + } + } + + var pendingItem Info + + for len(minHeap) > 0 { + min := heap.Pop(&minHeap).(*nextInfo) + if pendingItem.BlockID != min.it.BlockID { + if pendingItem.BlockID != "" { + if err := cb(pendingItem); err != nil { + return err + } + } + + pendingItem = min.it + } else if min.it.TimestampSeconds > pendingItem.TimestampSeconds { + pendingItem = min.it + } + + it, ok := <-min.ch + if ok { + heap.Push(&minHeap, &nextInfo{it, min.ch}) + } + } + + if pendingItem.BlockID != "" { + return cb(pendingItem) + } + + return nil +} + +var _ packIndex = (*mergedIndex)(nil) diff --git a/block/merged_test.go b/block/merged_test.go new file mode 100644 index 000000000..cea9c0ac4 --- /dev/null +++ b/block/merged_test.go @@ -0,0 +1,93 @@ +package block + +import ( + "bytes" + "reflect" + "testing" + + "github.com/pkg/errors" +) + +func TestMerged(t *testing.T) { + i1, err := indexWithItems( + Info{BlockID: "aabbcc", TimestampSeconds: 1, PackFile: "xx", PackOffset: 11}, + Info{BlockID: "ddeeff", TimestampSeconds: 1, PackFile: "xx", PackOffset: 111}, + Info{BlockID: "z010203", TimestampSeconds: 1, PackFile: "xx", PackOffset: 111}, + Info{BlockID: "de1e1e", TimestampSeconds: 4, PackFile: "xx", PackOffset: 111}, + ) + if err != nil { + t.Fatalf("can't create index: %v", err) + } + i2, err := indexWithItems( + Info{BlockID: "aabbcc", TimestampSeconds: 3, PackFile: "yy", PackOffset: 33}, + Info{BlockID: "xaabbcc", TimestampSeconds: 1, PackFile: "xx", PackOffset: 111}, + Info{BlockID: "de1e1e", TimestampSeconds: 4, PackFile: "xx", PackOffset: 222, Deleted: true}, + ) + if err != nil { + t.Fatalf("can't create index: %v", err) + } + i3, err := indexWithItems( + Info{BlockID: "aabbcc", TimestampSeconds: 2, PackFile: "zz", PackOffset: 22}, + Info{BlockID: "ddeeff", TimestampSeconds: 1, PackFile: "zz", PackOffset: 222}, + Info{BlockID: "k010203", TimestampSeconds: 1, PackFile: "xx", PackOffset: 111}, + Info{BlockID: "k020304", TimestampSeconds: 1, PackFile: "xx", PackOffset: 111}, + ) + if err != nil { + t.Fatalf("can't create index: %v", err) + } + + m := mergedIndex{i1, i2, i3} + i, err := m.GetInfo("aabbcc") + if err != nil || i == nil { + t.Fatalf("unable to get info: %v", err) + } + if got, want := i.PackOffset, uint32(33); got != want { + t.Errorf("invalid pack offset %v, wanted %v", got, want) + } + + var inOrder []string + assertNoError(t, m.Iterate("", func(i Info) error { + inOrder = append(inOrder, i.BlockID) + if i.BlockID == "de1e1e" { + if i.Deleted { + t.Errorf("iteration preferred deleted block over non-deleted") + } + } + return nil + })) + + if i, err := m.GetInfo("de1e1e"); err != nil { + t.Errorf("error getting deleted block info: %v", err) + } else if i.Deleted { + t.Errorf("GetInfo preferred deleted block over non-deleted") + } + + expectedInOrder := []string{ + "aabbcc", + "ddeeff", + "de1e1e", + "k010203", + "k020304", + "xaabbcc", + "z010203", + } + if !reflect.DeepEqual(inOrder, expectedInOrder) { + t.Errorf("unexpected items in order: %v, wanted %v", inOrder, expectedInOrder) + } + + if err := m.Close(); err != nil { + t.Errorf("unexpected error in Close(): %v", err) + } +} + +func indexWithItems(items ...Info) (packIndex, error) { + b := make(packIndexBuilder) + for _, it := range items { + b.Add(it) + } + var buf bytes.Buffer + if err := b.Build(&buf); err != nil { + return nil, errors.Wrap(err, "build error") + } + return openPackIndex(bytes.NewReader(buf.Bytes())) +} diff --git a/block/packindex_internal_test.go b/block/packindex_internal_test.go new file mode 100644 index 000000000..305619855 --- /dev/null +++ b/block/packindex_internal_test.go @@ -0,0 +1,26 @@ +package block + +import "testing" + +func TestRoundTrip(t *testing.T) { + cases := []string{ + "", + "x", + "aa", + "xaa", + "xaaa", + "a1x", + } + + for _, tc := range cases { + b := contentIDToBytes(tc) + got := bytesToContentID(b) + if got != tc { + t.Errorf("%q did not round trip, got %q, wanted %q", tc, got, tc) + } + } + + if got, want := bytesToContentID(nil), ""; got != want { + t.Errorf("unexpected content id %v, want %v", got, want) + } +} diff --git a/block/packindex_test.go b/block/packindex_test.go new file mode 100644 index 000000000..c771ec5a7 --- /dev/null +++ b/block/packindex_test.go @@ -0,0 +1,235 @@ +package block + +import ( + "bytes" + "crypto/sha1" + "encoding/hex" + "fmt" + "math/rand" + "reflect" + "strings" + "testing" +) + +func TestPackIndex(t *testing.T) { + blockNumber := 0 + + deterministicBlockID := func(prefix string, id int) string { + h := sha1.New() + fmt.Fprintf(h, "%v%v", prefix, id) + blockNumber++ + + prefix2 := "" + if id%2 == 0 { + prefix2 = "x" + } + if id%7 == 0 { + prefix2 = "y" + } + if id%5 == 0 { + prefix2 = "m" + } + return string(fmt.Sprintf("%v%x", prefix2, h.Sum(nil))) + } + deterministicPackFile := func(id int) string { + h := sha1.New() + fmt.Fprintf(h, "%v", id) + blockNumber++ + return string(fmt.Sprintf("%x", h.Sum(nil))) + } + + deterministicPackedOffset := func(id int) uint32 { + s := rand.NewSource(int64(id + 1)) + rnd := rand.New(s) + return uint32(rnd.Int31()) + } + deterministicPackedLength := func(id int) uint32 { + s := rand.NewSource(int64(id + 2)) + rnd := rand.New(s) + return uint32(rnd.Int31()) + } + deterministicFormatVersion := func(id int) byte { + return byte(id % 100) + } + + randomUnixTime := func() int64 { + return int64(rand.Int31()) + } + + var infos []Info + + // deleted blocks with all information + for i := 0; i < 100; i++ { + infos = append(infos, Info{ + TimestampSeconds: randomUnixTime(), + Deleted: true, + BlockID: deterministicBlockID("deleted-packed", i), + PackFile: deterministicPackFile(i), + PackOffset: deterministicPackedOffset(i), + Length: deterministicPackedLength(i), + FormatVersion: deterministicFormatVersion(i), + }) + } + // non-deleted block + for i := 0; i < 100; i++ { + infos = append(infos, Info{ + TimestampSeconds: randomUnixTime(), + BlockID: deterministicBlockID("packed", i), + PackFile: deterministicPackFile(i), + PackOffset: deterministicPackedOffset(i), + Length: deterministicPackedLength(i), + FormatVersion: deterministicFormatVersion(i), + }) + } + + infoMap := map[string]Info{} + b1 := make(packIndexBuilder) + b2 := make(packIndexBuilder) + b3 := make(packIndexBuilder) + + for _, info := range infos { + infoMap[info.BlockID] = info + b1.Add(info) + b2.Add(info) + b3.Add(info) + } + + var buf1 bytes.Buffer + var buf2 bytes.Buffer + var buf3 bytes.Buffer + if err := b1.Build(&buf1); err != nil { + t.Errorf("unable to build: %v", err) + } + if err := b1.Build(&buf2); err != nil { + t.Errorf("unable to build: %v", err) + } + if err := b1.Build(&buf3); err != nil { + t.Errorf("unable to build: %v", err) + } + data1 := buf1.Bytes() + data2 := buf2.Bytes() + data3 := buf3.Bytes() + + if !reflect.DeepEqual(data1, data2) { + t.Errorf("builder output not stable: %x vs %x", hex.Dump(data1), hex.Dump(data2)) + } + if !reflect.DeepEqual(data2, data3) { + t.Errorf("builder output not stable: %x vs %x", hex.Dump(data2), hex.Dump(data3)) + } + + t.Run("FuzzTest", func(t *testing.T) { + fuzzTestIndexOpen(t, data1) + }) + + ndx, err := openPackIndex(bytes.NewReader(data1)) + if err != nil { + t.Fatalf("can't open index: %v", err) + } + defer ndx.Close() + + for _, info := range infos { + info2, err := ndx.GetInfo(info.BlockID) + if err != nil { + t.Errorf("unable to find %v", info.BlockID) + continue + } + if !reflect.DeepEqual(info, *info2) { + t.Errorf("invalid value retrieved: %+v, wanted %+v", info2, info) + } + } + + cnt := 0 + assertNoError(t, ndx.Iterate("", func(info2 Info) error { + info := infoMap[info2.BlockID] + if !reflect.DeepEqual(info, info2) { + t.Errorf("invalid value retrieved: %+v, wanted %+v", info2, info) + } + cnt++ + return nil + })) + if cnt != len(infoMap) { + t.Errorf("invalid number of iterations: %v, wanted %v", cnt, len(infoMap)) + } + + prefixes := []string{"a", "b", "f", "0", "3", "aa", "aaa", "aab", "fff", "m", "x", "y", "m0", "ma"} + + for i := 0; i < 100; i++ { + blockID := deterministicBlockID("no-such-block", i) + v, err := ndx.GetInfo(blockID) + if err != nil { + t.Errorf("unable to get block %v: %v", blockID, err) + } + if v != nil { + t.Errorf("unexpected result when getting block %v: %v", blockID, v) + } + } + + for _, prefix := range prefixes { + cnt2 := 0 + assertNoError(t, ndx.Iterate(string(prefix), func(info2 Info) error { + cnt2++ + if !strings.HasPrefix(string(info2.BlockID), string(prefix)) { + t.Errorf("unexpected item %v when iterating prefix %v", info2.BlockID, prefix) + } + return nil + })) + t.Logf("found %v elements with prefix %q", cnt2, prefix) + } +} + +func fuzzTestIndexOpen(t *testing.T, originalData []byte) { + // use consistent random + rnd := rand.New(rand.NewSource(12345)) + + fuzzTest(rnd, originalData, 50000, func(d []byte) { + ndx, err := openPackIndex(bytes.NewReader(d)) + if err != nil { + return + } + defer ndx.Close() + cnt := 0 + _ = ndx.Iterate("", func(cb Info) error { + if cnt < 10 { + _, _ = ndx.GetInfo(cb.BlockID) + } + cnt++ + return nil + }) + }) +} + +func fuzzTest(rnd *rand.Rand, originalData []byte, rounds int, callback func(d []byte)) { + for round := 0; round < rounds; round++ { + data := append([]byte(nil), originalData...) + + // mutate small number of bytes + bytesToMutate := rnd.Intn(3) + for i := 0; i < bytesToMutate; i++ { + pos := rnd.Intn(len(data)) + data[pos] = byte(rnd.Int()) + } + + sectionsToInsert := rnd.Intn(3) + for i := 0; i < sectionsToInsert; i++ { + pos := rnd.Intn(len(data)) + insertedLength := rnd.Intn(20) + insertedData := make([]byte, insertedLength) + rnd.Read(insertedData) + + data = append(append(append([]byte(nil), data[0:pos]...), insertedData...), data[pos:]...) + } + + sectionsToDelete := rnd.Intn(3) + for i := 0; i < sectionsToDelete; i++ { + pos := rnd.Intn(len(data)) + deletedLength := rnd.Intn(10) + if pos+deletedLength > len(data) { + continue + } + + data = append(append([]byte(nil), data[0:pos]...), data[pos+deletedLength:]...) + } + + callback(data) + } +} diff --git a/block/stats.go b/block/stats.go new file mode 100644 index 000000000..b1483506f --- /dev/null +++ b/block/stats.go @@ -0,0 +1,25 @@ +package block + +// Stats exposes statistics about block operation. +type Stats struct { + // Keep int64 fields first to ensure they get aligned to at least 64-bit boundaries + // which is required for atomic access on ARM and x86-32. + ReadBytes int64 `json:"readBytes,omitempty"` + WrittenBytes int64 `json:"writtenBytes,omitempty"` + DecryptedBytes int64 `json:"decryptedBytes,omitempty"` + EncryptedBytes int64 `json:"encryptedBytes,omitempty"` + HashedBytes int64 `json:"hashedBytes,omitempty"` + + ReadBlocks int32 `json:"readBlocks,omitempty"` + WrittenBlocks int32 `json:"writtenBlocks,omitempty"` + CheckedBlocks int32 `json:"checkedBlocks,omitempty"` + HashedBlocks int32 `json:"hashedBlocks,omitempty"` + InvalidBlocks int32 `json:"invalidBlocks,omitempty"` + PresentBlocks int32 `json:"presentBlocks,omitempty"` + ValidBlocks int32 `json:"validBlocks,omitempty"` +} + +// Reset clears all repository statistics. +func (s *Stats) Reset() { + *s = Stats{} +} diff --git a/connect.go b/connect.go new file mode 100644 index 000000000..d6675cf29 --- /dev/null +++ b/connect.go @@ -0,0 +1,111 @@ +package repo + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "io/ioutil" + "os" + "path/filepath" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/storage" + "github.com/pkg/errors" +) + +// ConnectOptions specifies options when persisting configuration to connect to a repository. +type ConnectOptions struct { + block.CachingOptions +} + +// Connect connects to the repository in the specified storage and persists the configuration and credentials in the file provided. +func Connect(ctx context.Context, configFile string, st storage.Storage, password string, opt ConnectOptions) error { + formatBytes, err := st.GetBlock(ctx, FormatBlockID, 0, -1) + if err != nil { + return errors.Wrap(err, "unable to read format block") + } + + f, err := parseFormatBlock(formatBytes) + if err != nil { + return err + } + + var lc LocalConfig + lc.Storage = st.ConnectionInfo() + + if err = setupCaching(configFile, &lc, opt.CachingOptions, f.UniqueID); err != nil { + return errors.Wrap(err, "unable to set up caching") + } + + d, err := json.MarshalIndent(&lc, "", " ") + if err != nil { + return err + } + + if err = os.MkdirAll(filepath.Dir(configFile), 0700); err != nil { + return errors.Wrap(err, "unable to create config directory") + } + + if err = ioutil.WriteFile(configFile, d, 0600); err != nil { + return errors.Wrap(err, "unable to write config file") + } + + // now verify that the repository can be opened with the provided config file. + r, err := Open(ctx, configFile, password, nil) + if err != nil { + return err + } + + return r.Close(ctx) +} + +func setupCaching(configPath string, lc *LocalConfig, opt block.CachingOptions, uniqueID []byte) error { + if opt.MaxCacheSizeBytes == 0 { + lc.Caching = block.CachingOptions{} + return nil + } + + if opt.CacheDirectory == "" { + cacheDir, err := os.UserCacheDir() + if err != nil { + return errors.Wrap(err, "unable to determine cache directory") + } + + h := sha256.New() + h.Write(uniqueID) //nolint:errcheck + h.Write([]byte(configPath)) //nolint:errcheck + lc.Caching.CacheDirectory = filepath.Join(cacheDir, "kopia", hex.EncodeToString(h.Sum(nil))[0:16]) + } else { + absCacheDir, err := filepath.Abs(opt.CacheDirectory) + if err != nil { + return err + } + + lc.Caching.CacheDirectory = absCacheDir + } + lc.Caching.MaxCacheSizeBytes = opt.MaxCacheSizeBytes + lc.Caching.MaxListCacheDurationSec = opt.MaxListCacheDurationSec + + log.Debugf("Creating cache directory '%v' with max size %v", lc.Caching.CacheDirectory, lc.Caching.MaxCacheSizeBytes) + if err := os.MkdirAll(lc.Caching.CacheDirectory, 0700); err != nil { + log.Warningf("unablet to create cache directory: %v", err) + } + return nil +} + +// Disconnect removes the specified configuration file and any local cache directories. +func Disconnect(configFile string) error { + cfg, err := loadConfigFromFile(configFile) + if err != nil { + return err + } + + if cfg.Caching.CacheDirectory != "" { + if err = os.RemoveAll(cfg.Caching.CacheDirectory); err != nil { + log.Warningf("unable to to remove cache directory: %v", err) + } + } + + return os.Remove(configFile) +} diff --git a/crypto_key_derivation.go b/crypto_key_derivation.go new file mode 100644 index 000000000..704e263a5 --- /dev/null +++ b/crypto_key_derivation.go @@ -0,0 +1,33 @@ +package repo + +import ( + "crypto/sha256" + "fmt" + "io" + + "golang.org/x/crypto/hkdf" + "golang.org/x/crypto/scrypt" +) + +// defaultKeyDerivationAlgorithm is the key derivation algorithm for new configurations. +const defaultKeyDerivationAlgorithm = "scrypt-65536-8-1" + +func (f formatBlock) deriveMasterKeyFromPassword(password string) ([]byte, error) { + const masterKeySize = 32 + + switch f.KeyDerivationAlgorithm { + case "scrypt-65536-8-1": + return scrypt.Key([]byte(password), f.UniqueID, 65536, 8, 1, masterKeySize) + + default: + return nil, fmt.Errorf("unsupported key algorithm: %v", f.KeyDerivationAlgorithm) + } +} + +// deriveKeyFromMasterKey computes a key for a specific purpose and length using HKDF based on the master key. +func deriveKeyFromMasterKey(masterKey, uniqueID, purpose []byte, length int) []byte { + key := make([]byte, length) + k := hkdf.New(sha256.New, masterKey, uniqueID, purpose) + io.ReadFull(k, key) //nolint:errcheck + return key +} diff --git a/doc.go b/doc.go new file mode 100644 index 000000000..f5f82361d --- /dev/null +++ b/doc.go @@ -0,0 +1,2 @@ +// Package repo implements content-addressable Repository on top of BLOB storage. +package repo diff --git a/examples/upload_download/main.go b/examples/upload_download/main.go new file mode 100644 index 000000000..2dedc41bf --- /dev/null +++ b/examples/upload_download/main.go @@ -0,0 +1,40 @@ +//+build !test + +// Command repository_api demonstrates the use of Kopia's Repository API. +package main + +import ( + "context" + "log" + "os" + + "github.com/kopia/repo" +) + +func main() { + ctx := context.Background() + + if err := setupRepositoryAndConnect(ctx, masterPassword); err != nil { + log.Printf("unable to set up repository: %v", err) + os.Exit(1) + } + + r, err := repo.Open(ctx, configFile, masterPassword, nil) + if err != nil { + log.Printf("unable to open repository: %v", err) + os.Exit(1) + } + defer r.Close(ctx) //nolint:errcheck + + uploadAndDownloadObjects(ctx, r) + + // Now list blocks found in the repository. + blks, err := r.Blocks.ListBlocks("") + if err != nil { + log.Printf("err: %v", err) + } + + for _, b := range blks { + log.Printf("found block %v", b) + } +} diff --git a/examples/upload_download/setup_repository.go b/examples/upload_download/setup_repository.go new file mode 100644 index 000000000..8655074db --- /dev/null +++ b/examples/upload_download/setup_repository.go @@ -0,0 +1,56 @@ +//+build !test + +package main + +import ( + "context" + "fmt" + "os" + + "github.com/kopia/repo" + "github.com/kopia/repo/block" + "github.com/kopia/repo/storage/filesystem" + "github.com/kopia/repo/storage/logging" +) + +const ( + masterPassword = "my-password$!@#!@" + storageDir = "/tmp/kopia-example/storage" + configFile = "/tmp/kopia-example/config" + cacheDirectory = "/tmp/kopia-example/cache" +) + +func setupRepositoryAndConnect(ctx context.Context, password string) error { + if err := os.MkdirAll(storageDir, 0700); err != nil { + return fmt.Errorf("unable to create directory: %v", err) + } + st, err := filesystem.New(ctx, &filesystem.Options{ + Path: storageDir, + }) + if err != nil { + return fmt.Errorf("unable to connect to storage: %v", err) + } + + // set up logging so we can see what's going on + st = logging.NewWrapper(st) + + // see if we already have the config file, if not connect. + if _, err := os.Stat(configFile); os.IsNotExist(err) { + // initialize repository + if err := repo.Initialize(ctx, st, &repo.NewRepositoryOptions{}, password); err != nil { + return fmt.Errorf("unable to initialize repository: %v", err) + } + + // now establish connection to repository and create configuration file. + if err := repo.Connect(ctx, configFile, st, password, repo.ConnectOptions{ + CachingOptions: block.CachingOptions{ + CacheDirectory: cacheDirectory, + MaxCacheSizeBytes: 100000000, + }, + }); err != nil { + return fmt.Errorf("unable to connect to repository: %v", err) + } + } + + return nil +} diff --git a/examples/upload_download/upload_download_objects.go b/examples/upload_download/upload_download_objects.go new file mode 100644 index 000000000..bfddf52a6 --- /dev/null +++ b/examples/upload_download/upload_download_objects.go @@ -0,0 +1,67 @@ +//+build !test + +package main + +import ( + "context" + "crypto/rand" + "io/ioutil" + "log" + "os" + + "github.com/kopia/repo" + "github.com/kopia/repo/object" +) + +func uploadRandomObject(ctx context.Context, r *repo.Repository, length int) (object.ID, error) { + w := r.Objects.NewWriter(ctx, object.WriterOptions{}) + defer w.Close() //nolint:errcheck + + buf := make([]byte, 256*1024) + for length > 0 { + todo := length + if todo > len(buf) { + todo = len(buf) + } + rand.Read(buf[0:todo]) //nolint:errcheck + if _, err := w.Write(buf[0:todo]); err != nil { + return "", err + } + length -= todo + } + return w.Result() +} + +func downloadObject(ctx context.Context, r *repo.Repository, oid object.ID) ([]byte, error) { + rd, err := r.Objects.Open(ctx, oid) + if err != nil { + return nil, err + } + defer rd.Close() //nolint:errcheck + + return ioutil.ReadAll(rd) +} + +func uploadAndDownloadObjects(ctx context.Context, r *repo.Repository) { + var oids []object.ID + + for size := 100; size < 100000000; size *= 2 { + log.Printf("uploading file with %v bytes", size) + oid, err := uploadRandomObject(ctx, r, size) + if err != nil { + log.Printf("unable to upload: %v", err) + os.Exit(1) + } + log.Printf("uploaded %v bytes as %v", size, oid) + oids = append(oids, oid) + } + + for _, oid := range oids { + log.Printf("downloading %q", oid) + b, err := downloadObject(ctx, r, oid) + if err != nil { + log.Printf("unable to read object: %v", err) + } + log.Printf("downloaded %v", len(b)) + } +} diff --git a/format_block.go b/format_block.go new file mode 100644 index 000000000..fffda5070 --- /dev/null +++ b/format_block.go @@ -0,0 +1,263 @@ +package repo + +import ( + "bytes" + "context" + "crypto/aes" + "crypto/cipher" + "crypto/hmac" + "crypto/rand" + "crypto/sha256" + "encoding/json" + "fmt" + "io" + + "github.com/kopia/repo/storage" + "github.com/pkg/errors" +) + +const defaultFormatEncryption = "AES256_GCM" + +const ( + maxChecksummedFormatBytesLength = 65000 + formatBlockChecksumSize = sha256.Size +) + +// formatBlockChecksumSecret is a HMAC secret used for checksumming the format block. +// It's not really a secret, but will provide positive identification of blocks that +// are repository format blocks. +var formatBlockChecksumSecret = []byte("kopia-repository") + +// FormatBlockID is the identifier of a storage block that describes repository format. +const FormatBlockID = "kopia.repository" + +var ( + purposeAESKey = []byte("AES") + purposeAuthData = []byte("CHECKSUM") + + errFormatBlockNotFound = errors.New("format block not found") +) + +type formatBlock struct { + Tool string `json:"tool"` + BuildVersion string `json:"buildVersion"` + BuildInfo string `json:"buildInfo"` + + UniqueID []byte `json:"uniqueID"` + KeyDerivationAlgorithm string `json:"keyAlgo"` + + Version string `json:"version"` + EncryptionAlgorithm string `json:"encryption"` + EncryptedFormatBytes []byte `json:"encryptedBlockFormat,omitempty"` + UnencryptedFormat *repositoryObjectFormat `json:"blockFormat,omitempty"` +} + +// encryptedRepositoryConfig contains the configuration of repository that's persisted in encrypted format. +type encryptedRepositoryConfig struct { + Format repositoryObjectFormat `json:"format"` +} + +func parseFormatBlock(b []byte) (*formatBlock, error) { + f := &formatBlock{} + + if err := json.Unmarshal(b, &f); err != nil { + return nil, errors.Wrap(err, "invalid format block") + } + + return f, nil +} + +// RecoverFormatBlock attempts to recover format block replica from the specified file. +// The format block can be either the prefix or a suffix of the given file. +// optionally the length can be provided (if known) to speed up recovery. +func RecoverFormatBlock(ctx context.Context, st storage.Storage, filename string, optionalLength int64) ([]byte, error) { + if optionalLength > 0 { + return recoverFormatBlockWithLength(ctx, st, filename, optionalLength) + } + + var foundMetadata storage.BlockMetadata + + if err := st.ListBlocks(ctx, filename, func(bm storage.BlockMetadata) error { + if foundMetadata.BlockID != "" { + return fmt.Errorf("found multiple blocks with a given prefix: %v", filename) + } + foundMetadata = bm + return nil + }); err != nil { + return nil, errors.Wrap(err, "error") + } + + if foundMetadata.BlockID == "" { + return nil, storage.ErrBlockNotFound + } + + return recoverFormatBlockWithLength(ctx, st, foundMetadata.BlockID, foundMetadata.Length) +} + +func recoverFormatBlockWithLength(ctx context.Context, st storage.Storage, filename string, length int64) ([]byte, error) { + chunkLength := int64(65536) + if chunkLength > length { + chunkLength = length + } + + if chunkLength > 4 { + + // try prefix + prefixChunk, err := st.GetBlock(ctx, filename, 0, chunkLength) + if err != nil { + return nil, err + } + if l := int(prefixChunk[0]) + int(prefixChunk[1])<<8; l <= maxChecksummedFormatBytesLength && l+2 < len(prefixChunk) { + if b, ok := verifyFormatBlockChecksum(prefixChunk[2 : 2+l]); ok { + return b, nil + } + } + + // try the suffix + suffixChunk, err := st.GetBlock(ctx, filename, length-chunkLength, chunkLength) + if err != nil { + return nil, err + } + if l := int(suffixChunk[len(suffixChunk)-2]) + int(suffixChunk[len(suffixChunk)-1])<<8; l <= maxChecksummedFormatBytesLength && l+2 < len(suffixChunk) { + if b, ok := verifyFormatBlockChecksum(suffixChunk[len(suffixChunk)-2-l : len(suffixChunk)-2]); ok { + return b, nil + } + } + } + + return nil, errFormatBlockNotFound +} + +func verifyFormatBlockChecksum(b []byte) ([]byte, bool) { + if len(b) < formatBlockChecksumSize { + return nil, false + } + + data, checksum := b[0:len(b)-formatBlockChecksumSize], b[len(b)-formatBlockChecksumSize:] + h := hmac.New(sha256.New, formatBlockChecksumSecret) + h.Write(data) //nolint:errcheck + actualChecksum := h.Sum(nil) + if !hmac.Equal(actualChecksum, checksum) { + return nil, false + } + + return data, true +} + +func writeFormatBlock(ctx context.Context, st storage.Storage, f *formatBlock) error { + var buf bytes.Buffer + e := json.NewEncoder(&buf) + e.SetIndent("", " ") + if err := e.Encode(f); err != nil { + return errors.Wrap(err, "unable to marshal format block") + } + + if err := st.PutBlock(ctx, FormatBlockID, buf.Bytes()); err != nil { + return errors.Wrap(err, "unable to write format block") + } + + return nil +} + +func (f *formatBlock) decryptFormatBytes(masterKey []byte) (*repositoryObjectFormat, error) { + switch f.EncryptionAlgorithm { + case "NONE": // do nothing + return f.UnencryptedFormat, nil + + case "AES256_GCM": + aead, authData, err := initCrypto(masterKey, f.UniqueID) + if err != nil { + return nil, errors.Wrap(err, "cannot initialize cipher") + } + + content := append([]byte(nil), f.EncryptedFormatBytes...) + if len(content) < aead.NonceSize() { + return nil, fmt.Errorf("invalid encrypted payload, too short") + } + nonce := content[0:aead.NonceSize()] + payload := content[aead.NonceSize():] + + plainText, err := aead.Open(payload[:0], nonce, payload, authData) + if err != nil { + return nil, fmt.Errorf("unable to decrypt repository format, invalid credentials?") + } + + var erc encryptedRepositoryConfig + if err := json.Unmarshal(plainText, &erc); err != nil { + return nil, errors.Wrap(err, "invalid repository format") + } + + return &erc.Format, nil + + default: + return nil, fmt.Errorf("unknown encryption algorithm: '%v'", f.EncryptionAlgorithm) + } +} + +func initCrypto(masterKey, repositoryID []byte) (cipher.AEAD, []byte, error) { + aesKey := deriveKeyFromMasterKey(masterKey, repositoryID, purposeAESKey, 32) + authData := deriveKeyFromMasterKey(masterKey, repositoryID, purposeAuthData, 32) + + blk, err := aes.NewCipher(aesKey) + if err != nil { + return nil, nil, errors.Wrap(err, "cannot create cipher") + } + aead, err := cipher.NewGCM(blk) + if err != nil { + return nil, nil, errors.Wrap(err, "cannot create cipher") + } + + return aead, authData, nil +} + +func encryptFormatBytes(f *formatBlock, format *repositoryObjectFormat, masterKey, repositoryID []byte) error { + switch f.EncryptionAlgorithm { + case "NONE": + f.UnencryptedFormat = format + return nil + + case "AES256_GCM": + content, err := json.Marshal(&encryptedRepositoryConfig{Format: *format}) + if err != nil { + return errors.Wrap(err, "can't marshal format to JSON") + } + aead, authData, err := initCrypto(masterKey, repositoryID) + if err != nil { + return errors.Wrap(err, "unable to initialize crypto") + } + nonceLength := aead.NonceSize() + noncePlusContentLength := nonceLength + len(content) + cipherText := make([]byte, noncePlusContentLength+aead.Overhead()) + + // Store nonce at the beginning of ciphertext. + nonce := cipherText[0:nonceLength] + if _, err := io.ReadFull(rand.Reader, nonce); err != nil { + return err + } + + b := aead.Seal(cipherText[nonceLength:nonceLength], nonce, content, authData) + content = nonce[0 : nonceLength+len(b)] + f.EncryptedFormatBytes = content + return nil + + default: + return fmt.Errorf("unknown encryption algorithm: '%v'", f.EncryptionAlgorithm) + } +} + +func addFormatBlockChecksumAndLength(fb []byte) ([]byte, error) { + h := hmac.New(sha256.New, formatBlockChecksumSecret) + h.Write(fb) //nolint:errcheck + checksummedFormatBytes := h.Sum(fb) + + l := len(checksummedFormatBytes) + if l > maxChecksummedFormatBytesLength { + return nil, fmt.Errorf("format block too big: %v", l) + } + + // return + result := append([]byte(nil), byte(l), byte(l>>8)) + result = append(result, checksummedFormatBytes...) + result = append(result, byte(l), byte(l>>8)) + return result, nil +} diff --git a/format_block_test.go b/format_block_test.go new file mode 100644 index 000000000..e1bd302aa --- /dev/null +++ b/format_block_test.go @@ -0,0 +1,79 @@ +package repo + +import ( + "context" + "crypto/sha256" + "reflect" + "testing" + + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" +) + +func TestFormatBlockRecovery(t *testing.T) { + data := map[string][]byte{} + st := storagetesting.NewMapStorage(data, nil, nil) + ctx := context.Background() + + someDataBlock := []byte("aadsdasdas") + checksummed, err := addFormatBlockChecksumAndLength(someDataBlock) + if err != nil { + t.Errorf("error appending checksum: %v", err) + } + if got, want := len(checksummed), 2+2+sha256.Size+len(someDataBlock); got != want { + t.Errorf("unexpected checksummed length: %v, want %v", got, want) + } + + assertNoError(t, st.PutBlock(ctx, "some-block-by-itself", checksummed)) + assertNoError(t, st.PutBlock(ctx, "some-block-suffix", append(append([]byte(nil), 1, 2, 3), checksummed...))) + assertNoError(t, st.PutBlock(ctx, "some-block-prefix", append(append([]byte(nil), checksummed...), 1, 2, 3))) + + // mess up checksum + checksummed[len(checksummed)-3] ^= 1 + assertNoError(t, st.PutBlock(ctx, "bad-checksum", checksummed)) + assertNoError(t, st.PutBlock(ctx, "zero-len", []byte{})) + assertNoError(t, st.PutBlock(ctx, "one-len", []byte{1})) + assertNoError(t, st.PutBlock(ctx, "two-len", []byte{1, 2})) + assertNoError(t, st.PutBlock(ctx, "three-len", []byte{1, 2, 3})) + assertNoError(t, st.PutBlock(ctx, "four-len", []byte{1, 2, 3, 4})) + assertNoError(t, st.PutBlock(ctx, "five-len", []byte{1, 2, 3, 4, 5})) + + cases := []struct { + block string + err error + }{ + {"some-block-by-itself", nil}, + {"some-block-suffix", nil}, + {"some-block-prefix", nil}, + {"bad-checksum", errFormatBlockNotFound}, + {"no-such-block", storage.ErrBlockNotFound}, + {"zero-len", errFormatBlockNotFound}, + {"one-len", errFormatBlockNotFound}, + {"two-len", errFormatBlockNotFound}, + {"three-len", errFormatBlockNotFound}, + {"four-len", errFormatBlockNotFound}, + {"five-len", errFormatBlockNotFound}, + } + + for _, tc := range cases { + t.Run(tc.block, func(t *testing.T) { + v, err := RecoverFormatBlock(ctx, st, tc.block, -1) + if tc.err == nil { + if !reflect.DeepEqual(v, someDataBlock) || err != nil { + t.Errorf("unexpected result or error: v=%v err=%v, expected success", v, err) + } + } else { + if v != nil || err != tc.err { + t.Errorf("unexpected result or error: v=%v err=%v, expected %v", v, err, tc.err) + } + } + }) + } +} + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Errorf("err: %v", err) + } +} diff --git a/initialize.go b/initialize.go new file mode 100644 index 000000000..a654168c9 --- /dev/null +++ b/initialize.go @@ -0,0 +1,132 @@ +package repo + +import ( + "context" + "crypto/rand" + "fmt" + "io" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/object" + "github.com/kopia/repo/storage" + "github.com/pkg/errors" +) + +// BuildInfo is the build information of Kopia. +var ( + BuildInfo = "unknown" + BuildVersion = "v0-unofficial" +) + +// NewRepositoryOptions specifies options that apply to newly created repositories. +// All fields are optional, when not provided, reasonable defaults will be used. +type NewRepositoryOptions struct { + UniqueID []byte // force the use of particular unique ID + BlockFormat block.FormattingOptions + DisableHMAC bool + ObjectFormat object.Format // object format +} + +// Initialize creates initial repository data structures in the specified storage with given credentials. +func Initialize(ctx context.Context, st storage.Storage, opt *NewRepositoryOptions, password string) error { + if opt == nil { + opt = &NewRepositoryOptions{} + } + + // get the block - expect ErrBlockNotFound + _, err := st.GetBlock(ctx, FormatBlockID, 0, -1) + if err == nil { + return fmt.Errorf("repository already initialized") + } + if err != storage.ErrBlockNotFound { + return err + } + + format := formatBlockFromOptions(opt) + masterKey, err := format.deriveMasterKeyFromPassword(password) + if err != nil { + return errors.Wrap(err, "unable to derive master key") + } + + if err := encryptFormatBytes(format, repositoryObjectFormatFromOptions(opt), masterKey, format.UniqueID); err != nil { + return errors.Wrap(err, "unable to encrypt format bytes") + } + + if err := writeFormatBlock(ctx, st, format); err != nil { + return errors.Wrap(err, "unable to write format block") + } + + return nil +} + +func formatBlockFromOptions(opt *NewRepositoryOptions) *formatBlock { + f := &formatBlock{ + Tool: "https://github.com/kopia/kopia", + BuildInfo: BuildInfo, + KeyDerivationAlgorithm: defaultKeyDerivationAlgorithm, + UniqueID: applyDefaultRandomBytes(opt.UniqueID, 32), + Version: "1", + EncryptionAlgorithm: defaultFormatEncryption, + } + + if opt.BlockFormat.Encryption == "NONE" { + f.EncryptionAlgorithm = "NONE" + } + + return f +} + +func repositoryObjectFormatFromOptions(opt *NewRepositoryOptions) *repositoryObjectFormat { + f := &repositoryObjectFormat{ + FormattingOptions: block.FormattingOptions{ + Version: 1, + Hash: applyDefaultString(opt.BlockFormat.Hash, block.DefaultHash), + Encryption: applyDefaultString(opt.BlockFormat.Encryption, block.DefaultEncryption), + HMACSecret: applyDefaultRandomBytes(opt.BlockFormat.HMACSecret, 32), + MasterKey: applyDefaultRandomBytes(opt.BlockFormat.MasterKey, 32), + MaxPackSize: applyDefaultInt(opt.BlockFormat.MaxPackSize, applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20)), // 20 MB + }, + Format: object.Format{ + Splitter: applyDefaultString(opt.ObjectFormat.Splitter, object.DefaultSplitter), + MaxBlockSize: applyDefaultInt(opt.ObjectFormat.MaxBlockSize, 20<<20), // 20MiB + MinBlockSize: applyDefaultInt(opt.ObjectFormat.MinBlockSize, 10<<20), // 10MiB + AvgBlockSize: applyDefaultInt(opt.ObjectFormat.AvgBlockSize, 16<<20), // 16MiB + }, + } + + if opt.DisableHMAC { + f.HMACSecret = nil + } + + return f +} + +func randomBytes(n int) []byte { + b := make([]byte, n) + io.ReadFull(rand.Reader, b) //nolint:errcheck + return b +} + +func applyDefaultInt(v, def int) int { + if v == 0 { + return def + } + + return v +} + +func applyDefaultString(v, def string) string { + if v == "" { + return def + } + + return v +} + +func applyDefaultRandomBytes(b []byte, n int) []byte { + if b == nil { + return randomBytes(n) + } + + return b +} diff --git a/internal/repologging/logging.go b/internal/repologging/logging.go new file mode 100644 index 000000000..d72eb0110 --- /dev/null +++ b/internal/repologging/logging.go @@ -0,0 +1,9 @@ +// Package repologging provides loggers. +package repologging + +import "github.com/op/go-logging" + +// Logger returns an instance of a logger used throughout repository codebase. +func Logger(module string) *logging.Logger { + return logging.MustGetLogger(module) +} diff --git a/internal/repotesting/repotesting.go b/internal/repotesting/repotesting.go index da41a7814..816730911 100644 --- a/internal/repotesting/repotesting.go +++ b/internal/repotesting/repotesting.go @@ -23,6 +23,7 @@ type Environment struct { configDir string storageDir string + connected bool } // Setup sets up a test environment. @@ -75,6 +76,8 @@ func (e *Environment) Setup(t *testing.T, opts ...func(*repo.NewRepositoryOption t.Fatalf("can't connect: %v", err) } + e.connected = true + e.Repository, err = repo.Open(ctx, e.configFile(), masterPassword, &repo.Options{}) if err != nil { t.Fatalf("can't open: %v", err) @@ -88,8 +91,13 @@ func (e *Environment) Close(t *testing.T) { if err := e.Repository.Close(context.Background()); err != nil { t.Fatalf("unable to close: %v", err) } - - if err := os.RemoveAll(e.configDir); err != nil { + if e.connected { + if err := repo.Disconnect(e.configFile()); err != nil { + t.Errorf("error disconnecting: %v", err) + } + } + if err := os.Remove(e.configDir); err != nil { + // should be empty, assuming Disconnect was successful t.Errorf("error removing config directory: %v", err) } if err := os.RemoveAll(e.storageDir); err != nil { diff --git a/internal/retry/retry.go b/internal/retry/retry.go new file mode 100644 index 000000000..24e7036b2 --- /dev/null +++ b/internal/retry/retry.go @@ -0,0 +1,44 @@ +// Package retry implements exponential retry policy. +package retry + +import ( + "fmt" + "time" + + "github.com/kopia/repo/internal/repologging" +) + +var log = repologging.Logger("repo/retry") + +var ( + maxAttempts = 10 + retryInitialSleepAmount = 1 * time.Second + retryMaxSleepAmount = 32 * time.Second +) + +// AttemptFunc performs an attempt and returns a value (optional, may be nil) and an error. +type AttemptFunc func() (interface{}, error) + +// IsRetriableFunc is a function that determines whether an error is retriable. +type IsRetriableFunc func(err error) bool + +// WithExponentialBackoff runs the provided attempt until it succeeds, retrying on all errors that are +// deemed retriable by the provided function. The delay between retries grows exponentially up to +// a certain limit. +func WithExponentialBackoff(desc string, attempt AttemptFunc, isRetriableError IsRetriableFunc) (interface{}, error) { + sleepAmount := retryInitialSleepAmount + for i := 0; i < maxAttempts; i++ { + v, err := attempt() + if !isRetriableError(err) { + return v, err + } + log.Debugf("got error %v when %v (#%v), sleeping for %v before retrying", err, desc, i, sleepAmount) + time.Sleep(sleepAmount) + sleepAmount *= 2 + if sleepAmount > retryMaxSleepAmount { + sleepAmount = retryMaxSleepAmount + } + } + + return nil, fmt.Errorf("unable to complete %v despite %v retries", desc, maxAttempts) +} diff --git a/internal/retry/retry_test.go b/internal/retry/retry_test.go new file mode 100644 index 000000000..18b2e7e5d --- /dev/null +++ b/internal/retry/retry_test.go @@ -0,0 +1,59 @@ +package retry + +import ( + "errors" + "fmt" + "reflect" + "testing" + "time" +) + +var ( + errRetriable = errors.New("retriable") +) + +func isRetriable(e error) bool { + return e == errRetriable +} + +func TestRetry(t *testing.T) { + retryInitialSleepAmount = 10 * time.Millisecond + retryMaxSleepAmount = 20 * time.Millisecond + maxAttempts = 3 + + cnt := 0 + + cases := []struct { + desc string + f func() (interface{}, error) + want interface{} + wantError error + }{ + {"success-nil", func() (interface{}, error) { return nil, nil }, nil, nil}, + {"success", func() (interface{}, error) { return 3, nil }, 3, nil}, + {"retriable-succeeds", func() (interface{}, error) { + cnt++ + if cnt < 2 { + return nil, errRetriable + } + return 4, nil + }, 4, nil}, + {"retriable-never-succeeds", func() (interface{}, error) { return nil, errRetriable }, nil, fmt.Errorf("unable to complete retriable-never-succeeds despite 3 retries")}, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + tc := tc + t.Parallel() + + got, err := WithExponentialBackoff(tc.desc, tc.f, isRetriable) + if !reflect.DeepEqual(err, tc.wantError) { + t.Errorf("invalid error %q, wanted %q", err, tc.wantError) + } + + if got != tc.want { + t.Errorf("invalid value %v, wanted %v", got, tc.want) + } + }) + } +} diff --git a/internal/storagetesting/asserts.go b/internal/storagetesting/asserts.go new file mode 100644 index 000000000..a6a61e498 --- /dev/null +++ b/internal/storagetesting/asserts.go @@ -0,0 +1,110 @@ +package storagetesting + +import ( + "bytes" + "context" + "reflect" + "sort" + "testing" + + "github.com/kopia/repo/storage" +) + +// AssertGetBlock asserts that the specified storage block has correct content. +func AssertGetBlock(ctx context.Context, t *testing.T, s storage.Storage, block string, expected []byte) { + t.Helper() + + b, err := s.GetBlock(ctx, block, 0, -1) + if err != nil { + t.Errorf("GetBlock(%v) returned error %v, expected data: %v", block, err, expected) + return + } + + if !bytes.Equal(b, expected) { + t.Errorf("GetBlock(%v) returned %x, but expected %x", block, b, expected) + } + + half := int64(len(expected) / 2) + if half == 0 { + return + } + + b, err = s.GetBlock(ctx, block, 0, 0) + if err != nil { + t.Errorf("GetBlock(%v) returned error %v, expected data: %v", block, err, expected) + return + } + + if len(b) != 0 { + t.Errorf("GetBlock(%v) returned non-zero length: %v", block, len(b)) + return + } + + b, err = s.GetBlock(ctx, block, 0, half) + if err != nil { + t.Errorf("GetBlock(%v) returned error %v, expected data: %v", block, err, expected) + return + } + + if !bytes.Equal(b, expected[0:half]) { + t.Errorf("GetBlock(%v) returned %x, but expected %x", block, b, expected[0:half]) + } + + b, err = s.GetBlock(ctx, block, half, int64(len(expected))-half) + if err != nil { + t.Errorf("GetBlock(%v) returned error %v, expected data: %v", block, err, expected) + return + } + + if !bytes.Equal(b, expected[len(expected)-int(half):]) { + t.Errorf("GetBlock(%v) returned %x, but expected %x", block, b, expected[len(expected)-int(half):]) + } + + AssertInvalidOffsetLength(ctx, t, s, block, -3, 1) + AssertInvalidOffsetLength(ctx, t, s, block, int64(len(expected)), 3) + AssertInvalidOffsetLength(ctx, t, s, block, int64(len(expected)-1), 3) + AssertInvalidOffsetLength(ctx, t, s, block, int64(len(expected)+1), 3) +} + +// AssertInvalidOffsetLength verifies that the given combination of (offset,length) fails on GetBlock() +func AssertInvalidOffsetLength(ctx context.Context, t *testing.T, s storage.Storage, block string, offset, length int64) { + if _, err := s.GetBlock(ctx, block, offset, length); err == nil { + t.Errorf("GetBlock(%v,%v,%v) did not return error for invalid offset/length", block, offset, length) + } +} + +// AssertGetBlockNotFound asserts that GetBlock() for specified storage block returns ErrBlockNotFound. +func AssertGetBlockNotFound(ctx context.Context, t *testing.T, s storage.Storage, block string) { + t.Helper() + + b, err := s.GetBlock(ctx, block, 0, -1) + if err != storage.ErrBlockNotFound || b != nil { + t.Errorf("GetBlock(%v) returned %v, %v but expected ErrBlockNotFound", block, b, err) + } +} + +// AssertListResults asserts that the list results with given prefix return the specified list of names in order. +func AssertListResults(ctx context.Context, t *testing.T, s storage.Storage, prefix string, want ...string) { + t.Helper() + var names []string + + if err := s.ListBlocks(ctx, prefix, func(e storage.BlockMetadata) error { + names = append(names, e.BlockID) + return nil + }); err != nil { + t.Fatalf("err: %v", err) + } + + names = sorted(names) + want = sorted(want) + + if !reflect.DeepEqual(names, want) { + t.Errorf("ListBlocks(%v) returned %v, but wanted %v", prefix, names, want) + } +} + +func sorted(s []string) []string { + x := append([]string(nil), s...) + sort.Strings(x) + return x +} diff --git a/internal/storagetesting/doc.go b/internal/storagetesting/doc.go new file mode 100644 index 000000000..0c2582e88 --- /dev/null +++ b/internal/storagetesting/doc.go @@ -0,0 +1,2 @@ +// Package storagetesting is used for testing Storage implementations. +package storagetesting diff --git a/internal/storagetesting/faulty.go b/internal/storagetesting/faulty.go new file mode 100644 index 000000000..aa5970bc7 --- /dev/null +++ b/internal/storagetesting/faulty.go @@ -0,0 +1,115 @@ +package storagetesting + +import ( + "context" + "sync" + "time" + + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/storage" +) + +var log = repologging.Logger("faulty-storage") + +// Fault describes the behavior of a single fault. +type Fault struct { + Repeat int // how many times to repeat this fault + Sleep time.Duration // sleep before returning + ErrCallback func() error + WaitFor chan struct{} // waits until the given channel is closed before returning + Err error // error to return (can be nil in combination with Sleep and WaitFor) +} + +// FaultyStorage implements fault injection for Storage. +type FaultyStorage struct { + Base storage.Storage + Faults map[string][]*Fault + + mu sync.Mutex +} + +// GetBlock implements storage.Storage +func (s *FaultyStorage) GetBlock(ctx context.Context, id string, offset, length int64) ([]byte, error) { + if err := s.getNextFault("GetBlock", id, offset, length); err != nil { + return nil, err + } + return s.Base.GetBlock(ctx, id, offset, length) +} + +// PutBlock implements storage.Storage +func (s *FaultyStorage) PutBlock(ctx context.Context, id string, data []byte) error { + if err := s.getNextFault("PutBlock", id, len(data)); err != nil { + return err + } + return s.Base.PutBlock(ctx, id, data) +} + +// DeleteBlock implements storage.Storage +func (s *FaultyStorage) DeleteBlock(ctx context.Context, id string) error { + if err := s.getNextFault("DeleteBlock", id); err != nil { + return err + } + return s.Base.DeleteBlock(ctx, id) +} + +// ListBlocks implements storage.Storage +func (s *FaultyStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + if err := s.getNextFault("ListBlocks", prefix); err != nil { + return err + } + + return s.Base.ListBlocks(ctx, prefix, func(bm storage.BlockMetadata) error { + if err := s.getNextFault("ListBlocksItem", prefix); err != nil { + return err + } + return callback(bm) + }) +} + +// Close implements storage.Storage +func (s *FaultyStorage) Close(ctx context.Context) error { + if err := s.getNextFault("Close"); err != nil { + return err + } + return s.Base.Close(ctx) +} + +// ConnectionInfo implements storage.Storage +func (s *FaultyStorage) ConnectionInfo() storage.ConnectionInfo { + return s.Base.ConnectionInfo() +} + +func (s *FaultyStorage) getNextFault(method string, args ...interface{}) error { + s.mu.Lock() + faults := s.Faults[method] + if len(faults) == 0 { + s.mu.Unlock() + log.Debugf("no faults for %v %v", method, args) + return nil + } + + f := faults[0] + if f.Repeat > 0 { + f.Repeat-- + log.Debugf("will repeat %v more times the fault for %v %v", f.Repeat, method, args) + } else { + s.Faults[method] = faults[1:] + } + s.mu.Unlock() + if f.WaitFor != nil { + log.Debugf("waiting for channel to be closed in %v %v", method, args) + <-f.WaitFor + } + if f.Sleep > 0 { + log.Debugf("sleeping for %v in %v %v", f.Sleep, method, args) + } + if f.ErrCallback != nil { + err := f.ErrCallback() + log.Debugf("returning %v for %v %v", err, method, args) + return err + } + log.Debugf("returning %v for %v %v", f.Err, method, args) + return f.Err +} + +var _ storage.Storage = (*FaultyStorage)(nil) diff --git a/internal/storagetesting/map.go b/internal/storagetesting/map.go new file mode 100644 index 000000000..ed01aa415 --- /dev/null +++ b/internal/storagetesting/map.go @@ -0,0 +1,133 @@ +package storagetesting + +import ( + "context" + "errors" + "sort" + "strings" + "sync" + "time" + + "github.com/kopia/repo/storage" +) + +type mapStorage struct { + data map[string][]byte + keyTime map[string]time.Time + timeNow func() time.Time + mutex sync.RWMutex +} + +func (s *mapStorage) GetBlock(ctx context.Context, id string, offset, length int64) ([]byte, error) { + s.mutex.RLock() + defer s.mutex.RUnlock() + + data, ok := s.data[id] + if ok { + data = append([]byte(nil), data...) + if length < 0 { + return data, nil + } + + if int(offset) > len(data) || offset < 0 { + return nil, errors.New("invalid offset") + } + + data = data[offset:] + if int(length) > len(data) { + return nil, errors.New("invalid length") + } + return data[0:length], nil + } + + return nil, storage.ErrBlockNotFound +} + +func (s *mapStorage) PutBlock(ctx context.Context, id string, data []byte) error { + s.mutex.Lock() + defer s.mutex.Unlock() + + if _, ok := s.data[id]; ok { + return nil + } + + s.keyTime[id] = s.timeNow() + s.data[id] = append([]byte{}, data...) + return nil +} + +func (s *mapStorage) DeleteBlock(ctx context.Context, id string) error { + s.mutex.Lock() + defer s.mutex.Unlock() + + delete(s.data, id) + delete(s.keyTime, id) + return nil +} + +func (s *mapStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + s.mutex.RLock() + + keys := []string{} + for k := range s.data { + if strings.HasPrefix(k, prefix) { + keys = append(keys, k) + } + } + s.mutex.RUnlock() + + sort.Strings(keys) + + for _, k := range keys { + s.mutex.RLock() + v, ok := s.data[k] + ts := s.keyTime[k] + s.mutex.RUnlock() + if !ok { + continue + } + if err := callback(storage.BlockMetadata{ + BlockID: k, + Length: int64(len(v)), + Timestamp: ts, + }); err != nil { + return err + } + } + return nil +} + +func (s *mapStorage) Close(ctx context.Context) error { + return nil +} + +func (s *mapStorage) TouchBlock(ctx context.Context, blockID string, threshold time.Duration) error { + s.mutex.Lock() + defer s.mutex.Unlock() + + if v, ok := s.keyTime[blockID]; ok { + n := s.timeNow() + if n.Sub(v) >= threshold { + s.keyTime[blockID] = n + } + } + + return nil +} + +func (s *mapStorage) ConnectionInfo() storage.ConnectionInfo { + // unsupported + return storage.ConnectionInfo{} +} + +// NewMapStorage returns an implementation of Storage backed by the contents of given map. +// Used primarily for testing. +func NewMapStorage(data map[string][]byte, keyTime map[string]time.Time, timeNow func() time.Time) storage.Storage { + if keyTime == nil { + keyTime = make(map[string]time.Time) + } + if timeNow == nil { + timeNow = time.Now + } + return &mapStorage{data: data, keyTime: keyTime, timeNow: timeNow} +} diff --git a/internal/storagetesting/map_test.go b/internal/storagetesting/map_test.go new file mode 100644 index 000000000..238276cbd --- /dev/null +++ b/internal/storagetesting/map_test.go @@ -0,0 +1,15 @@ +package storagetesting + +import ( + "context" + "testing" +) + +func TestMapStorage(t *testing.T) { + data := map[string][]byte{} + r := NewMapStorage(data, nil, nil) + if r == nil { + t.Errorf("unexpected result: %v", r) + } + VerifyStorage(context.Background(), t, r) +} diff --git a/internal/storagetesting/verify.go b/internal/storagetesting/verify.go new file mode 100644 index 000000000..3e384a001 --- /dev/null +++ b/internal/storagetesting/verify.go @@ -0,0 +1,84 @@ +package storagetesting + +import ( + "bytes" + "context" + "reflect" + "testing" + + "github.com/kopia/repo/storage" +) + +// VerifyStorage verifies the behavior of the specified storage. +func VerifyStorage(ctx context.Context, t *testing.T, r storage.Storage) { + blocks := []struct { + blk string + contents []byte + }{ + {blk: string("abcdbbf4f0507d054ed5a80a5b65086f602b"), contents: []byte{}}, + {blk: string("zxce0e35630770c54668a8cfb4e414c6bf8f"), contents: []byte{1}}, + {blk: string("abff4585856ebf0748fd989e1dd623a8963d"), contents: bytes.Repeat([]byte{1}, 1000)}, + {blk: string("abgc3dca496d510f492c858a2df1eb824e62"), contents: bytes.Repeat([]byte{1}, 10000)}, + {blk: string("kopia.repository"), contents: bytes.Repeat([]byte{2}, 100)}, + } + + // First verify that blocks don't exist. + for _, b := range blocks { + AssertGetBlockNotFound(ctx, t, r, b.blk) + } + + ctx2 := storage.WithUploadProgressCallback(ctx, func(desc string, completed, total int64) { + log.Infof("progress %v: %v/%v", desc, completed, total) + }) + + // Now add blocks. + for _, b := range blocks { + if err := r.PutBlock(ctx2, b.blk, b.contents); err != nil { + t.Errorf("can't put block: %v", err) + } + + AssertGetBlock(ctx, t, r, b.blk, b.contents) + } + + AssertListResults(ctx, t, r, "", blocks[0].blk, blocks[1].blk, blocks[2].blk, blocks[3].blk, blocks[4].blk) + AssertListResults(ctx, t, r, "ab", blocks[0].blk, blocks[2].blk, blocks[3].blk) + + // Overwrite blocks. + for _, b := range blocks { + if err := r.PutBlock(ctx, b.blk, b.contents); err != nil { + t.Errorf("can't put block: %v", err) + } + + AssertGetBlock(ctx, t, r, b.blk, b.contents) + } + + if err := r.DeleteBlock(ctx, blocks[0].blk); err != nil { + t.Errorf("unable to delete block: %v", err) + } + if err := r.DeleteBlock(ctx, blocks[0].blk); err != nil { + t.Errorf("invalid error when deleting deleted block: %v", err) + } + AssertListResults(ctx, t, r, "ab", blocks[2].blk, blocks[3].blk) + AssertListResults(ctx, t, r, "", blocks[1].blk, blocks[2].blk, blocks[3].blk, blocks[4].blk) +} + +// AssertConnectionInfoRoundTrips verifies that the ConnectionInfo returned by a given storage can be used to create +// equivalent storage +func AssertConnectionInfoRoundTrips(ctx context.Context, t *testing.T, s storage.Storage) { + t.Helper() + + ci := s.ConnectionInfo() + s2, err := storage.NewStorage(ctx, ci) + if err != nil { + t.Fatalf("err: %v", err) + } + + ci2 := s2.ConnectionInfo() + if !reflect.DeepEqual(ci, ci2) { + t.Errorf("connection info does not round-trip: %v vs %v", ci, ci2) + } + + if err := s2.Close(ctx); err != nil { + t.Errorf("unable to close storage: %v", err) + } +} diff --git a/internal/throttle/round_tripper.go b/internal/throttle/round_tripper.go new file mode 100644 index 000000000..c59a2deb3 --- /dev/null +++ b/internal/throttle/round_tripper.go @@ -0,0 +1,44 @@ +package throttle + +import ( + "io" + "net/http" +) + +type throttlerPool interface { + AddReader(io.ReadCloser) (io.ReadCloser, error) +} + +type throttlingRoundTripper struct { + base http.RoundTripper + downloadPool throttlerPool + uploadPool throttlerPool +} + +func (rt *throttlingRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + if req.Body != nil && rt.uploadPool != nil { + var err error + req.Body, err = rt.uploadPool.AddReader(req.Body) + if err != nil { + return nil, err + } + } + resp, err := rt.base.RoundTrip(req) + if resp != nil && resp.Body != nil && rt.downloadPool != nil { + resp.Body, err = rt.downloadPool.AddReader(resp.Body) + } + return resp, err +} + +// NewRoundTripper returns http.RoundTripper that throttles upload and downloads. +func NewRoundTripper(base http.RoundTripper, downloadPool throttlerPool, uploadPool throttlerPool) http.RoundTripper { + if base == nil { + base = http.DefaultTransport + } + + return &throttlingRoundTripper{ + base: base, + downloadPool: downloadPool, + uploadPool: uploadPool, + } +} diff --git a/internal/throttle/round_tripper_test.go b/internal/throttle/round_tripper_test.go new file mode 100644 index 000000000..50812f3b4 --- /dev/null +++ b/internal/throttle/round_tripper_test.go @@ -0,0 +1,103 @@ +package throttle + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "net/http" + "testing" +) + +type baseRoundTripper struct { + responses map[*http.Request]*http.Response +} + +func (rt *baseRoundTripper) add(req *http.Request, resp *http.Response) (*http.Request, *http.Response) { + rt.responses[req] = resp + return req, resp +} + +func (rt *baseRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + resp := rt.responses[req] + if resp != nil { + return resp, nil + } + + return nil, fmt.Errorf("error occurred") +} + +type fakePool struct { + readers []io.ReadCloser +} + +func (fp *fakePool) reset() { + fp.readers = nil +} + +func (fp *fakePool) AddReader(r io.ReadCloser) (io.ReadCloser, error) { + fp.readers = append(fp.readers, r) + return r, nil +} + +func TestRoundTripper(t *testing.T) { + downloadBody := ioutil.NopCloser(bytes.NewReader([]byte("data1"))) + uploadBody := ioutil.NopCloser(bytes.NewReader([]byte("data1"))) + + base := &baseRoundTripper{ + responses: make(map[*http.Request]*http.Response), + } + downloadPool := &fakePool{} + uploadPool := &fakePool{} + rt := NewRoundTripper(base, downloadPool, uploadPool) + + // Empty request (no request, no response) + uploadPool.reset() + downloadPool.reset() + req1, resp1 := base.add(&http.Request{}, &http.Response{}) + resp, err := rt.RoundTrip(req1) + if resp != resp1 || err != nil { + t.Errorf("invalid response or error: %v", err) + } + if len(downloadPool.readers) != 0 || len(uploadPool.readers) != 0 { + t.Errorf("invalid pool contents: %v %v", downloadPool.readers, uploadPool.readers) + } + + // Upload request + uploadPool.reset() + downloadPool.reset() + req2, resp2 := base.add(&http.Request{ + Body: uploadBody, + }, &http.Response{}) + resp, err = rt.RoundTrip(req2) + if resp != resp2 || err != nil { + t.Errorf("invalid response or error: %v", err) + } + if len(downloadPool.readers) != 0 || len(uploadPool.readers) != 1 { + t.Errorf("invalid pool contents: %v %v", downloadPool.readers, uploadPool.readers) + } + + // Download request + uploadPool.reset() + downloadPool.reset() + req3, resp3 := base.add(&http.Request{}, &http.Response{Body: downloadBody}) + resp, err = rt.RoundTrip(req3) + if resp != resp3 || err != nil { + t.Errorf("invalid response or error: %v", err) + } + if len(downloadPool.readers) != 1 || len(uploadPool.readers) != 0 { + t.Errorf("invalid pool contents: %v %v", downloadPool.readers, uploadPool.readers) + } + + // Upload/Download request + uploadPool.reset() + downloadPool.reset() + req4, resp4 := base.add(&http.Request{Body: uploadBody}, &http.Response{Body: downloadBody}) + resp, err = rt.RoundTrip(req4) + if resp != resp4 || err != nil { + t.Errorf("invalid response or error: %v", err) + } + if len(downloadPool.readers) != 1 || len(uploadPool.readers) != 1 { + t.Errorf("invalid pool contents: %v %v", downloadPool.readers, uploadPool.readers) + } +} diff --git a/local_config.go b/local_config.go new file mode 100644 index 000000000..ba11a6096 --- /dev/null +++ b/local_config.go @@ -0,0 +1,56 @@ +package repo + +import ( + "encoding/json" + "io" + "os" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/object" + "github.com/kopia/repo/storage" +) + +// LocalConfig is a configuration of Kopia stored in a configuration file. +type LocalConfig struct { + Storage storage.ConnectionInfo `json:"storage"` + Caching block.CachingOptions `json:"caching"` +} + +// repositoryObjectFormat describes the format of objects in a repository. +type repositoryObjectFormat struct { + block.FormattingOptions + object.Format +} + +// Load reads local configuration from the specified reader. +func (lc *LocalConfig) Load(r io.Reader) error { + *lc = LocalConfig{} + return json.NewDecoder(r).Decode(lc) +} + +// Save writes the configuration to the specified writer. +func (lc *LocalConfig) Save(w io.Writer) error { + b, err := json.MarshalIndent(lc, "", " ") + if err != nil { + return nil + } + _, err = w.Write(b) + return err +} + +// loadConfigFromFile reads the local configuration from the specified file. +func loadConfigFromFile(fileName string) (*LocalConfig, error) { + f, err := os.Open(fileName) + if err != nil { + return nil, err + } + defer f.Close() //nolint:errcheck + + var lc LocalConfig + + if err := lc.Load(f); err != nil { + return nil, err + } + + return &lc, nil +} diff --git a/manifest/manifest_entry.go b/manifest/manifest_entry.go new file mode 100644 index 000000000..cc2ead40f --- /dev/null +++ b/manifest/manifest_entry.go @@ -0,0 +1,12 @@ +package manifest + +import "time" + +// EntryMetadata contains metadata about manifest item. Each manifest item has one or more labels +// Including required "type" label. +type EntryMetadata struct { + ID string + Length int + Labels map[string]string + ModTime time.Time +} diff --git a/manifest/manifest_manager.go b/manifest/manifest_manager.go new file mode 100644 index 000000000..24454c3e1 --- /dev/null +++ b/manifest/manifest_manager.go @@ -0,0 +1,516 @@ +// Package manifest implements support for managing JSON-based manifests in repository. +package manifest + +import ( + "bytes" + "compress/gzip" + "context" + "crypto/rand" + "encoding/hex" + "encoding/json" + "fmt" + "sort" + "sync" + "time" + + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/storage" + "github.com/pkg/errors" +) + +var log = repologging.Logger("kopia/manifest") + +// ErrNotFound is returned when the metadata item is not found. +var ErrNotFound = errors.New("not found") + +const manifestBlockPrefix = "m" +const autoCompactionBlockCount = 16 + +type blockManager interface { + GetBlock(ctx context.Context, blockID string) ([]byte, error) + WriteBlock(ctx context.Context, data []byte, prefix string) (string, error) + DeleteBlock(blockID string) error + ListBlocks(prefix string) ([]string, error) + DisableIndexFlush() + EnableIndexFlush() + Flush(ctx context.Context) error +} + +// Manager organizes JSON manifests of various kinds, including snapshot manifests +type Manager struct { + mu sync.Mutex + b blockManager + + initialized bool + pendingEntries map[string]*manifestEntry + + committedEntries map[string]*manifestEntry + committedBlockIDs map[string]bool +} + +// Put serializes the provided payload to JSON and persists it. Returns unique handle that represents the object. +func (m *Manager) Put(ctx context.Context, labels map[string]string, payload interface{}) (string, error) { + if labels["type"] == "" { + return "", fmt.Errorf("'type' label is required") + } + + if err := m.ensureInitialized(ctx); err != nil { + return "", err + } + m.mu.Lock() + defer m.mu.Unlock() + + random := make([]byte, 16) + if _, err := rand.Read(random); err != nil { + return "", errors.Wrap(err, "can't initialize randomness") + } + + b, err := json.Marshal(payload) + if err != nil { + return "", errors.Wrap(err, "marshal error") + } + + e := &manifestEntry{ + ID: hex.EncodeToString(random), + ModTime: time.Now().UTC(), + Labels: copyLabels(labels), + Content: b, + } + + m.pendingEntries[e.ID] = e + + return e.ID, nil +} + +// GetMetadata returns metadata about provided manifest item or ErrNotFound if the item can't be found. +func (m *Manager) GetMetadata(ctx context.Context, id string) (*EntryMetadata, error) { + if err := m.ensureInitialized(ctx); err != nil { + return nil, err + } + + m.mu.Lock() + defer m.mu.Unlock() + + e := m.pendingEntries[id] + if e == nil { + e = m.committedEntries[id] + } + + if e == nil || e.Deleted { + return nil, ErrNotFound + } + + return &EntryMetadata{ + ID: id, + ModTime: e.ModTime, + Length: len(e.Content), + Labels: copyLabels(e.Labels), + }, nil +} + +// Get retrieves the contents of the provided manifest item by deserializing it as JSON to provided object. +// If the manifest is not found, returns ErrNotFound. +func (m *Manager) Get(ctx context.Context, id string, data interface{}) error { + if err := m.ensureInitialized(ctx); err != nil { + return err + } + + b, err := m.GetRaw(ctx, id) + if err != nil { + return err + } + + if err := json.Unmarshal(b, data); err != nil { + return fmt.Errorf("unable to unmashal %q: %v", id, err) + } + + return nil +} + +// GetRaw returns raw contents of the provided manifest (JSON bytes) or ErrNotFound if not found. +func (m *Manager) GetRaw(ctx context.Context, id string) ([]byte, error) { + if err := m.ensureInitialized(ctx); err != nil { + return nil, err + } + + m.mu.Lock() + defer m.mu.Unlock() + + e := m.pendingEntries[id] + if e == nil { + e = m.committedEntries[id] + } + if e == nil || e.Deleted { + return nil, ErrNotFound + } + + return e.Content, nil +} + +// Find returns the list of EntryMetadata for manifest entries matching all provided labels. +func (m *Manager) Find(ctx context.Context, labels map[string]string) ([]*EntryMetadata, error) { + if err := m.ensureInitialized(ctx); err != nil { + return nil, err + } + + m.mu.Lock() + defer m.mu.Unlock() + + var matches []*EntryMetadata + for _, e := range m.pendingEntries { + if matchesLabels(e.Labels, labels) { + matches = append(matches, cloneEntryMetadata(e)) + } + } + for _, e := range m.committedEntries { + if m.pendingEntries[e.ID] != nil { + // ignore committed that are also in pending + continue + } + + if matchesLabels(e.Labels, labels) { + matches = append(matches, cloneEntryMetadata(e)) + } + } + + sort.Slice(matches, func(i, j int) bool { + return matches[i].ModTime.Before(matches[j].ModTime) + }) + return matches, nil +} + +func cloneEntryMetadata(e *manifestEntry) *EntryMetadata { + return &EntryMetadata{ + ID: e.ID, + Labels: copyLabels(e.Labels), + Length: len(e.Content), + ModTime: e.ModTime, + } +} + +// matchesLabels returns true when all entries in 'b' are found in the 'a'. +func matchesLabels(a, b map[string]string) bool { + for k, v := range b { + if a[k] != v { + return false + } + } + + return true +} + +// Flush persists changes to manifest manager. +func (m *Manager) Flush(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + _, err := m.flushPendingEntriesLocked(ctx) + return err +} + +func (m *Manager) flushPendingEntriesLocked(ctx context.Context) (string, error) { + if len(m.pendingEntries) == 0 { + return "", nil + } + + man := manifest{} + + for _, e := range m.pendingEntries { + man.Entries = append(man.Entries, e) + } + + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + mustSucceed(json.NewEncoder(gz).Encode(man)) + mustSucceed(gz.Flush()) + mustSucceed(gz.Close()) + + blockID, err := m.b.WriteBlock(ctx, buf.Bytes(), manifestBlockPrefix) + if err != nil { + return "", err + } + + for _, e := range m.pendingEntries { + m.committedEntries[e.ID] = e + delete(m.pendingEntries, e.ID) + } + + m.committedBlockIDs[blockID] = true + + return blockID, nil +} + +func mustSucceed(e error) { + if e != nil { + panic("unexpected failure: " + e.Error()) + } +} + +// Delete marks the specified manifest ID for deletion. +func (m *Manager) Delete(ctx context.Context, id string) error { + if err := m.ensureInitialized(ctx); err != nil { + return err + } + + if m.pendingEntries[id] == nil && m.committedEntries[id] == nil { + return nil + } + + m.pendingEntries[id] = &manifestEntry{ + ID: id, + ModTime: time.Now().UTC(), + Deleted: true, + } + return nil +} + +// Refresh updates the committed blocks from the underlying storage. +func (m *Manager) Refresh(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + return m.loadCommittedBlocksLocked(ctx) +} + +func (m *Manager) loadCommittedBlocksLocked(ctx context.Context) error { + log.Debugf("listing manifest blocks") + for { + blocks, err := m.b.ListBlocks(manifestBlockPrefix) + if err != nil { + return errors.Wrap(err, "unable to list manifest blocks") + } + + m.committedEntries = map[string]*manifestEntry{} + m.committedBlockIDs = map[string]bool{} + + log.Debugf("found %v manifest blocks", len(blocks)) + err = m.loadManifestBlocks(ctx, blocks) + if err == nil { + // success + break + } + if err == storage.ErrBlockNotFound { + // try again, lost a race with another manifest manager which just did compaction + continue + } + return errors.Wrap(err, "unable to load manifest blocks") + } + + if err := m.maybeCompactLocked(ctx); err != nil { + return fmt.Errorf("error auto-compacting blocks") + } + + return nil +} + +func (m *Manager) loadManifestBlocks(ctx context.Context, blockIDs []string) error { + t0 := time.Now() + + for _, b := range blockIDs { + m.committedBlockIDs[b] = true + } + + manifests, err := m.loadBlocksInParallel(ctx, blockIDs) + if err != nil { + return err + } + + for _, man := range manifests { + for _, e := range man.Entries { + m.mergeEntry(e) + } + } + + // after merging, remove blocks marked as deleted. + for k, e := range m.committedEntries { + if e.Deleted { + delete(m.committedEntries, k) + } + } + + log.Debugf("finished loading manifest blocks in %v.", time.Since(t0)) + + return nil +} + +func (m *Manager) loadBlocksInParallel(ctx context.Context, blockIDs []string) ([]manifest, error) { + errors := make(chan error, len(blockIDs)) + manifests := make(chan manifest, len(blockIDs)) + ch := make(chan string, len(blockIDs)) + var wg sync.WaitGroup + + for i := 0; i < 8; i++ { + wg.Add(1) + go func(workerID int) { + defer wg.Done() + + for blk := range ch { + t1 := time.Now() + man, err := m.loadManifestBlock(ctx, blk) + + if err != nil { + errors <- err + log.Debugf("block %v failed to be loaded by worker %v in %v: %v.", blk, workerID, time.Since(t1), err) + } else { + log.Debugf("block %v loaded by worker %v in %v.", blk, workerID, time.Since(t1)) + manifests <- man + } + } + }(i) + } + + // feed block IDs for goroutines + for _, b := range blockIDs { + ch <- b + } + close(ch) + + // wait for workers to complete + wg.Wait() + close(errors) + close(manifests) + + // if there was any error, forward it + if err := <-errors; err != nil { + return nil, err + } + + var man []manifest + for m := range manifests { + man = append(man, m) + } + + return man, nil +} + +func (m *Manager) loadManifestBlock(ctx context.Context, blockID string) (manifest, error) { + man := manifest{} + blk, err := m.b.GetBlock(ctx, blockID) + if err != nil { + // do not wrap the error here, we want to propagate original ErrBlockNotFound + // which causes a retry if we lose list/delete race. + return man, err + } + + gz, err := gzip.NewReader(bytes.NewReader(blk)) + if err != nil { + return man, fmt.Errorf("unable to unpack block %q: %v", blockID, err) + } + + if err := json.NewDecoder(gz).Decode(&man); err != nil { + return man, fmt.Errorf("unable to parse block %q: %v", blockID, err) + } + + return man, nil +} + +// Compact performs compaction of manifest blocks. +func (m *Manager) Compact(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + return m.compactLocked(ctx) +} + +func (m *Manager) maybeCompactLocked(ctx context.Context) error { + if len(m.committedBlockIDs) < autoCompactionBlockCount { + return nil + } + + log.Debugf("performing automatic compaction of %v blocks", len(m.committedBlockIDs)) + if err := m.compactLocked(ctx); err != nil { + return errors.Wrap(err, "unable to compact manifest blocks") + } + + if err := m.b.Flush(ctx); err != nil { + return errors.Wrap(err, "unable to flush blocks after auto-compaction") + } + + return nil +} + +func (m *Manager) compactLocked(ctx context.Context) error { + log.Debugf("compactLocked: pendingEntries=%v blockIDs=%v", len(m.pendingEntries), len(m.committedBlockIDs)) + + if len(m.committedBlockIDs) == 1 && len(m.pendingEntries) == 0 { + return nil + } + + // compaction needs to be atomic (deletes and rewrite should show up in one index block or not show up at all) + // that's why we want to prevent index flushes while we're d. + m.b.DisableIndexFlush() + defer m.b.EnableIndexFlush() + + for _, e := range m.committedEntries { + m.pendingEntries[e.ID] = e + } + + blockID, err := m.flushPendingEntriesLocked(ctx) + if err != nil { + return err + } + + // add the newly-created block to the list, could be duplicate + for b := range m.committedBlockIDs { + if b == blockID { + // do not delete block that was just written. + continue + } + + if err := m.b.DeleteBlock(b); err != nil { + return fmt.Errorf("unable to delete block %q: %v", b, err) + } + + delete(m.committedBlockIDs, b) + } + + return nil +} + +func (m *Manager) mergeEntry(e *manifestEntry) { + prev := m.committedEntries[e.ID] + if prev == nil { + m.committedEntries[e.ID] = e + return + } + + if e.ModTime.After(prev.ModTime) { + m.committedEntries[e.ID] = e + } +} + +func (m *Manager) ensureInitialized(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + if m.initialized { + return nil + } + + if err := m.loadCommittedBlocksLocked(ctx); err != nil { + return err + } + + m.initialized = true + return nil +} + +func copyLabels(m map[string]string) map[string]string { + r := map[string]string{} + for k, v := range m { + r[k] = v + } + return r +} + +// NewManager returns new manifest manager for the provided block manager. +func NewManager(ctx context.Context, b blockManager) (*Manager, error) { + m := &Manager{ + b: b, + pendingEntries: map[string]*manifestEntry{}, + committedEntries: map[string]*manifestEntry{}, + committedBlockIDs: map[string]bool{}, + } + + return m, nil +} diff --git a/manifest/manifest_manager_test.go b/manifest/manifest_manager_test.go new file mode 100644 index 000000000..d2a74ac18 --- /dev/null +++ b/manifest/manifest_manager_test.go @@ -0,0 +1,321 @@ +package manifest + +import ( + "context" + "reflect" + "sort" + "strings" + "testing" + "time" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/internal/storagetesting" + "github.com/pkg/errors" +) + +func TestManifest(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + mgr, setupErr := newManagerForTesting(ctx, t, data) + if setupErr != nil { + t.Fatalf("unable to open block manager: %v", setupErr) + } + + item1 := map[string]int{"foo": 1, "bar": 2} + item2 := map[string]int{"foo": 2, "bar": 3} + item3 := map[string]int{"foo": 3, "bar": 4} + + labels1 := map[string]string{"type": "item", "color": "red"} + labels2 := map[string]string{"type": "item", "color": "blue", "shape": "square"} + labels3 := map[string]string{"type": "item", "shape": "square", "color": "red"} + + id1 := addAndVerify(ctx, t, mgr, labels1, item1) + id2 := addAndVerify(ctx, t, mgr, labels2, item2) + id3 := addAndVerify(ctx, t, mgr, labels3, item3) + + cases := []struct { + criteria map[string]string + expected []string + }{ + {map[string]string{"color": "red"}, []string{id1, id3}}, + {map[string]string{"color": "blue"}, []string{id2}}, + {map[string]string{"color": "green"}, nil}, + {map[string]string{"color": "red", "shape": "square"}, []string{id3}}, + {map[string]string{"color": "blue", "shape": "square"}, []string{id2}}, + {map[string]string{"color": "red", "shape": "circle"}, nil}, + } + + // verify before flush + for _, tc := range cases { + verifyMatches(ctx, t, mgr, tc.criteria, tc.expected) + } + verifyItem(ctx, t, mgr, id1, labels1, item1) + verifyItem(ctx, t, mgr, id2, labels2, item2) + verifyItem(ctx, t, mgr, id3, labels3, item3) + + if err := mgr.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + if err := mgr.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + + // verify after flush + for _, tc := range cases { + verifyMatches(ctx, t, mgr, tc.criteria, tc.expected) + } + verifyItem(ctx, t, mgr, id1, labels1, item1) + verifyItem(ctx, t, mgr, id2, labels2, item2) + verifyItem(ctx, t, mgr, id3, labels3, item3) + + // flush underlying block manager and verify in new manifest manager. + mgr.b.Flush(ctx) + mgr2, setupErr := newManagerForTesting(ctx, t, data) + if setupErr != nil { + t.Fatalf("can't open block manager: %v", setupErr) + } + for _, tc := range cases { + verifyMatches(ctx, t, mgr2, tc.criteria, tc.expected) + } + verifyItem(ctx, t, mgr2, id1, labels1, item1) + verifyItem(ctx, t, mgr2, id2, labels2, item2) + verifyItem(ctx, t, mgr2, id3, labels3, item3) + if err := mgr2.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + } + + // delete from one + time.Sleep(1 * time.Second) + if err := mgr.Delete(ctx, id3); err != nil { + t.Errorf("delete error: %v", err) + } + verifyItemNotFound(ctx, t, mgr, id3) + mgr.Flush(ctx) + verifyItemNotFound(ctx, t, mgr, id3) + + // still found in another + verifyItem(ctx, t, mgr2, id3, labels3, item3) + if err := mgr2.loadCommittedBlocksLocked(ctx); err != nil { + t.Errorf("unable to load: %v", err) + } + + if err := mgr.Compact(ctx); err != nil { + t.Errorf("can't compact: %v", err) + } + + blks, err := mgr.b.ListBlocks(manifestBlockPrefix) + if err != nil { + t.Errorf("unable to list manifest blocks: %v", err) + } + if got, want := len(blks), 1; got != want { + t.Errorf("unexpected number of blocks: %v, want %v", got, want) + } + + mgr.b.Flush(ctx) + + mgr3, err := newManagerForTesting(ctx, t, data) + if err != nil { + t.Fatalf("can't open manager: %v", err) + } + + verifyItem(ctx, t, mgr3, id1, labels1, item1) + verifyItem(ctx, t, mgr3, id2, labels2, item2) + verifyItemNotFound(ctx, t, mgr3, id3) +} + +func TestManifestInitCorruptedBlock(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + st := storagetesting.NewMapStorage(data, nil, nil) + + f := block.FormattingOptions{ + Hash: "HMAC-SHA256-128", + Encryption: "NONE", + MaxPackSize: 100000, + } + + // write some data to storage + bm, err := block.NewManager(ctx, st, f, block.CachingOptions{}, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + mgr, err := NewManager(ctx, bm) + if err != nil { + t.Fatalf("err: %v", err) + } + + mgr.Put(ctx, map[string]string{"type": "foo"}, map[string]string{"some": "value"}) //nolint:errcheck + mgr.Flush(ctx) + bm.Flush(ctx) + + // corrupt data at the storage level. + for k, v := range data { + if strings.HasPrefix(k, "p") { + for i := 0; i < len(v); i++ { + v[i] ^= 1 + } + } + } + + // make a new block manager based on corrupted data. + bm, err = block.NewManager(ctx, st, f, block.CachingOptions{}, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + mgr, err = NewManager(ctx, bm) + if err != nil { + t.Fatalf("err: %v", err) + } + + cases := []struct { + desc string + f func() error + }{ + {"GetRaw", func() error { _, err := mgr.GetRaw(ctx, "anything"); return err }}, + {"GetMetadata", func() error { _, err := mgr.GetMetadata(ctx, "anything"); return err }}, + {"Get", func() error { return mgr.Get(ctx, "anything", nil) }}, + {"Delete", func() error { return mgr.Delete(ctx, "anything") }}, + {"Find", func() error { _, err := mgr.Find(ctx, nil); return err }}, + {"Put", func() error { + _, err := mgr.Put(ctx, map[string]string{ + "type": "foo", + }, map[string]string{ + "some": "value", + }) + return err + }}, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + err := tc.f() + if err == nil || !strings.Contains(err.Error(), "invalid checksum") { + t.Errorf("invalid error when initializing malformed manifest manager: %v", err) + } + }) + } +} + +func addAndVerify(ctx context.Context, t *testing.T, mgr *Manager, labels map[string]string, data map[string]int) string { + t.Helper() + id, err := mgr.Put(ctx, labels, data) + if err != nil { + t.Errorf("unable to add %v (%v): %v", labels, data, err) + return "" + } + + verifyItem(ctx, t, mgr, id, labels, data) + return id +} + +func verifyItem(ctx context.Context, t *testing.T, mgr *Manager, id string, labels map[string]string, data map[string]int) { + t.Helper() + + l, err := mgr.GetMetadata(ctx, id) + if err != nil { + t.Errorf("unable to retrieve %q: %v", id, err) + return + } + + if !reflect.DeepEqual(l.Labels, labels) { + t.Errorf("invalid labels retrieved %v, wanted %v", l.Labels, labels) + } + + var d2 map[string]int + if err := mgr.Get(ctx, id, &d2); err != nil { + t.Errorf("Get failed: %v", err) + } + + if !reflect.DeepEqual(d2, data) { + t.Errorf("invalid data retrieved %v, wanted %v", d2, data) + } +} + +func verifyItemNotFound(ctx context.Context, t *testing.T, mgr *Manager, id string) { + t.Helper() + + _, err := mgr.GetMetadata(ctx, id) + if got, want := err, ErrNotFound; got != want { + t.Errorf("invalid error when getting %q %v, expected %v", id, err, ErrNotFound) + return + } +} + +func verifyMatches(ctx context.Context, t *testing.T, mgr *Manager, labels map[string]string, expected []string) { + t.Helper() + + var matches []string + items, err := mgr.Find(ctx, labels) + if err != nil { + t.Errorf("error in Find(): %v", err) + return + } + for _, m := range items { + matches = append(matches, m.ID) + } + sort.Strings(matches) + sort.Strings(expected) + + if !reflect.DeepEqual(matches, expected) { + t.Errorf("invalid matches for %v: %v, expected %v", labels, matches, expected) + } +} + +func newManagerForTesting(ctx context.Context, t *testing.T, data map[string][]byte) (*Manager, error) { + st := storagetesting.NewMapStorage(data, nil, nil) + + bm, err := block.NewManager(ctx, st, block.FormattingOptions{ + Hash: "HMAC-SHA256-128", + Encryption: "NONE", + MaxPackSize: 100000, + }, block.CachingOptions{}, nil) + if err != nil { + return nil, errors.Wrap(err, "can't create block manager") + } + + return NewManager(ctx, bm) +} + +func TestManifestInvalidPut(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + mgr, setupErr := newManagerForTesting(ctx, t, data) + if setupErr != nil { + t.Fatalf("unable to open block manager: %v", setupErr) + } + + cases := []struct { + labels map[string]string + payload interface{} + expectedError string + }{ + {map[string]string{"": ""}, "xxx", "'type' label is required"}, + {map[string]string{"type": "blah"}, complex128(1), "marshal error"}, + } + + for i, tc := range cases { + _, err := mgr.Put(ctx, tc.labels, tc.payload) + if err == nil || !strings.Contains(err.Error(), tc.expectedError) { + t.Errorf("invalid error when putting case %v: %v, expected %v", i, err, tc.expectedError) + } + } +} + +func TestManifestAutoCompaction(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + + for i := 0; i < 100; i++ { + mgr, setupErr := newManagerForTesting(ctx, t, data) + if setupErr != nil { + t.Fatalf("unable to open block manager: %v", setupErr) + } + + item1 := map[string]int{"foo": 1, "bar": 2} + labels1 := map[string]string{"type": "item", "color": "red"} + addAndVerify(ctx, t, mgr, labels1, item1) + mgr.Flush(ctx) + } +} diff --git a/manifest/serialized.go b/manifest/serialized.go new file mode 100644 index 000000000..34be024c9 --- /dev/null +++ b/manifest/serialized.go @@ -0,0 +1,18 @@ +package manifest + +import ( + "encoding/json" + "time" +) + +type manifest struct { + Entries []*manifestEntry `json:"entries"` +} + +type manifestEntry struct { + ID string `json:"id"` + Labels map[string]string `json:"labels"` + ModTime time.Time `json:"modified"` + Deleted bool `json:"deleted,omitempty"` + Content json.RawMessage `json:"data"` +} diff --git a/object/indirect.go b/object/indirect.go new file mode 100644 index 000000000..7fdadfd4e --- /dev/null +++ b/object/indirect.go @@ -0,0 +1,8 @@ +package object + +// indirectObjectEntry represents an entry in indirect object stream. +type indirectObjectEntry struct { + Start int64 `json:"s,omitempty"` + Length int64 `json:"l,omitempty"` + Object ID `json:"o,omitempty"` +} diff --git a/object/object_manager.go b/object/object_manager.go new file mode 100644 index 000000000..e4621ce40 --- /dev/null +++ b/object/object_manager.go @@ -0,0 +1,245 @@ +// Package object implements repository support for content-addressable objects of arbitrary size. +package object + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + + "github.com/kopia/repo/block" + "github.com/pkg/errors" +) + +// Reader allows reading, seeking, getting the length of and closing of a repository object. +type Reader interface { + io.Reader + io.Seeker + io.Closer + Length() int64 +} + +type blockManager interface { + BlockInfo(ctx context.Context, blockID string) (block.Info, error) + GetBlock(ctx context.Context, blockID string) ([]byte, error) + WriteBlock(ctx context.Context, data []byte, prefix string) (string, error) +} + +// Format describes the format of objects in a repository. +type Format struct { + Splitter string `json:"splitter,omitempty"` // splitter used to break objects into storage blocks + MinBlockSize int `json:"minBlockSize,omitempty"` // minimum block size used with dynamic splitter + AvgBlockSize int `json:"avgBlockSize,omitempty"` // approximate size of storage block (used with dynamic splitter) + MaxBlockSize int `json:"maxBlockSize,omitempty"` // maximum size of storage block +} + +// Manager implements a content-addressable storage on top of blob storage. +type Manager struct { + Format Format + + blockMgr blockManager + trace func(message string, args ...interface{}) + + newSplitter func() objectSplitter +} + +// NewWriter creates an ObjectWriter for writing to the repository. +func (om *Manager) NewWriter(ctx context.Context, opt WriterOptions) Writer { + return &objectWriter{ + ctx: ctx, + repo: om, + splitter: om.newSplitter(), + description: opt.Description, + prefix: opt.Prefix, + } +} + +// Open creates new ObjectReader for reading given object from a repository. +func (om *Manager) Open(ctx context.Context, objectID ID) (Reader, error) { + // log.Printf("Repository::Open %v", objectID.String()) + // defer log.Printf("finished Repository::Open() %v", objectID.String()) + + if indexObjectID, ok := objectID.IndexObjectID(); ok { + rd, err := om.Open(ctx, indexObjectID) + if err != nil { + return nil, err + } + defer rd.Close() //nolint:errcheck + + seekTable, err := om.flattenListChunk(rd) + if err != nil { + return nil, err + } + + totalLength := seekTable[len(seekTable)-1].endOffset() + + return &objectReader{ + ctx: ctx, + repo: om, + seekTable: seekTable, + totalLength: totalLength, + }, nil + } + + return om.newRawReader(ctx, objectID) +} + +// VerifyObject ensures that all objects backing ObjectID are present in the repository +// and returns the total length of the object and storage blocks of which it is composed. +func (om *Manager) VerifyObject(ctx context.Context, oid ID) (int64, []string, error) { + blocks := &blockTracker{} + l, err := om.verifyObjectInternal(ctx, oid, blocks) + if err != nil { + return 0, nil, err + } + + return l, blocks.blockIDs(), nil +} + +func (om *Manager) verifyIndirectObjectInternal(ctx context.Context, indexObjectID ID, blocks *blockTracker) (int64, error) { + if _, err := om.verifyObjectInternal(ctx, indexObjectID, blocks); err != nil { + return 0, errors.Wrap(err, "unable to read index") + } + rd, err := om.Open(ctx, indexObjectID) + if err != nil { + return 0, err + } + defer rd.Close() //nolint:errcheck + + seekTable, err := om.flattenListChunk(rd) + if err != nil { + return 0, err + } + + for i, m := range seekTable { + l, err := om.verifyObjectInternal(ctx, m.Object, blocks) + if err != nil { + return 0, err + } + + if l != m.Length { + return 0, fmt.Errorf("unexpected length of part %#v of indirect object %q: %v %v, expected %v", i, indexObjectID, m.Object, l, m.Length) + } + } + + totalLength := seekTable[len(seekTable)-1].endOffset() + return totalLength, nil +} + +func (om *Manager) verifyObjectInternal(ctx context.Context, oid ID, blocks *blockTracker) (int64, error) { + if indexObjectID, ok := oid.IndexObjectID(); ok { + return om.verifyIndirectObjectInternal(ctx, indexObjectID, blocks) + } + + if blockID, ok := oid.BlockID(); ok { + p, err := om.blockMgr.BlockInfo(ctx, blockID) + if err != nil { + return 0, err + } + blocks.addBlock(blockID) + return int64(p.Length), nil + } + + return 0, fmt.Errorf("unrecognized object type: %v", oid) + +} + +func nullTrace(message string, args ...interface{}) { +} + +// ManagerOptions specifies object manager options. +type ManagerOptions struct { + Trace func(message string, args ...interface{}) +} + +// NewObjectManager creates an ObjectManager with the specified block manager and format. +func NewObjectManager(ctx context.Context, bm blockManager, f Format, opts ManagerOptions) (*Manager, error) { + om := &Manager{ + blockMgr: bm, + Format: f, + trace: nullTrace, + } + + splitterID := f.Splitter + if splitterID == "" { + splitterID = "FIXED" + } + + os := splitterFactories[splitterID] + if os == nil { + return nil, fmt.Errorf("unsupported splitter %q", f.Splitter) + } + + om.newSplitter = func() objectSplitter { + return os(&f) + } + + if opts.Trace != nil { + om.trace = opts.Trace + } else { + om.trace = nullTrace + } + + return om, nil +} + +/* + +{"stream":"kopia:indirect","entries":[ +{"l":1698099,"o":"D13ea27f9ad891ad4a2edfa983906863d"}, +{"s":1698099,"l":1302081,"o":"De8ca8327cd3af5f4edbd5ed1009c525e"}, +{"s":3000180,"l":4352499,"o":"D6b6eb48ca5361d06d72fe193813e42e1"}, +{"s":7352679,"l":1170821,"o":"Dd14653f76b63802ed48be64a0e67fea9"}, + +{"s":91094118,"l":1645153,"o":"Daa55df764d881a1daadb5ea9de17abbb"} +]} +*/ + +type indirectObject struct { + StreamID string `json:"stream"` + Entries []indirectObjectEntry `json:"entries"` +} + +func (om *Manager) flattenListChunk(rawReader io.Reader) ([]indirectObjectEntry, error) { + var ind indirectObject + + if err := json.NewDecoder(rawReader).Decode(&ind); err != nil { + return nil, errors.Wrap(err, "invalid indirect object") + } + + return ind.Entries, nil +} + +func (om *Manager) newRawReader(ctx context.Context, objectID ID) (Reader, error) { + if blockID, ok := objectID.BlockID(); ok { + payload, err := om.blockMgr.GetBlock(ctx, blockID) + if err != nil { + return nil, err + } + + return newObjectReaderWithData(payload), nil + } + + return nil, fmt.Errorf("unsupported object ID: %v", objectID) +} + +type readerWithData struct { + io.ReadSeeker + length int64 +} + +func (rwd *readerWithData) Close() error { + return nil +} + +func (rwd *readerWithData) Length() int64 { + return rwd.length +} + +func newObjectReaderWithData(data []byte) Reader { + return &readerWithData{ + ReadSeeker: bytes.NewReader(data), + length: int64(len(data)), + } +} diff --git a/object/object_manager_test.go b/object/object_manager_test.go new file mode 100644 index 000000000..4fb177811 --- /dev/null +++ b/object/object_manager_test.go @@ -0,0 +1,344 @@ +package object + +import ( + "bytes" + "context" + cryptorand "crypto/rand" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io/ioutil" + "math/rand" + "reflect" + "runtime/debug" + "sync" + "testing" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/storage" +) + +type fakeBlockManager struct { + mu sync.Mutex + data map[string][]byte +} + +func (f *fakeBlockManager) GetBlock(ctx context.Context, blockID string) ([]byte, error) { + f.mu.Lock() + defer f.mu.Unlock() + + if d, ok := f.data[blockID]; ok { + return append([]byte(nil), d...), nil + } + + return nil, storage.ErrBlockNotFound +} + +func (f *fakeBlockManager) WriteBlock(ctx context.Context, data []byte, prefix string) (string, error) { + h := sha256.New() + h.Write(data) //nolint:errcheck + blockID := prefix + string(hex.EncodeToString(h.Sum(nil))) + + f.mu.Lock() + defer f.mu.Unlock() + + f.data[blockID] = append([]byte(nil), data...) + return blockID, nil +} + +func (f *fakeBlockManager) BlockInfo(ctx context.Context, blockID string) (block.Info, error) { + f.mu.Lock() + defer f.mu.Unlock() + + if d, ok := f.data[blockID]; ok { + return block.Info{BlockID: blockID, Length: uint32(len(d))}, nil + } + + return block.Info{}, storage.ErrBlockNotFound +} + +func (f *fakeBlockManager) Flush(ctx context.Context) error { + return nil +} + +func setupTest(t *testing.T) (map[string][]byte, *Manager) { + return setupTestWithData(t, map[string][]byte{}, ManagerOptions{}) +} + +func setupTestWithData(t *testing.T, data map[string][]byte, opts ManagerOptions) (map[string][]byte, *Manager) { + r, err := NewObjectManager(context.Background(), &fakeBlockManager{data: data}, Format{ + MaxBlockSize: 400, + Splitter: "FIXED", + }, opts) + if err != nil { + t.Fatalf("can't create object manager: %v", err) + } + + return data, r +} + +func TestWriters(t *testing.T) { + ctx := context.Background() + cases := []struct { + data []byte + objectID ID + }{ + { + []byte("the quick brown fox jumps over the lazy dog"), + "05c6e08f1d9fdafa03147fcb8f82f124c76d2f70e3d989dc8aadb5e7d7450bec", + }, + {make([]byte, 100), "cd00e292c5970d3c5e2f0ffa5171e555bc46bfc4faddfb4a418b6840b86e79a3"}, // 100 zero bytes + } + + for _, c := range cases { + data, om := setupTest(t) + + writer := om.NewWriter(ctx, WriterOptions{}) + + if _, err := writer.Write(c.data); err != nil { + t.Errorf("write error: %v", err) + } + + result, err := writer.Result() + if err != nil { + t.Errorf("error getting writer results for %v, expected: %v", c.data, c.objectID.String()) + continue + } + + if !objectIDsEqual(result, c.objectID) { + t.Errorf("incorrect result for %v, expected: %v got: %v", c.data, c.objectID.String(), result.String()) + } + + if _, ok := c.objectID.BlockID(); !ok { + if len(data) != 0 { + t.Errorf("unexpected data written to the storage: %v", data) + } + } else { + if len(data) != 1 { + // 1 data block + t.Errorf("unexpected data written to the storage: %v", data) + } + } + } +} + +func objectIDsEqual(o1 ID, o2 ID) bool { + return reflect.DeepEqual(o1, o2) +} + +func TestWriterCompleteChunkInTwoWrites(t *testing.T) { + ctx := context.Background() + _, om := setupTest(t) + + bytes := make([]byte, 100) + writer := om.NewWriter(ctx, WriterOptions{}) + writer.Write(bytes[0:50]) //nolint:errcheck + writer.Write(bytes[0:50]) //nolint:errcheck + result, err := writer.Result() + if !objectIDsEqual(result, "cd00e292c5970d3c5e2f0ffa5171e555bc46bfc4faddfb4a418b6840b86e79a3") { + t.Errorf("unexpected result: %v err: %v", result, err) + } +} + +func verifyIndirectBlock(ctx context.Context, t *testing.T, r *Manager, oid ID) { + for indexBlockID, isIndirect := oid.IndexObjectID(); isIndirect; indexBlockID, isIndirect = indexBlockID.IndexObjectID() { + rd, err := r.Open(ctx, indexBlockID) + if err != nil { + t.Errorf("unable to open %v: %v", oid.String(), err) + return + } + defer rd.Close() + + var ind indirectObject + if err := json.NewDecoder(rd).Decode(&ind); err != nil { + t.Errorf("cannot parse indirect stream: %v", err) + } + } +} + +func TestIndirection(t *testing.T) { + ctx := context.Background() + cases := []struct { + dataLength int + expectedBlockCount int + expectedIndirection int + }{ + {dataLength: 200, expectedBlockCount: 1, expectedIndirection: 0}, + {dataLength: 1400, expectedBlockCount: 3, expectedIndirection: 1}, + {dataLength: 2000, expectedBlockCount: 4, expectedIndirection: 2}, + {dataLength: 3000, expectedBlockCount: 5, expectedIndirection: 2}, + {dataLength: 4000, expectedBlockCount: 5, expectedIndirection: 2}, + {dataLength: 10000, expectedBlockCount: 10, expectedIndirection: 3}, + } + + for _, c := range cases { + data, om := setupTest(t) + + contentBytes := make([]byte, c.dataLength) + + writer := om.NewWriter(ctx, WriterOptions{}) + if _, err := writer.Write(contentBytes); err != nil { + t.Errorf("write error: %v", err) + } + result, err := writer.Result() + if err != nil { + t.Errorf("error getting writer results: %v", err) + } + + if indirectionLevel(result) != c.expectedIndirection { + t.Errorf("incorrect indirection level for size: %v: %v, expected %v", c.dataLength, indirectionLevel(result), c.expectedIndirection) + } + + if got, want := len(data), c.expectedBlockCount; got != want { + t.Errorf("unexpected block count for %v: %v, expected %v", c.dataLength, got, want) + } + + l, b, err := om.VerifyObject(ctx, result) + if err != nil { + t.Errorf("error verifying %q: %v", result, err) + } + + if got, want := int(l), len(contentBytes); got != want { + t.Errorf("got invalid byte count for %q: %v, wanted %v", result, got, want) + } + + if got, want := len(b), c.expectedBlockCount; got != want { + t.Errorf("invalid block count for %v, got %v, wanted %v", result, got, want) + } + + verifyIndirectBlock(ctx, t, om, result) + } +} + +func indirectionLevel(oid ID) int { + indexObjectID, ok := oid.IndexObjectID() + if !ok { + return 0 + } + + return 1 + indirectionLevel(indexObjectID) +} + +func TestHMAC(t *testing.T) { + ctx := context.Background() + content := bytes.Repeat([]byte{0xcd}, 50) + + _, om := setupTest(t) + + w := om.NewWriter(ctx, WriterOptions{}) + w.Write(content) //nolint:errcheck + result, err := w.Result() + if result.String() != "cad29ff89951a3c085c86cb7ed22b82b51f7bdfda24f932c7f9601f51d5975ba" { + t.Errorf("unexpected result: %v err: %v", result.String(), err) + } +} + +func TestReader(t *testing.T) { + ctx := context.Background() + data, om := setupTest(t) + + storedPayload := []byte("foo\nbar") + data["a76999788386641a3ec798554f1fe7e6"] = storedPayload + + cases := []struct { + text string + payload []byte + }{ + {"a76999788386641a3ec798554f1fe7e6", storedPayload}, + } + + for _, c := range cases { + objectID, err := ParseID(c.text) + if err != nil { + t.Errorf("cannot parse object ID: %v", err) + continue + } + + reader, err := om.Open(ctx, objectID) + if err != nil { + t.Errorf("cannot create reader for %v: %v", objectID, err) + continue + } + + d, err := ioutil.ReadAll(reader) + if err != nil { + t.Errorf("cannot read all data for %v: %v", objectID, err) + continue + } + if !bytes.Equal(d, c.payload) { + t.Errorf("incorrect payload for %v: expected: %v got: %v", objectID, c.payload, d) + continue + } + } +} + +func TestReaderStoredBlockNotFound(t *testing.T) { + ctx := context.Background() + _, om := setupTest(t) + + objectID, err := ParseID("deadbeef") + if err != nil { + t.Errorf("cannot parse object ID: %v", err) + } + reader, err := om.Open(ctx, objectID) + if err != storage.ErrBlockNotFound || reader != nil { + t.Errorf("unexpected result: reader: %v err: %v", reader, err) + } +} + +func TestEndToEndReadAndSeek(t *testing.T) { + ctx := context.Background() + _, om := setupTest(t) + + for _, size := range []int{1, 199, 200, 201, 9999, 512434} { + // Create some random data sample of the specified size. + randomData := make([]byte, size) + cryptorand.Read(randomData) //nolint:errcheck + + writer := om.NewWriter(ctx, WriterOptions{}) + if _, err := writer.Write(randomData); err != nil { + t.Errorf("write error: %v", err) + } + objectID, err := writer.Result() + writer.Close() + if err != nil { + t.Errorf("cannot get writer result for %v: %v", size, err) + continue + } + + verify(ctx, t, om, objectID, randomData, fmt.Sprintf("%v %v", objectID, size)) + } +} + +func verify(ctx context.Context, t *testing.T, om *Manager, objectID ID, expectedData []byte, testCaseID string) { + t.Helper() + reader, err := om.Open(ctx, objectID) + if err != nil { + t.Errorf("cannot get reader for %v (%v): %v %v", testCaseID, objectID, err, string(debug.Stack())) + return + } + + for i := 0; i < 20; i++ { + sampleSize := int(rand.Int31n(300)) + seekOffset := int(rand.Int31n(int32(len(expectedData)))) + if seekOffset+sampleSize > len(expectedData) { + sampleSize = len(expectedData) - seekOffset + } + if sampleSize > 0 { + got := make([]byte, sampleSize) + if offset, err := reader.Seek(int64(seekOffset), 0); err != nil || offset != int64(seekOffset) { + t.Errorf("seek error: %v offset=%v expected:%v", err, offset, seekOffset) + } + if n, err := reader.Read(got); err != nil || n != sampleSize { + t.Errorf("invalid data: n=%v, expected=%v, err:%v", n, sampleSize, err) + } + + expected := expectedData[seekOffset : seekOffset+sampleSize] + + if !bytes.Equal(expected, got) { + t.Errorf("incorrect data read for %v: expected: %x, got: %x", testCaseID, expected, got) + } + } + } +} diff --git a/object/object_reader.go b/object/object_reader.go new file mode 100644 index 000000000..5097d5fff --- /dev/null +++ b/object/object_reader.go @@ -0,0 +1,159 @@ +package object + +import ( + "context" + "fmt" + "io" +) + +func (i *indirectObjectEntry) endOffset() int64 { + return i.Start + i.Length +} + +type objectReader struct { + ctx context.Context + repo *Manager + + seekTable []indirectObjectEntry + + currentPosition int64 // Overall position in the objectReader + totalLength int64 // Overall length + + currentChunkIndex int // Index of current chunk in the seek table + currentChunkData []byte // Current chunk data + currentChunkPosition int // Read position in the current chunk +} + +func (r *objectReader) Read(buffer []byte) (int, error) { + readBytes := 0 + remaining := len(buffer) + + for remaining > 0 { + if r.currentChunkData != nil { + toCopy := len(r.currentChunkData) - r.currentChunkPosition + if toCopy == 0 { + // EOF on curren chunk + r.closeCurrentChunk() + r.currentChunkIndex++ + continue + } + + if toCopy > remaining { + toCopy = remaining + } + + copy(buffer[readBytes:], + r.currentChunkData[r.currentChunkPosition:r.currentChunkPosition+toCopy]) + r.currentChunkPosition += toCopy + r.currentPosition += int64(toCopy) + readBytes += toCopy + remaining -= toCopy + } else if r.currentChunkIndex < len(r.seekTable) { + err := r.openCurrentChunk() + if err != nil { + return 0, err + } + } else { + break + } + } + + if readBytes == 0 { + return readBytes, io.EOF + } + + return readBytes, nil +} + +func (r *objectReader) openCurrentChunk() error { + st := r.seekTable[r.currentChunkIndex] + blockData, err := r.repo.Open(r.ctx, st.Object) + if err != nil { + return err + } + defer blockData.Close() //nolint:errcheck + + b := make([]byte, st.Length) + if _, err := io.ReadFull(blockData, b); err != nil { + return err + } + + r.currentChunkData = b + r.currentChunkPosition = 0 + return nil +} + +func (r *objectReader) closeCurrentChunk() { + r.currentChunkData = nil +} + +func (r *objectReader) findChunkIndexForOffset(offset int64) (int, error) { + left := 0 + right := len(r.seekTable) - 1 + for left <= right { + middle := (left + right) / 2 + + if offset < r.seekTable[middle].Start { + right = middle - 1 + continue + } + + if offset >= r.seekTable[middle].endOffset() { + left = middle + 1 + continue + } + + return middle, nil + } + + return 0, fmt.Errorf("can't find chunk for offset %v", offset) +} + +func (r *objectReader) Seek(offset int64, whence int) (int64, error) { + if whence == 1 { + return r.Seek(r.currentPosition+offset, 0) + } + + if whence == 2 { + return r.Seek(r.totalLength+offset, 0) + } + + if offset < 0 { + return -1, fmt.Errorf("invalid seek %v %v", offset, whence) + } + + if offset > r.totalLength { + offset = r.totalLength + } + + index, err := r.findChunkIndexForOffset(offset) + if err != nil { + return -1, fmt.Errorf("invalid seek %v %v: %v", offset, whence, err) + } + + chunkStartOffset := r.seekTable[index].Start + + if index != r.currentChunkIndex { + r.closeCurrentChunk() + r.currentChunkIndex = index + } + + if r.currentChunkData == nil { + if err := r.openCurrentChunk(); err != nil { + return 0, err + } + } + + r.currentChunkPosition = int(offset - chunkStartOffset) + r.currentPosition = offset + + return r.currentPosition, nil +} + +func (r *objectReader) Close() error { + return nil +} + +func (r *objectReader) Length() int64 { + return r.totalLength +} diff --git a/object/object_splitter.go b/object/object_splitter.go new file mode 100644 index 000000000..085274687 --- /dev/null +++ b/object/object_splitter.go @@ -0,0 +1,110 @@ +package object + +import ( + "math" + "sort" + + "github.com/silvasur/buzhash" +) + +type objectSplitter interface { + add(b byte) bool +} + +// SupportedSplitters is a list of supported object splitters including: +// +// NEVER - prevents objects from ever splitting +// FIXED - always splits large objects exactly at the maximum block size boundary +// DYNAMIC - dynamically splits large objects based on rolling hash of contents. +var SupportedSplitters []string + +var splitterFactories = map[string]func(*Format) objectSplitter{ + "NEVER": func(f *Format) objectSplitter { + return newNeverSplitter() + }, + "FIXED": func(f *Format) objectSplitter { + return newFixedSplitter(f.MaxBlockSize) + }, + "DYNAMIC": func(f *Format) objectSplitter { + return newRollingHashSplitter(buzhash.NewBuzHash(32), f.MinBlockSize, f.AvgBlockSize, f.MaxBlockSize) + }, +} + +func init() { + for k := range splitterFactories { + SupportedSplitters = append(SupportedSplitters, k) + } + sort.Strings(SupportedSplitters) +} + +// DefaultSplitter is the name of the splitter used by default for new repositories. +const DefaultSplitter = "DYNAMIC" + +type neverSplitter struct{} + +func (s *neverSplitter) add(b byte) bool { + return false +} + +func newNeverSplitter() objectSplitter { + return &neverSplitter{} +} + +type fixedSplitter struct { + cur int + chunkLength int +} + +func (s *fixedSplitter) add(b byte) bool { + s.cur++ + if s.cur >= s.chunkLength { + s.cur = 0 + return true + } + + return false +} + +func newFixedSplitter(chunkLength int) objectSplitter { + return &fixedSplitter{chunkLength: chunkLength} +} + +type rollingHash interface { + HashByte(b byte) uint32 +} + +type rollingHashSplitter struct { + rh rollingHash + mask uint32 + + currentBlockSize int + minBlockSize int + maxBlockSize int +} + +func (rs *rollingHashSplitter) add(b byte) bool { + sum := rs.rh.HashByte(b) + rs.currentBlockSize++ + if rs.currentBlockSize >= rs.maxBlockSize { + rs.currentBlockSize = 0 + return true + } + if sum&rs.mask == 0 && rs.currentBlockSize > rs.minBlockSize && sum != 0 { + //log.Printf("splitting %v on sum %x mask %x", rs.currentBlockSize, sum, rs.mask) + rs.currentBlockSize = 0 + return true + } + return false +} + +func newRollingHashSplitter(rh rollingHash, minBlockSize int, approxBlockSize int, maxBlockSize int) objectSplitter { + bits := rollingHashBits(approxBlockSize) + mask := ^(^uint32(0) << bits) + return &rollingHashSplitter{rh, mask, 0, minBlockSize, maxBlockSize} +} + +func rollingHashBits(n int) uint { + e := math.Log2(float64(n)) + exp := math.Floor(e + 0.5) + return uint(exp) +} diff --git a/object/object_splitter_test.go b/object/object_splitter_test.go new file mode 100644 index 000000000..1b0e3592b --- /dev/null +++ b/object/object_splitter_test.go @@ -0,0 +1,134 @@ +package object + +import ( + "math" + "math/rand" + "testing" + + "github.com/silvasur/buzhash" +) + +func TestSplitters(t *testing.T) { + cases := []struct { + desc string + newSplitter func() objectSplitter + }{ + {"rolling buzhash with 3 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 8, 20) }}, + {"rolling buzhash with 5 bits", func() objectSplitter { return newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 20) }}, + } + + for _, tc := range cases { + s1 := tc.newSplitter() + s2 := tc.newSplitter() + + rnd := make([]byte, 50000000) + rand.Read(rnd) + + for i, p := range rnd { + if got, want := s1.add(p), s2.add(p); got != want { + t.Errorf("incorrect add() result for %v at offset %v", tc.desc, i) + } + } + } +} + +func TestSplitterStability(t *testing.T) { + r := rand.New(rand.NewSource(5)) + rnd := make([]byte, 5000000) + if n, err := r.Read(rnd); n != len(rnd) || err != nil { + t.Fatalf("can't initialize random data: %v", err) + } + + cases := []struct { + splitter objectSplitter + count int + avg int + minSplit int + maxSplit int + }{ + {newFixedSplitter(1000), 5000, 1000, 1000, 1000}, + {newFixedSplitter(10000), 500, 10000, 10000, 10000}, + + {newNeverSplitter(), 0, 0, math.MaxInt32, 0}, + + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, math.MaxInt32), 156262, 31, 1, 404}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, math.MaxInt32), 4933, 1013, 1, 8372}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, math.MaxInt32), 2476, 2019, 1, 19454}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32768, math.MaxInt32), 185, 27027, 1, 177510}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 65536, math.MaxInt32), 99, 50505, 418, 230449}, + + // min and max + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 32, 64), 179921, 27, 1, 64}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 1024, 10000), 4933, 1013, 1, 8372}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 0, 2048, 10000), 2490, 2008, 1, 10000}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 32768, 100000), 183, 27322, 522, 100000}, + {newRollingHashSplitter(buzhash.NewBuzHash(32), 500, 65536, 100000), 113, 44247, 522, 100000}, + } + + for _, tc := range cases { + s := tc.splitter + + lastSplit := -1 + maxSplit := 0 + minSplit := int(math.MaxInt32) + count := 0 + for i, p := range rnd { + if s.add(p) { + l := i - lastSplit + if l >= maxSplit { + maxSplit = l + } + if l < minSplit { + minSplit = l + } + count++ + lastSplit = i + } + } + + var avg int + if count > 0 { + avg = len(rnd) / count + } + + if got, want := avg, tc.avg; got != want { + t.Errorf("invalid split average size %v, wanted %v", got, want) + } + + if got, want := count, tc.count; got != want { + t.Errorf("invalid split count %v, wanted %v", got, want) + } + if got, want := minSplit, tc.minSplit; got != want { + t.Errorf("min split %v, wanted %v", got, want) + } + if got, want := maxSplit, tc.maxSplit; got != want { + t.Errorf("max split %v, wanted %v", got, want) + } + } +} + +func TestRollingHashBits(t *testing.T) { + cases := []struct { + blockSize int + bits uint + }{ + {256, 8}, + {128, 7}, + {100, 7}, + {500, 9}, + {700, 9}, + {724, 9}, + {725, 10}, + {768, 10}, + {1000, 10}, + {1000000, 20}, + {10000000, 23}, + {20000000, 24}, + } + + for _, tc := range cases { + if got, want := rollingHashBits(tc.blockSize), tc.bits; got != want { + t.Errorf("rollingHashBits(%v) = %v, wanted %v", tc.blockSize, got, want) + } + } +} diff --git a/object/object_writer.go b/object/object_writer.go new file mode 100644 index 000000000..ef0b35e04 --- /dev/null +++ b/object/object_writer.go @@ -0,0 +1,145 @@ +package object + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "sync" + + "github.com/pkg/errors" +) + +// Writer allows writing content to the storage and supports automatic deduplication and encryption +// of written data. +type Writer interface { + io.WriteCloser + + Result() (ID, error) +} + +type blockTracker struct { + mu sync.Mutex + blocks map[string]bool +} + +func (t *blockTracker) addBlock(blockID string) { + t.mu.Lock() + defer t.mu.Unlock() + + if t.blocks == nil { + t.blocks = make(map[string]bool) + } + t.blocks[blockID] = true +} + +func (t *blockTracker) blockIDs() []string { + t.mu.Lock() + defer t.mu.Unlock() + + result := make([]string, 0, len(t.blocks)) + for k := range t.blocks { + result = append(result, k) + } + return result +} + +type objectWriter struct { + ctx context.Context + repo *Manager + + prefix string + buffer bytes.Buffer + totalLength int64 + + currentPosition int64 + blockIndex []indirectObjectEntry + + description string + + splitter objectSplitter +} + +func (w *objectWriter) Close() error { + return nil +} + +func (w *objectWriter) Write(data []byte) (n int, err error) { + dataLen := len(data) + w.totalLength += int64(dataLen) + + for _, d := range data { + w.buffer.WriteByte(d) + + if w.splitter.add(d) { + if err := w.flushBuffer(); err != nil { + return 0, err + } + } + } + + return dataLen, nil +} + +func (w *objectWriter) flushBuffer() error { + length := w.buffer.Len() + chunkID := len(w.blockIndex) + w.blockIndex = append(w.blockIndex, indirectObjectEntry{}) + w.blockIndex[chunkID].Start = w.currentPosition + w.blockIndex[chunkID].Length = int64(length) + w.currentPosition += int64(length) + + var b2 bytes.Buffer + w.buffer.WriteTo(&b2) //nolint:errcheck + w.buffer.Reset() + + blockID, err := w.repo.blockMgr.WriteBlock(w.ctx, b2.Bytes(), w.prefix) + w.repo.trace("OBJECT_WRITER(%q) stored %v (%v bytes)", w.description, blockID, length) + if err != nil { + return fmt.Errorf("error when flushing chunk %d of %s: %v", chunkID, w.description, err) + } + + w.blockIndex[chunkID].Object = DirectObjectID(blockID) + return nil +} + +func (w *objectWriter) Result() (ID, error) { + if w.buffer.Len() > 0 || len(w.blockIndex) == 0 { + if err := w.flushBuffer(); err != nil { + return "", err + } + } + + if len(w.blockIndex) == 1 { + return w.blockIndex[0].Object, nil + } + + iw := &objectWriter{ + ctx: w.ctx, + repo: w.repo, + description: "LIST(" + w.description + ")", + splitter: w.repo.newSplitter(), + prefix: w.prefix, + } + + ind := indirectObject{ + StreamID: "kopia:indirect", + Entries: w.blockIndex, + } + + if err := json.NewEncoder(iw).Encode(ind); err != nil { + return "", errors.Wrap(err, "unable to write indirect block index") + } + oid, err := iw.Result() + if err != nil { + return "", err + } + return IndirectObjectID(oid), nil +} + +// WriterOptions can be passed to Repository.NewWriter() +type WriterOptions struct { + Description string + Prefix string // empty string or a single-character ('g'..'z') +} diff --git a/object/objectid.go b/object/objectid.go new file mode 100644 index 000000000..1f9b7b150 --- /dev/null +++ b/object/objectid.go @@ -0,0 +1,94 @@ +package object + +import ( + "encoding/hex" + "fmt" + "strings" +) + +// ID is an identifier of a repository object. Repository objects can be stored. +// +// 1. In a single content block, this is the most common case for small objects. +// 2. In a series of content blocks with an indirect block pointing at them (multiple indirections are allowed). +// This is used for larger files. Object IDs using indirect blocks start with "I" +type ID string + +// HasObjectID exposes the identifier of an object. +type HasObjectID interface { + ObjectID() ID +} + +// String returns string representation of ObjectID that is suitable for displaying in the UI. +func (i ID) String() string { + return strings.Replace(string(i), "D", "", -1) +} + +// IndexObjectID returns the object ID of the underlying index object. +func (i ID) IndexObjectID() (ID, bool) { + if strings.HasPrefix(string(i), "I") { + return i[1:], true + } + + return "", false +} + +// BlockID returns the block ID of the underlying content storage block. +func (i ID) BlockID() (string, bool) { + if strings.HasPrefix(string(i), "D") { + return string(i[1:]), true + } + if strings.HasPrefix(string(i), "I") { + return "", false + } + + return string(i), true +} + +// Validate checks the ID format for validity and reports any errors. +func (i ID) Validate() error { + if indexObjectID, ok := i.IndexObjectID(); ok { + if err := indexObjectID.Validate(); err != nil { + return fmt.Errorf("invalid indirect object ID %v: %v", i, err) + } + + return nil + } + + if blockID, ok := i.BlockID(); ok { + if len(blockID) < 2 { + return fmt.Errorf("missing block ID") + } + + // odd length - firstcharacter must be a single character between 'g' and 'z' + if len(blockID)%2 == 1 { + if blockID[0] < 'g' || blockID[0] > 'z' { + return fmt.Errorf("invalid block ID prefix: %v", blockID) + } + blockID = blockID[1:] + } + + if _, err := hex.DecodeString(blockID); err != nil { + return fmt.Errorf("invalid blockID suffix, must be base-16 encoded: %v", blockID) + } + + return nil + } + + return fmt.Errorf("invalid object ID: %v", i) +} + +// DirectObjectID returns direct object ID based on the provided block ID. +func DirectObjectID(blockID string) ID { + return ID(blockID) +} + +// IndirectObjectID returns indirect object ID based on the underlying index object ID. +func IndirectObjectID(indexObjectID ID) ID { + return "I" + indexObjectID +} + +// ParseID converts the specified string into object ID +func ParseID(s string) (ID, error) { + i := ID(s) + return i, i.Validate() +} diff --git a/object/objectid_test.go b/object/objectid_test.go new file mode 100644 index 000000000..022bebf4a --- /dev/null +++ b/object/objectid_test.go @@ -0,0 +1,46 @@ +package object + +import ( + "testing" +) + +func TestParseObjectID(t *testing.T) { + cases := []struct { + text string + isValid bool + }{ + {"Df0f0", true}, + {"IDf0f0", true}, + {"IDf0f0", true}, + {"IIDf0f0", true}, + {"Dxf0f0", true}, + {"IDxf0f0", true}, + {"IDxf0f0", true}, + {"IIDxf0f0", true}, + {"Dxf0f", false}, + {"IDxf0f", false}, + {"Da", false}, + {"Daf0f0", false}, + {"", false}, + {"B!$@#$!@#$", false}, + {"X", false}, + {"I.", false}, + {"I.x", false}, + {"I.af", false}, + {"Ix.ag", false}, + {"Iab.", false}, + {"I1", false}, + {"I1,", false}, + {"I-1,X", false}, + {"Xsomething", false}, + } + + for _, tc := range cases { + _, err := ParseID(tc.text) + if err != nil && tc.isValid { + t.Errorf("error parsing %q: %v", tc.text, err) + } else if err == nil && !tc.isValid { + t.Errorf("unexpected success parsing %v", tc.text) + } + } +} diff --git a/open.go b/open.go new file mode 100644 index 000000000..7c2734695 --- /dev/null +++ b/open.go @@ -0,0 +1,209 @@ +package repo + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "path/filepath" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/manifest" + "github.com/kopia/repo/object" + "github.com/kopia/repo/storage" + "github.com/kopia/repo/storage/logging" + "github.com/pkg/errors" +) + +var ( + log = repologging.Logger("kopia/repo") +) + +// Options provides configuration parameters for connection to a repository. +type Options struct { + TraceStorage func(f string, args ...interface{}) // Logs all storage access using provided Printf-style function + ObjectManagerOptions object.ManagerOptions +} + +// Open opens a Repository specified in the configuration file. +func Open(ctx context.Context, configFile string, password string, options *Options) (rep *Repository, err error) { + log.Debugf("opening repository from %v", configFile) + defer func() { + if err == nil { + log.Debugf("opened repository") + } else { + log.Errorf("failed to open repository: %v", err) + } + }() + + if options == nil { + options = &Options{} + } + + configFile, err = filepath.Abs(configFile) + if err != nil { + return nil, err + } + + log.Debugf("loading config from file: %v", configFile) + lc, err := loadConfigFromFile(configFile) + if err != nil { + return nil, err + } + + log.Debugf("opening storage: %v", lc.Storage.Type) + + st, err := storage.NewStorage(ctx, lc.Storage) + if err != nil { + return nil, errors.Wrap(err, "cannot open storage") + } + + if options.TraceStorage != nil { + st = logging.NewWrapper(st, logging.Prefix("[STORAGE] "), logging.Output(options.TraceStorage)) + } + + r, err := OpenWithConfig(ctx, st, lc, password, options, lc.Caching) + if err != nil { + st.Close(ctx) //nolint:errcheck + return nil, err + } + + r.ConfigFile = configFile + + return r, nil +} + +// OpenWithConfig opens the repository with a given configuration, avoiding the need for a config file. +func OpenWithConfig(ctx context.Context, st storage.Storage, lc *LocalConfig, password string, options *Options, caching block.CachingOptions) (*Repository, error) { + log.Debugf("reading encrypted format block") + // Read cache block, potentially from cache. + fb, err := readAndCacheFormatBlockBytes(ctx, st, caching.CacheDirectory) + if err != nil { + return nil, errors.Wrap(err, "unable to read format block") + } + + f, err := parseFormatBlock(fb) + if err != nil { + return nil, errors.Wrap(err, "can't parse format block") + } + + fb, err = addFormatBlockChecksumAndLength(fb) + if err != nil { + return nil, fmt.Errorf("unable to add checksum") + } + + masterKey, err := f.deriveMasterKeyFromPassword(password) + if err != nil { + return nil, err + } + + repoConfig, err := f.decryptFormatBytes(masterKey) + if err != nil { + return nil, errors.Wrap(err, "unable to decrypt repository config") + } + + caching.HMACSecret = deriveKeyFromMasterKey(masterKey, f.UniqueID, []byte("local-cache-integrity"), 16) + + fo := repoConfig.FormattingOptions + if fo.MaxPackSize == 0 { + fo.MaxPackSize = repoConfig.MaxBlockSize + } + + log.Debugf("initializing block manager") + bm, err := block.NewManager(ctx, st, fo, caching, fb) + if err != nil { + return nil, errors.Wrap(err, "unable to open block manager") + } + + log.Debugf("initializing object manager") + om, err := object.NewObjectManager(ctx, bm, repoConfig.Format, options.ObjectManagerOptions) + if err != nil { + return nil, errors.Wrap(err, "unable to open object manager") + } + + log.Debugf("initializing manifest manager") + manifests, err := manifest.NewManager(ctx, bm) + if err != nil { + return nil, errors.Wrap(err, "unable to open manifests") + } + + return &Repository{ + Blocks: bm, + Objects: om, + Storage: st, + Manifests: manifests, + CacheDirectory: caching.CacheDirectory, + UniqueID: f.UniqueID, + + formatBlock: f, + masterKey: masterKey, + }, nil +} + +// SetCachingConfig changes caching configuration for a given repository config file. +func SetCachingConfig(ctx context.Context, configFile string, opt block.CachingOptions) error { + configFile, err := filepath.Abs(configFile) + if err != nil { + return err + } + + lc, err := loadConfigFromFile(configFile) + if err != nil { + return err + } + + st, err := storage.NewStorage(ctx, lc.Storage) + if err != nil { + return errors.Wrap(err, "cannot open storage") + } + + fb, err := readAndCacheFormatBlockBytes(ctx, st, "") + if err != nil { + return errors.Wrap(err, "can't read format block") + } + + f, err := parseFormatBlock(fb) + if err != nil { + return errors.Wrap(err, "can't parse format block") + } + + if err = setupCaching(configFile, lc, opt, f.UniqueID); err != nil { + return errors.Wrap(err, "unable to set up caching") + } + + d, err := json.MarshalIndent(&lc, "", " ") + if err != nil { + return err + } + + if err := ioutil.WriteFile(configFile, d, 0600); err != nil { + return nil + } + + return nil +} + +func readAndCacheFormatBlockBytes(ctx context.Context, st storage.Storage, cacheDirectory string) ([]byte, error) { + cachedFile := filepath.Join(cacheDirectory, "kopia.repository") + if cacheDirectory != "" { + b, err := ioutil.ReadFile(cachedFile) + if err == nil { + // read from cache. + return b, nil + } + } + + b, err := st.GetBlock(ctx, FormatBlockID, 0, -1) + if err != nil { + return nil, err + } + + if cacheDirectory != "" { + if err := ioutil.WriteFile(cachedFile, b, 0600); err != nil { + log.Warningf("warning: unable to write cache: %v", err) + } + } + + return b, nil +} diff --git a/repository.go b/repository.go new file mode 100644 index 000000000..01f5dff1e --- /dev/null +++ b/repository.go @@ -0,0 +1,87 @@ +package repo + +import ( + "context" + "time" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/manifest" + "github.com/kopia/repo/object" + "github.com/kopia/repo/storage" + "github.com/pkg/errors" +) + +// Repository represents storage where both content-addressable and user-addressable data is kept. +type Repository struct { + Blocks *block.Manager + Objects *object.Manager + Storage storage.Storage + Manifests *manifest.Manager + UniqueID []byte + + ConfigFile string + CacheDirectory string + + formatBlock *formatBlock + masterKey []byte +} + +// Close closes the repository and releases all resources. +func (r *Repository) Close(ctx context.Context) error { + if err := r.Manifests.Flush(ctx); err != nil { + return errors.Wrap(err, "error flushing manifests") + } + if err := r.Blocks.Flush(ctx); err != nil { + return errors.Wrap(err, "error closing blocks") + } + if err := r.Storage.Close(ctx); err != nil { + return errors.Wrap(err, "error closing storage") + } + return nil +} + +// Flush waits for all in-flight writes to complete. +func (r *Repository) Flush(ctx context.Context) error { + if err := r.Manifests.Flush(ctx); err != nil { + return err + } + + return r.Blocks.Flush(ctx) +} + +// Refresh periodically makes external changes visible to repository. +func (r *Repository) Refresh(ctx context.Context) error { + updated, err := r.Blocks.Refresh(ctx) + if err != nil { + return errors.Wrap(err, "error refreshing block index") + } + + if !updated { + return nil + } + + log.Debugf("block index refreshed") + + if err := r.Manifests.Refresh(ctx); err != nil { + return errors.Wrap(err, "error reloading manifests") + } + + log.Debugf("manifests refreshed") + + return nil +} + +// RefreshPeriodically periodically refreshes the repository to reflect the changes made by other hosts. +func (r *Repository) RefreshPeriodically(ctx context.Context, interval time.Duration) { + for { + select { + case <-ctx.Done(): + return + + case <-time.After(interval): + if err := r.Refresh(ctx); err != nil { + log.Warningf("error refreshing repository: %v", err) + } + } + } +} diff --git a/repository_test.go b/repository_test.go new file mode 100644 index 000000000..822ffe6e0 --- /dev/null +++ b/repository_test.go @@ -0,0 +1,328 @@ +package repo_test + +import ( + "bytes" + "context" + cryptorand "crypto/rand" + "fmt" + "io/ioutil" + "math/rand" + "reflect" + "runtime/debug" + "testing" + + "github.com/kopia/repo" + "github.com/kopia/repo/block" + "github.com/kopia/repo/internal/repotesting" + "github.com/kopia/repo/object" + "github.com/kopia/repo/storage" +) + +func TestWriters(t *testing.T) { + cases := []struct { + data []byte + objectID object.ID + }{ + { + []byte("the quick brown fox jumps over the lazy dog"), + "345acef0bcf82f1daf8e49fab7b7fac7ec296c518501eabea3645b99345a4e08", + }, + {make([]byte, 100), "1d804f1f69df08f3f59070bf962de69433e3d61ac18522a805a84d8c92741340"}, // 100 zero bytes + } + + ctx := context.Background() + + for _, c := range cases { + var env repotesting.Environment + defer env.Setup(t).Close(t) + + writer := env.Repository.Objects.NewWriter(ctx, object.WriterOptions{}) + if _, err := writer.Write(c.data); err != nil { + t.Fatalf("write error: %v", err) + } + + result, err := writer.Result() + if err != nil { + t.Errorf("error getting writer results for %v, expected: %v", c.data, c.objectID.String()) + continue + } + + if !objectIDsEqual(result, c.objectID) { + t.Errorf("incorrect result for %v, expected: %v got: %v", c.data, c.objectID.String(), result.String()) + } + + env.Repository.Blocks.Flush(ctx) + } +} + +func objectIDsEqual(o1 object.ID, o2 object.ID) bool { + return reflect.DeepEqual(o1, o2) +} + +func TestWriterCompleteChunkInTwoWrites(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + ctx := context.Background() + + bytes := make([]byte, 100) + writer := env.Repository.Objects.NewWriter(ctx, object.WriterOptions{}) + writer.Write(bytes[0:50]) //nolint:errcheck + writer.Write(bytes[0:50]) //nolint:errcheck + result, err := writer.Result() + if result != "1d804f1f69df08f3f59070bf962de69433e3d61ac18522a805a84d8c92741340" { + t.Errorf("unexpected result: %v err: %v", result, err) + } +} + +func TestPackingSimple(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + + ctx := context.Background() + + content1 := "hello, how do you do?" + content2 := "hi, how are you?" + content3 := "thank you!" + + oid1a := writeObject(ctx, t, env.Repository, []byte(content1), "packed-object-1a") + oid1b := writeObject(ctx, t, env.Repository, []byte(content1), "packed-object-1b") + oid2a := writeObject(ctx, t, env.Repository, []byte(content2), "packed-object-2a") + oid2b := writeObject(ctx, t, env.Repository, []byte(content2), "packed-object-2b") + + oid3a := writeObject(ctx, t, env.Repository, []byte(content3), "packed-object-3a") + oid3b := writeObject(ctx, t, env.Repository, []byte(content3), "packed-object-3b") + verify(ctx, t, env.Repository, oid1a, []byte(content1), "packed-object-1") + verify(ctx, t, env.Repository, oid2a, []byte(content2), "packed-object-2") + oid2c := writeObject(ctx, t, env.Repository, []byte(content2), "packed-object-2c") + oid1c := writeObject(ctx, t, env.Repository, []byte(content1), "packed-object-1c") + + env.Repository.Blocks.Flush(ctx) + + if got, want := oid1a.String(), oid1b.String(); got != want { + t.Errorf("oid1a(%q) != oid1b(%q)", got, want) + } + if got, want := oid1a.String(), oid1c.String(); got != want { + t.Errorf("oid1a(%q) != oid1c(%q)", got, want) + } + if got, want := oid2a.String(), oid2b.String(); got != want { + t.Errorf("oid2(%q)a != oidb(%q)", got, want) + } + if got, want := oid2a.String(), oid2c.String(); got != want { + t.Errorf("oid2(%q)a != oidc(%q)", got, want) + } + if got, want := oid3a.String(), oid3b.String(); got != want { + t.Errorf("oid3a(%q) != oid3b(%q)", got, want) + } + + env.VerifyStorageBlockCount(t, 3) + + env.MustReopen(t) + + verify(ctx, t, env.Repository, oid1a, []byte(content1), "packed-object-1") + verify(ctx, t, env.Repository, oid2a, []byte(content2), "packed-object-2") + verify(ctx, t, env.Repository, oid3a, []byte(content3), "packed-object-3") + + if err := env.Repository.Blocks.CompactIndexes(ctx, block.CompactOptions{MinSmallBlocks: 1, MaxSmallBlocks: 1}); err != nil { + t.Errorf("optimize error: %v", err) + } + + env.MustReopen(t) + + verify(ctx, t, env.Repository, oid1a, []byte(content1), "packed-object-1") + verify(ctx, t, env.Repository, oid2a, []byte(content2), "packed-object-2") + verify(ctx, t, env.Repository, oid3a, []byte(content3), "packed-object-3") + + if err := env.Repository.Blocks.CompactIndexes(ctx, block.CompactOptions{MinSmallBlocks: 1, MaxSmallBlocks: 1}); err != nil { + t.Errorf("optimize error: %v", err) + } + + env.MustReopen(t) + + verify(ctx, t, env.Repository, oid1a, []byte(content1), "packed-object-1") + verify(ctx, t, env.Repository, oid2a, []byte(content2), "packed-object-2") + verify(ctx, t, env.Repository, oid3a, []byte(content3), "packed-object-3") +} + +func TestHMAC(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + ctx := context.Background() + + content := bytes.Repeat([]byte{0xcd}, 50) + + w := env.Repository.Objects.NewWriter(ctx, object.WriterOptions{}) + w.Write(content) //nolint:errcheck + result, err := w.Result() + if result.String() != "367352007ee6ca9fa755ce8352347d092c17a24077fd33c62f655574a8cf906d" { + t.Errorf("unexpected result: %v err: %v", result.String(), err) + } +} + +func TestUpgrade(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + ctx := context.Background() + + if err := env.Repository.Upgrade(ctx); err != nil { + t.Errorf("upgrade error: %v", err) + } + + if err := env.Repository.Upgrade(ctx); err != nil { + t.Errorf("2nd upgrade error: %v", err) + } +} + +func TestReaderStoredBlockNotFound(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + ctx := context.Background() + + objectID, err := object.ParseID("Ddeadbeef") + if err != nil { + t.Errorf("cannot parse object ID: %v", err) + } + reader, err := env.Repository.Objects.Open(ctx, objectID) + if err != storage.ErrBlockNotFound || reader != nil { + t.Errorf("unexpected result: reader: %v err: %v", reader, err) + } +} + +func TestEndToEndReadAndSeek(t *testing.T) { + var env repotesting.Environment + defer env.Setup(t).Close(t) + ctx := context.Background() + + for _, size := range []int{1, 199, 200, 201, 9999, 512434} { + // Create some random data sample of the specified size. + randomData := make([]byte, size) + cryptorand.Read(randomData) //nolint:errcheck + + writer := env.Repository.Objects.NewWriter(ctx, object.WriterOptions{}) + writer.Write(randomData) //nolint:errcheck + objectID, err := writer.Result() + writer.Close() + if err != nil { + t.Errorf("cannot get writer result for %v: %v", size, err) + continue + } + + verify(ctx, t, env.Repository, objectID, randomData, fmt.Sprintf("%v %v", objectID, size)) + } +} + +func writeObject(ctx context.Context, t *testing.T, rep *repo.Repository, data []byte, testCaseID string) object.ID { + w := rep.Objects.NewWriter(ctx, object.WriterOptions{}) + if _, err := w.Write(data); err != nil { + t.Fatalf("can't write object %q - write failed: %v", testCaseID, err) + + } + oid, err := w.Result() + if err != nil { + t.Fatalf("can't write object %q - result failed: %v", testCaseID, err) + } + + return oid +} + +func verify(ctx context.Context, t *testing.T, rep *repo.Repository, objectID object.ID, expectedData []byte, testCaseID string) { + t.Helper() + reader, err := rep.Objects.Open(ctx, objectID) + if err != nil { + t.Errorf("cannot get reader for %v (%v): %v %v", testCaseID, objectID, err, string(debug.Stack())) + return + } + + for i := 0; i < 20; i++ { + sampleSize := int(rand.Int31n(300)) + seekOffset := int(rand.Int31n(int32(len(expectedData)))) + if seekOffset+sampleSize > len(expectedData) { + sampleSize = len(expectedData) - seekOffset + } + if sampleSize > 0 { + got := make([]byte, sampleSize) + if offset, err := reader.Seek(int64(seekOffset), 0); err != nil || offset != int64(seekOffset) { + t.Errorf("seek error: %v offset=%v expected:%v", err, offset, seekOffset) + } + if n, err := reader.Read(got); err != nil || n != sampleSize { + t.Errorf("invalid data: n=%v, expected=%v, err:%v", n, sampleSize, err) + } + + expected := expectedData[seekOffset : seekOffset+sampleSize] + + if !bytes.Equal(expected, got) { + t.Errorf("incorrect data read for %v: expected: %x, got: %x", testCaseID, expected, got) + } + } + } +} + +func TestFormats(t *testing.T) { + ctx := context.Background() + makeFormat := func(hash, encryption string) func(*repo.NewRepositoryOptions) { + return func(n *repo.NewRepositoryOptions) { + n.BlockFormat.Hash = hash + n.BlockFormat.Encryption = encryption + n.BlockFormat.HMACSecret = []byte("key") + n.ObjectFormat.MaxBlockSize = 10000 + n.ObjectFormat.Splitter = "FIXED" + } + } + + cases := []struct { + format func(*repo.NewRepositoryOptions) + oids map[string]object.ID + }{ + { + format: func(n *repo.NewRepositoryOptions) { + n.ObjectFormat.MaxBlockSize = 10000 + }, + oids: map[string]object.ID{ + "": "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad", + "The quick brown fox jumps over the lazy dog": "fb011e6154a19b9a4c767373c305275a5a69e8b68b0b4c9200c383dced19a416", + }, + }, + { + format: makeFormat("HMAC-SHA256", "NONE"), + oids: map[string]object.ID{ + "The quick brown fox jumps over the lazy dog": "f7bc83f430538424b13298e6aa6fb143ef4d59a14946175997479dbc2d1a3cd8", + }, + }, + { + format: makeFormat("HMAC-SHA256-128", "NONE"), + oids: map[string]object.ID{ + "The quick brown fox jumps over the lazy dog": "f7bc83f430538424b13298e6aa6fb143", + }, + }, + } + + for caseIndex, c := range cases { + var env repotesting.Environment + defer env.Setup(t, c.format).Close(t) + + for k, v := range c.oids { + bytesToWrite := []byte(k) + w := env.Repository.Objects.NewWriter(ctx, object.WriterOptions{}) + w.Write(bytesToWrite) //nolint:errcheck + oid, err := w.Result() + if err != nil { + t.Errorf("error: %v", err) + } + if !objectIDsEqual(oid, v) { + t.Errorf("invalid oid for #%v\ngot:\n%#v\nexpected:\n%#v", caseIndex, oid.String(), v.String()) + } + + rc, err := env.Repository.Objects.Open(ctx, oid) + if err != nil { + t.Errorf("open failed: %v", err) + continue + } + bytesRead, err := ioutil.ReadAll(rc) + if err != nil { + t.Errorf("error reading: %v", err) + } + if !bytes.Equal(bytesRead, bytesToWrite) { + t.Errorf("data mismatch, read:%x vs written:%v", bytesRead, bytesToWrite) + } + } + } +} diff --git a/storage/config.go b/storage/config.go new file mode 100644 index 000000000..5f8f77bcd --- /dev/null +++ b/storage/config.go @@ -0,0 +1,47 @@ +package storage + +import ( + "encoding/json" + "fmt" +) + +// ConnectionInfo represents JSON-serializable configuration of a blob storage. +type ConnectionInfo struct { + Type string + Config interface{} +} + +// UnmarshalJSON parses the JSON-encoded data into ConnectionInfo. +func (c *ConnectionInfo) UnmarshalJSON(b []byte) error { + raw := struct { + Type string `json:"type"` + Data json.RawMessage `json:"config"` + }{} + + if err := json.Unmarshal(b, &raw); err != nil { + return err + } + + c.Type = raw.Type + f := factories[raw.Type] + if f == nil { + return fmt.Errorf("storage type '%v' not registered", raw.Type) + } + c.Config = f.defaultConfigFunc() + if err := json.Unmarshal(raw.Data, c.Config); err != nil { + return fmt.Errorf("unable to unmarshal config: %v", err) + } + + return nil +} + +// MarshalJSON returns JSON-encoded storage configuration. +func (c ConnectionInfo) MarshalJSON() ([]byte, error) { + return json.Marshal(struct { + Type string `json:"type"` + Data interface{} `json:"config"` + }{ + Type: c.Type, + Data: c.Config, + }) +} diff --git a/storage/doc.go b/storage/doc.go new file mode 100644 index 000000000..a2c193b6f --- /dev/null +++ b/storage/doc.go @@ -0,0 +1,2 @@ +// Package storage implements simple storage of immutable, unstructured binary large objects (BLOBs). +package storage diff --git a/storage/filesystem/filesystem_options.go b/storage/filesystem/filesystem_options.go new file mode 100644 index 000000000..2ca3cf385 --- /dev/null +++ b/storage/filesystem/filesystem_options.go @@ -0,0 +1,40 @@ +package filesystem + +import "os" + +// Options defines options for Filesystem-backed storage. +type Options struct { + Path string `json:"path"` + + DirectoryShards []int `json:"dirShards"` + + FileMode os.FileMode `json:"fileMode,omitempty"` + DirectoryMode os.FileMode `json:"dirMode,omitempty"` + + FileUID *int `json:"uid,omitempty"` + FileGID *int `json:"gid,omitempty"` +} + +func (fso *Options) fileMode() os.FileMode { + if fso.FileMode == 0 { + return fsDefaultFileMode + } + + return fso.FileMode +} + +func (fso *Options) dirMode() os.FileMode { + if fso.DirectoryMode == 0 { + return fsDefaultDirMode + } + + return fso.DirectoryMode +} + +func (fso *Options) shards() []int { + if fso.DirectoryShards == nil { + return fsDefaultShards + } + + return fso.DirectoryShards +} diff --git a/storage/filesystem/filesystem_storage.go b/storage/filesystem/filesystem_storage.go new file mode 100644 index 000000000..f9735b1e1 --- /dev/null +++ b/storage/filesystem/filesystem_storage.go @@ -0,0 +1,248 @@ +// Package filesystem implements filesystem-based Storage. +package filesystem + +import ( + "context" + "fmt" + "io" + "io/ioutil" + "math/rand" + "os" + "path/filepath" + "strings" + "time" + + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/storage" +) + +var log = repologging.Logger("repo/filesystem") + +const ( + fsStorageType = "filesystem" + fsStorageChunkSuffix = ".f" +) + +var ( + fsDefaultShards = []int{3, 3} + fsDefaultFileMode os.FileMode = 0600 + fsDefaultDirMode os.FileMode = 0700 +) + +type fsStorage struct { + Options +} + +func (fs *fsStorage) GetBlock(ctx context.Context, blockID string, offset, length int64) ([]byte, error) { + _, path := fs.getShardedPathAndFilePath(blockID) + + f, err := os.Open(path) + if os.IsNotExist(err) { + return nil, storage.ErrBlockNotFound + } + + if err != nil { + return nil, err + } + defer f.Close() //nolint:errcheck + + if length < 0 { + return ioutil.ReadAll(f) + } + + if _, err = f.Seek(offset, io.SeekStart); err != nil { + return nil, err + } + b, err := ioutil.ReadAll(io.LimitReader(f, length)) + if err != nil { + return nil, err + } + if int64(len(b)) != length { + return nil, fmt.Errorf("invalid length") + } + return b, nil +} + +func getstringFromFileName(name string) (string, bool) { + if strings.HasSuffix(name, fsStorageChunkSuffix) { + return name[0 : len(name)-len(fsStorageChunkSuffix)], true + } + + return string(""), false +} + +func makeFileName(blockID string) string { + return blockID + fsStorageChunkSuffix +} + +func (fs *fsStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + var walkDir func(string, string) error + + walkDir = func(directory string, currentPrefix string) error { + entries, err := ioutil.ReadDir(directory) + if err != nil { + return err + } + + for _, e := range entries { + if e.IsDir() { + newPrefix := currentPrefix + e.Name() + var match bool + + if len(prefix) > len(newPrefix) { + match = strings.HasPrefix(prefix, newPrefix) + } else { + match = strings.HasPrefix(newPrefix, prefix) + } + + if match { + if err := walkDir(directory+"/"+e.Name(), currentPrefix+e.Name()); err != nil { + return err + } + } + } else if fullID, ok := getstringFromFileName(currentPrefix + e.Name()); ok { + if strings.HasPrefix(fullID, prefix) { + if err := callback(storage.BlockMetadata{ + BlockID: fullID, + Length: e.Size(), + Timestamp: e.ModTime(), + }); err != nil { + return err + } + } + } + } + + return nil + } + + return walkDir(fs.Path, "") +} + +// TouchBlock updates file modification time to current time if it's sufficiently old. +func (fs *fsStorage) TouchBlock(ctx context.Context, blockID string, threshold time.Duration) error { + _, path := fs.getShardedPathAndFilePath(blockID) + st, err := os.Stat(path) + if err != nil { + return err + } + + n := time.Now() + age := n.Sub(st.ModTime()) + if age < threshold { + return nil + } + + log.Debugf("updating timestamp on %v to %v", path, n) + return os.Chtimes(path, n, n) +} + +func (fs *fsStorage) PutBlock(ctx context.Context, blockID string, data []byte) error { + _, path := fs.getShardedPathAndFilePath(blockID) + + tempFile := fmt.Sprintf("%s.tmp.%d", path, rand.Int()) + f, err := fs.createTempFileAndDir(tempFile) + if err != nil { + return fmt.Errorf("cannot create temporary file: %v", err) + } + + if _, err = f.Write(data); err != nil { + return fmt.Errorf("can't write temporary file: %v", err) + } + if err = f.Close(); err != nil { + return fmt.Errorf("can't close temporary file: %v", err) + } + + err = os.Rename(tempFile, path) + if err != nil { + if removeErr := os.Remove(tempFile); removeErr != nil { + log.Warningf("can't remove temp file: %v", removeErr) + } + return err + } + + if fs.FileUID != nil && fs.FileGID != nil && os.Geteuid() == 0 { + if chownErr := os.Chown(path, *fs.FileUID, *fs.FileGID); chownErr != nil { + log.Warningf("can't change file permissions: %v", chownErr) + } + } + + return nil +} + +func (fs *fsStorage) createTempFileAndDir(tempFile string) (*os.File, error) { + flags := os.O_CREATE | os.O_WRONLY | os.O_EXCL + f, err := os.OpenFile(tempFile, flags, fs.fileMode()) + if os.IsNotExist(err) { + if err = os.MkdirAll(filepath.Dir(tempFile), fs.dirMode()); err != nil { + return nil, fmt.Errorf("cannot create directory: %v", err) + } + return os.OpenFile(tempFile, flags, fs.fileMode()) + } + + return f, err +} + +func (fs *fsStorage) DeleteBlock(ctx context.Context, blockID string) error { + _, path := fs.getShardedPathAndFilePath(blockID) + err := os.Remove(path) + if err == nil || os.IsNotExist(err) { + return nil + } + + return err +} + +func (fs *fsStorage) getShardDirectory(blockID string) (string, string) { + shardPath := fs.Path + if len(blockID) < 20 { + return shardPath, blockID + } + for _, size := range fs.shards() { + shardPath = filepath.Join(shardPath, blockID[0:size]) + blockID = blockID[size:] + } + + return shardPath, blockID +} + +func (fs *fsStorage) getShardedPathAndFilePath(blockID string) (string, string) { + shardPath, blockID := fs.getShardDirectory(blockID) + result := filepath.Join(shardPath, makeFileName(blockID)) + return shardPath, result +} + +func (fs *fsStorage) ConnectionInfo() storage.ConnectionInfo { + return storage.ConnectionInfo{ + Type: fsStorageType, + Config: &fs.Options, + } +} + +func (fs *fsStorage) Close(ctx context.Context) error { + return nil +} + +// New creates new filesystem-backed storage in a specified directory. +func New(ctx context.Context, opts *Options) (storage.Storage, error) { + var err error + + if _, err = os.Stat(opts.Path); err != nil { + return nil, fmt.Errorf("cannot access storage path: %v", err) + } + + r := &fsStorage{ + Options: *opts, + } + + return r, nil +} + +func init() { + storage.AddSupportedStorage( + fsStorageType, + func() interface{} { return &Options{} }, + func(ctx context.Context, o interface{}) (storage.Storage, error) { + return New(ctx, o.(*Options)) + }) +} diff --git a/storage/filesystem/filesystem_storage_test.go b/storage/filesystem/filesystem_storage_test.go new file mode 100644 index 000000000..1812f51ef --- /dev/null +++ b/storage/filesystem/filesystem_storage_test.go @@ -0,0 +1,120 @@ +package filesystem + +import ( + "context" + "io/ioutil" + "os" + "reflect" + "sort" + "testing" + "time" + + "github.com/kopia/repo/storage" + + "github.com/kopia/repo/internal/storagetesting" +) + +func TestFileStorage(t *testing.T) { + t.Parallel() + ctx := context.Background() + + // Test varioush shard configurations. + for _, shardSpec := range [][]int{ + {0}, + {1}, + {3, 3}, + {2}, + {1, 1}, + {1, 2}, + {2, 2, 2}, + } { + path, _ := ioutil.TempDir("", "r-fs") + defer os.RemoveAll(path) + + r, err := New(ctx, &Options{ + Path: path, + DirectoryShards: shardSpec, + }) + + if r == nil || err != nil { + t.Errorf("unexpected result: %v %v", r, err) + } + + storagetesting.VerifyStorage(ctx, t, r) + storagetesting.AssertConnectionInfoRoundTrips(ctx, t, r) + if err := r.Close(ctx); err != nil { + t.Fatalf("err: %v", err) + } + } +} + +func TestFileStorageTouch(t *testing.T) { + t.Parallel() + ctx := context.Background() + + t1 := "392ee1bc299db9f235e046a62625afb84902" + t2 := "2a7ff4f29eddbcd4c18fa9e73fec20bbb71f" + t3 := "0dae5918f83e6a24c8b3e274ca1026e43f24" + + path, _ := ioutil.TempDir("", "r-fs") + defer os.RemoveAll(path) + + r, err := New(ctx, &Options{ + Path: path, + }) + + if r == nil || err != nil { + t.Errorf("unexpected result: %v %v", r, err) + } + + fs := r.(*fsStorage) + assertNoError(t, fs.PutBlock(ctx, t1, []byte{1})) + time.Sleep(1 * time.Second) // sleep a bit to accommodate Apple filesystems with low timestamp resolution + assertNoError(t, fs.PutBlock(ctx, t2, []byte{1})) + time.Sleep(1 * time.Second) + assertNoError(t, fs.PutBlock(ctx, t3, []byte{1})) + + verifyBlockTimestampOrder(t, fs, t1, t2, t3) + + assertNoError(t, fs.TouchBlock(ctx, t2, 1*time.Hour)) // has no effect, all timestamps are very new + verifyBlockTimestampOrder(t, fs, t1, t2, t3) + + assertNoError(t, fs.TouchBlock(ctx, t1, 0)) // moves t1 to the top of the pile + verifyBlockTimestampOrder(t, fs, t2, t3, t1) + time.Sleep(1 * time.Second) + + assertNoError(t, fs.TouchBlock(ctx, t2, 0)) // moves t2 to the top of the pile + verifyBlockTimestampOrder(t, fs, t3, t1, t2) + time.Sleep(1 * time.Second) + + assertNoError(t, fs.TouchBlock(ctx, t1, 0)) // moves t1 to the top of the pile + verifyBlockTimestampOrder(t, fs, t3, t2, t1) +} + +func verifyBlockTimestampOrder(t *testing.T, st storage.Storage, want ...string) { + blocks, err := storage.ListAllBlocks(context.Background(), st, "") + if err != nil { + t.Errorf("error listing blocks: %v", err) + return + } + + sort.Slice(blocks, func(i, j int) bool { + return blocks[i].Timestamp.Before(blocks[j].Timestamp) + }) + + var got []string + for _, b := range blocks { + got = append(got, b.BlockID) + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("incorrect block order: %v, wanted %v", blocks, want) + } +} + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Errorf("err: %v", err) + } +} diff --git a/storage/gcs/gcs_options.go b/storage/gcs/gcs_options.go new file mode 100644 index 000000000..9935fb8e3 --- /dev/null +++ b/storage/gcs/gcs_options.go @@ -0,0 +1,20 @@ +package gcs + +// Options defines options Google Cloud Storage-backed storage. +type Options struct { + // BucketName is the name of the GCS bucket where data is stored. + BucketName string `json:"bucket"` + + // Prefix specifies additional string to prepend to all objects. + Prefix string `json:"prefix,omitempty"` + + // ServiceAccountCredentials specifies the name of the file with GCS credentials. + ServiceAccountCredentials string `json:"credentialsFile,omitempty"` + + // ReadOnly causes GCS connection to be opened with read-only scope to prevent accidental mutations. + ReadOnly bool `json:"readOnly,omitempty"` + + MaxUploadSpeedBytesPerSecond int `json:"maxUploadSpeedBytesPerSecond,omitempty"` + + MaxDownloadSpeedBytesPerSecond int `json:"maxDownloadSpeedBytesPerSecond,omitempty"` +} diff --git a/storage/gcs/gcs_storage.go b/storage/gcs/gcs_storage.go new file mode 100644 index 000000000..b990fd334 --- /dev/null +++ b/storage/gcs/gcs_storage.go @@ -0,0 +1,270 @@ +// Package gcs implements Storage based on Google Cloud Storage bucket. +package gcs + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + + "google.golang.org/api/googleapi" + + "github.com/efarrer/iothrottler" + "github.com/kopia/repo/internal/retry" + "github.com/kopia/repo/internal/throttle" + "github.com/kopia/repo/storage" + "golang.org/x/oauth2" + "golang.org/x/oauth2/google" + "google.golang.org/api/iterator" + "google.golang.org/api/option" + + gcsclient "cloud.google.com/go/storage" +) + +const ( + gcsStorageType = "gcs" +) + +type gcsStorage struct { + Options + + ctx context.Context + storageClient *gcsclient.Client + bucket *gcsclient.BucketHandle + + downloadThrottler *iothrottler.IOThrottlerPool + uploadThrottler *iothrottler.IOThrottlerPool +} + +func (gcs *gcsStorage) GetBlock(ctx context.Context, b string, offset, length int64) ([]byte, error) { + if offset < 0 { + return nil, fmt.Errorf("invalid offset") + } + + attempt := func() (interface{}, error) { + reader, err := gcs.bucket.Object(gcs.getObjectNameString(b)).NewRangeReader(gcs.ctx, offset, length) + if err != nil { + return nil, err + } + defer reader.Close() //nolint:errcheck + + return ioutil.ReadAll(reader) + } + + v, err := exponentialBackoff(fmt.Sprintf("GetBlock(%q,%v,%v)", b, offset, length), attempt) + if err != nil { + return nil, translateError(err) + } + + fetched := v.([]byte) + if len(fetched) != int(length) && length >= 0 { + return nil, fmt.Errorf("invalid offset/length") + } + + return fetched, nil +} + +func exponentialBackoff(desc string, att retry.AttemptFunc) (interface{}, error) { + return retry.WithExponentialBackoff(desc, att, isRetriableError) +} + +func isRetriableError(err error) bool { + if apiError, ok := err.(*googleapi.Error); ok { + return apiError.Code >= 500 + } + + switch err { + case nil: + return false + case gcsclient.ErrObjectNotExist: + return false + case gcsclient.ErrBucketNotExist: + return false + default: + return true + } +} + +func translateError(err error) error { + switch err { + case nil: + return nil + case gcsclient.ErrObjectNotExist: + return storage.ErrBlockNotFound + case gcsclient.ErrBucketNotExist: + return storage.ErrBlockNotFound + default: + return fmt.Errorf("unexpected GCS error: %v", err) + } +} +func (gcs *gcsStorage) PutBlock(ctx context.Context, b string, data []byte) error { + ctx, cancel := context.WithCancel(ctx) + + obj := gcs.bucket.Object(gcs.getObjectNameString(b)) + writer := obj.NewWriter(ctx) + writer.ChunkSize = 1 << 20 + writer.ContentType = "application/x-kopia" + + progressCallback := storage.ProgressCallback(ctx) + + if progressCallback != nil { + progressCallback(b, 0, int64(len(data))) + defer progressCallback(b, int64(len(data)), int64(len(data))) + + writer.ProgressFunc = func(completed int64) { + if completed != int64(len(data)) { + progressCallback(b, completed, int64(len(data))) + } + } + } + + _, err := io.Copy(writer, bytes.NewReader(data)) + if err != nil { + // cancel context before closing the writer causes it to abandon the upload. + cancel() + writer.Close() //nolint:errcheck + return translateError(err) + } + defer cancel() + + // calling close before cancel() causes it to commit the upload. + return translateError(writer.Close()) +} + +func (gcs *gcsStorage) DeleteBlock(ctx context.Context, b string) error { + attempt := func() (interface{}, error) { + return nil, gcs.bucket.Object(gcs.getObjectNameString(b)).Delete(gcs.ctx) + } + + _, err := exponentialBackoff(fmt.Sprintf("DeleteBlock(%q)", b), attempt) + err = translateError(err) + if err == storage.ErrBlockNotFound { + return nil + } + + return err +} + +func (gcs *gcsStorage) getObjectNameString(blockID string) string { + return gcs.Prefix + blockID +} + +func (gcs *gcsStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + lst := gcs.bucket.Objects(gcs.ctx, &gcsclient.Query{ + Prefix: gcs.getObjectNameString(prefix), + }) + + oa, err := lst.Next() + for err == nil { + if err = callback(storage.BlockMetadata{ + BlockID: oa.Name[len(gcs.Prefix):], + Length: oa.Size, + Timestamp: oa.Created, + }); err != nil { + return err + } + oa, err = lst.Next() + } + + if err != iterator.Done { + return err + } + + return nil +} + +func (gcs *gcsStorage) ConnectionInfo() storage.ConnectionInfo { + return storage.ConnectionInfo{ + Type: gcsStorageType, + Config: &gcs.Options, + } +} + +func (gcs *gcsStorage) Close(ctx context.Context) error { + gcs.storageClient.Close() //nolint:errcheck + return nil +} + +func toBandwidth(bytesPerSecond int) iothrottler.Bandwidth { + if bytesPerSecond <= 0 { + return iothrottler.Unlimited + } + + return iothrottler.Bandwidth(bytesPerSecond) * iothrottler.BytesPerSecond +} + +func tokenSourceFromCredentialsFile(ctx context.Context, fn string, scopes ...string) (oauth2.TokenSource, error) { + data, err := ioutil.ReadFile(fn) + if err != nil { + return nil, err + } + + cfg, err := google.JWTConfigFromJSON(data, scopes...) + if err != nil { + return nil, fmt.Errorf("google.JWTConfigFromJSON: %v", err) + } + return cfg.TokenSource(ctx), nil +} + +// New creates new Google Cloud Storage-backed storage with specified options: +// +// - the 'BucketName' field is required and all other parameters are optional. +// +// By default the connection reuses credentials managed by (https://cloud.google.com/sdk/), +// but this can be disabled by setting IgnoreDefaultCredentials to true. +func New(ctx context.Context, opt *Options) (storage.Storage, error) { + var ts oauth2.TokenSource + var err error + + scope := gcsclient.ScopeReadWrite + if opt.ReadOnly { + scope = gcsclient.ScopeReadOnly + } + + if sa := opt.ServiceAccountCredentials; sa != "" { + ts, err = tokenSourceFromCredentialsFile(ctx, sa, scope) + } else { + ts, err = google.DefaultTokenSource(ctx, scope) + } + + if err != nil { + return nil, err + } + + downloadThrottler := iothrottler.NewIOThrottlerPool(toBandwidth(opt.MaxDownloadSpeedBytesPerSecond)) + uploadThrottler := iothrottler.NewIOThrottlerPool(toBandwidth(opt.MaxUploadSpeedBytesPerSecond)) + + hc := oauth2.NewClient(ctx, ts) + hc.Transport = throttle.NewRoundTripper(hc.Transport, downloadThrottler, uploadThrottler) + + cli, err := gcsclient.NewClient(ctx, option.WithHTTPClient(hc)) + if err != nil { + return nil, err + } + + if opt.BucketName == "" { + return nil, errors.New("bucket name must be specified") + } + + return &gcsStorage{ + Options: *opt, + ctx: ctx, + storageClient: cli, + bucket: cli.Bucket(opt.BucketName), + downloadThrottler: downloadThrottler, + uploadThrottler: uploadThrottler, + }, nil +} + +func init() { + storage.AddSupportedStorage( + gcsStorageType, + func() interface{} { + return &Options{} + }, + func(ctx context.Context, o interface{}) (storage.Storage, error) { + return New(ctx, o.(*Options)) + }) +} diff --git a/storage/gcs/gcs_storage_test.go b/storage/gcs/gcs_storage_test.go new file mode 100644 index 000000000..b76c40673 --- /dev/null +++ b/storage/gcs/gcs_storage_test.go @@ -0,0 +1,75 @@ +package gcs_test + +import ( + "context" + "os" + "testing" + + "github.com/kopia/repo/internal/storagetesting" + + "github.com/kopia/repo/storage" + "github.com/kopia/repo/storage/gcs" +) + +func TestGCSStorage(t *testing.T) { + bucket := os.Getenv("KOPIA_GCS_TEST_BUCKET") + if bucket == "" { + t.Skip("KOPIA_GCS_TEST_BUCKET not provided") + } + + credsFile := os.Getenv("KOPIA_GCS_CREDENTIALS_FILE") + if _, err := os.Stat(credsFile); err != nil { + t.Skip("skipping test because GCS credentials file can't be opened") + } + + ctx := context.Background() + st, err := gcs.New(ctx, &gcs.Options{ + BucketName: bucket, + ServiceAccountCredentials: credsFile, + }) + + if err != nil { + t.Fatalf("unable to connect to GCS: %v", err) + } + + if err := st.ListBlocks(ctx, "", func(bm storage.BlockMetadata) error { + return st.DeleteBlock(ctx, bm.BlockID) + }); err != nil { + t.Fatalf("unable to clear GCS bucket: %v", err) + } + + storagetesting.VerifyStorage(ctx, t, st) + storagetesting.AssertConnectionInfoRoundTrips(ctx, t, st) + + // delete everything again + if err := st.ListBlocks(ctx, "", func(bm storage.BlockMetadata) error { + return st.DeleteBlock(ctx, bm.BlockID) + }); err != nil { + t.Fatalf("unable to clear GCS bucket: %v", err) + } + if err := st.Close(ctx); err != nil { + t.Fatalf("err: %v", err) + } +} + +func TestGCSStorageInvalid(t *testing.T) { + bucket := os.Getenv("KOPIA_GCS_TEST_BUCKET") + if bucket == "" { + t.Skip("KOPIA_GCS_TEST_BUCKET not provided") + } + + ctx := context.Background() + st, err := gcs.New(ctx, &gcs.Options{ + BucketName: bucket + "-no-such-bucket", + ServiceAccountCredentials: os.Getenv("KOPIA_GCS_CREDENTIALS_FILE"), + }) + + if err != nil { + t.Fatalf("unable to connect to GCS: %v", err) + } + + defer st.Close(ctx) + if err := st.PutBlock(ctx, "xxx", []byte{1, 2, 3}); err == nil { + t.Errorf("unexpecte success when adding to non-existent bucket") + } +} diff --git a/storage/logging/logging_storage.go b/storage/logging/logging_storage.go new file mode 100644 index 000000000..79cd7818d --- /dev/null +++ b/storage/logging/logging_storage.go @@ -0,0 +1,96 @@ +// Package logging implements wrapper around Storage that logs all activity. +package logging + +import ( + "context" + "time" + + "github.com/kopia/repo/internal/repologging" + "github.com/kopia/repo/storage" +) + +var log = repologging.Logger("repo/storage") + +type loggingStorage struct { + base storage.Storage + printf func(string, ...interface{}) + prefix string +} + +func (s *loggingStorage) GetBlock(ctx context.Context, id string, offset, length int64) ([]byte, error) { + t0 := time.Now() + result, err := s.base.GetBlock(ctx, id, offset, length) + dt := time.Since(t0) + if len(result) < 20 { + s.printf(s.prefix+"GetBlock(%q,%v,%v)=(%#v, %#v) took %v", id, offset, length, result, err, dt) + } else { + s.printf(s.prefix+"GetBlock(%q,%v,%v)=({%#v bytes}, %#v) took %v", id, offset, length, len(result), err, dt) + } + return result, err +} + +func (s *loggingStorage) PutBlock(ctx context.Context, id string, data []byte) error { + t0 := time.Now() + err := s.base.PutBlock(ctx, id, data) + dt := time.Since(t0) + s.printf(s.prefix+"PutBlock(%q,len=%v)=%#v took %v", id, len(data), err, dt) + return err +} + +func (s *loggingStorage) DeleteBlock(ctx context.Context, id string) error { + t0 := time.Now() + err := s.base.DeleteBlock(ctx, id) + dt := time.Since(t0) + s.printf(s.prefix+"DeleteBlock(%q)=%#v took %v", id, err, dt) + return err +} + +func (s *loggingStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + t0 := time.Now() + cnt := 0 + err := s.base.ListBlocks(ctx, prefix, func(bi storage.BlockMetadata) error { + cnt++ + return callback(bi) + }) + s.printf(s.prefix+"ListBlocks(%q)=%v returned %v items and took %v", prefix, err, cnt, time.Since(t0)) + return err +} + +func (s *loggingStorage) Close(ctx context.Context) error { + t0 := time.Now() + err := s.base.Close(ctx) + dt := time.Since(t0) + s.printf(s.prefix+"Close()=%#v took %v", err, dt) + return err +} + +func (s *loggingStorage) ConnectionInfo() storage.ConnectionInfo { + return s.base.ConnectionInfo() +} + +// Option modifies the behavior of logging storage wrapper. +type Option func(s *loggingStorage) + +// NewWrapper returns a Storage wrapper that logs all storage commands. +func NewWrapper(wrapped storage.Storage, options ...Option) storage.Storage { + s := &loggingStorage{base: wrapped, printf: log.Debugf} + for _, o := range options { + o(s) + } + + return s +} + +// Output is a logging storage option that causes all output to be sent to a given function instead of log.Printf() +func Output(outputFunc func(fmt string, args ...interface{})) Option { + return func(s *loggingStorage) { + s.printf = outputFunc + } +} + +// Prefix specifies prefix to be prepended to all log output. +func Prefix(prefix string) Option { + return func(s *loggingStorage) { + s.prefix = prefix + } +} diff --git a/storage/logging/logging_storage_test.go b/storage/logging/logging_storage_test.go new file mode 100644 index 000000000..83fd6e5dd --- /dev/null +++ b/storage/logging/logging_storage_test.go @@ -0,0 +1,39 @@ +package logging + +import ( + "context" + "strings" + "testing" + + "github.com/kopia/repo/internal/storagetesting" +) + +func TestLoggingStorage(t *testing.T) { + var outputCount int + myPrefix := "myprefix" + myOutput := func(msg string, args ...interface{}) { + if !strings.HasPrefix(msg, myPrefix) { + t.Errorf("unexpected prefix %v", msg) + } + outputCount++ + } + + data := map[string][]byte{} + underlying := storagetesting.NewMapStorage(data, nil, nil) + st := NewWrapper(underlying, Output(myOutput), Prefix(myPrefix)) + if st == nil { + t.Fatalf("unexpected result: %v", st) + } + + ctx := context.Background() + storagetesting.VerifyStorage(ctx, t, st) + if err := st.Close(ctx); err != nil { + t.Fatalf("err: %v", err) + } + if outputCount == 0 { + t.Errorf("did not write any output!") + } + if got, want := st.ConnectionInfo().Type, underlying.ConnectionInfo().Type; got != want { + t.Errorf("unexpected connection infor %v, want %v", got, want) + } +} diff --git a/storage/progress.go b/storage/progress.go new file mode 100644 index 000000000..fcf56d6f3 --- /dev/null +++ b/storage/progress.go @@ -0,0 +1,21 @@ +package storage + +import "context" + +type contextKey string + +var progressCallbackContextKey contextKey = "progress-callback" + +// ProgressFunc is used to report progress of a long-running storage operation. +type ProgressFunc func(desc string, completed, total int64) + +// WithUploadProgressCallback returns a context that passes callback function to be used storage upload progress. +func WithUploadProgressCallback(ctx context.Context, callback ProgressFunc) context.Context { + return context.WithValue(ctx, progressCallbackContextKey, callback) +} + +// ProgressCallback gets the progress callback function from the context. +func ProgressCallback(ctx context.Context) ProgressFunc { + pf, _ := ctx.Value(progressCallbackContextKey).(ProgressFunc) + return pf +} diff --git a/storage/providers/providers.go b/storage/providers/providers.go new file mode 100644 index 000000000..c06c01378 --- /dev/null +++ b/storage/providers/providers.go @@ -0,0 +1,8 @@ +// Package providers registers all storage providers that are included as part of Kopia. +package providers + +import ( + // Register well-known blob storage providers + _ "github.com/kopia/repo/storage/filesystem" + _ "github.com/kopia/repo/storage/gcs" +) diff --git a/storage/registry.go b/storage/registry.go new file mode 100644 index 000000000..776e42cc0 --- /dev/null +++ b/storage/registry.go @@ -0,0 +1,39 @@ +package storage + +import ( + "context" + "fmt" +) + +var ( + factories = map[string]*storageFactory{} +) + +// StorageFactory allows creation of repositories in a generic way. +type storageFactory struct { + defaultConfigFunc func() interface{} + createStorageFunc func(context.Context, interface{}) (Storage, error) +} + +// AddSupportedStorage registers factory function to create storage with a given type name. +func AddSupportedStorage( + urlScheme string, + defaultConfigFunc func() interface{}, + createStorageFunc func(context.Context, interface{}) (Storage, error)) { + + f := &storageFactory{ + defaultConfigFunc: defaultConfigFunc, + createStorageFunc: createStorageFunc, + } + factories[urlScheme] = f +} + +// NewStorage creates new storage based on ConnectionInfo. +// The storage type must be previously registered using AddSupportedStorage. +func NewStorage(ctx context.Context, cfg ConnectionInfo) (Storage, error) { + if factory, ok := factories[cfg.Type]; ok { + return factory.createStorageFunc(ctx, cfg.Config) + } + + return nil, fmt.Errorf("unknown storage type: %s", cfg.Type) +} diff --git a/storage/s3/s3_options.go b/storage/s3/s3_options.go new file mode 100644 index 000000000..c2d675953 --- /dev/null +++ b/storage/s3/s3_options.go @@ -0,0 +1,20 @@ +package s3 + +// Options defines options for S3-based storage. +type Options struct { + // BucketName is the name of the bucket where data is stored. + BucketName string `json:"bucket"` + + // Prefix specifies additional string to prepend to all objects. + Prefix string `json:"prefix,omitempty"` + + Endpoint string `json:"endpoint"` + DoNotUseTLS bool `json:"doNotUseTLS,omitempyy"` + + AccessKeyID string `json:"accessKeyID"` + SecretAccessKey string `json:"secretAccessKey" kopia:"sensitive"` + + MaxUploadSpeedBytesPerSecond int `json:"maxUploadSpeedBytesPerSecond,omitempty"` + + MaxDownloadSpeedBytesPerSecond int `json:"maxDownloadSpeedBytesPerSecond,omitempty"` +} diff --git a/storage/s3/s3_storage.go b/storage/s3/s3_storage.go new file mode 100644 index 000000000..ae78a8788 --- /dev/null +++ b/storage/s3/s3_storage.go @@ -0,0 +1,244 @@ +// Package s3 implements Storage based on an S3 bucket. +package s3 + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + + "github.com/efarrer/iothrottler" + "github.com/kopia/repo/internal/retry" + "github.com/kopia/repo/storage" + "github.com/minio/minio-go" +) + +const ( + s3storageType = "s3" +) + +type s3Storage struct { + Options + + ctx context.Context + + cli *minio.Client + + downloadThrottler *iothrottler.IOThrottlerPool + uploadThrottler *iothrottler.IOThrottlerPool +} + +func (s *s3Storage) GetBlock(ctx context.Context, b string, offset, length int64) ([]byte, error) { + attempt := func() (interface{}, error) { + var opt minio.GetObjectOptions + if length > 0 { + if err := opt.SetRange(offset, offset+length-1); err != nil { + return nil, fmt.Errorf("unable to set range: %v", err) + } + } + + o, err := s.cli.GetObject(s.BucketName, s.getObjectNameString(b), opt) + if err != nil { + return 0, err + } + + defer o.Close() //nolint:errcheck + throttled, err := s.downloadThrottler.AddReader(o) + if err != nil { + return nil, err + } + + b, err := ioutil.ReadAll(throttled) + if err != nil { + return nil, err + } + + if len(b) != int(length) && length > 0 { + return nil, fmt.Errorf("invalid length, got %v bytes, but expected %v", len(b), length) + } + + if length == 0 { + return []byte{}, nil + } + + return b, nil + } + + v, err := exponentialBackoff(fmt.Sprintf("GetBlock(%q,%v,%v)", b, offset, length), attempt) + if err != nil { + return nil, translateError(err) + } + + return v.([]byte), nil +} + +func exponentialBackoff(desc string, att retry.AttemptFunc) (interface{}, error) { + return retry.WithExponentialBackoff(desc, att, isRetriableError) +} + +func isRetriableError(err error) bool { + if me, ok := err.(minio.ErrorResponse); ok { + // retry on server errors, not on client errors + return me.StatusCode >= 500 + } + + return false +} + +func translateError(err error) error { + if me, ok := err.(minio.ErrorResponse); ok { + if me.StatusCode == 200 { + return nil + } + if me.StatusCode == 404 { + return storage.ErrBlockNotFound + } + } + + return err +} + +func (s *s3Storage) PutBlock(ctx context.Context, b string, data []byte) error { + throttled, err := s.uploadThrottler.AddReader(ioutil.NopCloser(bytes.NewReader(data))) + if err != nil { + return err + } + + progressCallback := storage.ProgressCallback(ctx) + if progressCallback != nil { + progressCallback(b, 0, int64(len(data))) + defer progressCallback(b, int64(len(data)), int64(len(data))) + } + n, err := s.cli.PutObject(s.BucketName, s.getObjectNameString(b), throttled, -1, minio.PutObjectOptions{ + ContentType: "application/x-kopia", + Progress: newProgressReader(progressCallback, b, int64(len(data))), + }) + if err == io.EOF && n == 0 { + // special case empty stream + _, err = s.cli.PutObject(s.BucketName, s.getObjectNameString(b), bytes.NewBuffer(nil), 0, minio.PutObjectOptions{ + ContentType: "application/x-kopia", + }) + } + + return translateError(err) +} + +func (s *s3Storage) DeleteBlock(ctx context.Context, b string) error { + attempt := func() (interface{}, error) { + return nil, s.cli.RemoveObject(s.BucketName, s.getObjectNameString(b)) + } + + _, err := exponentialBackoff(fmt.Sprintf("DeleteBlock(%q)", b), attempt) + return translateError(err) +} + +func (s *s3Storage) getObjectNameString(b string) string { + return s.Prefix + b +} + +func (s *s3Storage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + oi := s.cli.ListObjects(s.BucketName, s.Prefix+prefix, false, ctx.Done()) + for o := range oi { + if err := o.Err; err != nil { + return err + } + + bm := storage.BlockMetadata{ + BlockID: o.Key[len(s.Prefix):], + Length: o.Size, + Timestamp: o.LastModified, + } + + if err := callback(bm); err != nil { + return err + } + } + + return nil +} + +func (s *s3Storage) ConnectionInfo() storage.ConnectionInfo { + return storage.ConnectionInfo{ + Type: s3storageType, + Config: &s.Options, + } +} + +func (s *s3Storage) Close(ctx context.Context) error { + return nil +} + +func (s *s3Storage) String() string { + return fmt.Sprintf("s3://%v/%v", s.BucketName, s.Prefix) +} + +type progressReader struct { + cb storage.ProgressFunc + blockID string + completed int64 + totalLength int64 + lastReported int64 +} + +func (r *progressReader) Read(b []byte) (int, error) { + r.completed += int64(len(b)) + if r.completed >= r.lastReported+1000000 && r.completed < r.totalLength { + r.cb(r.blockID, r.completed, r.totalLength) + r.lastReported = r.completed + } + return len(b), nil +} + +func newProgressReader(cb storage.ProgressFunc, blockID string, totalLength int64) io.Reader { + if cb == nil { + return nil + } + + return &progressReader{cb: cb, blockID: blockID, totalLength: totalLength} +} + +func toBandwidth(bytesPerSecond int) iothrottler.Bandwidth { + if bytesPerSecond <= 0 { + return iothrottler.Unlimited + } + + return iothrottler.Bandwidth(bytesPerSecond) * iothrottler.BytesPerSecond +} + +// New creates new S3-backed storage with specified options: +// +// - the 'BucketName' field is required and all other parameters are optional. +func New(ctx context.Context, opt *Options) (storage.Storage, error) { + if opt.BucketName == "" { + return nil, errors.New("bucket name must be specified") + } + + cli, err := minio.New(opt.Endpoint, opt.AccessKeyID, opt.SecretAccessKey, !opt.DoNotUseTLS) + if err != nil { + return nil, fmt.Errorf("unable to create client: %v", err) + } + + downloadThrottler := iothrottler.NewIOThrottlerPool(toBandwidth(opt.MaxDownloadSpeedBytesPerSecond)) + uploadThrottler := iothrottler.NewIOThrottlerPool(toBandwidth(opt.MaxUploadSpeedBytesPerSecond)) + + return &s3Storage{ + Options: *opt, + ctx: ctx, + cli: cli, + downloadThrottler: downloadThrottler, + uploadThrottler: uploadThrottler, + }, nil +} + +func init() { + storage.AddSupportedStorage( + s3storageType, + func() interface{} { + return &Options{} + }, + func(ctx context.Context, o interface{}) (storage.Storage, error) { + return New(ctx, o.(*Options)) + }) +} diff --git a/storage/s3/s3_storage_test.go b/storage/s3/s3_storage_test.go new file mode 100644 index 000000000..1c2c794ef --- /dev/null +++ b/storage/s3/s3_storage_test.go @@ -0,0 +1,116 @@ +package s3 + +import ( + "context" + "crypto/rand" + "crypto/sha1" + "fmt" + "log" + "net" + "os" + "testing" + "time" + + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" + "github.com/minio/minio-go" +) + +// https://github.com/minio/minio-go +const ( + endpoint = "play.minio.io:9000" + accessKeyID = "Q3AM3UQ867SPQQA43P2F" + secretAccessKey = "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG" + useSSL = true + + // the test takes a few seconds, delete stuff older than 1h to avoid accumulating cruft + cleanupAge = 1 * time.Hour +) + +var bucketName = getBucketName() + +func getBucketName() string { + hn, err := os.Hostname() + if err != nil { + return "kopia-test-1" + } + h := sha1.New() + fmt.Fprintf(h, "%v", hn) + return fmt.Sprintf("kopia-test-%x", h.Sum(nil)[0:8]) +} + +func endpointReachable() bool { + conn, err := net.DialTimeout("tcp4", endpoint, 5*time.Second) + if err == nil { + conn.Close() + return true + } + + return false +} + +func TestS3Storage(t *testing.T) { + if !endpointReachable() { + t.Skip("endpoint not reachable") + } + + ctx := context.Background() + + // recreate per-host bucket, which sometimes get cleaned up by play.minio.io + createBucket(t) + cleanupOldData(ctx, t) + + data := make([]byte, 8) + rand.Read(data) //nolint:errcheck + + st, err := New(context.Background(), &Options{ + AccessKeyID: accessKeyID, + SecretAccessKey: secretAccessKey, + Endpoint: endpoint, + BucketName: bucketName, + Prefix: fmt.Sprintf("test-%v-%x-", time.Now().Unix(), data), + }) + if err != nil { + t.Fatalf("err: %v", err) + } + + storagetesting.VerifyStorage(ctx, t, st) + storagetesting.AssertConnectionInfoRoundTrips(ctx, t, st) + if err := st.Close(ctx); err != nil { + t.Fatalf("err: %v", err) + } +} + +func createBucket(t *testing.T) { + minioClient, err := minio.New(endpoint, accessKeyID, secretAccessKey, useSSL) + if err != nil { + t.Fatalf("can't initialize minio client: %v", err) + } + // ignore error + _ = minioClient.MakeBucket(bucketName, "us-east-1") +} + +func cleanupOldData(ctx context.Context, t *testing.T) { + // cleanup old data from the bucket + st, err := New(context.Background(), &Options{ + AccessKeyID: accessKeyID, + SecretAccessKey: secretAccessKey, + Endpoint: endpoint, + BucketName: bucketName, + }) + if err != nil { + t.Fatalf("err: %v", err) + } + + _ = st.ListBlocks(ctx, "", func(it storage.BlockMetadata) error { + age := time.Since(it.Timestamp) + if age > cleanupAge { + if err := st.DeleteBlock(ctx, it.BlockID); err != nil { + t.Errorf("warning: unable to delete %q: %v", it.BlockID, err) + } + } else { + log.Printf("keeping %v", it.BlockID) + } + return nil + }) +} diff --git a/storage/storage.go b/storage/storage.go new file mode 100644 index 000000000..9934c6aed --- /dev/null +++ b/storage/storage.go @@ -0,0 +1,108 @@ +package storage + +import ( + "context" + "errors" + "fmt" + "time" +) + +// CancelFunc requests cancellation of a storage operation. +type CancelFunc func() + +// Storage encapsulates API for connecting to blob storage. +// +// The underlying storage system must provide: +// +// * high durability, availability and bit-rot protection +// * read-after-write - block written using PutBlock() must be immediately readable using GetBlock() and ListBlocks() +// * atomicity - it mustn't be possible to observe partial results of PutBlock() via either GetBlock() or ListBlocks() +// * timestamps that don't go back in time (small clock skew up to minutes is allowed) +// * reasonably low latency for retrievals +// +// The required semantics are provided by existing commercial cloud storage products (Google Cloud, AWS, Azure). +type Storage interface { + // PutBlock uploads the block with given data to the repository or replaces existing block with the provided + // id with given contents. + PutBlock(ctx context.Context, id string, data []byte) error + + // DeleteBlock removes the block from storage. Future GetBlock() operations will fail with ErrBlockNotFound. + DeleteBlock(ctx context.Context, id string) error + + // GetBlock returns full or partial contents of a block with given ID. + // If length>0, the the function retrieves a range of bytes [offset,offset+length) + // If length<0, the entire block must be fetched. + GetBlock(ctx context.Context, id string, offset, length int64) ([]byte, error) + + // ListBlocks returns a channel of BlockMetadata that describes storage blocks with existing name prefixes. + // Iteration continues until all blocks have been listed or until client code invokes the returned cancellation function. + ListBlocks(ctx context.Context, prefix string, cb func(bm BlockMetadata) error) error + + // ConnectionInfo returns JSON-serializable data structure containing information required to + // connect to storage. + ConnectionInfo() ConnectionInfo + + // Close releases all resources associated with storage. + Close(ctx context.Context) error +} + +// BlockMetadata represents metadata about a single block in a storage. +type BlockMetadata struct { + BlockID string + Length int64 + Timestamp time.Time +} + +// ErrBlockNotFound is returned when a block cannot be found in storage. +var ErrBlockNotFound = errors.New("block not found") + +// ListAllBlocks returns BlockMetadata for all blocks in a given storage that have the provided name prefix. +func ListAllBlocks(ctx context.Context, st Storage, prefix string) ([]BlockMetadata, error) { + var result []BlockMetadata + + err := st.ListBlocks(ctx, prefix, func(bm BlockMetadata) error { + result = append(result, bm) + return nil + }) + + return result, err +} + +// ListAllBlocksConsistent lists all blocks with given name prefix in the provided storage until the results are +// consistent. The results are consistent if the list result fetched twice is identical. This guarantees that while +// the first scan was in progress, no new block was added or removed. +// maxAttempts specifies maximum number of list attempts (must be >= 2) +func ListAllBlocksConsistent(ctx context.Context, st Storage, prefix string, maxAttempts int) ([]BlockMetadata, error) { + var previous []BlockMetadata + + for i := 0; i < maxAttempts; i++ { + result, err := ListAllBlocks(ctx, st, prefix) + if err != nil { + return nil, err + } + if i > 0 && sameBlocks(result, previous) { + return result, nil + } + + previous = result + } + + return nil, fmt.Errorf("unable to achieve consistent snapshot despite %v attempts", maxAttempts) +} + +// sameBlocks returns true if b1 & b2 contain the same blocks (ignoring order). +func sameBlocks(b1, b2 []BlockMetadata) bool { + if len(b1) != len(b2) { + return false + } + m := map[string]BlockMetadata{} + for _, b := range b1 { + m[b.BlockID] = b + } + for _, b := range b2 { + if m[b.BlockID] != b { + return false + } + } + return true +} diff --git a/storage/storage_test.go b/storage/storage_test.go new file mode 100644 index 000000000..8f41e4796 --- /dev/null +++ b/storage/storage_test.go @@ -0,0 +1,57 @@ +package storage_test + +import ( + "context" + "testing" + "time" + + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" +) + +func TestListAllBlocksConsistent(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + st := storagetesting.NewMapStorage(data, nil, time.Now) + st.PutBlock(ctx, "foo1", []byte{1, 2, 3}) //nolint:errcheck + st.PutBlock(ctx, "foo2", []byte{1, 2, 3}) //nolint:errcheck + st.PutBlock(ctx, "foo3", []byte{1, 2, 3}) //nolint:errcheck + + // set up faulty storage that will add a block while a scan is in progress. + f := &storagetesting.FaultyStorage{ + Base: st, + Faults: map[string][]*storagetesting.Fault{ + "ListBlocksItem": { + {ErrCallback: func() error { + st.PutBlock(ctx, "foo0", []byte{1, 2, 3}) //nolint:errcheck + return nil + }}, + }, + }, + } + + r, err := storage.ListAllBlocksConsistent(ctx, f, "foo", 3) + if err != nil { + t.Fatalf("error: %v", err) + } + + // make sure we get the list with 4 items, not 3. + if got, want := len(r), 4; got != want { + t.Errorf("unexpected list result count: %v, want %v", got, want) + } +} + +func TestListAllBlocksConsistentEmpty(t *testing.T) { + ctx := context.Background() + data := map[string][]byte{} + st := storagetesting.NewMapStorage(data, nil, time.Now) + + r, err := storage.ListAllBlocksConsistent(ctx, st, "foo", 3) + if err != nil { + t.Fatalf("error: %v", err) + } + + if got, want := len(r), 0; got != want { + t.Errorf("unexpected list result count: %v, want %v", got, want) + } +} diff --git a/storage/webdav/webdav_options.go b/storage/webdav/webdav_options.go new file mode 100644 index 000000000..79e93d544 --- /dev/null +++ b/storage/webdav/webdav_options.go @@ -0,0 +1,17 @@ +package webdav + +// Options defines options for Filesystem-backed storage. +type Options struct { + URL string `json:"url"` + DirectoryShards []int `json:"dirShards"` + Username string `json:"username,omitempty"` + Password string `json:"password,omitempty" kopia:"sensitive"` +} + +func (fso *Options) shards() []int { + if fso.DirectoryShards == nil { + return fsDefaultShards + } + + return fso.DirectoryShards +} diff --git a/storage/webdav/webdav_storage.go b/storage/webdav/webdav_storage.go new file mode 100644 index 000000000..21fd6ce94 --- /dev/null +++ b/storage/webdav/webdav_storage.go @@ -0,0 +1,210 @@ +// Package webdav implements WebDAV-based Storage. +package webdav + +import ( + "context" + "errors" + "fmt" + "math/rand" + "os" + "path/filepath" + "sort" + "strings" + + "github.com/kopia/repo/storage" + "github.com/studio-b12/gowebdav" +) + +const ( + davStorageType = "webdav" + fsStorageChunkSuffix = ".f" +) + +var ( + fsDefaultShards = []int{3, 3} +) + +// davStorage implements blob.Storage on top of remove WebDAV repository. +// It is very similar to File storage, except uses HTTP URLs instead of local files. +// Storage formats are compatible (both use sharded directory structure), so a repository +// may be accessed using WebDAV or File interchangeably. +type davStorage struct { + Options + + cli *gowebdav.Client +} + +func (d *davStorage) GetBlock(ctx context.Context, blockID string, offset, length int64) ([]byte, error) { + _, path := d.getDirPathAndFilePath(blockID) + + data, err := d.cli.Read(path) + if err != nil { + return nil, d.translateError(err) + } + if length < 0 { + return data, nil + } + + if int(offset) > len(data) || offset < 0 { + return nil, errors.New("invalid offset") + } + + data = data[offset:] + if int(length) > len(data) { + return nil, errors.New("invalid length") + } + + return data[0:length], nil +} + +func (d *davStorage) translateError(err error) error { + switch err := err.(type) { + case *os.PathError: + switch err.Err.Error() { + case "404": + return storage.ErrBlockNotFound + } + return err + default: + return err + } +} + +func getBlockIDFromFileName(name string) (string, bool) { + if strings.HasSuffix(name, fsStorageChunkSuffix) { + return name[0 : len(name)-len(fsStorageChunkSuffix)], true + } + + return "", false +} + +func makeFileName(blockID string) string { + return blockID + fsStorageChunkSuffix +} + +func (d *davStorage) ListBlocks(ctx context.Context, prefix string, callback func(storage.BlockMetadata) error) error { + var walkDir func(string, string) error + + walkDir = func(path string, currentPrefix string) error { + entries, err := d.cli.ReadDir(gowebdav.FixSlash(path)) + if err != nil { + return fmt.Errorf("read dir error on %v: %v", path, err) + } + + sort.Slice(entries, func(i, j int) bool { + return entries[i].Name() < entries[j].Name() + }) + + for _, e := range entries { + if e.IsDir() { + newPrefix := currentPrefix + e.Name() + var match bool + + if len(prefix) > len(newPrefix) { + // looking for 'abcd', got 'ab' so far, worth trying + match = strings.HasPrefix(prefix, newPrefix) + } else { + match = strings.HasPrefix(newPrefix, prefix) + } + + if match { + if err := walkDir(path+"/"+e.Name(), currentPrefix+e.Name()); err != nil { + return err + } + } + } else if fullID, ok := getBlockIDFromFileName(currentPrefix + e.Name()); ok { + if strings.HasPrefix(fullID, prefix) { + if err := callback(storage.BlockMetadata{ + BlockID: fullID, + Length: e.Size(), + Timestamp: e.ModTime(), + }); err != nil { + return err + } + } + } + } + + return nil + } + + return walkDir("", "") +} + +func (d *davStorage) PutBlock(ctx context.Context, blockID string, data []byte) error { + dirPath, filePath := d.getDirPathAndFilePath(blockID) + tmpPath := fmt.Sprintf("%v-%v", filePath, rand.Int63()) + if err := d.translateError(d.cli.Write(tmpPath, data, 0600)); err != nil { + if err != storage.ErrBlockNotFound { + return err + } + + d.cli.MkdirAll(dirPath, 0700) //nolint:errcheck + if err = d.translateError(d.cli.Write(tmpPath, data, 0600)); err != nil { + return err + } + } + + return d.translateError(d.cli.Rename(tmpPath, filePath, true)) +} + +func (d *davStorage) DeleteBlock(ctx context.Context, blockID string) error { + _, filePath := d.getDirPathAndFilePath(blockID) + return d.translateError(d.cli.Remove(filePath)) +} + +func (d *davStorage) getShardDirectory(blockID string) (string, string) { + shardPath := "/" + if len(blockID) < 20 { + return shardPath, blockID + } + for _, size := range d.shards() { + shardPath = filepath.Join(shardPath, blockID[0:size]) + blockID = blockID[size:] + } + + return shardPath, blockID +} + +func (d *davStorage) getDirPathAndFilePath(blockID string) (string, string) { + shardPath, blockID := d.getShardDirectory(blockID) + result := filepath.Join(shardPath, makeFileName(blockID)) + return shardPath, result +} + +func (d *davStorage) ConnectionInfo() storage.ConnectionInfo { + return storage.ConnectionInfo{ + Type: davStorageType, + Config: &d.Options, + } +} + +func (d *davStorage) Close(ctx context.Context) error { + return nil +} + +// New creates new WebDAV-backed storage in a specified URL. +func New(ctx context.Context, opts *Options) (storage.Storage, error) { + r := &davStorage{ + Options: *opts, + cli: gowebdav.NewClient(opts.URL, opts.Username, opts.Password), + } + + for _, s := range r.shards() { + if s == 0 { + return nil, fmt.Errorf("invalid shard spec: %v", opts.DirectoryShards) + } + } + + r.Options.URL = strings.TrimSuffix(r.Options.URL, "/") + return r, nil +} + +func init() { + storage.AddSupportedStorage( + davStorageType, + func() interface{} { return &Options{} }, + func(ctx context.Context, o interface{}) (storage.Storage, error) { + return New(ctx, o.(*Options)) + }) +} diff --git a/storage/webdav/webdav_storage_test.go b/storage/webdav/webdav_storage_test.go new file mode 100644 index 000000000..ddfb24a22 --- /dev/null +++ b/storage/webdav/webdav_storage_test.go @@ -0,0 +1,65 @@ +package webdav + +import ( + "context" + "fmt" + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "testing" + + "golang.org/x/net/webdav" + + "github.com/kopia/repo/internal/storagetesting" +) + +func TestWebDAVStorage(t *testing.T) { + tmpDir, _ := ioutil.TempDir("", "webdav") + defer os.RemoveAll(tmpDir) + + t.Logf("tmpDir: %v", tmpDir) + + mux := http.NewServeMux() + mux.Handle("/", &webdav.Handler{ + FileSystem: webdav.Dir(tmpDir), + LockSystem: webdav.NewMemLS(), + }) + + server := httptest.NewServer(mux) + defer server.Close() + + ctx := context.Background() + + // Test varioush shard configurations. + for _, shardSpec := range [][]int{ + {1}, + {3, 3}, + {2}, + {1, 1}, + {1, 2}, + {2, 2, 2}, + } { + t.Run(fmt.Sprintf("shards-%v", shardSpec), func(t *testing.T) { + if err := os.RemoveAll(tmpDir); err != nil { + t.Errorf("can't remove all: %q", tmpDir) + } + os.MkdirAll(tmpDir, 0700) //nolint:errcheck + + r, err := New(context.Background(), &Options{ + URL: server.URL, + DirectoryShards: shardSpec, + }) + + if r == nil || err != nil { + t.Errorf("unexpected result: %v %v", r, err) + } + + storagetesting.VerifyStorage(ctx, t, r) + storagetesting.AssertConnectionInfoRoundTrips(ctx, t, r) + if err := r.Close(ctx); err != nil { + t.Fatalf("err: %v", err) + } + }) + } +} diff --git a/test_service_account.json.enc b/test_service_account.json.enc new file mode 100644 index 000000000..20cb866b6 Binary files /dev/null and b/test_service_account.json.enc differ diff --git a/tests/repository_stress_test/repository_stress.go b/tests/repository_stress_test/repository_stress.go new file mode 100644 index 000000000..ec4817f32 --- /dev/null +++ b/tests/repository_stress_test/repository_stress.go @@ -0,0 +1,3 @@ +package repositorystress + +// dummy package diff --git a/tests/repository_stress_test/repository_stress_test.go b/tests/repository_stress_test/repository_stress_test.go new file mode 100644 index 000000000..785686ae8 --- /dev/null +++ b/tests/repository_stress_test/repository_stress_test.go @@ -0,0 +1,319 @@ +package repositorystress_test + +import ( + "context" + "fmt" + "io/ioutil" + "log" + "math/rand" + "os" + "path/filepath" + "runtime" + "strings" + "sync" + "testing" + "time" + + "github.com/kopia/repo" + "github.com/kopia/repo/block" + "github.com/kopia/repo/storage" + "github.com/kopia/repo/storage/filesystem" +) + +const masterPassword = "foo-bar-baz-1234" + +var ( + knownBlocks []string + knownBlocksMutex sync.Mutex +) + +func TestStressRepository(t *testing.T) { + if testing.Short() { + t.Skip("skipping stress test during short tests") + } + ctx := block.UsingListCache(context.Background(), false) + + tmpPath, err := ioutil.TempDir("", "kopia") + if err != nil { + t.Fatalf("unable to create temp directory") + } + + defer func() { + if !t.Failed() { + os.RemoveAll(tmpPath) + } + }() + + t.Logf("path: %v", tmpPath) + + storagePath := filepath.Join(tmpPath, "storage") + configFile1 := filepath.Join(tmpPath, "kopia1.config") + configFile2 := filepath.Join(tmpPath, "kopia2.config") + + assertNoError(t, os.MkdirAll(storagePath, 0700)) + st, err := filesystem.New(ctx, &filesystem.Options{ + Path: storagePath, + }) + if err != nil { + t.Fatalf("unable to initialize storage: %v", err) + } + + // create repository + if err := repo.Initialize(ctx, st, &repo.NewRepositoryOptions{}, masterPassword); err != nil { + t.Fatalf("unable to initialize repository: %v", err) + } + + // set up two parallel kopia connections, each with its own config file and cache. + if err := repo.Connect(ctx, configFile1, st, masterPassword, repo.ConnectOptions{ + CachingOptions: block.CachingOptions{ + CacheDirectory: filepath.Join(tmpPath, "cache1"), + MaxCacheSizeBytes: 2000000000, + }, + }); err != nil { + t.Fatalf("unable to connect 1: %v", err) + } + + if err := repo.Connect(ctx, configFile2, st, masterPassword, repo.ConnectOptions{ + CachingOptions: block.CachingOptions{ + CacheDirectory: filepath.Join(tmpPath, "cache2"), + MaxCacheSizeBytes: 2000000000, + }, + }); err != nil { + t.Fatalf("unable to connect 2: %v", err) + } + + cancel := make(chan struct{}) + + var wg sync.WaitGroup + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile1, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile1, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile1, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile1, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile2, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile2, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile2, &wg) + wg.Add(1) + go longLivedRepositoryTest(ctx, t, cancel, configFile2, &wg) + + time.Sleep(5 * time.Second) + close(cancel) + + wg.Wait() +} + +func longLivedRepositoryTest(ctx context.Context, t *testing.T, cancel chan struct{}, configFile string, wg *sync.WaitGroup) { + defer wg.Done() + + rep, err := repo.Open(ctx, configFile, masterPassword, &repo.Options{}) + if err != nil { + t.Errorf("error opening repository: %v", err) + return + } + defer rep.Close(ctx) + + var wg2 sync.WaitGroup + + for i := 0; i < 4; i++ { + wg2.Add(1) + go func() { + defer wg2.Done() + + repositoryTest(ctx, t, cancel, rep) + }() + } + + wg2.Wait() +} + +func repositoryTest(ctx context.Context, t *testing.T, cancel chan struct{}, rep *repo.Repository) { + // reopen := func(t *testing.T, r *repo.Repository) error { + // if err := rep.Close(ctx); err != nil { + // return fmt.Errorf("error closing: %v", err) + // } + + // t0 := time.Now() + // rep, err = repo.Open(ctx, configFile, &repo.Options{}) + // log.Printf("reopened in %v", time.Since(t0)) + // return err + // } + + workTypes := []*struct { + name string + fun func(ctx context.Context, t *testing.T, r *repo.Repository) error + weight int + hitCount int + }{ + //{"reopen", reopen, 1, 0}, + {"writeRandomBlock", writeRandomBlock, 100, 0}, + {"writeRandomManifest", writeRandomManifest, 100, 0}, + {"readKnownBlock", readKnownBlock, 500, 0}, + {"listBlocks", listBlocks, 50, 0}, + {"listAndReadAllBlocks", listAndReadAllBlocks, 5, 0}, + {"readRandomManifest", readRandomManifest, 50, 0}, + {"compact", compact, 1, 0}, + {"refresh", refresh, 3, 0}, + {"flush", flush, 1, 0}, + } + + var totalWeight int + for _, w := range workTypes { + totalWeight += w.weight + } + + iter := 0 + for { + select { + case <-cancel: + rep.Close(ctx) + return + default: + } + + if iter%1000 == 0 { + var bits []string + for _, w := range workTypes { + bits = append(bits, fmt.Sprintf("%v:%v", w.name, w.hitCount)) + } + log.Printf("#%v %v %v goroutines", iter, strings.Join(bits, " "), runtime.NumGoroutine()) + } + iter++ + + roulette := rand.Intn(totalWeight) + for _, w := range workTypes { + if roulette < w.weight { + w.hitCount++ + //log.Printf("running %v", w.name) + if err := w.fun(ctx, t, rep); err != nil { + w.hitCount++ + t.Errorf("error: %v", fmt.Errorf("error running %v: %v", w.name, err)) + return + } + break + } + + roulette -= w.weight + } + } + +} + +func writeRandomBlock(ctx context.Context, t *testing.T, r *repo.Repository) error { + data := make([]byte, 1000) + rand.Read(data) + blockID, err := r.Blocks.WriteBlock(ctx, data, "") + if err == nil { + knownBlocksMutex.Lock() + if len(knownBlocks) >= 1000 { + n := rand.Intn(len(knownBlocks)) + knownBlocks[n] = blockID + } else { + knownBlocks = append(knownBlocks, blockID) + } + knownBlocksMutex.Unlock() + } + return err +} + +func readKnownBlock(ctx context.Context, t *testing.T, r *repo.Repository) error { + knownBlocksMutex.Lock() + if len(knownBlocks) == 0 { + knownBlocksMutex.Unlock() + return nil + } + blockID := knownBlocks[rand.Intn(len(knownBlocks))] + knownBlocksMutex.Unlock() + + _, err := r.Blocks.GetBlock(ctx, blockID) + if err == nil || err == storage.ErrBlockNotFound { + return nil + } + + return err +} + +func listBlocks(ctx context.Context, t *testing.T, r *repo.Repository) error { + _, err := r.Blocks.ListBlocks("") + return err +} + +func listAndReadAllBlocks(ctx context.Context, t *testing.T, r *repo.Repository) error { + blocks, err := r.Blocks.ListBlocks("") + if err != nil { + return err + } + + for _, bi := range blocks { + _, err := r.Blocks.GetBlock(ctx, bi) + if err != nil { + if err == storage.ErrBlockNotFound && strings.HasPrefix(bi, "m") { + // this is ok, sometimes manifest manager will perform compaction and 'm' blocks will be marked as deleted + continue + } + return fmt.Errorf("error reading block %v: %v", bi, err) + } + } + + return nil +} + +func compact(ctx context.Context, t *testing.T, r *repo.Repository) error { + return r.Blocks.CompactIndexes(ctx, block.CompactOptions{ + MinSmallBlocks: 1, + MaxSmallBlocks: 1, + }) +} + +func flush(ctx context.Context, t *testing.T, r *repo.Repository) error { + return r.Flush(ctx) +} + +func refresh(ctx context.Context, t *testing.T, r *repo.Repository) error { + return r.Refresh(ctx) +} + +func readRandomManifest(ctx context.Context, t *testing.T, r *repo.Repository) error { + manifests, err := r.Manifests.Find(ctx, nil) + if err != nil { + return err + } + if len(manifests) == 0 { + return nil + } + n := rand.Intn(len(manifests)) + _, err = r.Manifests.GetRaw(ctx, manifests[n].ID) + return err +} + +func writeRandomManifest(ctx context.Context, t *testing.T, r *repo.Repository) error { + key1 := fmt.Sprintf("key-%v", rand.Intn(10)) + key2 := fmt.Sprintf("key-%v", rand.Intn(10)) + val1 := fmt.Sprintf("val1-%v", rand.Intn(10)) + val2 := fmt.Sprintf("val2-%v", rand.Intn(10)) + content1 := fmt.Sprintf("content-%v", rand.Intn(10)) + content2 := fmt.Sprintf("content-%v", rand.Intn(10)) + content1val := fmt.Sprintf("val1-%v", rand.Intn(10)) + content2val := fmt.Sprintf("val2-%v", rand.Intn(10)) + _, err := r.Manifests.Put(ctx, map[string]string{ + "type": key1, + key1: val1, + key2: val2, + }, map[string]string{ + content1: content1val, + content2: content2val, + }) + return err +} + +func assertNoError(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Errorf("err: %v", err) + } +} diff --git a/tests/stress_test/stress.go b/tests/stress_test/stress.go new file mode 100644 index 000000000..1d01396c9 --- /dev/null +++ b/tests/stress_test/stress.go @@ -0,0 +1,3 @@ +package stress + +// dummy package diff --git a/tests/stress_test/stress_test.go b/tests/stress_test/stress_test.go new file mode 100644 index 000000000..0b8a1b399 --- /dev/null +++ b/tests/stress_test/stress_test.go @@ -0,0 +1,132 @@ +package stress_test + +import ( + "context" + "fmt" + "math/rand" + "os" + "reflect" + "testing" + "time" + + "github.com/kopia/repo/block" + "github.com/kopia/repo/internal/storagetesting" + "github.com/kopia/repo/storage" +) + +const goroutineCount = 16 + +func TestStressBlockManager(t *testing.T) { + if testing.Short() { + t.Skip("skipping stress test during short tests") + } + + data := map[string][]byte{} + keyTimes := map[string]time.Time{} + memst := storagetesting.NewMapStorage(data, keyTimes, time.Now) + + var duration = 3 * time.Second + if os.Getenv("KOPIA_LONG_STRESS_TEST") != "" { + duration = 3 * time.Minute + } + + stressTestWithStorage(t, memst, duration) +} + +func stressTestWithStorage(t *testing.T, st storage.Storage, duration time.Duration) { + ctx := context.Background() + + openMgr := func() (*block.Manager, error) { + return block.NewManager(ctx, st, block.FormattingOptions{ + Version: 1, + Hash: "HMAC-SHA256-128", + Encryption: "AES-256-CTR", + MaxPackSize: 20000000, + MasterKey: []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + }, block.CachingOptions{}, nil) + } + + seed0 := time.Now().Nanosecond() + + t.Logf("running with seed %v", seed0) + + deadline := time.Now().Add(duration) + + t.Run("workers", func(t *testing.T) { + for i := 0; i < goroutineCount; i++ { + i := i + t.Run(fmt.Sprintf("worker-%v", i), func(t *testing.T) { + t.Parallel() + stressWorker(ctx, t, deadline, i, openMgr, int64(seed0+i)) + }) + } + }) +} + +func stressWorker(ctx context.Context, t *testing.T, deadline time.Time, workerID int, openMgr func() (*block.Manager, error), seed int64) { + src := rand.NewSource(seed) + rand := rand.New(src) + + bm, err := openMgr() + if err != nil { + t.Fatalf("error opening manager: %v", err) + } + + type writtenBlock struct { + contentID string + data []byte + } + + var workerBlocks []writtenBlock + + for time.Now().Before(deadline) { + l := rand.Intn(30000) + data := make([]byte, l) + if _, err := rand.Read(data); err != nil { + t.Errorf("err: %v", err) + return + } + dataCopy := append([]byte{}, data...) + contentID, err := bm.WriteBlock(ctx, data, "") + if err != nil { + t.Errorf("err: %v", err) + return + } + + switch rand.Intn(20) { + case 0: + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + return + } + case 1: + if err := bm.Flush(ctx); err != nil { + t.Errorf("flush error: %v", err) + return + } + bm, err = openMgr() + if err != nil { + t.Errorf("error opening: %v", err) + return + } + } + + //log.Printf("wrote %v", contentID) + workerBlocks = append(workerBlocks, writtenBlock{contentID, dataCopy}) + if len(workerBlocks) > 5 { + pos := rand.Intn(len(workerBlocks)) + previous := workerBlocks[pos] + //log.Printf("reading %v", previous.contentID) + d2, err := bm.GetBlock(ctx, previous.contentID) + if err != nil { + t.Errorf("error verifying block %q: %v", previous.contentID, err) + return + } + if !reflect.DeepEqual(previous.data, d2) { + t.Errorf("invalid previous data for %q %x %x", previous.contentID, d2, previous.data) + return + } + workerBlocks = append(workerBlocks[0:pos], workerBlocks[pos+1:]...) + } + } +} diff --git a/upgrade.go b/upgrade.go new file mode 100644 index 000000000..0c2487292 --- /dev/null +++ b/upgrade.go @@ -0,0 +1,35 @@ +package repo + +import ( + "context" + "fmt" + + "github.com/pkg/errors" +) + +// Upgrade upgrades repository data structures to the latest version. +func (r *Repository) Upgrade(ctx context.Context) error { + f := r.formatBlock + + log.Debug("decrypting format...") + repoConfig, err := f.decryptFormatBytes(r.masterKey) + if err != nil { + return errors.Wrap(err, "unable to decrypt repository config") + } + + var migrated bool + + // TODO(jkowalski): add migration code here + if !migrated { + log.Infof("nothing to do") + return nil + } + + log.Debug("encrypting format...") + if err := encryptFormatBytes(f, repoConfig, r.masterKey, f.UniqueID); err != nil { + return fmt.Errorf("unable to encrypt format bytes") + } + + log.Infof("writing updated format block...") + return writeFormatBlock(ctx, r.Storage, f) +}