Files
kopia/repo/object/object_manager.go
Jarek Kowalski 792cc874dc repo: allow reusing of object writer buffers (#1315)
This reduces memory consumption and speeds up backups.

1. Backing up kopia repository (3.5 GB files:133102 dirs:20074):

before: 25s, 490 MB
after: 21s, 445 MB

2. Large files (14.8 GB, 76 files)

before: 30s, 597 MB
after: 28s, 495 MB

All tests repeated 5 times for clean local filesystem repo.
2021-09-25 14:54:31 -07:00

212 lines
6.4 KiB
Go

// Package object implements repository support for content-addressable objects of arbitrary size.
package object
import (
"context"
"io"
"sync"
"github.com/pkg/errors"
"github.com/kopia/kopia/internal/gather"
"github.com/kopia/kopia/repo/compression"
"github.com/kopia/kopia/repo/content"
"github.com/kopia/kopia/repo/splitter"
)
// ErrObjectNotFound is returned when an object cannot be found.
var ErrObjectNotFound = errors.New("object not found")
// Reader allows reading, seeking, getting the length of and closing of a repository object.
type Reader interface {
io.Reader
io.Seeker
io.Closer
Length() int64
}
type contentReader interface {
ContentInfo(ctx context.Context, contentID content.ID) (content.Info, error)
GetContent(ctx context.Context, contentID content.ID) ([]byte, error)
}
type contentManager interface {
contentReader
SupportsContentCompression() bool
WriteContent(ctx context.Context, data gather.Bytes, prefix content.ID, comp compression.HeaderID) (content.ID, error)
}
// Format describes the format of objects in a repository.
type Format struct {
Splitter string `json:"splitter,omitempty"` // splitter used to break objects into pieces of content
}
// Manager implements a content-addressable storage on top of blob storage.
type Manager struct {
Format Format
contentMgr contentManager
newSplitter splitter.Factory
writerPool sync.Pool
}
// NewWriter creates an ObjectWriter for writing to the repository.
func (om *Manager) NewWriter(ctx context.Context, opt WriterOptions) Writer {
w, _ := om.writerPool.Get().(*objectWriter)
w.ctx = ctx
w.om = om
w.splitter = om.newSplitter()
w.description = opt.Description
w.prefix = opt.Prefix
w.compressor = compression.ByName[opt.Compressor]
w.totalLength = 0
w.currentPosition = 0
// point the slice at the embedded array, so that we avoid allocations most of the time
w.indirectIndex = w.indirectIndexBuf[:0]
if opt.AsyncWrites > 0 {
if len(w.asyncWritesSemaphore) != 0 || cap(w.asyncWritesSemaphore) != opt.AsyncWrites {
w.asyncWritesSemaphore = make(chan struct{}, opt.AsyncWrites)
}
} else {
w.asyncWritesSemaphore = nil
}
w.buffer.Reset()
w.contentWriteError = nil
return w
}
func (om *Manager) closedWriter(ow *objectWriter) {
om.writerPool.Put(ow)
}
// Concatenate creates an object that's a result of concatenation of other objects. This is more efficient than reading
// and rewriting the objects because Concatenate can efficiently merge index entries without reading the underlying
// contents.
//
// This function exists primarily to facilitate efficient parallel uploads of very large files (>1GB). Due to bottleneck of
// splitting which is inherently sequential, we can only one use CPU core for each Writer, which limits throughput.
//
// For example when uploading a 100 GB file it is beneficial to independently upload sections of [0..25GB),
// [25..50GB), [50GB..75GB) and [75GB..100GB) and concatenate them together as this allows us to run four splitters
// in parallel utilizing more CPU cores. Because some split points now start at fixed bounaries and not content-specific,
// this causes some slight loss of deduplication at concatenation points (typically 1-2 contents, usually <10MB),
// so this method should only be used for very large files where this overhead is relatively small.
func (om *Manager) Concatenate(ctx context.Context, objectIDs []ID) (ID, error) {
if len(objectIDs) == 0 {
return "", errors.Errorf("empty list of objects")
}
if len(objectIDs) == 1 {
return objectIDs[0], nil
}
var (
concatenatedEntries []indirectObjectEntry
totalLength int64
err error
)
for _, objectID := range objectIDs {
concatenatedEntries, totalLength, err = appendIndexEntriesForObject(ctx, om.contentMgr, concatenatedEntries, totalLength, objectID)
if err != nil {
return "", errors.Wrapf(err, "error appending %v", objectID)
}
}
log(ctx).Debugf("concatenated: %v total: %v", concatenatedEntries, totalLength)
w := om.NewWriter(ctx, WriterOptions{
Prefix: indirectContentPrefix,
Description: "CONCATENATED INDEX",
})
defer w.Close() // nolint:errcheck
if werr := writeIndirectObject(w, concatenatedEntries); werr != nil {
return "", werr
}
concatID, err := w.Result()
if err != nil {
return "", errors.Wrap(err, "error writing concatenated index")
}
return IndirectObjectID(concatID), nil
}
func appendIndexEntriesForObject(ctx context.Context, cr contentReader, indexEntries []indirectObjectEntry, startingLength int64, objectID ID) (result []indirectObjectEntry, totalLength int64, _ error) {
if indexObjectID, ok := objectID.IndexObjectID(); ok {
ndx, err := loadSeekTable(ctx, cr, indexObjectID)
if err != nil {
return nil, 0, errors.Wrapf(err, "error reading index of %v", objectID)
}
indexEntries, totalLength = appendIndexEntries(indexEntries, startingLength, ndx...)
return indexEntries, totalLength, nil
}
// non-index object - the precise length of the object cannot be determined from content due to compression and padding,
// so we must open the object to read its length.
r, err := Open(ctx, cr, objectID)
if err != nil {
return nil, 0, errors.Wrapf(err, "error opening %v", objectID)
}
defer r.Close() //nolint:errcheck
indexEntries, totalLength = appendIndexEntries(indexEntries, startingLength, indirectObjectEntry{
Start: 0,
Length: r.Length(),
Object: objectID,
})
return indexEntries, totalLength, nil
}
func appendIndexEntries(indexEntries []indirectObjectEntry, startingLength int64, incoming ...indirectObjectEntry) (result []indirectObjectEntry, totalLength int64) {
totalLength = startingLength
for _, inc := range incoming {
indexEntries = append(indexEntries, indirectObjectEntry{
Start: inc.Start + startingLength,
Length: inc.Length,
Object: inc.Object,
})
totalLength += inc.Length
}
return indexEntries, totalLength
}
// NewObjectManager creates an ObjectManager with the specified content manager and format.
func NewObjectManager(ctx context.Context, bm contentManager, f Format) (*Manager, error) {
om := &Manager{
contentMgr: bm,
Format: f,
}
om.writerPool = sync.Pool{
New: func() interface{} {
return new(objectWriter)
},
}
splitterID := f.Splitter
if splitterID == "" {
splitterID = "FIXED"
}
os := splitter.GetFactory(splitterID)
if os == nil {
return nil, errors.Errorf("unsupported splitter %q", f.Splitter)
}
om.newSplitter = splitter.Pooled(os)
return om, nil
}