mirror of
https://github.com/kopia/kopia.git
synced 2026-02-22 02:06:55 -05:00
* Robustness engine actions with stats and logging - Add actions to robustness engine - Actions wrap other functional behavior and serve as a common interface for collecting stats - Add stats for the engine, both per run and cumulative over time - Add a log for actions that the engine has executed - Add recovery logic to re-sync snapshot metadata after a possible failed engine run (e.g. if metadata wasn't properly persisted). Current built-in actions: - snapshot root directory - restore random snapshot ID into a target restore path - delete a random snapshot ID - run GC - write random files to the local data directory - delete a random subdirectory under the local data directory - delete files in a directory - restore a snapshot ID into the local data directory Actions are executed according to a set of options, which dictate the relative probabilities of picking a given action, along with ranges for action-specific parameters that can be randomized.
490 lines
14 KiB
Go
490 lines
14 KiB
Go
// +build darwin,amd64 linux,amd64
|
|
|
|
package engine
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"log"
|
|
"math/rand"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/kopia/kopia/tests/tools/fio"
|
|
)
|
|
|
|
// Errors associated with action-picking.
|
|
var (
|
|
ErrNoActionPicked = errors.New("unable to pick an action with the action control options provided")
|
|
ErrInvalidOption = errors.New("invalid option setting")
|
|
)
|
|
|
|
// ExecAction executes the action denoted by the provided ActionKey.
|
|
func (e *Engine) ExecAction(actionKey ActionKey, opts map[string]string) (map[string]string, error) {
|
|
if opts == nil {
|
|
opts = make(map[string]string)
|
|
}
|
|
|
|
e.RunStats.ActionCounter++
|
|
e.CumulativeStats.ActionCounter++
|
|
log.Printf("Engine executing ACTION: name=%q actionCount=%v totActCount=%v t=%vs (%vs)", actionKey, e.RunStats.ActionCounter, e.CumulativeStats.ActionCounter, e.RunStats.getLifetimeSeconds(), e.getRuntimeSeconds())
|
|
|
|
action := actions[actionKey]
|
|
st := time.Now()
|
|
|
|
logEntry := &LogEntry{
|
|
StartTime: st,
|
|
EngineTimestamp: e.getTimestampS(),
|
|
Action: actionKey,
|
|
ActionOpts: opts,
|
|
}
|
|
|
|
// Execute the action n times
|
|
err := ErrNoOp // Default to no-op error
|
|
|
|
// TODO: return more than the last output
|
|
var out map[string]string
|
|
|
|
n := getOptAsIntOrDefault(ActionRepeaterField, opts, defaultActionRepeats)
|
|
for i := 0; i < n; i++ {
|
|
out, err = action.f(e, opts, logEntry)
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
// If error was just a no-op, don't bother logging the action
|
|
switch {
|
|
case errors.Is(err, ErrNoOp):
|
|
e.RunStats.NoOpCount++
|
|
e.CumulativeStats.NoOpCount++
|
|
|
|
return out, err
|
|
|
|
case err != nil:
|
|
log.Printf("error=%q", err.Error())
|
|
}
|
|
|
|
if e.RunStats.PerActionStats != nil && e.RunStats.PerActionStats[actionKey] == nil {
|
|
e.RunStats.PerActionStats[actionKey] = new(ActionStats)
|
|
}
|
|
|
|
if e.CumulativeStats.PerActionStats != nil && e.CumulativeStats.PerActionStats[actionKey] == nil {
|
|
e.CumulativeStats.PerActionStats[actionKey] = new(ActionStats)
|
|
}
|
|
|
|
e.RunStats.PerActionStats[actionKey].Record(st, err)
|
|
e.CumulativeStats.PerActionStats[actionKey].Record(st, err)
|
|
|
|
e.EngineLog.AddCompleted(logEntry, err)
|
|
|
|
return out, err
|
|
}
|
|
|
|
// RandomAction executes a random action picked by the relative weights given
|
|
// in actionOpts[ActionControlActionKey], or uniform probability if that
|
|
// key is not present in the input options.
|
|
func (e *Engine) RandomAction(actionOpts ActionOpts) error {
|
|
actionControlOpts := actionOpts.getActionControlOpts()
|
|
|
|
actionName := pickActionWeighted(actionControlOpts, actions)
|
|
if string(actionName) == "" {
|
|
return ErrNoActionPicked
|
|
}
|
|
|
|
_, err := e.ExecAction(actionName, actionOpts[actionName])
|
|
err = e.checkErrRecovery(err, actionOpts)
|
|
|
|
return err
|
|
}
|
|
|
|
func (e *Engine) checkErrRecovery(incomingErr error, actionOpts ActionOpts) (outgoingErr error) {
|
|
outgoingErr = incomingErr
|
|
|
|
if incomingErr == nil {
|
|
return nil
|
|
}
|
|
|
|
ctrl := actionOpts.getActionControlOpts()
|
|
|
|
if errIsNotEnoughSpace(incomingErr) && ctrl[ThrowNoSpaceOnDeviceErrField] == "" {
|
|
// no space left on device
|
|
// Delete everything in the data directory
|
|
const hundredPcnt = 100
|
|
|
|
deleteDirActionKey := DeleteDirectoryContentsActionKey
|
|
deleteRootOpts := map[string]string{
|
|
MaxDirDepthField: strconv.Itoa(0),
|
|
DeletePercentOfContentsField: strconv.Itoa(hundredPcnt),
|
|
}
|
|
|
|
_, outgoingErr = e.ExecAction(deleteDirActionKey, deleteRootOpts)
|
|
if outgoingErr != nil {
|
|
return outgoingErr
|
|
}
|
|
|
|
e.RunStats.DataPurgeCount++
|
|
e.CumulativeStats.DataPurgeCount++
|
|
|
|
// Restore a previoius snapshot to the data directory
|
|
restoreActionKey := RestoreIntoDataDirectoryActionKey
|
|
_, outgoingErr = e.ExecAction(restoreActionKey, actionOpts[restoreActionKey])
|
|
|
|
if errors.Is(outgoingErr, ErrNoOp) {
|
|
outgoingErr = nil
|
|
} else {
|
|
e.RunStats.DataRestoreCount++
|
|
e.CumulativeStats.DataRestoreCount++
|
|
}
|
|
}
|
|
|
|
if outgoingErr == nil {
|
|
e.RunStats.ErrorRecoveryCount++
|
|
e.CumulativeStats.ErrorRecoveryCount++
|
|
}
|
|
|
|
return outgoingErr
|
|
}
|
|
|
|
// List of action keys.
|
|
const (
|
|
ActionControlActionKey ActionKey = "action-control"
|
|
SnapshotRootDirActionKey ActionKey = "snapshot-root"
|
|
RestoreSnapshotActionKey ActionKey = "restore-random-snapID"
|
|
DeleteRandomSnapshotActionKey ActionKey = "delete-random-snapID"
|
|
WriteRandomFilesActionKey ActionKey = "write-random-files"
|
|
DeleteRandomSubdirectoryActionKey ActionKey = "delete-random-subdirectory"
|
|
DeleteDirectoryContentsActionKey ActionKey = "delete-files"
|
|
RestoreIntoDataDirectoryActionKey ActionKey = "restore-into-data-dir"
|
|
GCActionKey ActionKey = "run-gc"
|
|
)
|
|
|
|
// ActionOpts is a structure that designates the options for
|
|
// picking and running an action.
|
|
type ActionOpts map[ActionKey]map[string]string
|
|
|
|
func (actionOpts ActionOpts) getActionControlOpts() map[string]string {
|
|
actionControlOpts := defaultActionControls()
|
|
if actionOpts != nil && actionOpts[ActionControlActionKey] != nil {
|
|
actionControlOpts = actionOpts[ActionControlActionKey]
|
|
}
|
|
|
|
return actionControlOpts
|
|
}
|
|
|
|
// Action is a unit of functionality that can be executed by
|
|
// the engine.
|
|
type Action struct {
|
|
f func(eng *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error)
|
|
}
|
|
|
|
// ActionKey refers to an action that can be executed by the engine.
|
|
type ActionKey string
|
|
|
|
var actions = map[ActionKey]Action{
|
|
SnapshotRootDirActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
log.Printf("Creating snapshot of root directory %s", e.FileWriter.LocalDataDir)
|
|
|
|
ctx := context.TODO()
|
|
snapID, err := e.Checker.TakeSnapshot(ctx, e.FileWriter.LocalDataDir)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{
|
|
"snap-dir": e.FileWriter.LocalDataDir,
|
|
"snapID": snapID,
|
|
})
|
|
|
|
return map[string]string{
|
|
SnapshotIDField: snapID,
|
|
}, err
|
|
},
|
|
},
|
|
RestoreSnapshotActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
snapID, err := e.getSnapIDOptOrRandLive(opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{"snapID": snapID})
|
|
|
|
log.Printf("Restoring snapshot %s", snapID)
|
|
|
|
ctx := context.Background()
|
|
b := &bytes.Buffer{}
|
|
|
|
err = e.Checker.RestoreSnapshot(ctx, snapID, b)
|
|
if err != nil {
|
|
log.Print(b.String())
|
|
}
|
|
|
|
return nil, err
|
|
},
|
|
},
|
|
DeleteRandomSnapshotActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
snapID, err := e.getSnapIDOptOrRandLive(opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
log.Printf("Deleting snapshot %s", snapID)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{"snapID": snapID})
|
|
|
|
ctx := context.Background()
|
|
err = e.Checker.DeleteSnapshot(ctx, snapID)
|
|
return nil, err
|
|
},
|
|
},
|
|
GCActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
return nil, e.TestRepo.RunGC()
|
|
},
|
|
},
|
|
WriteRandomFilesActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
// Directory depth
|
|
maxDirDepth := getOptAsIntOrDefault(MaxDirDepthField, opts, defaultMaxDirDepth)
|
|
dirDepth := rand.Intn(maxDirDepth + 1) //nolint:gosec
|
|
|
|
// File size range
|
|
maxFileSizeB := getOptAsIntOrDefault(MaxFileSizeField, opts, defaultMaxFileSize)
|
|
minFileSizeB := getOptAsIntOrDefault(MinFileSizeField, opts, defaultMinFileSize)
|
|
|
|
// Number of files to write
|
|
maxNumFiles := getOptAsIntOrDefault(MaxNumFilesPerWriteField, opts, defaultMaxNumFilesPerWrite)
|
|
minNumFiles := getOptAsIntOrDefault(MinNumFilesPerWriteField, opts, defaultMinNumFilesPerWrite)
|
|
|
|
numFiles := rand.Intn(maxNumFiles-minNumFiles+1) + minNumFiles //nolint:gosec
|
|
|
|
// Dedup Percentage
|
|
maxDedupPcnt := getOptAsIntOrDefault(MaxDedupePercentField, opts, defaultMaxDedupePercent)
|
|
minDedupPcnt := getOptAsIntOrDefault(MinDedupePercentField, opts, defaultMinDedupePercent)
|
|
|
|
dedupStep := getOptAsIntOrDefault(DedupePercentStepField, opts, defaultDedupePercentStep)
|
|
|
|
dedupPcnt := dedupStep * (rand.Intn(maxDedupPcnt/dedupStep-minDedupPcnt/dedupStep+1) + minDedupPcnt/dedupStep) //nolint:gosec
|
|
|
|
blockSize := int64(defaultMinFileSize)
|
|
|
|
fioOpts := fio.Options{}.
|
|
WithFileSizeRange(int64(minFileSizeB), int64(maxFileSizeB)).
|
|
WithNumFiles(numFiles).
|
|
WithBlockSize(blockSize).
|
|
WithDedupePercentage(dedupPcnt).
|
|
WithNoFallocate()
|
|
|
|
ioLimit := getOptAsIntOrDefault(IOLimitPerWriteAction, opts, defaultIOLimitPerWriteAction)
|
|
|
|
if ioLimit > 0 {
|
|
freeSpaceLimitB := getOptAsIntOrDefault(FreeSpaceLimitField, opts, defaultFreeSpaceLimit)
|
|
|
|
freeSpaceB, err := getFreeSpaceB(e.FileWriter.LocalDataDir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
log.Printf("Free Space %v B, limit %v B, ioLimit %v B\n", freeSpaceB, freeSpaceLimitB, ioLimit)
|
|
|
|
if int(freeSpaceB)-ioLimit < freeSpaceLimitB {
|
|
ioLimit = int(freeSpaceB) - freeSpaceLimitB
|
|
log.Printf("Cutting down I/O limit for space %v", ioLimit)
|
|
if ioLimit <= 0 {
|
|
return nil, ErrCannotPerformIO
|
|
}
|
|
}
|
|
|
|
fioOpts = fioOpts.WithIOLimit(int64(ioLimit))
|
|
}
|
|
|
|
relBasePath := "."
|
|
|
|
log.Printf("Writing files at depth %v (fileSize: %v-%v, numFiles: %v, blockSize: %v, dedupPcnt: %v, ioLimit: %v)\n", dirDepth, minFileSizeB, maxFileSizeB, numFiles, blockSize, dedupPcnt, ioLimit)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{
|
|
"dirDepth": strconv.Itoa(dirDepth),
|
|
"relBasePath": relBasePath,
|
|
})
|
|
|
|
for k, v := range fioOpts {
|
|
l.CmdOpts[k] = v
|
|
}
|
|
|
|
return nil, e.FileWriter.WriteFilesAtDepthRandomBranch(relBasePath, dirDepth, fioOpts)
|
|
},
|
|
},
|
|
DeleteRandomSubdirectoryActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
maxDirDepth := getOptAsIntOrDefault(MaxDirDepthField, opts, defaultMaxDirDepth)
|
|
if maxDirDepth <= 0 {
|
|
return nil, ErrInvalidOption
|
|
}
|
|
dirDepth := rand.Intn(maxDirDepth) + 1 //nolint:gosec
|
|
|
|
log.Printf("Deleting directory at depth %v\n", dirDepth)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{"dirDepth": strconv.Itoa(dirDepth)})
|
|
|
|
err = e.FileWriter.DeleteDirAtDepth("", dirDepth)
|
|
if errors.Is(err, fio.ErrNoDirFound) {
|
|
log.Print(err)
|
|
return nil, ErrNoOp
|
|
}
|
|
|
|
return nil, err
|
|
},
|
|
},
|
|
DeleteDirectoryContentsActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
maxDirDepth := getOptAsIntOrDefault(MaxDirDepthField, opts, defaultMaxDirDepth)
|
|
dirDepth := rand.Intn(maxDirDepth + 1) //nolint:gosec
|
|
|
|
pcnt := getOptAsIntOrDefault(DeletePercentOfContentsField, opts, defaultDeletePercentOfContents)
|
|
|
|
log.Printf("Deleting %d%% of directory contents at depth %v\n", pcnt, dirDepth)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{
|
|
"dirDepth": strconv.Itoa(dirDepth),
|
|
"percent": strconv.Itoa(pcnt),
|
|
})
|
|
|
|
const pcntConv = 100
|
|
err = e.FileWriter.DeleteContentsAtDepth("", dirDepth, float32(pcnt)/pcntConv)
|
|
if errors.Is(err, fio.ErrNoDirFound) {
|
|
log.Print(err)
|
|
return nil, ErrNoOp
|
|
}
|
|
|
|
return nil, err
|
|
},
|
|
},
|
|
RestoreIntoDataDirectoryActionKey: {
|
|
f: func(e *Engine, opts map[string]string, l *LogEntry) (out map[string]string, err error) {
|
|
snapID, err := e.getSnapIDOptOrRandLive(opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
log.Printf("Restoring snap ID %v into data directory\n", snapID)
|
|
|
|
setLogEntryCmdOpts(l, map[string]string{"snapID": snapID})
|
|
|
|
b := &bytes.Buffer{}
|
|
err = e.Checker.RestoreSnapshotToPath(context.Background(), snapID, e.FileWriter.LocalDataDir, b)
|
|
if err != nil {
|
|
log.Print(b.String())
|
|
return nil, err
|
|
}
|
|
|
|
return nil, nil
|
|
},
|
|
},
|
|
}
|
|
|
|
// Action constants.
|
|
const (
|
|
defaultMaxDirDepth = 20
|
|
defaultMaxFileSize = 1 * 1024 * 1024 * 1024 // 1GB
|
|
defaultMinFileSize = 4096
|
|
defaultMaxNumFilesPerWrite = 10000
|
|
defaultMinNumFilesPerWrite = 1
|
|
defaultIOLimitPerWriteAction = 0 // A zero value does not impose any limit on IO
|
|
defaultFreeSpaceLimit = 100 * 1024 * 1024 // 100 MB
|
|
defaultMaxDedupePercent = 100
|
|
defaultMinDedupePercent = 0
|
|
defaultDedupePercentStep = 25
|
|
defaultDeletePercentOfContents = 20
|
|
defaultActionRepeats = 1
|
|
)
|
|
|
|
// Option field names.
|
|
const (
|
|
MaxDirDepthField = "max-dir-depth"
|
|
MaxFileSizeField = "max-file-size"
|
|
MinFileSizeField = "min-file-size"
|
|
MaxNumFilesPerWriteField = "max-num-files-per-write"
|
|
MinNumFilesPerWriteField = "min-num-files-per-write"
|
|
IOLimitPerWriteAction = "io-limit-per-write"
|
|
FreeSpaceLimitField = "free-space-limit"
|
|
MaxDedupePercentField = "max-dedupe-percent"
|
|
MinDedupePercentField = "min-dedupe-percent"
|
|
DedupePercentStepField = "dedupe-percent"
|
|
ActionRepeaterField = "repeat-action"
|
|
ThrowNoSpaceOnDeviceErrField = "throw-no-space-error"
|
|
DeletePercentOfContentsField = "delete-contents-percent"
|
|
SnapshotIDField = "snapshot-ID"
|
|
)
|
|
|
|
func getOptAsIntOrDefault(key string, opts map[string]string, def int) int {
|
|
if opts == nil {
|
|
return def
|
|
}
|
|
|
|
if opts[key] == "" {
|
|
return def
|
|
}
|
|
|
|
retInt, err := strconv.Atoi(opts[key])
|
|
if err != nil {
|
|
return def
|
|
}
|
|
|
|
return retInt
|
|
}
|
|
|
|
func defaultActionControls() map[string]string {
|
|
ret := make(map[string]string, len(actions))
|
|
|
|
for actionKey := range actions {
|
|
switch actionKey {
|
|
case RestoreIntoDataDirectoryActionKey:
|
|
// Don't restore into data directory by default
|
|
ret[string(actionKey)] = strconv.Itoa(0)
|
|
default:
|
|
ret[string(actionKey)] = strconv.Itoa(1)
|
|
}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func pickActionWeighted(actionControlOpts map[string]string, actionList map[ActionKey]Action) ActionKey {
|
|
var keepKey ActionKey
|
|
|
|
sum := 0
|
|
|
|
for actionName := range actionList {
|
|
weight := getOptAsIntOrDefault(string(actionName), actionControlOpts, 0)
|
|
if weight == 0 {
|
|
continue
|
|
}
|
|
|
|
sum += weight
|
|
if rand.Intn(sum) < weight { //nolint:gosec
|
|
keepKey = actionName
|
|
}
|
|
}
|
|
|
|
return keepKey
|
|
}
|
|
|
|
func errIsNotEnoughSpace(err error) bool {
|
|
return errors.Is(err, ErrCannotPerformIO) || strings.Contains(err.Error(), noSpaceOnDeviceMatchStr)
|
|
}
|
|
|
|
func (e *Engine) getSnapIDOptOrRandLive(opts map[string]string) (snapID string, err error) {
|
|
snapID = opts[SnapshotIDField]
|
|
if snapID != "" {
|
|
return snapID, nil
|
|
}
|
|
|
|
snapIDList := e.Checker.GetLiveSnapIDs()
|
|
if len(snapIDList) == 0 {
|
|
return "", ErrNoOp
|
|
}
|
|
|
|
return snapIDList[rand.Intn(len(snapIDList))], nil //nolint:gosec
|
|
}
|