mirror of
https://github.com/syncthing/syncthing.git
synced 2025-12-23 22:18:14 -05:00
We've had weak/rolling hashing in the code for quite a while. It was a popular request for a while, based on the belief that rsync does this and we should too. However, the benefit is quite small; we save on average about 0.8% of transferred blocks over the population as a whole: <img width="974" alt="Screenshot 2025-03-28 at 17 09 02" src="https://github.com/user-attachments/assets/bbe10dea-f85e-4043-9823-7cef1220b4a2" /> This would be fine if the cost was comparably low, however the downside of attempting rolling hash matching is that we (by default) do a complete file read on the destination in order to look for matches before we starting pulling blocks for the file. For any larger file this means a sometimes long, I/O-intensive pause before the file starts syncing, for usually no benefit. I propose we simply rip off the bandaid and save the effort.
142 lines
3.3 KiB
Go
142 lines
3.3 KiB
Go
// Copyright (C) 2014 The Syncthing Authors.
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
// You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
|
|
package scanner
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/sha256"
|
|
"fmt"
|
|
mrand "math/rand"
|
|
"testing"
|
|
)
|
|
|
|
var blocksTestData = []struct {
|
|
data []byte
|
|
blocksize int
|
|
hash []string
|
|
}{
|
|
{
|
|
[]byte(""), 1024,
|
|
[]string{
|
|
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contents"), 1024,
|
|
[]string{
|
|
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contents"), 9,
|
|
[]string{
|
|
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contents"), 8,
|
|
[]string{
|
|
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contents"), 7,
|
|
[]string{
|
|
"ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73",
|
|
"043a718774c572bd8a25adbeb1bfcd5c0256ae11cecf9f9c3f925d0e52beaf89",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contents"), 3,
|
|
[]string{
|
|
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
|
|
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
|
|
"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4",
|
|
},
|
|
},
|
|
{
|
|
[]byte("conconts"), 3,
|
|
[]string{
|
|
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
|
|
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
|
|
"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4",
|
|
},
|
|
},
|
|
{
|
|
[]byte("contenten"), 3,
|
|
[]string{
|
|
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
|
|
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
|
|
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
|
|
},
|
|
},
|
|
}
|
|
|
|
func TestBlocks(t *testing.T) {
|
|
for testNo, test := range blocksTestData {
|
|
buf := bytes.NewBuffer(test.data)
|
|
blocks, err := Blocks(context.TODO(), buf, test.blocksize, -1, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if l := len(blocks); l != len(test.hash) {
|
|
t.Fatalf("%d: Incorrect number of blocks %d != %d", testNo, l, len(test.hash))
|
|
} else {
|
|
i := 0
|
|
for off := int64(0); off < int64(len(test.data)); off += int64(test.blocksize) {
|
|
if blocks[i].Offset != off {
|
|
t.Errorf("%d/%d: Incorrect offset %d != %d", testNo, i, blocks[i].Offset, off)
|
|
}
|
|
|
|
bs := test.blocksize
|
|
if rem := len(test.data) - int(off); bs > rem {
|
|
bs = rem
|
|
}
|
|
if int(blocks[i].Size) != bs {
|
|
t.Errorf("%d/%d: Incorrect length %d != %d", testNo, i, blocks[i].Size, bs)
|
|
}
|
|
if h := fmt.Sprintf("%x", blocks[i].Hash); h != test.hash[i] {
|
|
t.Errorf("%d/%d: Incorrect block hash %q != %q", testNo, i, h, test.hash[i])
|
|
}
|
|
|
|
i++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func BenchmarkValidate(b *testing.B) {
|
|
type block struct {
|
|
data []byte
|
|
hash [sha256.Size]byte
|
|
}
|
|
var blocks []block
|
|
const blocksPerType = 100
|
|
|
|
r := mrand.New(mrand.NewSource(0x136bea689e851))
|
|
|
|
// Valid blocks.
|
|
for i := 0; i < blocksPerType; i++ {
|
|
var b block
|
|
b.data = make([]byte, 128<<10)
|
|
r.Read(b.data)
|
|
b.hash = sha256.Sum256(b.data)
|
|
blocks = append(blocks, b)
|
|
}
|
|
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
|
|
for i := 0; i < b.N; i++ {
|
|
for _, b := range blocks {
|
|
Validate(b.data, b.hash[:])
|
|
}
|
|
}
|
|
}
|