Files
syncthing/lib/fs/folding_test.go
bt90 05cc6b0f43 chore(fs): speed up case normalization (#10013)
### Purpose

Resurrecting https://github.com/syncthing/syncthing/pull/9365

### Testing

Current benchmark results:

```
goos: linux
goarch: amd64
pkg: github.com/syncthing/syncthing/lib/fs
cpu: AMD EPYC 7763 64-Core Processor                
                                           │  ../old.txt  │             ../new.txt              │
                                           │    sec/op    │   sec/op     vs base                │
UnicodeLowercase/ASCII_lowercase-4            43.68n ± 4%   12.19n ± 1%  -72.09% (p=0.000 n=10)
UnicodeLowercase/ASCII_mixedcase_start-4     200.65n ± 2%   59.49n ± 3%  -70.35% (p=0.000 n=10)
UnicodeLowercase/ASCII_mixedcase_end-4        95.50n ± 2%   59.10n ± 2%  -38.12% (p=0.000 n=10)
UnicodeLowercase/Latin1_lowercase-4           122.5n ± 1%   131.4n ± 1%   +7.27% (p=0.000 n=10)
UnicodeLowercase/Latin1_mixedcase_start-4     339.9n ± 2%   309.2n ± 1%   -9.05% (p=0.000 n=10)
UnicodeLowercase/Latin1_mixedcase_end-4       183.6n ± 2%   174.3n ± 1%   -5.04% (p=0.000 n=10)
UnicodeLowercase/Unicode_lowercase-4          456.6n ± 1%   440.5n ± 1%   -3.53% (p=0.000 n=10)
UnicodeLowercase/Unicode_mixedcase_start-4    625.9n ± 1%   595.6n ± 1%   -4.83% (p=0.000 n=10)
UnicodeLowercase/Unicode_mixedcase_end-4      516.2n ± 1%   495.5n ± 1%   -4.02% (p=0.000 n=10)
geomean                                       214.1n        150.4n       -29.72%

                                           │  ../old.txt  │             ../new.txt              │
                                           │     B/op     │    B/op     vs base                 │
UnicodeLowercase/ASCII_lowercase-4           0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_start-4     24.00 ± 0%     24.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_end-4       24.00 ± 0%     24.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_lowercase-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_start-4    32.00 ± 0%     32.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_end-4      32.00 ± 0%     32.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_lowercase-4         0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_start-4   48.00 ± 0%     48.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_end-4     48.00 ± 0%     48.00 ± 0%       ~ (p=1.000 n=10) ¹
geomean                                                 ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

                                           │  ../old.txt  │             ../new.txt              │
                                           │  allocs/op   │ allocs/op   vs base                 │
UnicodeLowercase/ASCII_lowercase-4           0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_start-4     1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_end-4       1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_lowercase-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_start-4    1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_end-4      1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_lowercase-4         0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_start-4   1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_end-4     1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                                                 ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean
```

I think the `+7%` for the lowercase Latin1 testcase is easily outweighed
by the ASCII and unicode improvements 🙂
2025-04-01 13:41:57 +02:00

83 lines
2.8 KiB
Go

// Copyright (C) 2017 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package fs
import (
"testing"
)
var caseCases = [][2]string{
{"", ""},
{"hej", "hej"},
{"HeJ!@#", "hej!@#"},
// Western Europe diacritical stuff is trivial.
{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
// As are ligatures.
{"Æglefinus", "æglefinus"},
{"IJssel", "ijssel"},
// Cyrillic seems regular as well.
{"Привет", "привет"},
// Greek has multiple lower case characters for things depending on
// context; we should always choose the same one.
{"Ὀδυσσεύς", "ὀδυσσεύσ"},
{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
// German ß doesn't really have an upper case variant, and we
// shouldn't mess things up when lower casing it either. We don't
// attempt to make ß equivalent to "ss".
{"Reichwaldstraße", "reichwaldstraße"},
// The Turks do their thing with the Is.... Like the Greek example
// we pick just the one canonicalized "i" although you can argue
// with this... From what I understand most operating systems don't
// get this right anyway.
{"İI", "ii"},
// Arabic doesn't do case folding.
{"العَرَبِيَّة", "العَرَبِيَّة"},
// Neither does Hebrew.
{"עברית", "עברית"},
// Nor Chinese, in any variant.
{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
// Nor katakana, as far as I can tell.
{"チャーハン", "チャーハン"},
// Some special Unicode characters, however, are folded by OSes.
{"\u212A", "k"},
// Folding renormalizes to NFC
{"A\xCC\x88", "\xC3\xA4"}, // ä
{"a\xCC\x88", "\xC3\xA4"}, // ä
}
var benchmarkCases = [][2]string{
{"img_202401241010.jpg", "ASCII lowercase"},
{"IMG_202401241010.jpg", "ASCII mixedcase start"},
{"img_202401241010.JPG", "ASCII mixedcase end"},
{"wir_kinder_aus_bullerbü.epub", "Latin1 lowercase"},
{"Wir_Kinder_aus_Bullerbü.epub", "Latin1 mixedcase start"},
{"wir_kinder_aus_bullerbü.EPUB", "Latin1 mixedcase end"},
{"translated_ウェブの国際化.html", "Unicode lowercase"},
{"Translated_ウェブの国際化.html", "Unicode mixedcase start"},
{"translated_ウェブの国際化.HTML", "Unicode mixedcase end"},
}
func TestUnicodeLowercaseNormalized(t *testing.T) {
for _, tc := range caseCases {
res := UnicodeLowercaseNormalized(tc[0])
if res != tc[1] {
t.Errorf("UnicodeLowercaseNormalized(%q) => %q, expected %q", tc[0], res, tc[1])
}
}
}
func BenchmarkUnicodeLowercase(b *testing.B) {
for _, c := range benchmarkCases {
b.Run(c[1], func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
UnicodeLowercaseNormalized(c[0])
}
})
}
}