mirror of
https://github.com/syncthing/syncthing.git
synced 2026-05-14 01:56:31 -04:00
### Purpose Resurrecting https://github.com/syncthing/syncthing/pull/9365 ### Testing Current benchmark results: ``` goos: linux goarch: amd64 pkg: github.com/syncthing/syncthing/lib/fs cpu: AMD EPYC 7763 64-Core Processor │ ../old.txt │ ../new.txt │ │ sec/op │ sec/op vs base │ UnicodeLowercase/ASCII_lowercase-4 43.68n ± 4% 12.19n ± 1% -72.09% (p=0.000 n=10) UnicodeLowercase/ASCII_mixedcase_start-4 200.65n ± 2% 59.49n ± 3% -70.35% (p=0.000 n=10) UnicodeLowercase/ASCII_mixedcase_end-4 95.50n ± 2% 59.10n ± 2% -38.12% (p=0.000 n=10) UnicodeLowercase/Latin1_lowercase-4 122.5n ± 1% 131.4n ± 1% +7.27% (p=0.000 n=10) UnicodeLowercase/Latin1_mixedcase_start-4 339.9n ± 2% 309.2n ± 1% -9.05% (p=0.000 n=10) UnicodeLowercase/Latin1_mixedcase_end-4 183.6n ± 2% 174.3n ± 1% -5.04% (p=0.000 n=10) UnicodeLowercase/Unicode_lowercase-4 456.6n ± 1% 440.5n ± 1% -3.53% (p=0.000 n=10) UnicodeLowercase/Unicode_mixedcase_start-4 625.9n ± 1% 595.6n ± 1% -4.83% (p=0.000 n=10) UnicodeLowercase/Unicode_mixedcase_end-4 516.2n ± 1% 495.5n ± 1% -4.02% (p=0.000 n=10) geomean 214.1n 150.4n -29.72% │ ../old.txt │ ../new.txt │ │ B/op │ B/op vs base │ UnicodeLowercase/ASCII_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_start-4 24.00 ± 0% 24.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_end-4 24.00 ± 0% 24.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_start-4 32.00 ± 0% 32.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_end-4 32.00 ± 0% 32.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_start-4 48.00 ± 0% 48.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_end-4 48.00 ± 0% 48.00 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ ../old.txt │ ../new.txt │ │ allocs/op │ allocs/op vs base │ UnicodeLowercase/ASCII_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean ``` I think the `+7%` for the lowercase Latin1 testcase is easily outweighed by the ASCII and unicode improvements 🙂
83 lines
2.8 KiB
Go
83 lines
2.8 KiB
Go
// Copyright (C) 2017 The Syncthing Authors.
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
// You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
|
|
package fs
|
|
|
|
import (
|
|
"testing"
|
|
)
|
|
|
|
var caseCases = [][2]string{
|
|
{"", ""},
|
|
{"hej", "hej"},
|
|
{"HeJ!@#", "hej!@#"},
|
|
// Western Europe diacritical stuff is trivial.
|
|
{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
|
|
// As are ligatures.
|
|
{"Æglefinus", "æglefinus"},
|
|
{"IJssel", "ijssel"},
|
|
// Cyrillic seems regular as well.
|
|
{"Привет", "привет"},
|
|
// Greek has multiple lower case characters for things depending on
|
|
// context; we should always choose the same one.
|
|
{"Ὀδυσσεύς", "ὀδυσσεύσ"},
|
|
{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
|
|
// German ß doesn't really have an upper case variant, and we
|
|
// shouldn't mess things up when lower casing it either. We don't
|
|
// attempt to make ß equivalent to "ss".
|
|
{"Reichwaldstraße", "reichwaldstraße"},
|
|
// The Turks do their thing with the Is.... Like the Greek example
|
|
// we pick just the one canonicalized "i" although you can argue
|
|
// with this... From what I understand most operating systems don't
|
|
// get this right anyway.
|
|
{"İI", "ii"},
|
|
// Arabic doesn't do case folding.
|
|
{"العَرَبِيَّة", "العَرَبِيَّة"},
|
|
// Neither does Hebrew.
|
|
{"עברית", "עברית"},
|
|
// Nor Chinese, in any variant.
|
|
{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
|
|
// Nor katakana, as far as I can tell.
|
|
{"チャーハン", "チャーハン"},
|
|
// Some special Unicode characters, however, are folded by OSes.
|
|
{"\u212A", "k"},
|
|
// Folding renormalizes to NFC
|
|
{"A\xCC\x88", "\xC3\xA4"}, // ä
|
|
{"a\xCC\x88", "\xC3\xA4"}, // ä
|
|
}
|
|
|
|
var benchmarkCases = [][2]string{
|
|
{"img_202401241010.jpg", "ASCII lowercase"},
|
|
{"IMG_202401241010.jpg", "ASCII mixedcase start"},
|
|
{"img_202401241010.JPG", "ASCII mixedcase end"},
|
|
{"wir_kinder_aus_bullerbü.epub", "Latin1 lowercase"},
|
|
{"Wir_Kinder_aus_Bullerbü.epub", "Latin1 mixedcase start"},
|
|
{"wir_kinder_aus_bullerbü.EPUB", "Latin1 mixedcase end"},
|
|
{"translated_ウェブの国際化.html", "Unicode lowercase"},
|
|
{"Translated_ウェブの国際化.html", "Unicode mixedcase start"},
|
|
{"translated_ウェブの国際化.HTML", "Unicode mixedcase end"},
|
|
}
|
|
|
|
func TestUnicodeLowercaseNormalized(t *testing.T) {
|
|
for _, tc := range caseCases {
|
|
res := UnicodeLowercaseNormalized(tc[0])
|
|
if res != tc[1] {
|
|
t.Errorf("UnicodeLowercaseNormalized(%q) => %q, expected %q", tc[0], res, tc[1])
|
|
}
|
|
}
|
|
}
|
|
|
|
func BenchmarkUnicodeLowercase(b *testing.B) {
|
|
for _, c := range benchmarkCases {
|
|
b.Run(c[1], func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
for i := 0; i < b.N; i++ {
|
|
UnicodeLowercaseNormalized(c[0])
|
|
}
|
|
})
|
|
}
|
|
}
|