Files
syncthing/lib/fs/folding.go
bt90 05cc6b0f43 chore(fs): speed up case normalization (#10013)
### Purpose

Resurrecting https://github.com/syncthing/syncthing/pull/9365

### Testing

Current benchmark results:

```
goos: linux
goarch: amd64
pkg: github.com/syncthing/syncthing/lib/fs
cpu: AMD EPYC 7763 64-Core Processor                
                                           │  ../old.txt  │             ../new.txt              │
                                           │    sec/op    │   sec/op     vs base                │
UnicodeLowercase/ASCII_lowercase-4            43.68n ± 4%   12.19n ± 1%  -72.09% (p=0.000 n=10)
UnicodeLowercase/ASCII_mixedcase_start-4     200.65n ± 2%   59.49n ± 3%  -70.35% (p=0.000 n=10)
UnicodeLowercase/ASCII_mixedcase_end-4        95.50n ± 2%   59.10n ± 2%  -38.12% (p=0.000 n=10)
UnicodeLowercase/Latin1_lowercase-4           122.5n ± 1%   131.4n ± 1%   +7.27% (p=0.000 n=10)
UnicodeLowercase/Latin1_mixedcase_start-4     339.9n ± 2%   309.2n ± 1%   -9.05% (p=0.000 n=10)
UnicodeLowercase/Latin1_mixedcase_end-4       183.6n ± 2%   174.3n ± 1%   -5.04% (p=0.000 n=10)
UnicodeLowercase/Unicode_lowercase-4          456.6n ± 1%   440.5n ± 1%   -3.53% (p=0.000 n=10)
UnicodeLowercase/Unicode_mixedcase_start-4    625.9n ± 1%   595.6n ± 1%   -4.83% (p=0.000 n=10)
UnicodeLowercase/Unicode_mixedcase_end-4      516.2n ± 1%   495.5n ± 1%   -4.02% (p=0.000 n=10)
geomean                                       214.1n        150.4n       -29.72%

                                           │  ../old.txt  │             ../new.txt              │
                                           │     B/op     │    B/op     vs base                 │
UnicodeLowercase/ASCII_lowercase-4           0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_start-4     24.00 ± 0%     24.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_end-4       24.00 ± 0%     24.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_lowercase-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_start-4    32.00 ± 0%     32.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_end-4      32.00 ± 0%     32.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_lowercase-4         0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_start-4   48.00 ± 0%     48.00 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_end-4     48.00 ± 0%     48.00 ± 0%       ~ (p=1.000 n=10) ¹
geomean                                                 ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

                                           │  ../old.txt  │             ../new.txt              │
                                           │  allocs/op   │ allocs/op   vs base                 │
UnicodeLowercase/ASCII_lowercase-4           0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_start-4     1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/ASCII_mixedcase_end-4       1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_lowercase-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_start-4    1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Latin1_mixedcase_end-4      1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_lowercase-4         0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_start-4   1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
UnicodeLowercase/Unicode_mixedcase_end-4     1.000 ± 0%     1.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                                                 ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean
```

I think the `+7%` for the lowercase Latin1 testcase is easily outweighed
by the ASCII and unicode improvements 🙂
2025-04-01 13:41:57 +02:00

103 lines
2.1 KiB
Go

// Copyright (C) 2017 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package fs
import (
"strings"
"unicode"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
// UnicodeLowercaseNormalized returns the Unicode lower case variant of s,
// having also normalized it to normalization form C.
func UnicodeLowercaseNormalized(s string) string {
if isASCII, isLower := isASCII(s); isASCII {
if isLower {
return s
}
return toLowerASCII(s)
}
return toLowerUnicode(s)
}
func isASCII(s string) (bool, bool) {
isLower := true
for _, b := range []byte(s) {
if b > unicode.MaxASCII {
return false, isLower
}
if 'A' <= b && b <= 'Z' {
isLower = false
}
}
return true, isLower
}
func toLowerASCII(s string) string {
var (
b strings.Builder
pos int
)
b.Grow(len(s))
for i, c := range []byte(s) {
if c < 'A' || 'Z' < c {
continue
}
if pos < i {
b.WriteString(s[pos:i])
}
pos = i + 1
b.WriteByte(c + 'a' - 'A')
}
if pos != len(s) {
b.WriteString(s[pos:])
}
return b.String()
}
func toLowerUnicode(s string) string {
i := firstCaseChange(s)
if i == -1 {
return norm.NFC.String(s)
}
var rs strings.Builder
// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
// even if it doesn't need all that space. Overallocate now to prevent
// it from ever triggering a reallocation.
rs.Grow(utf8.UTFMax - 1 + len(s))
rs.WriteString(s[:i])
for _, r := range s[i:] {
if r <= unicode.MaxLatin1 && r != 'µ' {
rs.WriteRune(unicode.ToLower(r))
} else {
rs.WriteRune(unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r)))
}
}
return norm.NFC.String(rs.String())
}
// Byte index of the first rune r s.t. lower(upper(r)) != r.
func firstCaseChange(s string) int {
for i, r := range s {
if r <= unicode.MaxASCII {
if r < 'A' || r > 'Z' {
continue
}
return i
}
if unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r)) != r {
return i
}
}
return -1
}