mirror of
https://github.com/syncthing/syncthing.git
synced 2026-05-10 16:16:47 -04:00
### Purpose Resurrecting https://github.com/syncthing/syncthing/pull/9365 ### Testing Current benchmark results: ``` goos: linux goarch: amd64 pkg: github.com/syncthing/syncthing/lib/fs cpu: AMD EPYC 7763 64-Core Processor │ ../old.txt │ ../new.txt │ │ sec/op │ sec/op vs base │ UnicodeLowercase/ASCII_lowercase-4 43.68n ± 4% 12.19n ± 1% -72.09% (p=0.000 n=10) UnicodeLowercase/ASCII_mixedcase_start-4 200.65n ± 2% 59.49n ± 3% -70.35% (p=0.000 n=10) UnicodeLowercase/ASCII_mixedcase_end-4 95.50n ± 2% 59.10n ± 2% -38.12% (p=0.000 n=10) UnicodeLowercase/Latin1_lowercase-4 122.5n ± 1% 131.4n ± 1% +7.27% (p=0.000 n=10) UnicodeLowercase/Latin1_mixedcase_start-4 339.9n ± 2% 309.2n ± 1% -9.05% (p=0.000 n=10) UnicodeLowercase/Latin1_mixedcase_end-4 183.6n ± 2% 174.3n ± 1% -5.04% (p=0.000 n=10) UnicodeLowercase/Unicode_lowercase-4 456.6n ± 1% 440.5n ± 1% -3.53% (p=0.000 n=10) UnicodeLowercase/Unicode_mixedcase_start-4 625.9n ± 1% 595.6n ± 1% -4.83% (p=0.000 n=10) UnicodeLowercase/Unicode_mixedcase_end-4 516.2n ± 1% 495.5n ± 1% -4.02% (p=0.000 n=10) geomean 214.1n 150.4n -29.72% │ ../old.txt │ ../new.txt │ │ B/op │ B/op vs base │ UnicodeLowercase/ASCII_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_start-4 24.00 ± 0% 24.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_end-4 24.00 ± 0% 24.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_start-4 32.00 ± 0% 32.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_end-4 32.00 ± 0% 32.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_start-4 48.00 ± 0% 48.00 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_end-4 48.00 ± 0% 48.00 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ ../old.txt │ ../new.txt │ │ allocs/op │ allocs/op vs base │ UnicodeLowercase/ASCII_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/ASCII_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Latin1_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_lowercase-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_start-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ UnicodeLowercase/Unicode_mixedcase_end-4 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean ``` I think the `+7%` for the lowercase Latin1 testcase is easily outweighed by the ASCII and unicode improvements 🙂
103 lines
2.1 KiB
Go
103 lines
2.1 KiB
Go
// Copyright (C) 2017 The Syncthing Authors.
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
// You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
|
|
package fs
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/unicode/norm"
|
|
)
|
|
|
|
// UnicodeLowercaseNormalized returns the Unicode lower case variant of s,
|
|
// having also normalized it to normalization form C.
|
|
func UnicodeLowercaseNormalized(s string) string {
|
|
if isASCII, isLower := isASCII(s); isASCII {
|
|
if isLower {
|
|
return s
|
|
}
|
|
return toLowerASCII(s)
|
|
}
|
|
|
|
return toLowerUnicode(s)
|
|
}
|
|
|
|
func isASCII(s string) (bool, bool) {
|
|
isLower := true
|
|
for _, b := range []byte(s) {
|
|
if b > unicode.MaxASCII {
|
|
return false, isLower
|
|
}
|
|
if 'A' <= b && b <= 'Z' {
|
|
isLower = false
|
|
}
|
|
}
|
|
return true, isLower
|
|
}
|
|
|
|
func toLowerASCII(s string) string {
|
|
var (
|
|
b strings.Builder
|
|
pos int
|
|
)
|
|
b.Grow(len(s))
|
|
for i, c := range []byte(s) {
|
|
if c < 'A' || 'Z' < c {
|
|
continue
|
|
}
|
|
if pos < i {
|
|
b.WriteString(s[pos:i])
|
|
}
|
|
pos = i + 1
|
|
b.WriteByte(c + 'a' - 'A')
|
|
}
|
|
if pos != len(s) {
|
|
b.WriteString(s[pos:])
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func toLowerUnicode(s string) string {
|
|
i := firstCaseChange(s)
|
|
if i == -1 {
|
|
return norm.NFC.String(s)
|
|
}
|
|
|
|
var rs strings.Builder
|
|
// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
|
|
// even if it doesn't need all that space. Overallocate now to prevent
|
|
// it from ever triggering a reallocation.
|
|
rs.Grow(utf8.UTFMax - 1 + len(s))
|
|
rs.WriteString(s[:i])
|
|
|
|
for _, r := range s[i:] {
|
|
if r <= unicode.MaxLatin1 && r != 'µ' {
|
|
rs.WriteRune(unicode.ToLower(r))
|
|
} else {
|
|
rs.WriteRune(unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r)))
|
|
}
|
|
}
|
|
return norm.NFC.String(rs.String())
|
|
}
|
|
|
|
// Byte index of the first rune r s.t. lower(upper(r)) != r.
|
|
func firstCaseChange(s string) int {
|
|
for i, r := range s {
|
|
if r <= unicode.MaxASCII {
|
|
if r < 'A' || r > 'Z' {
|
|
continue
|
|
}
|
|
return i
|
|
}
|
|
if unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r)) != r {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|