Files
AdGuardDNS/internal/agdalg/skeletonconstructor.go
Ainar Garipov 3f26cca7e0 Sync v2.22.0
2026-05-28 14:26:40 +03:00

211 lines
5.4 KiB
Go

package agdalg
import (
"bytes"
"unicode"
"github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/syncutil"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"golang.org/x/text/unicode/rangetable"
)
// SkeletonConstructor constructs confusable skeletons from strings. It's based
// on the [Unicode Technical Report #39].
//
// [Unicode Technical Report #39]: https://www.unicode.org/reports/tr39/#Confusable_Detection
type SkeletonConstructor struct {
// transformerPool produces and allows reusing [transform.Transformer]s for
// confusable skeleton construction. The reusing is required since
// [transform.Chain] allocates a stateful [transform.Transformer].
transformerPool *syncutil.Pool[transform.Transformer]
srcBufPool *syncutil.Pool[[]byte]
dstBufPool *syncutil.Pool[[]byte]
}
// NewSkeletonConstructor returns a new properly initialized
// *SkeletonConstructor. srcInitSize and dstInitSize are the initial sizes of
// the source and destination buffers for the skeleton construction.
func NewSkeletonConstructor(srcInitSize, dstInitSize uint) (sc *SkeletonConstructor) {
return &SkeletonConstructor{
transformerPool: syncutil.NewPool(newUTR39Chain),
srcBufPool: syncutil.NewSlicePool[byte](int(srcInitSize)),
dstBufPool: syncutil.NewSlicePool[byte](int(dstInitSize)),
}
}
// Skeleton returns a confusable skeleton for s. s is case-sensitive.
//
// TODO(e.burkov): Consider rewriting the [transform.String] function to use
// buffers. It will allow to avoid unnecessary allocations for the skeleton
// construction.
func (c *SkeletonConstructor) Skeleton(s string) (skel string) {
trPtr := c.transformerPool.Get()
defer c.transformerPool.Put(trPtr)
(*trPtr).Reset()
if s == "" {
// Fast path for the common case for empty input. Taken from
// [transform.String].
if _, _, err := (*trPtr).Transform(nil, nil, true); err == nil {
return ""
}
}
srcPtr := c.srcBufPool.Get()
defer c.srcBufPool.Put(srcPtr)
*srcPtr = append((*srcPtr)[:0], s...)
dstPtr := c.dstBufPool.Get()
defer c.dstBufPool.Put(dstPtr)
var err error
*dstPtr, _, err = transform.Append(*trPtr, (*dstPtr)[:0], *srcPtr)
// It seems transformer can never return an error.
//
// TODO(e.burkov): Consider sending to Sentry.
errors.Check(err)
if bytes.Equal(*srcPtr, *dstPtr) {
return s
}
return string(*dstPtr)
}
// dicpTable is a merged table of all code points with the property
// Default_Ignorable_Code_Point.
//
// See https://www.unicode.org/Public/15.0.0/ucd/DerivedCoreProperties.txt.
//
// TODO(e.burkov): Update to 16.0.0 when the [unicode] package will use it.
var dicpTable = rangetable.Merge(
unicode.Other_Default_Ignorable_Code_Point,
// These are ranges of Default_Ignorable_Code_Point excluding the ones
// already included in [unicode.Other_Default_Ignorable_Code_Point].
&unicode.RangeTable{
R16: []unicode.Range16{{
// SOFT HYPHEN.
Lo: 0x00AD,
// ARABIC LETTER MARK.
Hi: 0x061C,
// Only include two code points.
Stride: (0x061C - 0x00AD),
}, {
// MONGOLIAN FREE VARIATION SELECTOR ONE.
Lo: 0x180B,
// MONGOLIAN FREE VARIATION SELECTOR THREE.
Hi: 0x180D,
// Include all 3 code points.
Stride: 1,
}, {
// MONGOLIAN VOWEL SEPARATOR.
Lo: 0x180E,
// MONGOLIAN FREE VARIATION SELECTOR FOUR.
Hi: 0x180F,
// Include all 2 code points.
Stride: 1,
}, {
// ZERO WIDTH SPACE.
Lo: 0x200B,
// RIGHT-TO-LEFT MARK.
Hi: 0x200F,
// Include all 5 code points.
Stride: 1,
}, {
// LEFT-TO-RIGHT EMBEDDING.
Lo: 0x202A,
// RIGHT-TO-LEFT OVERRIDE.
Hi: 0x202E,
// Include all 5 code points.
Stride: 1,
}, {
// WORD JOINER.
Lo: 0x2060,
// INVISIBLE PLUS.
Hi: 0x2064,
// Include all 5 code points.
Stride: 1,
}, {
// LEFT-TO-RIGHT ISOLATE.
Lo: 0x2066,
// NOMINAL DIGIT SHAPES.
Hi: 0x206F,
// Include all 10 code points.
Stride: 1,
}, {
// VARIATION SELECTOR-1.
Lo: 0xFE00,
// VARIATION SELECTOR-16.
Hi: 0xFE0F,
// Include all 16 code points.
Stride: 1,
}, {
// ZERO WIDTH NO-BREAK SPACE.
Lo: 0xFEFF,
// HALFWIDTH HANGUL FILLER.
Hi: 0xFFA0,
// Only include two code points.
Stride: (0xFFA0 - 0xFEFF),
}},
R32: []unicode.Range32{{
// SHORTHAND FORMAT LETTER OVERLAP.
Lo: 0x1BCA0,
// SHORTHAND FORMAT UP STEP.
Hi: 0x1BCA3,
// Include all 4 code points.
Stride: 1,
}, {
// MUSICAL SYMBOL BEGIN BEAM.
Lo: 0x1D173,
// MUSICAL SYMBOL END PHRASE.
Hi: 0x1D17A,
// Include all 8 code points.
Stride: 1,
}, {
// LANGUAGE TAG.
Lo: 0xE0001,
// Reserved.
Hi: 0xE0002,
// Include all 2 code points.
Stride: 1,
}, {
// TAG SPACE.
Lo: 0xE0020,
// CANCEL TAG.
Hi: 0xE007F,
// Include all 96 code points.
Stride: 1,
}, {
// VARIATION SELECTOR-17.
Lo: 0xE0100,
// VARIATION SELECTOR-256.
Hi: 0xE01EF,
// Include all 240 code points.
Stride: 1,
}},
},
)
// dicpRemover is a [transform.Transformer] that removes all runes with the
// Default_Ignorable_Code_Point property.
var dicpRemover = runes.Remove(runes.In(dicpTable))
// newUTR39Chain is a helper function for creating a new chain of transformers
// for the [UTR #39] confusable skeleton construction.
//
// [UTR #39]: https://www.unicode.org/reports/tr39/#Confusable_Detection
func newUTR39Chain() (tr *transform.Transformer) {
return new(transform.Chain(
norm.NFD,
dicpRemover,
confusablePrototyper,
norm.NFD,
))
}