Files
opencloud/services/thumbnails/pkg/preprocessor/textanalyzer_test.go
2023-10-06 10:29:08 +02:00

615 lines
17 KiB
Go

package preprocessor
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
)
var (
inputs = [18]string{
"basic latin",
"trailing tab ",
"Small text. \"$\", \"£\" and \"¥\" are currencies.",
"latin with 🖖",
"기본 한국어",
"基本的な日本語",
"ウーロン茶",
"私はエンジニアです",
"ティー私はエンジニアです",
"私はエンジニアです ティー",
"आधारभूत देवनागरी",
"mixed 언어 传入 🚀!",
"/k͜p/",
// ä and a + ¨
"ä ä",
"базовый русский", // cyrillic script isn't part of our default
"latin русский", // latin + cyrillic (cyrillic not supported)
" space justified ",
"",
}
)
func TestAnalyzeString(t *testing.T) {
defaultOpts := AnalysisOpts{
UseMergeMap: true,
MergeMap: DefaultMergeMap,
}
tables := []struct {
input string
opts AnalysisOpts
eOut TextAnalysis
}{
{
input: inputs[0],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 10, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 11},
},
RuneCount: map[string]int{
"Latin": 11,
},
Text: inputs[0],
},
},
{
input: inputs[1],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 12, Spaces: []int{8, 12}, TargetScript: "Latin", RuneCount: 13},
},
RuneCount: map[string]int{
"Latin": 13,
},
Text: inputs[1],
},
},
{
input: inputs[2],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 45, Spaces: []int{5, 11, 16, 21, 25, 30, 34}, TargetScript: "Latin", RuneCount: 44},
},
RuneCount: map[string]int{
"Latin": 44,
},
Text: inputs[2],
},
},
{
input: inputs[3],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 14, Spaces: []int{5, 10}, TargetScript: "Latin", RuneCount: 12},
},
RuneCount: map[string]int{
"Latin": 12,
},
Text: inputs[3],
},
},
{
input: inputs[4],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 15, Spaces: []int{6}, TargetScript: "Hangul", RuneCount: 6},
},
RuneCount: map[string]int{
"Hangul": 6,
},
Text: inputs[4],
},
},
{
input: inputs[5],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 20, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 7},
},
RuneCount: map[string]int{
"Hiragana": 7,
},
Text: inputs[5],
},
},
{
input: inputs[6],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 14, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
},
RuneCount: map[string]int{
"Katakana": 5,
},
Text: inputs[6],
},
},
{
input: inputs[7],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 9},
},
RuneCount: map[string]int{
"Hiragana": 9,
},
Text: inputs[7],
},
},
{
input: inputs[8],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 35, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 12},
},
RuneCount: map[string]int{
"Hiragana": 12,
},
Text: inputs[8],
},
},
{
input: inputs[9],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 36, Spaces: []int{27}, TargetScript: "Hiragana", RuneCount: 13},
},
RuneCount: map[string]int{
"Hiragana": 13,
},
Text: inputs[9],
},
},
{
input: inputs[10],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 45, Spaces: []int{21}, TargetScript: "Devanagari", RuneCount: 16},
},
RuneCount: map[string]int{
"Devanagari": 16,
},
Text: inputs[10],
},
},
{
input: inputs[11],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 6},
{Low: 6, High: 12, Spaces: []int{12}, TargetScript: "Hangul", RuneCount: 3},
{Low: 13, High: 24, Spaces: []int{19}, TargetScript: "Han", RuneCount: 5}, // 🚀 and ! are "Common" script and will be merged with "Han"
},
RuneCount: map[string]int{
"Latin": 6,
"Hangul": 3,
"Han": 5,
},
Text: inputs[11],
},
},
{
input: inputs[12],
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
},
RuneCount: map[string]int{
"Latin": 5,
},
Text: inputs[12],
},
},
{
input: inputs[13], // ä and a + ¨
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{2}, TargetScript: "Latin", RuneCount: 4},
},
RuneCount: map[string]int{
"Latin": 4,
},
Text: inputs[13],
},
},
{
input: inputs[14], // cyrillic script isn't part of our default
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 28, Spaces: []int{14}, TargetScript: "_unknown", RuneCount: 15},
},
RuneCount: map[string]int{
"_unknown": 15,
},
Text: inputs[14],
},
},
{
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 6},
{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
},
RuneCount: map[string]int{
"Latin": 6,
"_unknown": 7,
},
Text: inputs[15],
},
},
{
input: inputs[16], // latin + cyrillic (cyrillic script isn't part of our default)
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 16, Spaces: []int{0, 6, 16}, TargetScript: "Latin", RuneCount: 17},
},
RuneCount: map[string]int{
"Latin": 17,
},
Text: inputs[16],
},
},
{
input: inputs[17], // latin + cyrillic (cyrillic script isn't part of our default)
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{},
RuneCount: map[string]int{},
Text: inputs[17],
},
},
}
for _, table := range tables {
testname := fmt.Sprintf("Analyzing \"%s\" string", table.input)
t.Run(testname, func(t *testing.T) {
ta := NewTextAnalyzer(DefaultScripts)
result := ta.AnalyzeString(table.input, table.opts)
if table.opts.UseMergeMap {
result.MergeCommon(table.opts.MergeMap)
} else {
result.MergeCommon(MergeMap{})
}
assert.Equal(t, table.eOut, result)
})
}
}
func TestAnalyzeStringRaw(t *testing.T) {
tables := []struct {
input string
eOut TextAnalysis
}{
{
input: inputs[0],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
{Low: 6, High: 10, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
},
RuneCount: map[string]int{
"Latin": 10,
"Common": 1,
},
Text: inputs[0],
},
},
{
input: inputs[1],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 7, Spaces: []int{}, TargetScript: "Latin", RuneCount: 8},
{Low: 8, High: 8, Spaces: []int{8}, TargetScript: "Common", RuneCount: 1},
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
{Low: 12, High: 12, Spaces: []int{12}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 11,
"Common": 2,
},
Text: inputs[1],
},
},
{
input: inputs[2],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
{Low: 6, High: 9, Spaces: []int{}, TargetScript: "Latin", RuneCount: 4},
{Low: 10, High: 21, Spaces: []int{11, 16, 21}, TargetScript: "Common", RuneCount: 11}, // £ takes 2 bytes
{Low: 22, High: 24, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
{Low: 25, High: 30, Spaces: []int{25, 30}, TargetScript: "Common", RuneCount: 5}, // ¥ takes 2 bytes
{Low: 31, High: 33, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
{Low: 34, High: 34, Spaces: []int{34}, TargetScript: "Common", RuneCount: 1},
{Low: 35, High: 44, Spaces: []int{}, TargetScript: "Latin", RuneCount: 10},
{Low: 45, High: 45, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 25,
"Common": 19,
},
Text: inputs[2],
},
},
{
input: inputs[3],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
{Low: 6, High: 9, Spaces: []int{}, TargetScript: "Latin", RuneCount: 4},
{Low: 10, High: 14, Spaces: []int{10}, TargetScript: "Common", RuneCount: 2},
},
RuneCount: map[string]int{
"Latin": 9,
"Common": 3,
},
Text: inputs[3],
},
},
{
input: inputs[4],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 2},
{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1},
{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 3},
},
RuneCount: map[string]int{
"Hangul": 5,
"Common": 1,
},
Text: inputs[4],
},
},
{
input: inputs[5],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 8, Spaces: []int{}, TargetScript: "Han", RuneCount: 3},
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
{Low: 12, High: 20, Spaces: []int{}, TargetScript: "Han", RuneCount: 3},
},
RuneCount: map[string]int{
"Hiragana": 1,
"Han": 6,
},
Text: inputs[5],
},
},
{
input: inputs[6],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 1},
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Common", RuneCount: 1}, // ー U+30FC (KATAKANA-HIRAGANA PROLONGED SOUND MARK) seems to be counted as Common
{Low: 6, High: 11, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
{Low: 12, High: 14, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
},
RuneCount: map[string]int{
"Katakana": 3,
"Common": 1,
"Han": 1,
},
Text: inputs[6],
},
},
{
input: inputs[7],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
{Low: 6, High: 20, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
{Low: 21, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
},
RuneCount: map[string]int{
"Han": 1,
"Hiragana": 3,
"Katakana": 5,
},
Text: inputs[7],
},
},
{
input: inputs[8],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
{Low: 6, High: 8, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
{Low: 12, High: 14, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
{Low: 15, High: 29, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
{Low: 30, High: 35, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
},
RuneCount: map[string]int{
"Han": 1,
"Hiragana": 3,
"Katakana": 7,
"Common": 1,
},
Text: inputs[8],
},
},
{
input: inputs[9],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
{Low: 6, High: 20, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
{Low: 21, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
{Low: 27, High: 27, Spaces: []int{27}, TargetScript: "Common", RuneCount: 1},
{Low: 28, High: 33, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
{Low: 34, High: 36, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Han": 1,
"Hiragana": 3,
"Katakana": 7,
"Common": 2,
},
Text: inputs[9],
},
},
{
input: inputs[10],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 20, Spaces: []int{}, TargetScript: "Devanagari", RuneCount: 7},
{Low: 21, High: 21, Spaces: []int{21}, TargetScript: "Common", RuneCount: 1},
{Low: 22, High: 45, Spaces: []int{}, TargetScript: "Devanagari", RuneCount: 8},
},
RuneCount: map[string]int{
"Devanagari": 15,
"Common": 1,
},
Text: inputs[10],
},
},
{
input: inputs[11],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
{Low: 6, High: 11, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 2},
{Low: 12, High: 12, Spaces: []int{12}, TargetScript: "Common", RuneCount: 1},
{Low: 13, High: 18, Spaces: []int{}, TargetScript: "Han", RuneCount: 2},
{Low: 19, High: 24, Spaces: []int{19}, TargetScript: "Common", RuneCount: 3},
},
RuneCount: map[string]int{
"Latin": 5,
"Hangul": 2,
"Han": 2,
"Common": 5,
},
Text: inputs[11],
},
},
{
input: inputs[12],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 0, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
{Low: 1, High: 1, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
{Low: 2, High: 3, Spaces: []int{}, TargetScript: "Inherited", RuneCount: 1},
{Low: 4, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
{Low: 5, High: 5, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 2,
"Common": 2,
"Inherited": 1,
},
Text: inputs[12],
},
},
{
input: inputs[13], // ä and a + ¨
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 1, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
{Low: 2, High: 2, Spaces: []int{2}, TargetScript: "Common", RuneCount: 1},
{Low: 3, High: 3, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
{Low: 4, High: 5, Spaces: []int{}, TargetScript: "Inherited", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 2,
"Common": 1,
"Inherited": 1,
},
Text: inputs[13],
},
},
{
input: inputs[14], // cyrillic script isn't part of our default
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 13, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
{Low: 14, High: 14, Spaces: []int{14}, TargetScript: "Common", RuneCount: 1},
{Low: 15, High: 28, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
},
RuneCount: map[string]int{
"_unknown": 14,
"Common": 1,
},
Text: inputs[14],
},
},
{
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
},
RuneCount: map[string]int{
"Latin": 5,
"Common": 1,
"_unknown": 7,
},
Text: inputs[15],
},
},
{
input: inputs[16],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
{Low: 0, High: 0, Spaces: []int{0}, TargetScript: "Common", RuneCount: 1},
{Low: 1, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1},
{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Latin", RuneCount: 9},
{Low: 16, High: 16, Spaces: []int{16}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 14,
"Common": 3,
},
Text: inputs[16],
},
},
{
input: inputs[17], // empty string
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{},
RuneCount: map[string]int{},
Text: inputs[17],
},
},
}
for _, table := range tables {
testname := fmt.Sprintf("Raw-Analyzing \"%s\" string", table.input)
t.Run(testname, func(t *testing.T) {
ta := NewTextAnalyzer(DefaultScripts)
result := ta.AnalyzeString(table.input, AnalysisOpts{})
assert.Equal(t, table.eOut, result)
})
}
}