mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2025-12-27 00:00:49 -05:00
615 lines
17 KiB
Go
615 lines
17 KiB
Go
package preprocessor
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
var (
|
|
inputs = [18]string{
|
|
"basic latin",
|
|
"trailing tab ",
|
|
"Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
|
"latin with 🖖",
|
|
"기본 한국어",
|
|
"基本的な日本語",
|
|
"ウーロン茶",
|
|
"私はエンジニアです",
|
|
"ティー私はエンジニアです",
|
|
"私はエンジニアです ティー",
|
|
"आधारभूत देवनागरी",
|
|
"mixed 언어 传入 🚀!",
|
|
"/k͜p/",
|
|
// ä and a + ¨
|
|
"ä ä",
|
|
"базовый русский", // cyrillic script isn't part of our default
|
|
"latin русский", // latin + cyrillic (cyrillic not supported)
|
|
" space justified ",
|
|
"",
|
|
}
|
|
)
|
|
|
|
func TestAnalyzeString(t *testing.T) {
|
|
defaultOpts := AnalysisOpts{
|
|
UseMergeMap: true,
|
|
MergeMap: DefaultMergeMap,
|
|
}
|
|
|
|
tables := []struct {
|
|
input string
|
|
opts AnalysisOpts
|
|
eOut TextAnalysis
|
|
}{
|
|
{
|
|
input: inputs[0],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 10, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 11},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 11,
|
|
},
|
|
Text: inputs[0],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[1],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 12, Spaces: []int{8, 12}, TargetScript: "Latin", RuneCount: 13},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 13,
|
|
},
|
|
Text: inputs[1],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[2],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 45, Spaces: []int{5, 11, 16, 21, 25, 30, 34}, TargetScript: "Latin", RuneCount: 44},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 44,
|
|
},
|
|
Text: inputs[2],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[3],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 14, Spaces: []int{5, 10}, TargetScript: "Latin", RuneCount: 12},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 12,
|
|
},
|
|
Text: inputs[3],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[4],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 15, Spaces: []int{6}, TargetScript: "Hangul", RuneCount: 6},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hangul": 6,
|
|
},
|
|
Text: inputs[4],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[5],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 20, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 7},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hiragana": 7,
|
|
},
|
|
Text: inputs[5],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[6],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 14, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Katakana": 5,
|
|
},
|
|
Text: inputs[6],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[7],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 9},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hiragana": 9,
|
|
},
|
|
Text: inputs[7],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[8],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 35, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 12},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hiragana": 12,
|
|
},
|
|
Text: inputs[8],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[9],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 36, Spaces: []int{27}, TargetScript: "Hiragana", RuneCount: 13},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hiragana": 13,
|
|
},
|
|
Text: inputs[9],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[10],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 45, Spaces: []int{21}, TargetScript: "Devanagari", RuneCount: 16},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Devanagari": 16,
|
|
},
|
|
Text: inputs[10],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[11],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 6},
|
|
{Low: 6, High: 12, Spaces: []int{12}, TargetScript: "Hangul", RuneCount: 3},
|
|
{Low: 13, High: 24, Spaces: []int{19}, TargetScript: "Han", RuneCount: 5}, // 🚀 and ! are "Common" script and will be merged with "Han"
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 6,
|
|
"Hangul": 3,
|
|
"Han": 5,
|
|
},
|
|
Text: inputs[11],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[12],
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 5,
|
|
},
|
|
Text: inputs[12],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[13], // ä and a + ¨
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{2}, TargetScript: "Latin", RuneCount: 4},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 4,
|
|
},
|
|
Text: inputs[13],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[14], // cyrillic script isn't part of our default
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 28, Spaces: []int{14}, TargetScript: "_unknown", RuneCount: 15},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"_unknown": 15,
|
|
},
|
|
Text: inputs[14],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 6},
|
|
{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 6,
|
|
"_unknown": 7,
|
|
},
|
|
Text: inputs[15],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[16], // latin + cyrillic (cyrillic script isn't part of our default)
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 16, Spaces: []int{0, 6, 16}, TargetScript: "Latin", RuneCount: 17},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 17,
|
|
},
|
|
Text: inputs[16],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[17], // latin + cyrillic (cyrillic script isn't part of our default)
|
|
opts: defaultOpts,
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{},
|
|
RuneCount: map[string]int{},
|
|
Text: inputs[17],
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, table := range tables {
|
|
testname := fmt.Sprintf("Analyzing \"%s\" string", table.input)
|
|
t.Run(testname, func(t *testing.T) {
|
|
ta := NewTextAnalyzer(DefaultScripts)
|
|
result := ta.AnalyzeString(table.input, table.opts)
|
|
if table.opts.UseMergeMap {
|
|
result.MergeCommon(table.opts.MergeMap)
|
|
} else {
|
|
result.MergeCommon(MergeMap{})
|
|
}
|
|
assert.Equal(t, table.eOut, result)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestAnalyzeStringRaw(t *testing.T) {
|
|
tables := []struct {
|
|
input string
|
|
eOut TextAnalysis
|
|
}{
|
|
{
|
|
input: inputs[0],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 6, High: 10, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 10,
|
|
"Common": 1,
|
|
},
|
|
Text: inputs[0],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[1],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 7, Spaces: []int{}, TargetScript: "Latin", RuneCount: 8},
|
|
{Low: 8, High: 8, Spaces: []int{8}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
|
|
{Low: 12, High: 12, Spaces: []int{12}, TargetScript: "Common", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 11,
|
|
"Common": 2,
|
|
},
|
|
Text: inputs[1],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[2],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 6, High: 9, Spaces: []int{}, TargetScript: "Latin", RuneCount: 4},
|
|
{Low: 10, High: 21, Spaces: []int{11, 16, 21}, TargetScript: "Common", RuneCount: 11}, // £ takes 2 bytes
|
|
{Low: 22, High: 24, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
|
|
{Low: 25, High: 30, Spaces: []int{25, 30}, TargetScript: "Common", RuneCount: 5}, // ¥ takes 2 bytes
|
|
{Low: 31, High: 33, Spaces: []int{}, TargetScript: "Latin", RuneCount: 3},
|
|
{Low: 34, High: 34, Spaces: []int{34}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 35, High: 44, Spaces: []int{}, TargetScript: "Latin", RuneCount: 10},
|
|
{Low: 45, High: 45, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 25,
|
|
"Common": 19,
|
|
},
|
|
Text: inputs[2],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[3],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 6, High: 9, Spaces: []int{}, TargetScript: "Latin", RuneCount: 4},
|
|
{Low: 10, High: 14, Spaces: []int{10}, TargetScript: "Common", RuneCount: 2},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 9,
|
|
"Common": 3,
|
|
},
|
|
Text: inputs[3],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[4],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 2},
|
|
{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 3},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hangul": 5,
|
|
"Common": 1,
|
|
},
|
|
Text: inputs[4],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[5],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 8, Spaces: []int{}, TargetScript: "Han", RuneCount: 3},
|
|
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
|
|
{Low: 12, High: 20, Spaces: []int{}, TargetScript: "Han", RuneCount: 3},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Hiragana": 1,
|
|
"Han": 6,
|
|
},
|
|
Text: inputs[5],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[6],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 1},
|
|
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Common", RuneCount: 1}, // ー U+30FC (KATAKANA-HIRAGANA PROLONGED SOUND MARK) seems to be counted as Common
|
|
{Low: 6, High: 11, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
|
|
{Low: 12, High: 14, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Katakana": 3,
|
|
"Common": 1,
|
|
"Han": 1,
|
|
},
|
|
Text: inputs[6],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[7],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
|
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
|
|
{Low: 6, High: 20, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
|
|
{Low: 21, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Han": 1,
|
|
"Hiragana": 3,
|
|
"Katakana": 5,
|
|
},
|
|
Text: inputs[7],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[8],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
|
|
{Low: 6, High: 8, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 9, High: 11, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
|
{Low: 12, High: 14, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
|
|
{Low: 15, High: 29, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
|
|
{Low: 30, High: 35, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Han": 1,
|
|
"Hiragana": 3,
|
|
"Katakana": 7,
|
|
"Common": 1,
|
|
},
|
|
Text: inputs[8],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[9],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
|
{Low: 3, High: 5, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 1},
|
|
{Low: 6, High: 20, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 5},
|
|
{Low: 21, High: 26, Spaces: []int{}, TargetScript: "Hiragana", RuneCount: 2},
|
|
{Low: 27, High: 27, Spaces: []int{27}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 28, High: 33, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
|
|
{Low: 34, High: 36, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Han": 1,
|
|
"Hiragana": 3,
|
|
"Katakana": 7,
|
|
"Common": 2,
|
|
},
|
|
Text: inputs[9],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[10],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 20, Spaces: []int{}, TargetScript: "Devanagari", RuneCount: 7},
|
|
{Low: 21, High: 21, Spaces: []int{21}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 22, High: 45, Spaces: []int{}, TargetScript: "Devanagari", RuneCount: 8},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Devanagari": 15,
|
|
"Common": 1,
|
|
},
|
|
Text: inputs[10],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[11],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 6, High: 11, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 2},
|
|
{Low: 12, High: 12, Spaces: []int{12}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 13, High: 18, Spaces: []int{}, TargetScript: "Han", RuneCount: 2},
|
|
{Low: 19, High: 24, Spaces: []int{19}, TargetScript: "Common", RuneCount: 3},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 5,
|
|
"Hangul": 2,
|
|
"Han": 2,
|
|
"Common": 5,
|
|
},
|
|
Text: inputs[11],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[12],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 0, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 1, High: 1, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
|
|
{Low: 2, High: 3, Spaces: []int{}, TargetScript: "Inherited", RuneCount: 1},
|
|
{Low: 4, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
|
|
{Low: 5, High: 5, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 2,
|
|
"Common": 2,
|
|
"Inherited": 1,
|
|
},
|
|
Text: inputs[12],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[13], // ä and a + ¨
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 1, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
|
|
{Low: 2, High: 2, Spaces: []int{2}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 3, High: 3, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
|
|
{Low: 4, High: 5, Spaces: []int{}, TargetScript: "Inherited", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 2,
|
|
"Common": 1,
|
|
"Inherited": 1,
|
|
},
|
|
Text: inputs[13],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[14], // cyrillic script isn't part of our default
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 13, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
|
{Low: 14, High: 14, Spaces: []int{14}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 15, High: 28, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"_unknown": 14,
|
|
"Common": 1,
|
|
},
|
|
Text: inputs[14],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 5,
|
|
"Common": 1,
|
|
"_unknown": 7,
|
|
},
|
|
Text: inputs[15],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[16],
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{
|
|
{Low: 0, High: 0, Spaces: []int{0}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 1, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
|
{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1},
|
|
{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Latin", RuneCount: 9},
|
|
{Low: 16, High: 16, Spaces: []int{16}, TargetScript: "Common", RuneCount: 1},
|
|
},
|
|
RuneCount: map[string]int{
|
|
"Latin": 14,
|
|
"Common": 3,
|
|
},
|
|
Text: inputs[16],
|
|
},
|
|
},
|
|
{
|
|
input: inputs[17], // empty string
|
|
eOut: TextAnalysis{
|
|
ScriptRanges: []ScriptRange{},
|
|
RuneCount: map[string]int{},
|
|
Text: inputs[17],
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, table := range tables {
|
|
testname := fmt.Sprintf("Raw-Analyzing \"%s\" string", table.input)
|
|
t.Run(testname, func(t *testing.T) {
|
|
ta := NewTextAnalyzer(DefaultScripts)
|
|
result := ta.AnalyzeString(table.input, AnalysisOpts{})
|
|
|
|
assert.Equal(t, table.eOut, result)
|
|
})
|
|
}
|
|
}
|