opencloud/services/thumbnails/pkg/preprocessor/textanalyzer.go

package preprocessor

import (
	"unicode"
	"unicode/utf8"
)

// Default list of scripts to be analyzed within the string.
//
// Scripts that aren't present in the list will be considered as part
// of the last "known" script. For example, if "Avestan" script (which isn't
// present) is preceeded by "Arabic" script, then the "Avestan" script will
// be considered as "Arabic"
//
// Punctuation symbols are usually considered part of the "Common" script
var DefaultScripts = []string{
	"Arabic",
	"Common",
	"Devanagari",
	"Han",
	"Hangul",
	"Hiragana",
	"Inherited",
	"Katakana",
	"Latin",
}

// Convenient map[string]map[string]string type used to merge multiple
// scripts into one. This is mainly used for japanese language which uses
// "Han", "Hiragana" and "Katakana" scripts.
//
// The map contains the expected previous script as first key, the expected
// current script as second key, and the resulting script (if both keys
// match) as value
type MergeMap map[string]map[string]string

// The default mergeMap containing info for the japanese scripts
var DefaultMergeMap = MergeMap{
	"Han": map[string]string{
		"Hiragana": "Hiragana",
		"Katakana": "Katakana",
	},
	"Hiragana": map[string]string{
		"Han":      "Hiragana",
		"Katakana": "Hiragana",
	},
	"Katakana": map[string]string{
		"Han":      "Katakana",
		"Hiragana": "Hiragana",
	},
}

// Analysis options.
type AnalysisOpts struct {
	UseMergeMap bool
	MergeMap    MergeMap
}

// A script range. The range should be attached to a string which could contain
// multiple scripts. The "TargetScript" will go from bytes "Low" to "High"
// (both inclusive), and contains a "RuneCount" number of runes or chars
// (mostly for debugging purposes).
// The Space contains the bytes (inside the range) that are considered as
// white space.
type ScriptRange struct {
	Low, High    int
	Spaces       []int
	TargetScript string
	RuneCount    int
}

// The result of a text analysis. It contains the analyzed text, a list of
// script ranges (see the ScriptRange type) and a map containing how many
// runes have been detected for a particular script.
type TextAnalysis struct {
	ScriptRanges []ScriptRange
	RuneCount    map[string]int
	Text         string
}

// The TextAnalyzer object contains private members. It should be created via
// "NewTextAnalyzer" function.
type TextAnalyzer struct {
	scripts         map[string]*unicode.RangeTable
	scriptListCache []string
}

// Create a new TextAnalyzer. A list of scripts must be provided.
// You can use the "DefaultScripts" variable for a default list,
// although it doesn't contain all the available scripts.
// See the unicode.Scripts variable (in the unicode package) for a
// full list. Note that using invalid scripts will cause an undefined
// behavior
func NewTextAnalyzer(scriptList []string) TextAnalyzer {
	scriptRanges := make(map[string]*unicode.RangeTable, len(scriptList))
	for _, script := range scriptList {
		scriptRanges[script] = unicode.Scripts[script]
	}
	return TextAnalyzer{
		scripts:         scriptRanges,
		scriptListCache: scriptList,
	}
}

// Analyze the target string using the specified options.
// A TextAnalysis will be returned with the result of the analysis.
func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalysis {
	analysis := TextAnalysis{
		ScriptRanges: []ScriptRange{},
		RuneCount:    make(map[string]int),
		Text:         word,
	}

	if len(word) < 1 {
		return analysis
	}

	firstRune, runeLen := utf8.DecodeRuneInString(word)

	lastRange := &ScriptRange{
		Low:          0,
		Spaces:       make([]int, 0),
		TargetScript: ta.chooseScriptFor(firstRune),
	}
	firstRuneIsWhiteSpace := unicode.Is(unicode.White_Space, firstRune)
	if firstRuneIsWhiteSpace {
		lastRange.Spaces = append(lastRange.Spaces, 0)
	}

	runeCount := 1
	for wordIndex, char := range word[runeLen:] {
		wordIndex += runeLen // shifted from the original string
		script := ta.chooseScriptFor(char)

		isWhiteSpace := unicode.Is(unicode.White_Space, char)
		if script != lastRange.TargetScript {
			if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk {
				lastRange.TargetScript = mapScript
				if isWhiteSpace {
					// TODO: Check if this is dead code.
					// whitespace should be part of the "Common" script, and the Common
					// script shouldn't be part of a mergeMap
					lastRange.Spaces = append(lastRange.Spaces, wordIndex)
				}
				runeCount++
				continue
			}

			lastRange.High = wordIndex - 1
			lastRange.RuneCount = runeCount
			analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
			if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists {
				analysis.RuneCount[lastRange.TargetScript] = 0
			}
			analysis.RuneCount[lastRange.TargetScript] += runeCount
			lastRange = &ScriptRange{
				Low:          wordIndex,
				Spaces:       make([]int, 0),
				TargetScript: script,
			}
			runeCount = 0
		}
		runeCount++
		if isWhiteSpace {
			lastRange.Spaces = append(lastRange.Spaces, wordIndex)
		}
	}

	// close the last range
	lastRange.High = len(word) - 1
	lastRange.RuneCount = runeCount
	analysis.RuneCount[lastRange.TargetScript] += runeCount
	analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)

	return analysis
}

func (ta *TextAnalyzer) chooseScriptFor(char rune) string {
	script := "_unknown"
	for scriptIndex, scriptFound := range ta.scriptListCache {
		// if we can't match with a known script, do nothing and jump to the next char
		if unicode.Is(ta.scripts[scriptFound], char) {
			if scriptIndex > 3 {
				// we might expect more chars with the same script
				// so move the script first to match it faster next time
				ta.reorderScriptList(scriptFound)
			}
			return scriptFound
		}
	}
	return script
}

// Reorder the scriptListCache in the TextAnalyzer in order to speed up
// the next script searches. A "Latin" script is expected to be surrounded
// by "Latin" chars, although "Common" script chars might be present too
func (ta *TextAnalyzer) reorderScriptList(matchedScript string) {
	for index, script := range ta.scriptListCache {
		if script == matchedScript {
			if index != 0 {
				// move the script to the first position for a faster matching
				newList := append([]string{script}, ta.scriptListCache[:index]...)
				ta.scriptListCache = append(newList, ta.scriptListCache[index+1:]...)
			}
			// if index == 0 there is nothing to do: the element is already the first
			break
		}
	}
}

// Get the value from the merge map based on the previous and current scripts.
// The information about using the merge map and the actual merge map will be
// gotten from the AnalysisOpts passed as parameter
func (ta *TextAnalyzer) getMergeMapValue(opts AnalysisOpts, previous, current string) (string, bool) {
	if opts.UseMergeMap {
		// This option mainly target japanese chars; multiple scripts can be used
		// in the same piece of text (Han, Hiragana and Katakana)
		// Instead of starting a new range, adjust the target script of the last range
		if expCurrent, currentOk := opts.MergeMap[previous]; currentOk {
			if expFinal, finalOk := expCurrent[current]; finalOk {
				return expFinal, finalOk
			}
		}
	}
	return "", false
}

// Change the "Common" script to the one used in the previous script range.
// The ranges will be readjusted and merged if they're adjacent.
// This naive approach should be good enough for normal use cases
//
// The MergeMap is needed in case of the japanese language: the ranges
// "Han"-"Common"-"Katakana" might be replaced to "Han"-"Hiragana"-"Katakana"
// However, the ranges should be merged together into a big "Hiragana" range.
// If the MergeMap isn't needed, use an empty one
func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) {
	var finalRanges []ScriptRange

	if len(tr.ScriptRanges) < 1 {
		// no ranges -> nothing to do
		return
	}

	previousRange := &ScriptRange{}
	*previousRange = tr.ScriptRanges[0]
	for _, sRange := range tr.ScriptRanges[1:] {
		if previousRange.TargetScript == sRange.TargetScript {
			previousRange.High = sRange.High
			previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
			previousRange.RuneCount += sRange.RuneCount
		} else if sRange.TargetScript == "Common" || sRange.TargetScript == "Inherited" {
			// new range will be absorbed into the previous one
			previousRange.High = sRange.High
			previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
			previousRange.RuneCount += sRange.RuneCount
			tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
			tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
		} else if previousRange.TargetScript == "Common" || previousRange.TargetScript == "Inherited" {
			// might happen if the text starts with a Common script
			previousRange.High = sRange.High
			previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
			tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
			tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
			previousRange.RuneCount += sRange.RuneCount
			previousRange.TargetScript = sRange.TargetScript
		} else {
			if mapScript, isOk := tr.getMergeMapValue(mergeMap, previousRange.TargetScript, sRange.TargetScript); isOk {
				if sRange.TargetScript == mapScript {
					// the previous range has changed the target script
					tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
					tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
				} else {
					// new range has been absorbed
					tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
					tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
				}
				previousRange.TargetScript = mapScript
				previousRange.High = sRange.High
				previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
				previousRange.RuneCount += sRange.RuneCount
				continue
			}
			finalRanges = append(finalRanges, *previousRange)
			*previousRange = sRange
		}
	}

	finalRanges = append(finalRanges, *previousRange)
	tr.ScriptRanges = finalRanges
	delete(tr.RuneCount, "Common")
	delete(tr.RuneCount, "Inherited")
	for index, rCount := range tr.RuneCount {
		if rCount == 0 {
			delete(tr.RuneCount, index)
		}
	}
}

func (tr *TextAnalysis) getMergeMapValue(mMap MergeMap, previous, current string) (string, bool) {
	// This option mainly target japanese chars; multiple scripts can be used
	// in the same piece of text (Han, Hiragana and Katakana)
	// Instead of starting a new range, adjust the target script of the last range
	if expCurrent, currentOk := mMap[previous]; currentOk {
		if expFinal, finalOk := expCurrent[current]; finalOk {
			return expFinal, finalOk
		}
	}
	return "", false
}