[db] Better support search for CJK languages

by inserting zero whitespace between their characters to help the existing sqlite FTS tokenizers to split them up.

We have considered splitting them up only at word boundaries, but after consulting native speakers decided to do splitting by chars instead.

Doing this is a hack, but due to the limitations of tokenizers currently available with sqlite, we saw no better solution. While the ICU tokenizer is available as well, it doesn't handle diacritics in other languages.

The zero whitespace is added to zh, ja and ko locales when saving their text to the database. It happens for app names, summaries and descriptions either when loading a full index or when applying diffs. Tests have been added for both cases.
This commit is contained in:
Torsten Grote
2025-07-31 15:36:31 -03:00
parent 0f3f2d6f80
commit 23dde0bc9b
9 changed files with 107 additions and 23 deletions

View File

@@ -114,7 +114,12 @@ internal abstract class DbTest {
assertEquals(sortedIndex.packages.size, appDao.countApps(), "number of packages")
sortedIndex.packages.forEach { (packageName, packageV2) ->
assertEquals(
packageV2.metadata,
// zero-whitespace hack needs to get applied to expected data
packageV2.metadata.copy(
name = packageV2.metadata.name.zero(),
summary = packageV2.metadata.summary.zero(),
description = packageV2.metadata.description.zero(),
),
appDao.getApp(repoId, packageName)?.toMetadataV2()?.sort()
)
val versions = versionDao.getAppVersions(repoId, packageName).getOrFail().map {

View File

@@ -390,6 +390,37 @@ internal class IndexV2DiffTest : DbTest() {
)
}
@Test
fun testMinAddChinese() {
val diffJson = """{
"packages": {
"org.fdroid.min1": {
"metadata": {
"name": { "zh-CN": "自由软件仓库" },
"summary": { "ja": "这个仓库中的" },
"description": { "ko-KR": "切始终是从" }
}
}
}
}""".trimIndent()
testJsonDiff(
startPath = "index-min-v2.json",
diff = diffJson,
endIndex = TestDataMinV2.index.copy(
packages = TestDataMinV2.index.packages.mapValues {
it.value.copy(
metadata = it.value.metadata.copy(
// zero whitespaces (to separate tokens) will be added in testJsonDiff()
name = mapOf("zh-CN" to "自由软件仓库"),
summary = mapOf("ja" to "这个仓库中的"),
description = mapOf("ko-KR" to "切始终是从"),
)
)
}
),
)
}
private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2) {
testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex)
}