mirror of
https://github.com/f-droid/fdroidclient.git
synced 2026-05-19 14:10:38 -04:00
[db] Better support search for CJK languages
by inserting zero whitespace between their characters to help the existing sqlite FTS tokenizers to split them up. We have considered splitting them up only at word boundaries, but after consulting native speakers decided to do splitting by chars instead. Doing this is a hack, but due to the limitations of tokenizers currently available with sqlite, we saw no better solution. While the ICU tokenizer is available as well, it doesn't handle diacritics in other languages. The zero whitespace is added to zh, ja and ko locales when saving their text to the database. It happens for app names, summaries and descriptions either when loading a full index or when applying diffs. Tests have been added for both cases.
This commit is contained in:
@@ -114,7 +114,12 @@ internal abstract class DbTest {
|
||||
assertEquals(sortedIndex.packages.size, appDao.countApps(), "number of packages")
|
||||
sortedIndex.packages.forEach { (packageName, packageV2) ->
|
||||
assertEquals(
|
||||
packageV2.metadata,
|
||||
// zero-whitespace hack needs to get applied to expected data
|
||||
packageV2.metadata.copy(
|
||||
name = packageV2.metadata.name.zero(),
|
||||
summary = packageV2.metadata.summary.zero(),
|
||||
description = packageV2.metadata.description.zero(),
|
||||
),
|
||||
appDao.getApp(repoId, packageName)?.toMetadataV2()?.sort()
|
||||
)
|
||||
val versions = versionDao.getAppVersions(repoId, packageName).getOrFail().map {
|
||||
|
||||
@@ -390,6 +390,37 @@ internal class IndexV2DiffTest : DbTest() {
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testMinAddChinese() {
|
||||
val diffJson = """{
|
||||
"packages": {
|
||||
"org.fdroid.min1": {
|
||||
"metadata": {
|
||||
"name": { "zh-CN": "自由软件仓库" },
|
||||
"summary": { "ja": "这个仓库中的" },
|
||||
"description": { "ko-KR": "切始终是从" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}""".trimIndent()
|
||||
testJsonDiff(
|
||||
startPath = "index-min-v2.json",
|
||||
diff = diffJson,
|
||||
endIndex = TestDataMinV2.index.copy(
|
||||
packages = TestDataMinV2.index.packages.mapValues {
|
||||
it.value.copy(
|
||||
metadata = it.value.metadata.copy(
|
||||
// zero whitespaces (to separate tokens) will be added in testJsonDiff()
|
||||
name = mapOf("zh-CN" to "自由软件仓库"),
|
||||
summary = mapOf("ja" to "这个仓库中的"),
|
||||
description = mapOf("ko-KR" to "切始终是从"),
|
||||
)
|
||||
)
|
||||
}
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2) {
|
||||
testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user