[db] Better support search for CJK languages

by inserting zero whitespace between their characters to help the existing sqlite FTS tokenizers to split them up. We have considered splitting them up only at word boundaries, but after consulting native speakers decided to do splitting by chars instead. Doing this is a hack, but due to the limitations of tokenizers currently available with sqlite, we saw no better solution. While the ICU tokenizer is available as well, it doesn't handle diacritics in other languages. The zero whitespace is added to zh, ja and ko locales when saving their text to the database. It happens for app names, summaries and descriptions either when loading a full index or when applying diffs. Tests have been added for both cases.
2026-05-19 14:10:38 -04:00 · 2025-07-31 15:36:31 -03:00
parent 0f3f2d6f80
commit 23dde0bc9b
9 changed files with 107 additions and 23 deletions
--- a/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt
+++ b/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt
@@ -114,7 +114,12 @@ internal abstract class DbTest {
        assertEquals(sortedIndex.packages.size, appDao.countApps(), "number of packages")
        sortedIndex.packages.forEach { (packageName, packageV2) ->
            assertEquals(
-                packageV2.metadata,
+                // zero-whitespace hack needs to get applied to expected data
+                packageV2.metadata.copy(
+                    name = packageV2.metadata.name.zero(),
+                    summary = packageV2.metadata.summary.zero(),
+                    description = packageV2.metadata.description.zero(),
+                ),
                appDao.getApp(repoId, packageName)?.toMetadataV2()?.sort()
            )
            val versions = versionDao.getAppVersions(repoId, packageName).getOrFail().map {
--- a/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt
+++ b/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt
@@ -390,6 +390,37 @@ internal class IndexV2DiffTest : DbTest() {
        )
    }

+    @Test
+    fun testMinAddChinese() {
+        val diffJson = """{
+          "packages": {
+            "org.fdroid.min1": {
+              "metadata": {
+                "name": { "zh-CN": "自由软件仓库" },
+                "summary": { "ja": "这个仓库中的" },
+                "description": { "ko-KR": "切始终是从" }
+              }
+            }
+          }
+        }""".trimIndent()
+        testJsonDiff(
+            startPath = "index-min-v2.json",
+            diff = diffJson,
+            endIndex = TestDataMinV2.index.copy(
+                packages = TestDataMinV2.index.packages.mapValues {
+                    it.value.copy(
+                        metadata = it.value.metadata.copy(
+                            // zero whitespaces (to separate tokens) will be added in testJsonDiff()
+                            name = mapOf("zh-CN" to "自由软件仓库"),
+                            summary = mapOf("ja" to "这个仓库中的"),
+                            description = mapOf("ko-KR" to "切始终是从"),
+                        )
+                    )
+                }
+            ),
+        )
+    }
+
    private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2) {
        testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex)
    }