From 23dde0bc9bd3e99c69d5dce4f09bba1687744106 Mon Sep 17 00:00:00 2001 From: Torsten Grote Date: Thu, 31 Jul 2025 15:36:31 -0300 Subject: [PATCH] [db] Better support search for CJK languages by inserting zero whitespace between their characters to help the existing sqlite FTS tokenizers to split them up. We have considered splitting them up only at word boundaries, but after consulting native speakers decided to do splitting by chars instead. Doing this is a hack, but due to the limitations of tokenizers currently available with sqlite, we saw no better solution. While the ICU tokenizer is available as well, it doesn't handle diacritics in other languages. The zero whitespace is added to zh, ja and ko locales when saving their text to the database. It happens for app names, summaries and descriptions either when loading a full index or when applying diffs. Tests have been added for both cases. --- .../dbTest/java/org/fdroid/database/DbTest.kt | 7 +++- .../org/fdroid/database/IndexV2DiffTest.kt | 31 +++++++++++++++++ .../src/main/java/org/fdroid/database/App.kt | 33 +++++++++++++++++-- .../main/java/org/fdroid/database/AppDao.kt | 20 +++++++---- .../src/main/assets/diff-empty-max/1337.json | 9 +++-- .../src/main/assets/diff-empty-max/23.json | 9 +++-- .../src/main/assets/diff-empty-max/42.json | 9 +++-- .../src/main/assets/index-max-v2.json | 9 +++-- .../main/kotlin/org/fdroid/test/TestDataV2.kt | 3 ++ 9 files changed, 107 insertions(+), 23 deletions(-) diff --git a/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt b/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt index f4208e62a..e4b31214c 100644 --- a/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt +++ b/libs/database/src/dbTest/java/org/fdroid/database/DbTest.kt @@ -114,7 +114,12 @@ internal abstract class DbTest { assertEquals(sortedIndex.packages.size, appDao.countApps(), "number of packages") sortedIndex.packages.forEach { (packageName, packageV2) -> assertEquals( - packageV2.metadata, + // zero-whitespace hack needs to get applied to expected data + packageV2.metadata.copy( + name = packageV2.metadata.name.zero(), + summary = packageV2.metadata.summary.zero(), + description = packageV2.metadata.description.zero(), + ), appDao.getApp(repoId, packageName)?.toMetadataV2()?.sort() ) val versions = versionDao.getAppVersions(repoId, packageName).getOrFail().map { diff --git a/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt b/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt index d3d37c70a..f0e4db3c7 100644 --- a/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt +++ b/libs/database/src/dbTest/java/org/fdroid/database/IndexV2DiffTest.kt @@ -390,6 +390,37 @@ internal class IndexV2DiffTest : DbTest() { ) } + @Test + fun testMinAddChinese() { + val diffJson = """{ + "packages": { + "org.fdroid.min1": { + "metadata": { + "name": { "zh-CN": "自由软件仓库" }, + "summary": { "ja": "这个仓库中的" }, + "description": { "ko-KR": "切始终是从" } + } + } + } + }""".trimIndent() + testJsonDiff( + startPath = "index-min-v2.json", + diff = diffJson, + endIndex = TestDataMinV2.index.copy( + packages = TestDataMinV2.index.packages.mapValues { + it.value.copy( + metadata = it.value.metadata.copy( + // zero whitespaces (to separate tokens) will be added in testJsonDiff() + name = mapOf("zh-CN" to "自由软件仓库"), + summary = mapOf("ja" to "这个仓库中的"), + description = mapOf("ko-KR" to "切始终是从"), + ) + ) + } + ), + ) + } + private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2) { testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex) } diff --git a/libs/database/src/main/java/org/fdroid/database/App.kt b/libs/database/src/main/java/org/fdroid/database/App.kt index 7c4fbcfd2..e5dfbf6cc 100644 --- a/libs/database/src/main/java/org/fdroid/database/App.kt +++ b/libs/database/src/main/java/org/fdroid/database/App.kt @@ -97,9 +97,9 @@ internal fun MetadataV2.toAppMetadata( packageName = packageName, added = added, lastUpdated = lastUpdated, - name = name, - summary = summary, - description = description, + name = name.zero(), + summary = summary.zero(), + description = description.zero(), localizedName = name.getBestLocale(locales), localizedSummary = summary.getBestLocale(locales), webSite = webSite, @@ -125,6 +125,33 @@ internal fun MetadataV2.toAppMetadata( isCompatible = isCompatible, ) +/** + * Introduce zero whitespace for CJK (Chinese, Japanese, Korean) languages. + * This is needed, because the sqlite tokenizers available to us either handle those languages + * or do diacritics removals. + * Since we can't remove diacritics here ourselves, + * we help the tokenizer for CJK languages instead. + */ +internal fun LocalizedTextV2?.zero(): LocalizedTextV2? { + if (this == null) return null + return toMutableMap().mapValues { (locale, text) -> + if (locale.startsWith("zh") || locale.startsWith("ja") || locale.startsWith("ko")) { + StringBuilder().apply { + text.forEachIndexed { i, char -> + if (Character.isIdeographic(char.code) && i + 1 < text.length) { + append(char) + append("\u200B") + } else { + append(char) + } + } + }.toString() + } else { + text + } + } +} + @Entity(tableName = AppMetadataFts.TABLE) @Fts4( contentEntity = AppMetadata::class, diff --git a/libs/database/src/main/java/org/fdroid/database/AppDao.kt b/libs/database/src/main/java/org/fdroid/database/AppDao.kt index 2e62839e7..2cc79e034 100644 --- a/libs/database/src/main/java/org/fdroid/database/AppDao.kt +++ b/libs/database/src/main/java/org/fdroid/database/AppDao.kt @@ -237,13 +237,19 @@ internal interface AppDaoInt : AppDao { } // diff metadata val diffedApp = applyDiff(metadata, jsonObject) - val updatedApp = - if (jsonObject.containsKey("name") || jsonObject.containsKey("summary")) { - diffedApp.copy( - localizedName = diffedApp.name.getBestLocale(locales), - localizedSummary = diffedApp.summary.getBestLocale(locales), - ) - } else diffedApp + val containsName = jsonObject.containsKey("name") + val containsSummary = jsonObject.containsKey("summary") + val containsDescription = jsonObject.containsKey("description") + val updatedApp = if (containsName || containsSummary || containsDescription) { + diffedApp.copy( + name = if (containsName) diffedApp.name.zero() else diffedApp.name, + summary = if (containsSummary) diffedApp.summary.zero() else diffedApp.summary, + description = if (containsDescription) diffedApp.description.zero() + else diffedApp.description, + localizedName = diffedApp.name.getBestLocale(locales), + localizedSummary = diffedApp.summary.getBestLocale(locales), + ) + } else diffedApp updateAppMetadata(updatedApp) // diff localizedFiles val localizedFiles = getLocalizedFiles(repoId, packageName) diff --git a/libs/sharedTest/src/main/assets/diff-empty-max/1337.json b/libs/sharedTest/src/main/assets/diff-empty-max/1337.json index a07819f0c..9a76377dd 100644 --- a/libs/sharedTest/src/main/assets/diff-empty-max/1337.json +++ b/libs/sharedTest/src/main/assets/diff-empty-max/1337.json @@ -320,15 +320,18 @@ "metadata": { "name": { "en-US": "App3", - "en": "en " + "en": "en ", + "zh-CN": "自由软件仓库" }, "summary": { "en-US": "App3 summary", - "en": "en " + "en": "en ", + "ja": "这个仓库中的" }, "description": { "en-US": "App3 description", - "en": "en " + "en": "en ", + "ko-KR": "切始终是从" }, "added": 1234567890, "lastUpdated": 9223372036854775807, diff --git a/libs/sharedTest/src/main/assets/diff-empty-max/23.json b/libs/sharedTest/src/main/assets/diff-empty-max/23.json index 51e398b82..314a6844b 100644 --- a/libs/sharedTest/src/main/assets/diff-empty-max/23.json +++ b/libs/sharedTest/src/main/assets/diff-empty-max/23.json @@ -596,15 +596,18 @@ "metadata": { "name": { "en-US": "App3", - "en": "en " + "en": "en ", + "zh-CN": "自由软件仓库" }, "summary": { "en-US": "App3 summary", - "en": "en " + "en": "en ", + "ja": "这个仓库中的" }, "description": { "en-US": "App3 description", - "en": "en " + "en": "en ", + "ko-KR": "切始终是从" }, "added": 1234567890, "lastUpdated": 9223372036854775807, diff --git a/libs/sharedTest/src/main/assets/diff-empty-max/42.json b/libs/sharedTest/src/main/assets/diff-empty-max/42.json index aebb63085..ce3679463 100644 --- a/libs/sharedTest/src/main/assets/diff-empty-max/42.json +++ b/libs/sharedTest/src/main/assets/diff-empty-max/42.json @@ -593,15 +593,18 @@ "metadata": { "name": { "en-US": "App3", - "en": "en " + "en": "en ", + "zh-CN": "自由软件仓库" }, "summary": { "en-US": "App3 summary", - "en": "en " + "en": "en ", + "ja": "这个仓库中的" }, "description": { "en-US": "App3 description", - "en": "en " + "en": "en ", + "ko-KR": "切始终是从" }, "added": 1234567890, "lastUpdated": 9223372036854775807, diff --git a/libs/sharedTest/src/main/assets/index-max-v2.json b/libs/sharedTest/src/main/assets/index-max-v2.json index 6be919aac..4da00689b 100644 --- a/libs/sharedTest/src/main/assets/index-max-v2.json +++ b/libs/sharedTest/src/main/assets/index-max-v2.json @@ -603,15 +603,18 @@ "metadata": { "name": { "en-US": "App3", - "en": "en " + "en": "en ", + "zh-CN": "自由软件仓库" }, "summary": { "en-US": "App3 summary", - "en": "en " + "en": "en ", + "ja": "这个仓库中的" }, "description": { "en-US": "App3 description", - "en": "en " + "en": "en ", + "ko-KR": "切始终是从" }, "added": 1234567890, "lastUpdated": 9223372036854775807, diff --git a/libs/sharedTest/src/main/kotlin/org/fdroid/test/TestDataV2.kt b/libs/sharedTest/src/main/kotlin/org/fdroid/test/TestDataV2.kt index 6dcc611b5..de53c1b8a 100644 --- a/libs/sharedTest/src/main/kotlin/org/fdroid/test/TestDataV2.kt +++ b/libs/sharedTest/src/main/kotlin/org/fdroid/test/TestDataV2.kt @@ -1055,14 +1055,17 @@ object TestDataMaxV2 { name = mapOf( LOCALE to "App3", "en" to "en ", + "zh-CN" to "自由软件仓库", ), summary = mapOf( LOCALE to "App3 summary", "en" to "en ", + "ja" to "这个仓库中的", ), description = mapOf( LOCALE to "App3 description", "en" to "en ", + "ko-KR" to "切始终是从", ), added = 1234567890, lastUpdated = Long.MAX_VALUE,