[db] fix diff bug where zero-whitespace gets added more than once

This bug only affects CJK languages and apart from DB growth, the symptom is that word filtering in app lists doesn't find affected apps because we look for a single whitespace between tokens
This commit is contained in:
Torsten Grote
2026-03-18 15:49:00 -03:00
parent bd8d1b9261
commit 34b3fa31f4
3 changed files with 82 additions and 27 deletions

View File

@@ -14,6 +14,7 @@ import org.fdroid.test.TestDataMaxV2.PACKAGE_NAME_3
import org.fdroid.test.TestDataMaxV2.app3
import org.fdroid.test.TestDataMidV2
import org.fdroid.test.TestDataMinV2
import org.fdroid.test.TestDataMinV2.PACKAGE_NAME
import org.fdroid.test.TestUtils.getRes
import org.junit.Ignore
import org.junit.Test
@@ -31,52 +32,58 @@ internal class IndexV2DiffTest : DbTest() {
}
@Test
fun testEmptyToMin() =
fun testEmptyToMin() {
testDiff(
startPath = "index-empty-v2.json",
diffPath = "diff-empty-min/23.json",
endIndex = TestDataMinV2.index,
)
}
@Test
fun testEmptyToMid() =
fun testEmptyToMid() {
testDiff(
startPath = "index-empty-v2.json",
diffPath = "diff-empty-mid/23.json",
endIndex = TestDataMidV2.index,
)
}
@Test
fun testEmptyToMax() =
fun testEmptyToMax() {
testDiff(
startPath = "index-empty-v2.json",
diffPath = "diff-empty-max/23.json",
endIndex = TestDataMaxV2.index,
)
}
@Test
fun testMinToMid() =
fun testMinToMid() {
testDiff(
startPath = "index-min-v2.json",
diffPath = "diff-empty-mid/42.json",
endIndex = TestDataMidV2.index,
)
}
@Test
fun testMinToMax() =
fun testMinToMax() {
testDiff(
startPath = "index-min-v2.json",
diffPath = "diff-empty-max/42.json",
endIndex = TestDataMaxV2.index,
)
}
@Test
fun testMidToMax() =
fun testMidToMax() {
testDiff(
startPath = "index-mid-v2.json",
diffPath = "diff-empty-max/1337.json",
endIndex = TestDataMaxV2.index,
)
}
@Test
fun testMinRemoveApp() {
@@ -450,20 +457,55 @@ internal class IndexV2DiffTest : DbTest() {
}
"""
.trimIndent()
testJsonDiff(
startPath = "index-min-v2.json",
diff = diffJson,
endIndex =
val metadata =
TestDataMinV2.index.packages[PACKAGE_NAME]!!
.metadata
.copy(
// zero whitespaces (to separate tokens) will be added in testJsonDiff()
name = mapOf("zh-CN" to "自由软件仓库"),
summary = mapOf("ja" to "这个仓库中的"),
description = mapOf("ko-KR" to "切始终是从"),
)
val endIndex =
TestDataMinV2.index.copy(
packages = TestDataMinV2.index.packages.mapValues { it.value.copy(metadata = metadata) }
)
val repoId = testJsonDiff(startPath = "index-min-v2.json", diff = diffJson, endIndex = endIndex)
// now apply another diff to ensure we don't add zero whitespace multiple times
val newDiffJson =
"""
{
"packages": {
"org.fdroid.min1": {
"metadata": {
"name": { "en-US": "foo bar" },
"summary": { "en-US": "foo bar" },
"description": { "en-US": "foo bar" }
}
}
}
}
"""
.trimIndent()
// apply diff stream to the DB
val streamReceiver = DbV2DiffStreamReceiver(db, repoId) { true }
val streamProcessor = IndexV2DiffStreamProcessor(streamReceiver)
val diffStream = ByteArrayInputStream(newDiffJson.toByteArray())
db.runInTransaction { streamProcessor.process(42, diffStream) {} }
// assert that changed DB data is equal to given endIndex
assertDbEquals(
repoId = repoId,
index =
TestDataMinV2.index.copy(
packages =
TestDataMinV2.index.packages.mapValues {
it.value.copy(
metadata =
it.value.metadata.copy(
// zero whitespaces (to separate tokens) will be added in testJsonDiff()
name = mapOf("zh-CN" to "自由软件仓库"),
summary = mapOf("ja" to "这个仓库中的"),
description = mapOf("ko-KR" to "切始终是从"),
metadata.copy(
name = mapOf("en-US" to "foo bar", "zh-CN" to "自由软件仓库"),
summary = mapOf("en-US" to "foo bar", "ja" to "这个仓库中的"),
description = mapOf("en-US" to "foo bar", "ko-KR" to "切始终是从"),
)
)
}
@@ -471,15 +513,15 @@ internal class IndexV2DiffTest : DbTest() {
)
}
private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2) {
testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex)
private fun testJsonDiff(startPath: String, diff: String, endIndex: IndexV2): Long {
return testDiff(startPath, ByteArrayInputStream(diff.toByteArray()), endIndex)
}
private fun testDiff(startPath: String, diffPath: String, endIndex: IndexV2) {
testDiff(startPath, getRes(diffPath), endIndex)
private fun testDiff(startPath: String, diffPath: String, endIndex: IndexV2): Long {
return testDiff(startPath, getRes(diffPath), endIndex)
}
private fun testDiff(startPath: String, diffStream: InputStream, endIndex: IndexV2) {
private fun testDiff(startPath: String, diffStream: InputStream, endIndex: IndexV2): Long {
// stream start index into the DB
val repoId = streamIndexV2IntoDb(startPath)
@@ -489,5 +531,6 @@ internal class IndexV2DiffTest : DbTest() {
db.runInTransaction { streamProcessor.process(42, diffStream) {} }
// assert that changed DB data is equal to given endIndex
assertDbEquals(repoId, endIndex)
return repoId
}
}

View File

@@ -140,9 +140,10 @@ internal fun MetadataV2.toAppMetadata(
* the sqlite tokenizers available to us either handle those languages or do diacritics removals.
* Since we can't remove diacritics here ourselves, we help the tokenizer for CJK languages instead.
*/
internal fun LocalizedTextV2?.zero(): LocalizedTextV2? {
internal fun LocalizedTextV2?.zero(localeAllowList: Set<String>? = null): LocalizedTextV2? {
if (this == null) return null
return toMutableMap().mapValues { (locale, text) ->
if (localeAllowList != null && locale !in localeAllowList) return@mapValues text
if (locale.startsWith("zh") || locale.startsWith("ja") || locale.startsWith("ko")) {
StringBuilder()
.apply {

View File

@@ -28,6 +28,7 @@ import kotlinx.serialization.SerializationException
import kotlinx.serialization.json.JsonNull
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.decodeFromJsonElement
import kotlinx.serialization.json.jsonObject
import org.fdroid.LocaleChooser.getBestLocale
import org.fdroid.database.AppListSortOrder.LAST_UPDATED
import org.fdroid.database.AppListSortOrder.NAME
@@ -280,16 +281,26 @@ internal interface AppDaoInt : AppDao {
}
// diff metadata
val diffedApp = applyDiff(metadata, jsonObject)
val containsName = jsonObject.containsKey("name")
val containsSummary = jsonObject.containsKey("summary")
val containsDescription = jsonObject.containsKey("description")
val containsName = jsonObject["name"] is JsonObject
val containsSummary = jsonObject["summary"] is JsonObject
val containsDescription = jsonObject["description"] is JsonObject
val updatedApp =
if (containsName || containsSummary || containsDescription) {
// applies zero whitespace hack (needed for Fts search) for new/changed locales only
// also updates localizedName and localizedSummary cache
diffedApp.copy(
name = if (containsName) diffedApp.name.zero() else diffedApp.name,
summary = if (containsSummary) diffedApp.summary.zero() else diffedApp.summary,
name =
if (containsName) {
diffedApp.name.zero(jsonObject["name"]?.jsonObject?.keys)
} else diffedApp.name,
summary =
if (containsSummary) {
diffedApp.summary.zero(jsonObject["summary"]?.jsonObject?.keys)
} else diffedApp.summary,
description =
if (containsDescription) diffedApp.description.zero() else diffedApp.description,
if (containsDescription) {
diffedApp.description.zero(jsonObject["description"]?.jsonObject?.keys)
} else diffedApp.description,
localizedName = diffedApp.name.getBestLocale(locales),
localizedSummary = diffedApp.summary.getBestLocale(locales),
)