Enter SpellingsDB

This commit is contained in:
Veloman Yunkan
2025-10-02 20:52:39 +04:00
parent b799c0648b
commit 286649e8c3
6 changed files with 311 additions and 1 deletions

View File

@@ -7,6 +7,7 @@ headers = [
'downloader.h',
'search_renderer.h',
'server.h',
'spelling_correction.h',
'kiwixserve.h',
'name_mapper.h',
'tools.h',

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2025 Veloman Yunkan
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#ifndef KIWIX_SPELLING_CORRECTION_H
#define KIWIX_SPELLING_CORRECTION_H
#include <memory>
#include <string>
#include <vector>
namespace zim
{
class Archive;
}
namespace Xapian
{
class Database;
}
namespace kiwix
{
class SpellingsDB
{
public: // functions
explicit SpellingsDB(const zim::Archive& archive, std::string path);
~SpellingsDB();
SpellingsDB(const SpellingsDB& ) = delete;
void operator=(const SpellingsDB& ) = delete;
std::vector<std::string> getSpellingCorrections(const std::string& word, uint32_t maxCount) const;
private: // data
std::unique_ptr<Xapian::Database> impl_;
};
} // namespace kiwix
#endif // KIWIX_SPELLING_CORRECTION_H

View File

@@ -31,6 +31,7 @@ kiwix_sources = [
'server/internalServer_catalog.cpp',
'server/i18n.cpp',
'opds_catalog.cpp',
'spelling_correction.cpp',
'version.cpp'
]
kiwix_sources += lib_resources

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 2025 Veloman Yunkan
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#include "spelling_correction.h"
#include "zim/archive.h"
#include <stdexcept>
#include <xapian.h>
namespace kiwix
{
namespace
{
std::vector<std::string> getAllTitles(const zim::Archive& a)
{
std::vector<std::string> result;
for (const auto& entry : a.iterByPath() ) {
result.push_back(entry.getTitle());
}
return result;
}
std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::string path, const zim::Archive& archive)
{
auto db(std::make_unique<Xapian::WritableDatabase>(path, Xapian::DB_BACKEND_GLASS));
for (const auto& t : getAllTitles(archive)) {
db->add_spelling(t);
}
return std::move(db);
}
} // unnamed namespace
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::string path)
: impl_(openOrCreateXapianDB(path, archive))
{
}
SpellingsDB::~SpellingsDB()
{
}
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
{
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}
std::vector<std::string> result;
const auto term = impl_->get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
}
return result;
}
} // namespace kiwix

View File

@@ -15,7 +15,8 @@ tests = [
'server_helper',
'lrucache',
'i18n',
'response'
'response',
'spelling_correction'
]
if build_machine.system() != 'windows'
@@ -42,6 +43,7 @@ if gtest_dep.found() and not meson.is_cross_build()
'zimfile_raycharles_uncategorized.zim',
'corner_cases#&.zim',
'poor.zim',
'spelling_correction_test.zim',
'library.xml',
'lib_for_server_search_test.xml',
'customized_resources.txt',

View File

@@ -0,0 +1,173 @@
/*
* Copyright (C) 2025 Veloman Yunkan
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "gtest/gtest.h"
#include "../include/spelling_correction.h"
#include "zim/archive.h"
#include <filesystem>
const std::string TEST_DB_PATH = "./spellings.db";
class SpellingCorrectionTest : public ::testing::Test
{
void removeDb()
{
std::filesystem::remove_all(TEST_DB_PATH);
}
protected:
void SetUp() override {
removeDb();
}
void TearDown() override {
removeDb();
}
};
#define EXPECT_SPELLING_CORRECTION(query, maxSuggestions, parenthesizedExpectedResult) \
EXPECT_EQ( \
spellingsDB.getSpellingCorrections(query, maxSuggestions), \
std::vector<std::string> parenthesizedExpectedResult \
)
TEST_F(SpellingCorrectionTest, allInOne)
{
const auto archive = zim::Archive("./test/spelling_correction_test.zim");
kiwix::SpellingsDB spellingsDB(archive, TEST_DB_PATH);
EXPECT_SPELLING_CORRECTION("", 1, ({}));
EXPECT_SPELLING_CORRECTION("geflekt", 1, ({"gefleckt"}));
EXPECT_SPELLING_CORRECTION("Teler", 1, ({"Teller"}));
EXPECT_SPELLING_CORRECTION("Teler", 1, ({"Teller"}));
EXPECT_SPELLING_CORRECTION("kämen", 1, ({"kämmen"}));
EXPECT_SPELLING_CORRECTION("abonieren", 1, ({"abonnieren"}));
EXPECT_SPELLING_CORRECTION("abbonnieren", 1, ({"abonnieren"}));
EXPECT_SPELLING_CORRECTION("abbonieren", 1, ({"abonnieren"}));
EXPECT_SPELLING_CORRECTION("Aplaus", 1, ({"Applaus"}));
EXPECT_SPELLING_CORRECTION("konkurieren", 1, ({"konkurrieren"}));
EXPECT_SPELLING_CORRECTION("Asisstent", 1, ({"Assistent"}));
EXPECT_SPELLING_CORRECTION("Assisstent", 1, ({"Assistent"}));
EXPECT_SPELLING_CORRECTION("Atacke", 1, ({"Attacke"}));
EXPECT_SPELLING_CORRECTION("atestieren", 1, ({"attestieren"}));
EXPECT_SPELLING_CORRECTION("entäuschen", 1, ({"enttäuschen"}));
EXPECT_SPELLING_CORRECTION("Enzündung", 1, ({"Entzündung"}));
EXPECT_SPELLING_CORRECTION("Schirmütze", 1, ({"Schirmmütze"}));
EXPECT_SPELLING_CORRECTION("Termoskanne", 1, ({"Thermoskanne"}));
EXPECT_SPELLING_CORRECTION("Tsunge", 1, ({"Zunge"}));
EXPECT_SPELLING_CORRECTION("vort", 1, ({"fort"}));
EXPECT_SPELLING_CORRECTION("Schtuhl", 1, ({"Stuhl"}));
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
// The spelling correction "Lax -> Lachs" is not returned because the max
// edit distance is capped at (length(query_word) - 1) which reduces our
// passed value of the max edit distance argument from 3 to 2. This
// change was brought by
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
// and first appears in v1.4.19 release of Xapian.
//EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
EXPECT_SPELLING_CORRECTION("Trok", 1, ({"Trog"}));
EXPECT_SPELLING_CORRECTION("Unfuk", 1, ({"Unfug"}));
EXPECT_SPELLING_CORRECTION("schupsen", 1, ({"schubsen"}));
EXPECT_SPELLING_CORRECTION("warscheinlich", 1, ({"wahrscheinlich"}));
EXPECT_SPELLING_CORRECTION("gefärlich", 1, ({"gefährlich"}));
EXPECT_SPELLING_CORRECTION("Son", 1, ({"Sohn"}));
EXPECT_SPELLING_CORRECTION("nähmlich", 1, ({"nämlich"}));
EXPECT_SPELLING_CORRECTION("Grahl", 1, ({"Gral"}));
EXPECT_SPELLING_CORRECTION("Bine", 1, ({"Biene"}));
EXPECT_SPELLING_CORRECTION("Hirarchie", 1, ({"Hierarchie"}));
EXPECT_SPELLING_CORRECTION("Priese", 1, ({"Prise"}));
EXPECT_SPELLING_CORRECTION("auslehren", 1, ({"ausleeren"}));
EXPECT_SPELLING_CORRECTION("Phenomen", 1, ({"Phänomen"}));
EXPECT_SPELLING_CORRECTION("Phänomän", 1, ({"Phänomen"}));
EXPECT_SPELLING_CORRECTION("Phenomän", 1, ({"Phänomen"}));
EXPECT_SPELLING_CORRECTION("gewehren", 1, ({"gewähren"}));
EXPECT_SPELLING_CORRECTION("aba", 1, ({"aber"}));
EXPECT_SPELLING_CORRECTION("gestan", 1, ({"gestern"}));
EXPECT_SPELLING_CORRECTION("ronterfallen", 1, ({"runterfallen"}));
EXPECT_SPELLING_CORRECTION("Hönig", 1, ({"Honig"}));
EXPECT_SPELLING_CORRECTION("mussen", 1, ({"müssen"}));
EXPECT_SPELLING_CORRECTION("Bewandnis", 1, ({"Bewandtnis"}));
EXPECT_SPELLING_CORRECTION("hässlig", 1, ({"hässlich"}));
EXPECT_SPELLING_CORRECTION("lustich", 1, ({"lustig"}));
EXPECT_SPELLING_CORRECTION("Botschaftler", 1, ({"Botschafter"}));
EXPECT_SPELLING_CORRECTION("ebemfalls", 1, ({"ebenfalls"}));
EXPECT_SPELLING_CORRECTION("samft", 1, ({"sanft"}));
EXPECT_SPELLING_CORRECTION("Wohenzimmer", 1, ({"Wohnzimmer"}));
EXPECT_SPELLING_CORRECTION("Flaster", 1, ({"Pflaster"}));
EXPECT_SPELLING_CORRECTION("Imfung", 1, ({"Impfung"}));
EXPECT_SPELLING_CORRECTION("amptieren", 1, ({"amtieren"}));
EXPECT_SPELLING_CORRECTION("Endgeld", 1, ({"Entgelt"}));
EXPECT_SPELLING_CORRECTION("Abendteuer", 1, ({"Abenteuer"}));
EXPECT_SPELLING_CORRECTION("sampft", 1, ({"sanft"}));
EXPECT_SPELLING_CORRECTION("forgestan", 1, ({"vorgestern"}));
EXPECT_SPELLING_CORRECTION("Füreschein", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("ronterfalen", 1, ({"runterfallen"}));
EXPECT_SPELLING_CORRECTION("Farradschluss", 1, ({"Fahrradschloss"}));
EXPECT_SPELLING_CORRECTION("Konkorenz", 1, ({"Konkurrenz"}));
EXPECT_SPELLING_CORRECTION("Hirachie", 1, ({"Hierarchie"}));
//////////////////////////////////////////////////////////////////////////////
// Edge cases
//////////////////////////////////////////////////////////////////////////////
// Exact match is not considered a spelling correction
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
// Max edit distance is 3
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
// Case matters in edit distance
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
// Diacritics matters in edit distance
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
// of a Unicode codepoint).
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
//////////////////////////////////////////////////////////////////////////////
// Shortcomings of the proof-of-concept implementation
//////////////////////////////////////////////////////////////////////////////
// Multiword titles are treated as a single entity
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
// Only one spelling correction can be requested
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
}