Use nuspell for spelling correction

This is the initial version of using nuspell for spelling correction,
which yet has to be tuned.

Note that libnuspell must be available as a dependency.

Xapian-based code for spelling correction is not deleted.
This commit is contained in:
Veloman Yunkan
2025-11-21 18:29:19 +04:00
parent 33f22eb966
commit 88d8f2788e
4 changed files with 53 additions and 39 deletions

View File

@@ -35,6 +35,14 @@ namespace Xapian
class Database;
}
namespace nuspell
{
inline namespace v5
{
class Dictionary;
}
}
namespace kiwix
{
@@ -51,6 +59,7 @@ public: // functions
private: // data
std::unique_ptr<Xapian::Database> impl_;
std::unique_ptr<nuspell::Dictionary> nuspell_;
};
} // namespace kiwix

View File

@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
libnuspell_dep = dependency('libnuspell', static:static_deps)
if compiler.has_header('mustache.hpp')
extra_include = []
@@ -94,7 +95,7 @@ endif
# Dependencies as string
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
# Dependencies as array
all_deps += libicu_deps

View File

@@ -24,6 +24,7 @@
#include <stdexcept>
#include <xapian.h>
#include <nuspell/dictionary.hxx>
namespace kiwix
{
@@ -80,10 +81,30 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
}
}
const char nuspellAffFileData[] = R"(
SET UTF-8
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
)";
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
{
auto d = std::make_unique<nuspell::Dictionary>();
const auto& allTitles = getAllTitles(archive);
std::istringstream affSS(nuspellAffFileData);
std::stringstream dicSS;
dicSS << allTitles.size() << "\n";
for ( const auto& t : allTitles ) {
dicSS << t << "\n";
}
d->load_aff_dic(affSS, dicSS);
return d;
}
} // unnamed namespace
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
, nuspell_(createNuspellDictionary(archive))
{
}
@@ -93,14 +114,13 @@ SpellingsDB::~SpellingsDB()
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
{
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}
std::vector<std::string> result;
const auto term = impl_->get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
nuspell_->suggest(word, result);
if ( result.size() > maxCount ) {
result.resize(maxCount);
}
if ( result.size() == 1 && result[0] == word ) {
result.clear();
}
return result;
}

View File

@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
// The spelling correction "Lax -> Lachs" is affected by commit
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
// which caps the edit distance at (length(query_word) - 1). As a result, the
// max edit distance parameter that we pass into get_spelling_suggestion() is
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
Xapian::minor_version(),
Xapian::revision());
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
} else {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
}
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
// Exact match is not considered a spelling correction
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
// Max edit distance is 3
// Max edit distance can be quite large
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
// Case matters in edit distance
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("h", 1, ({}));
// Case doesn't matter in edit distance
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
// Diacritics matters in edit distance
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
// of a Unicode codepoint).
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
// More than one spelling correction can be requested
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
//////////////////////////////////////////////////////////////////////////////
// Shortcomings of the proof-of-concept implementation
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
// Only one spelling correction can be requested
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
}
using StrCollection = std::vector<std::string>;