Compare commits

...

2 Commits

Author SHA1 Message Date
Veloman Yunkan
68c9702772 User affix file can be used for spelling correction
This is a temporary change to facilitate playing with different affix
rules.
2025-11-24 17:53:04 +04:00
Veloman Yunkan
88d8f2788e Use nuspell for spelling correction
This is the initial version of using nuspell for spelling correction,
which yet has to be tuned.

Note that libnuspell must be available as a dependency.

Xapian-based code for spelling correction is not deleted.
2025-11-21 18:32:46 +04:00
4 changed files with 63 additions and 39 deletions

View File

@@ -35,6 +35,14 @@ namespace Xapian
class Database;
}
namespace nuspell
{
inline namespace v5
{
class Dictionary;
}
}
namespace kiwix
{
@@ -51,6 +59,7 @@ public: // functions
private: // data
std::unique_ptr<Xapian::Database> impl_;
std::unique_ptr<nuspell::Dictionary> nuspell_;
};
} // namespace kiwix

View File

@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
libnuspell_dep = dependency('libnuspell', static:static_deps)
if compiler.has_header('mustache.hpp')
extra_include = []
@@ -94,7 +95,7 @@ endif
# Dependencies as string
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
# Dependencies as array
all_deps += libicu_deps

View File

@@ -20,10 +20,12 @@
#include "spelling_correction.h"
#include "zim/archive.h"
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <xapian.h>
#include <nuspell/dictionary.hxx>
namespace kiwix
{
@@ -80,10 +82,39 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
}
}
const char nuspellAffFileData[] = R"(
SET UTF-8
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
)";
std::unique_ptr<std::istream> getAffDataStream()
{
const char* const userAffFilePath = ::getenv("KIWIX_NUSPELL_AFF_FILE_PATH");
if ( userAffFilePath ) {
return std::make_unique<std::ifstream>(userAffFilePath);
}
return std::make_unique<std::istringstream>(nuspellAffFileData);
}
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
{
auto d = std::make_unique<nuspell::Dictionary>();
const auto& allTitles = getAllTitles(archive);
std::stringstream dicSS;
dicSS << allTitles.size() << "\n";
for ( const auto& t : allTitles ) {
dicSS << t << "\n";
}
d->load_aff_dic(*getAffDataStream(), dicSS);
return d;
}
} // unnamed namespace
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
, nuspell_(createNuspellDictionary(archive))
{
}
@@ -93,14 +124,13 @@ SpellingsDB::~SpellingsDB()
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
{
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}
std::vector<std::string> result;
const auto term = impl_->get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
nuspell_->suggest(word, result);
if ( result.size() > maxCount ) {
result.resize(maxCount);
}
if ( result.size() == 1 && result[0] == word ) {
result.clear();
}
return result;
}

View File

@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
// The spelling correction "Lax -> Lachs" is affected by commit
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
// which caps the edit distance at (length(query_word) - 1). As a result, the
// max edit distance parameter that we pass into get_spelling_suggestion() is
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
Xapian::minor_version(),
Xapian::revision());
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
} else {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
}
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
// Exact match is not considered a spelling correction
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
// Max edit distance is 3
// Max edit distance can be quite large
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
// Case matters in edit distance
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("h", 1, ({}));
// Case doesn't matter in edit distance
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
// Diacritics matters in edit distance
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
// of a Unicode codepoint).
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
// More than one spelling correction can be requested
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
//////////////////////////////////////////////////////////////////////////////
// Shortcomings of the proof-of-concept implementation
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
// Only one spelling correction can be requested
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
}
using StrCollection = std::vector<std::string>;