mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-12-23 22:47:57 -05:00
Use nuspell for spelling correction
This is the initial version of using nuspell for spelling correction, which yet has to be tuned. Note that libnuspell must be available as a dependency. Xapian-based code for spelling correction is not deleted.
This commit is contained in:
@@ -35,6 +35,14 @@ namespace Xapian
|
||||
class Database;
|
||||
}
|
||||
|
||||
namespace nuspell
|
||||
{
|
||||
inline namespace v5
|
||||
{
|
||||
class Dictionary;
|
||||
}
|
||||
}
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
|
||||
@@ -51,6 +59,7 @@ public: // functions
|
||||
|
||||
private: // data
|
||||
std::unique_ptr<Xapian::Database> impl_;
|
||||
std::unique_ptr<nuspell::Dictionary> nuspell_;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
||||
|
||||
@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
|
||||
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
|
||||
zlib_dep = dependency('zlib', static:static_deps)
|
||||
xapian_dep = dependency('xapian-core', static:static_deps)
|
||||
libnuspell_dep = dependency('libnuspell', static:static_deps)
|
||||
|
||||
if compiler.has_header('mustache.hpp')
|
||||
extra_include = []
|
||||
@@ -94,7 +95,7 @@ endif
|
||||
|
||||
|
||||
# Dependencies as string
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
|
||||
|
||||
# Dependencies as array
|
||||
all_deps += libicu_deps
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <stdexcept>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <nuspell/dictionary.hxx>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
@@ -80,10 +81,30 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
|
||||
}
|
||||
}
|
||||
|
||||
const char nuspellAffFileData[] = R"(
|
||||
SET UTF-8
|
||||
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
|
||||
)";
|
||||
|
||||
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
|
||||
{
|
||||
auto d = std::make_unique<nuspell::Dictionary>();
|
||||
const auto& allTitles = getAllTitles(archive);
|
||||
std::istringstream affSS(nuspellAffFileData);
|
||||
std::stringstream dicSS;
|
||||
dicSS << allTitles.size() << "\n";
|
||||
for ( const auto& t : allTitles ) {
|
||||
dicSS << t << "\n";
|
||||
}
|
||||
d->load_aff_dic(affSS, dicSS);
|
||||
return d;
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
|
||||
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
|
||||
, nuspell_(createNuspellDictionary(archive))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -93,14 +114,13 @@ SpellingsDB::~SpellingsDB()
|
||||
|
||||
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
|
||||
{
|
||||
if ( maxCount > 1 ) {
|
||||
throw std::runtime_error("More than one spelling correction was requested");
|
||||
}
|
||||
|
||||
std::vector<std::string> result;
|
||||
const auto term = impl_->get_spelling_suggestion(word, 3);
|
||||
if ( !term.empty() ) {
|
||||
result.push_back(term);
|
||||
nuspell_->suggest(word, result);
|
||||
if ( result.size() > maxCount ) {
|
||||
result.resize(maxCount);
|
||||
}
|
||||
if ( result.size() == 1 && result[0] == word ) {
|
||||
result.clear();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
|
||||
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
|
||||
|
||||
// The spelling correction "Lax -> Lachs" is affected by commit
|
||||
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
|
||||
// which caps the edit distance at (length(query_word) - 1). As a result, the
|
||||
// max edit distance parameter that we pass into get_spelling_suggestion() is
|
||||
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
|
||||
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
|
||||
Xapian::minor_version(),
|
||||
Xapian::revision());
|
||||
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
} else {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
|
||||
}
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
|
||||
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
|
||||
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
|
||||
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
// Exact match is not considered a spelling correction
|
||||
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
|
||||
|
||||
// Max edit distance is 3
|
||||
// Max edit distance can be quite large
|
||||
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
|
||||
// Case matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Füh", 1, ({}));
|
||||
// Case doesn't matter in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
|
||||
// Diacritics matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
|
||||
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
|
||||
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
|
||||
// of a Unicode codepoint).
|
||||
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
|
||||
|
||||
// More than one spelling correction can be requested
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Shortcomings of the proof-of-concept implementation
|
||||
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
|
||||
|
||||
// Only one spelling correction can be requested
|
||||
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
|
||||
}
|
||||
|
||||
using StrCollection = std::vector<std::string>;
|
||||
|
||||
Reference in New Issue
Block a user