Compare commits

..

2 Commits

Author SHA1 Message Date
Veloman Yunkan
68c9702772 User affix file can be used for spelling correction
This is a temporary change to facilitate playing with different affix
rules.
2025-11-24 17:53:04 +04:00
Veloman Yunkan
88d8f2788e Use nuspell for spelling correction
This is the initial version of using nuspell for spelling correction,
which yet has to be tuned.

Note that libnuspell must be available as a dependency.

Xapian-based code for spelling correction is not deleted.
2025-11-21 18:32:46 +04:00
10 changed files with 76 additions and 138 deletions

View File

@@ -1,11 +1,3 @@
libkiwix 14.1.1
===============
* Server:
- Fix regression for kiwix-serve --nosearchbar (@veloman-yunkan #1250)
- Avoid results content interpretation... crash in fulltext search (@vighnesh-sawant #1241)
- Fix for intermittent /content/blank.html errors (@veloman-yunkan #1249)
libkiwix 14.1.0
===============

View File

@@ -155,15 +155,6 @@ class Manager
const std::string& url = "",
const bool checkMetaData = false);
/**
* Add all books from the directory tree into the library.
*
* @param path The path of the directory to scan.
* @param verboseFlag Verbose logs flag.
*/
void addBooksFromDirectory(const std::string& path,
const bool verboseFlag = false);
std::string writableLibraryPath;
bool m_hasSearchResult = false;

View File

@@ -35,6 +35,14 @@ namespace Xapian
class Database;
}
namespace nuspell
{
inline namespace v5
{
class Dictionary;
}
}
namespace kiwix
{
@@ -51,6 +59,7 @@ public: // functions
private: // data
std::unique_ptr<Xapian::Database> impl_;
std::unique_ptr<nuspell::Dictionary> nuspell_;
};
} // namespace kiwix

View File

@@ -1,5 +1,5 @@
project('libkiwix', 'cpp',
version : '14.1.1',
version : '14.1.0',
license : 'GPLv3+',
default_options : ['c_std=c11', 'cpp_std=c++17', 'werror=true'])
@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
libnuspell_dep = dependency('libnuspell', static:static_deps)
if compiler.has_header('mustache.hpp')
extra_include = []
@@ -94,7 +95,7 @@ endif
# Dependencies as string
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
# Dependencies as array
all_deps += libicu_deps

View File

@@ -23,14 +23,6 @@
#include "tools/pathTools.h"
#include <pugixml.hpp>
#include <filesystem>
#include <iostream>
#include <set>
#include <queue>
#include <cctype>
#include <algorithm>
namespace fs = std::filesystem;
namespace kiwix
{
@@ -259,58 +251,6 @@ bool Manager::addBookFromPath(const std::string& pathToOpen,
.empty());
}
void Manager::addBooksFromDirectory(const std::string& path,
const bool verboseFlag)
{
std::set<std::string> iteratedDirs;
std::queue<std::string> dirQueue;
dirQueue.push(fs::absolute(path).u8string());
int totalBooksAdded = 0;
if (verboseFlag)
std::cout << "Adding books from the directory tree: " << dirQueue.front() << std::endl;
while (!dirQueue.empty()) {
const auto currentPath = dirQueue.front();
dirQueue.pop();
if (verboseFlag)
std::cout << "Visiting directory: " << currentPath << std::endl;
for (const auto& dirEntry : fs::directory_iterator(currentPath)) {
auto resolvedPath = dirEntry.path();
if (fs::is_symlink(dirEntry)) {
try {
resolvedPath = fs::canonical(dirEntry.path());
} catch (const std::exception& e) {
std::cerr << "Could not resolve symlink " << resolvedPath.u8string() << " to a valid path. Skipping..." << std::endl;
continue;
}
}
const std::string pathString = resolvedPath.u8string();
std::string resolvedPathExtension = resolvedPath.extension();
std::transform(resolvedPathExtension.begin(), resolvedPathExtension.end(), resolvedPathExtension.begin(),
[](unsigned char c){ return std::tolower(c); });
if (fs::is_directory(resolvedPath)) {
if (iteratedDirs.find(pathString) == iteratedDirs.end())
dirQueue.push(pathString);
else if (verboseFlag)
std::cout << "Already iterated over " << pathString << ". Skipping..." << std::endl;
} else if (resolvedPathExtension == ".zim" || resolvedPathExtension == ".zimaa") {
if (!this->addBookFromPath(pathString, pathString, "", false)) {
std::cerr << "Could not add " << pathString << " into the library." << std::endl;
} else if (verboseFlag) {
std::cout << "Added " << pathString << " into the library." << std::endl;
totalBooksAdded++;
}
} else if (verboseFlag) {
std::cout << "Skipped " << pathString << " - unsupported file type or permission denied." << std::endl;
}
}
iteratedDirs.insert(currentPath);
}
if (verboseFlag)
std::cout << "Traversal completed. Total books added: " << totalBooksAdded << std::endl;
}
bool Manager::readBookFromPath(const std::string& path, kiwix::Book* book)
{
std::string tmp_path = path;

View File

@@ -20,10 +20,12 @@
#include "spelling_correction.h"
#include "zim/archive.h"
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <xapian.h>
#include <nuspell/dictionary.hxx>
namespace kiwix
{
@@ -80,10 +82,39 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
}
}
const char nuspellAffFileData[] = R"(
SET UTF-8
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
)";
std::unique_ptr<std::istream> getAffDataStream()
{
const char* const userAffFilePath = ::getenv("KIWIX_NUSPELL_AFF_FILE_PATH");
if ( userAffFilePath ) {
return std::make_unique<std::ifstream>(userAffFilePath);
}
return std::make_unique<std::istringstream>(nuspellAffFileData);
}
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
{
auto d = std::make_unique<nuspell::Dictionary>();
const auto& allTitles = getAllTitles(archive);
std::stringstream dicSS;
dicSS << allTitles.size() << "\n";
for ( const auto& t : allTitles ) {
dicSS << t << "\n";
}
d->load_aff_dic(*getAffDataStream(), dicSS);
return d;
}
} // unnamed namespace
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
, nuspell_(createNuspellDictionary(archive))
{
}
@@ -93,14 +124,13 @@ SpellingsDB::~SpellingsDB()
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
{
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}
std::vector<std::string> result;
const auto term = impl_->get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
nuspell_->suggest(word, result);
if ( result.size() > maxCount ) {
result.resize(maxCount);
}
if ( result.size() == 1 && result[0] == word ) {
result.clear();
}
return result;
}

View File

@@ -1,18 +1,13 @@
{
"@metadata": {
"authors": [
"Jimkats",
"Kelson",
"Norhorn",
"Ανώνυμος Βικιπαιδιστής"
]
},
"name": "Αγγλικά",
"suggest-full-text-search": "περιέχει '{{{SEARCH_TERMS}}}'...",
"no-such-book": "Δεν υπάρχει τέτοιο βιβλίο: {{BOOK_NAME}}",
"caution-warning": "Προσοχή!",
"search-result-book-info": "από {{BOOK_TITLE}}",
"word-count": "{{COUNT}} λέξεις",
"welcome-page-overzealous-filter": "Κανένα αποτέλεσμα. Θέλετε να <a href=\"{{URL}}\">επαναφέρετε το φίλτρο</a>;",
"powered-by-kiwix-html": "Με την υποστήριξη by&nbsp;<a href=\"https://kiwix.org\">Kiwix</a>",
"search": "Αναζήτηση",
@@ -24,10 +19,10 @@
"direct-download-alt-text": "άμεση λήψη",
"hash-download-alt-text": "λήψη αναγνωριστικού",
"magnet-alt-text": "λήψη μαγνήτη",
"torrent-download-link-text": "BitTorrent",
"torrent-download-alt-text": "Λήψη μέσω BitTorrent",
"filter-by-tag": ιλτράρισμα κατά ετικέτα \"{{{TAG}}}\"",
"stop-filtering-by-tag": "Διακοπή φιλτραρίσματος κατά ετικέτα \"{{{TAG}}}\"",
"torrent-download-link-text": "Αρχείο torrent",
"torrent-download-alt-text": "λήψη torrent",
"filter-by-tag": ίλτρο ανά ετικέτα \"{{TAG}}\"",
"stop-filtering-by-tag": "Διακοπή φίλτρου ανά ετικέτα \"{{TAG}}\"",
"welcome-to-kiwix-server": "Καλώς ορίσατε στον διακομιστή Kiwix",
"download-links-heading": "Λήψη συνδέσμων για <b><i>{{BOOK_TITLE}}</i></b>",
"download-links-title": "Κατεβάστε το βιβλίο",

View File

@@ -271,12 +271,10 @@ function translateErrorPageIfNeeded() {
let iframeLocationHref = null;
function handle_content_url_change() {
const iframeLocation = contentIframe.contentWindow.location;
if ( iframeLocationHref == iframeLocation.href ||
!iframeLocation.pathname.startsWith(root + '/content/') )
if ( iframeLocationHref == contentIframe.contentWindow.location.href )
return;
const iframeLocation = contentIframe.contentWindow.location;
iframeLocationHref = iframeLocation.href;
console.log('handle_content_url_change: ' + iframeLocation.href);
document.title = contentIframe.contentDocument.title;
@@ -433,6 +431,8 @@ function setup_chaperon_mode() {
}
}
let viewerSetupComplete = false;
function on_content_load() {
const loader = document.getElementById("kiwix__loader");
@@ -588,7 +588,6 @@ function setupViewer() {
const kiwixToolBarWrapper = document.getElementById('kiwixtoolbarwrapper');
if ( ! viewerSettings.toolbarEnabled ) {
finishViewerSetup();
return;
}
@@ -637,13 +636,10 @@ function updateUIText() {
function finishViewerSetupOnceTranslationsAreLoaded()
{
updateUIText();
finishViewerSetup();
}
function finishViewerSetup()
{
handle_location_hash_change();
window.onhashchange = handle_location_hash_change;
window.onpopstate = handle_history_state_change;
viewerSetupComplete = true;
}

View File

@@ -77,7 +77,7 @@ const ResourceCollection resources200Compressible{
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/taskbar.css" },
{ STATIC_CONTENT, "/ROOT%23%3F/skin/taskbar.css?cacheid=42e90cb9" },
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/viewer.js" },
{ STATIC_CONTENT, "/ROOT%23%3F/skin/viewer.js?cacheid=00e0fdf3" },
{ STATIC_CONTENT, "/ROOT%23%3F/skin/viewer.js?cacheid=3208c3ed" },
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/fonts/Poppins.ttf" },
{ STATIC_CONTENT, "/ROOT%23%3F/skin/fonts/Poppins.ttf?cacheid=af705837" },
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/fonts/Roboto.ttf" },
@@ -338,7 +338,7 @@ R"EXPECTEDRESULT( <link type="text/css" href="./skin/kiwix.css?cacheid=b4e29e
<script type="text/javascript" src="./skin/polyfills.js?cacheid=a0e0343d"></script>
<script type="module" src="./skin/i18n.js?cacheid=e9a10ac1" defer></script>
<script type="text/javascript" src="./skin/languages.js?cacheid=08955948" defer></script>
<script type="text/javascript" src="./skin/viewer.js?cacheid=00e0fdf3" defer></script>
<script type="text/javascript" src="./skin/viewer.js?cacheid=3208c3ed" defer></script>
<script type="text/javascript" src="./skin/autoComplete/autoComplete.min.js?cacheid=1191aaaf"></script>
const blankPageUrl = root + "/skin/blank.html?cacheid=6b1fa032";
<label for="kiwix_button_show_toggle"><img src="./skin/caret.png?cacheid=22b942b4" alt=""></label>

View File

@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
// The spelling correction "Lax -> Lachs" is affected by commit
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
// which caps the edit distance at (length(query_word) - 1). As a result, the
// max edit distance parameter that we pass into get_spelling_suggestion() is
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
Xapian::minor_version(),
Xapian::revision());
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
} else {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
}
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
// Exact match is not considered a spelling correction
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
// Max edit distance is 3
// Max edit distance can be quite large
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
// Case matters in edit distance
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("h", 1, ({}));
// Case doesn't matter in edit distance
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
// Diacritics matters in edit distance
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
// of a Unicode codepoint).
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
// More than one spelling correction can be requested
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
//////////////////////////////////////////////////////////////////////////////
// Shortcomings of the proof-of-concept implementation
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
// Only one spelling correction can be requested
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
}
using StrCollection = std::vector<std::string>;