mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-12-28 08:58:03 -05:00
Compare commits
11 Commits
nuspell
...
multiple_s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75a336395a | ||
|
|
f82bfc068f | ||
|
|
e6335be897 | ||
|
|
1074e833b7 | ||
|
|
9da5fbad1e | ||
|
|
1869fb4e8e | ||
|
|
536198fa38 | ||
|
|
ca808718f7 | ||
|
|
b65074f961 | ||
|
|
8b7d1ef9ec | ||
|
|
8b0f01fa9b |
@@ -1,3 +1,11 @@
|
||||
libkiwix 14.1.1
|
||||
===============
|
||||
|
||||
* Server:
|
||||
- Fix regression for kiwix-serve --nosearchbar (@veloman-yunkan #1250)
|
||||
- Avoid results content interpretation... crash in fulltext search (@vighnesh-sawant #1241)
|
||||
- Fix for intermittent /content/blank.html errors (@veloman-yunkan #1249)
|
||||
|
||||
libkiwix 14.1.0
|
||||
===============
|
||||
|
||||
|
||||
@@ -32,15 +32,7 @@ class Archive;
|
||||
|
||||
namespace Xapian
|
||||
{
|
||||
class Database;
|
||||
}
|
||||
|
||||
namespace nuspell
|
||||
{
|
||||
inline namespace v5
|
||||
{
|
||||
class Dictionary;
|
||||
}
|
||||
class WritableDatabase;
|
||||
}
|
||||
|
||||
namespace kiwix
|
||||
@@ -58,8 +50,7 @@ public: // functions
|
||||
std::vector<std::string> getSpellingCorrections(const std::string& word, uint32_t maxCount) const;
|
||||
|
||||
private: // data
|
||||
std::unique_ptr<Xapian::Database> impl_;
|
||||
std::unique_ptr<nuspell::Dictionary> nuspell_;
|
||||
std::unique_ptr<Xapian::WritableDatabase> impl_;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
project('libkiwix', 'cpp',
|
||||
version : '14.1.0',
|
||||
version : '14.1.1',
|
||||
license : 'GPLv3+',
|
||||
default_options : ['c_std=c11', 'cpp_std=c++17', 'werror=true'])
|
||||
|
||||
@@ -61,7 +61,6 @@ libcurl_dep = dependency('libcurl', static:static_deps)
|
||||
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
|
||||
zlib_dep = dependency('zlib', static:static_deps)
|
||||
xapian_dep = dependency('xapian-core', static:static_deps)
|
||||
libnuspell_dep = dependency('libnuspell', static:static_deps)
|
||||
|
||||
if compiler.has_header('mustache.hpp')
|
||||
extra_include = []
|
||||
@@ -95,7 +94,7 @@ endif
|
||||
|
||||
|
||||
# Dependencies as string
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
|
||||
|
||||
# Dependencies as array
|
||||
all_deps += libicu_deps
|
||||
|
||||
@@ -20,12 +20,10 @@
|
||||
#include "spelling_correction.h"
|
||||
#include "zim/archive.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <nuspell/dictionary.hxx>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
@@ -45,15 +43,11 @@ std::vector<std::string> getAllTitles(const zim::Archive& a)
|
||||
void createXapianDB(std::string path, const zim::Archive& archive)
|
||||
{
|
||||
const int flags = Xapian::DB_BACKEND_GLASS|Xapian::DB_CREATE;
|
||||
const auto tmpDbPath = path + ".tmp";
|
||||
Xapian::WritableDatabase db(tmpDbPath, flags);
|
||||
Xapian::WritableDatabase db(path, flags);
|
||||
for (const auto& t : getAllTitles(archive)) {
|
||||
db.add_spelling(t);
|
||||
}
|
||||
db.commit();
|
||||
db.compact(path, Xapian::DBCOMPACT_SINGLE_FILE);
|
||||
db.close();
|
||||
std::filesystem::remove_all(tmpDbPath);
|
||||
}
|
||||
|
||||
std::string spellingsDBPathForZIMArchive(std::filesystem::path cacheDirPath, const zim::Archive& a)
|
||||
@@ -61,60 +55,34 @@ std::string spellingsDBPathForZIMArchive(std::filesystem::path cacheDirPath, con
|
||||
// The version of spellings DB must be updated each time an important change
|
||||
// to the implementation is made that renders using the previous version
|
||||
// impossible or undesirable.
|
||||
const char SPELLINGS_DB_VERSION[] = "0.1";
|
||||
const char SPELLINGS_DB_VERSION[] = "0.2";
|
||||
|
||||
std::ostringstream filename;
|
||||
filename << a.getUuid() << ".spellingsdb.v" << SPELLINGS_DB_VERSION;
|
||||
return (cacheDirPath / filename.str()).string();
|
||||
}
|
||||
|
||||
std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cacheDirPath, const zim::Archive& archive)
|
||||
std::unique_ptr<Xapian::WritableDatabase> openOrCreateXapianDB(std::filesystem::path cacheDirPath, const zim::Archive& archive)
|
||||
{
|
||||
const auto path = spellingsDBPathForZIMArchive(cacheDirPath, archive);
|
||||
try
|
||||
{
|
||||
return std::make_unique<Xapian::Database>(path);
|
||||
{
|
||||
Xapian::Database checkIfDbAlreadyExists(path);
|
||||
}
|
||||
return std::make_unique<Xapian::WritableDatabase>(path);
|
||||
}
|
||||
catch (const Xapian::DatabaseOpeningError& )
|
||||
{
|
||||
createXapianDB(path, archive);
|
||||
return std::make_unique<Xapian::Database>(path);
|
||||
return std::make_unique<Xapian::WritableDatabase>(path);
|
||||
}
|
||||
}
|
||||
|
||||
const char nuspellAffFileData[] = R"(
|
||||
SET UTF-8
|
||||
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
|
||||
)";
|
||||
|
||||
std::unique_ptr<std::istream> getAffDataStream()
|
||||
{
|
||||
const char* const userAffFilePath = ::getenv("KIWIX_NUSPELL_AFF_FILE_PATH");
|
||||
if ( userAffFilePath ) {
|
||||
return std::make_unique<std::ifstream>(userAffFilePath);
|
||||
}
|
||||
|
||||
return std::make_unique<std::istringstream>(nuspellAffFileData);
|
||||
}
|
||||
|
||||
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
|
||||
{
|
||||
auto d = std::make_unique<nuspell::Dictionary>();
|
||||
const auto& allTitles = getAllTitles(archive);
|
||||
std::stringstream dicSS;
|
||||
dicSS << allTitles.size() << "\n";
|
||||
for ( const auto& t : allTitles ) {
|
||||
dicSS << t << "\n";
|
||||
}
|
||||
d->load_aff_dic(*getAffDataStream(), dicSS);
|
||||
return d;
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
|
||||
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
|
||||
, nuspell_(createNuspellDictionary(archive))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -125,13 +93,22 @@ SpellingsDB::~SpellingsDB()
|
||||
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
nuspell_->suggest(word, result);
|
||||
if ( result.size() > maxCount ) {
|
||||
result.resize(maxCount);
|
||||
while ( result.size() < maxCount ) {
|
||||
const auto term = impl_->get_spelling_suggestion(word, 3);
|
||||
if ( term.empty() )
|
||||
break;
|
||||
|
||||
result.push_back(term);
|
||||
|
||||
// temporarily remove this term so that another spellings could be obtained
|
||||
impl_->remove_spelling(term);
|
||||
}
|
||||
if ( result.size() == 1 && result[0] == word ) {
|
||||
result.clear();
|
||||
|
||||
// restore temporarily removed terms
|
||||
for (const auto& t : result) {
|
||||
impl_->add_spelling(t);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
{
|
||||
"@metadata": {
|
||||
"authors": [
|
||||
"Jimkats",
|
||||
"Kelson",
|
||||
"Norhorn",
|
||||
"Ανώνυμος Βικιπαιδιστής"
|
||||
]
|
||||
},
|
||||
"name": "Αγγλικά",
|
||||
"suggest-full-text-search": "περιέχει '{{{SEARCH_TERMS}}}'...",
|
||||
"no-such-book": "Δεν υπάρχει τέτοιο βιβλίο: {{BOOK_NAME}}",
|
||||
"caution-warning": "Προσοχή!",
|
||||
"search-result-book-info": "από {{BOOK_TITLE}}",
|
||||
"word-count": "{{COUNT}} λέξεις",
|
||||
"welcome-page-overzealous-filter": "Κανένα αποτέλεσμα. Θέλετε να <a href=\"{{URL}}\">επαναφέρετε το φίλτρο</a>;",
|
||||
"powered-by-kiwix-html": "Με την υποστήριξη by <a href=\"https://kiwix.org\">Kiwix</a>",
|
||||
"search": "Αναζήτηση",
|
||||
@@ -19,10 +24,10 @@
|
||||
"direct-download-alt-text": "άμεση λήψη",
|
||||
"hash-download-alt-text": "λήψη αναγνωριστικού",
|
||||
"magnet-alt-text": "λήψη μαγνήτη",
|
||||
"torrent-download-link-text": "Αρχείο torrent",
|
||||
"torrent-download-alt-text": "λήψη torrent",
|
||||
"filter-by-tag": "Φίλτρο ανά ετικέτα \"{{TAG}}\"",
|
||||
"stop-filtering-by-tag": "Διακοπή φίλτρου ανά ετικέτα \"{{TAG}}\"",
|
||||
"torrent-download-link-text": "BitTorrent",
|
||||
"torrent-download-alt-text": "Λήψη μέσω BitTorrent",
|
||||
"filter-by-tag": "Φιλτράρισμα κατά ετικέτα \"{{{TAG}}}\"",
|
||||
"stop-filtering-by-tag": "Διακοπή φιλτραρίσματος κατά ετικέτα \"{{{TAG}}}\"",
|
||||
"welcome-to-kiwix-server": "Καλώς ορίσατε στον διακομιστή Kiwix",
|
||||
"download-links-heading": "Λήψη συνδέσμων για <b><i>{{BOOK_TITLE}}</i></b>",
|
||||
"download-links-title": "Κατεβάστε το βιβλίο",
|
||||
|
||||
@@ -271,10 +271,12 @@ function translateErrorPageIfNeeded() {
|
||||
let iframeLocationHref = null;
|
||||
|
||||
function handle_content_url_change() {
|
||||
if ( iframeLocationHref == contentIframe.contentWindow.location.href )
|
||||
const iframeLocation = contentIframe.contentWindow.location;
|
||||
|
||||
if ( iframeLocationHref == iframeLocation.href ||
|
||||
!iframeLocation.pathname.startsWith(root + '/content/') )
|
||||
return;
|
||||
|
||||
const iframeLocation = contentIframe.contentWindow.location;
|
||||
iframeLocationHref = iframeLocation.href;
|
||||
console.log('handle_content_url_change: ' + iframeLocation.href);
|
||||
document.title = contentIframe.contentDocument.title;
|
||||
@@ -431,8 +433,6 @@ function setup_chaperon_mode() {
|
||||
}
|
||||
}
|
||||
|
||||
let viewerSetupComplete = false;
|
||||
|
||||
function on_content_load() {
|
||||
const loader = document.getElementById("kiwix__loader");
|
||||
|
||||
@@ -588,6 +588,7 @@ function setupViewer() {
|
||||
|
||||
const kiwixToolBarWrapper = document.getElementById('kiwixtoolbarwrapper');
|
||||
if ( ! viewerSettings.toolbarEnabled ) {
|
||||
finishViewerSetup();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -636,10 +637,13 @@ function updateUIText() {
|
||||
function finishViewerSetupOnceTranslationsAreLoaded()
|
||||
{
|
||||
updateUIText();
|
||||
finishViewerSetup();
|
||||
}
|
||||
|
||||
function finishViewerSetup()
|
||||
{
|
||||
handle_location_hash_change();
|
||||
|
||||
window.onhashchange = handle_location_hash_change;
|
||||
window.onpopstate = handle_history_state_change;
|
||||
|
||||
viewerSetupComplete = true;
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ const ResourceCollection resources200Compressible{
|
||||
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/taskbar.css" },
|
||||
{ STATIC_CONTENT, "/ROOT%23%3F/skin/taskbar.css?cacheid=42e90cb9" },
|
||||
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/viewer.js" },
|
||||
{ STATIC_CONTENT, "/ROOT%23%3F/skin/viewer.js?cacheid=3208c3ed" },
|
||||
{ STATIC_CONTENT, "/ROOT%23%3F/skin/viewer.js?cacheid=00e0fdf3" },
|
||||
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/fonts/Poppins.ttf" },
|
||||
{ STATIC_CONTENT, "/ROOT%23%3F/skin/fonts/Poppins.ttf?cacheid=af705837" },
|
||||
{ DYNAMIC_CONTENT, "/ROOT%23%3F/skin/fonts/Roboto.ttf" },
|
||||
@@ -338,7 +338,7 @@ R"EXPECTEDRESULT( <link type="text/css" href="./skin/kiwix.css?cacheid=b4e29e
|
||||
<script type="text/javascript" src="./skin/polyfills.js?cacheid=a0e0343d"></script>
|
||||
<script type="module" src="./skin/i18n.js?cacheid=e9a10ac1" defer></script>
|
||||
<script type="text/javascript" src="./skin/languages.js?cacheid=08955948" defer></script>
|
||||
<script type="text/javascript" src="./skin/viewer.js?cacheid=3208c3ed" defer></script>
|
||||
<script type="text/javascript" src="./skin/viewer.js?cacheid=00e0fdf3" defer></script>
|
||||
<script type="text/javascript" src="./skin/autoComplete/autoComplete.min.js?cacheid=1191aaaf"></script>
|
||||
const blankPageUrl = root + "/skin/blank.html?cacheid=6b1fa032";
|
||||
<label for="kiwix_button_show_toggle"><img src="./skin/caret.png?cacheid=22b942b4" alt=""></label>
|
||||
|
||||
@@ -78,7 +78,21 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
|
||||
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
|
||||
// The spelling correction "Lax -> Lachs" is affected by commit
|
||||
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
|
||||
// which caps the edit distance at (length(query_word) - 1). As a result, the
|
||||
// max edit distance parameter that we pass into get_spelling_suggestion() is
|
||||
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
|
||||
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
|
||||
Xapian::minor_version(),
|
||||
Xapian::revision());
|
||||
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
} else {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
|
||||
}
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
|
||||
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
|
||||
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
|
||||
@@ -130,26 +144,24 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
// Exact match is not considered a spelling correction
|
||||
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
|
||||
|
||||
// Max edit distance can be quite large
|
||||
// Max edit distance is 3
|
||||
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Füh", 1, ({}));
|
||||
// Case doesn't matter in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
|
||||
// Case matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
|
||||
// Diacritics matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
|
||||
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
|
||||
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
|
||||
// of a Unicode codepoint).
|
||||
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
|
||||
|
||||
// More than one spelling correction can be requested
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Shortcomings of the proof-of-concept implementation
|
||||
@@ -159,6 +171,9 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
|
||||
|
||||
// Only one spelling correction can be requested
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
}
|
||||
|
||||
using StrCollection = std::vector<std::string>;
|
||||
@@ -174,21 +189,21 @@ StrCollection directoryEntries(std::filesystem::path dirPath)
|
||||
|
||||
TEST_F(SpellingCorrectionTest, allInOne)
|
||||
{
|
||||
const auto tmpDirModTime0 = std::filesystem::last_write_time(tmpDirPath);
|
||||
//const auto tmpDirModTime0 = std::filesystem::last_write_time(tmpDirPath);
|
||||
ASSERT_TRUE(directoryEntries(tmpDirPath).empty());
|
||||
{
|
||||
const kiwix::SpellingsDB spellingsDB(*archive, tmpDirPath);
|
||||
testSpellingCorrections(spellingsDB);
|
||||
}
|
||||
|
||||
const auto tmpDirModTime1 = std::filesystem::last_write_time(tmpDirPath);
|
||||
//const auto tmpDirModTime1 = std::filesystem::last_write_time(tmpDirPath);
|
||||
|
||||
const auto spellingsDbPath = tmpDirPath / "554c9707-897e-097a-53ba-1b1306d8bb88.spellingsdb.v0.1";
|
||||
const auto spellingsDbPath = tmpDirPath / "554c9707-897e-097a-53ba-1b1306d8bb88.spellingsdb.v0.2";
|
||||
|
||||
const StrCollection EXPECTED_DIR_CONTENT{ spellingsDbPath.string() };
|
||||
ASSERT_EQ(directoryEntries(tmpDirPath), EXPECTED_DIR_CONTENT);
|
||||
ASSERT_LT(tmpDirModTime0, tmpDirModTime1);
|
||||
const auto fileModTime = std::filesystem::last_write_time(spellingsDbPath);
|
||||
//ASSERT_LT(tmpDirModTime0, tmpDirModTime1);
|
||||
//const auto fileModTime = std::filesystem::last_write_time(spellingsDbPath);
|
||||
|
||||
{
|
||||
const kiwix::SpellingsDB spellingsDB(*archive, tmpDirPath);
|
||||
@@ -196,6 +211,6 @@ TEST_F(SpellingCorrectionTest, allInOne)
|
||||
}
|
||||
|
||||
ASSERT_EQ(directoryEntries(tmpDirPath), EXPECTED_DIR_CONTENT );
|
||||
ASSERT_EQ(tmpDirModTime1, std::filesystem::last_write_time(tmpDirPath));
|
||||
ASSERT_EQ(fileModTime, std::filesystem::last_write_time(spellingsDbPath));
|
||||
//ASSERT_EQ(tmpDirModTime1, std::filesystem::last_write_time(tmpDirPath));
|
||||
//ASSERT_EQ(fileModTime, std::filesystem::last_write_time(spellingsDbPath));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user