Compare commits

...

6 Commits

Author SHA1 Message Date
Veloman Yunkan
68c9702772 User affix file can be used for spelling correction
This is a temporary change to facilitate playing with different affix
rules.
2025-11-24 17:53:04 +04:00
Veloman Yunkan
88d8f2788e Use nuspell for spelling correction
This is the initial version of using nuspell for spelling correction,
which yet has to be tuned.

Note that libnuspell must be available as a dependency.

Xapian-based code for spelling correction is not deleted.
2025-11-21 18:32:46 +04:00
Kelson
33f22eb966 Merge pull request #1241 from vighnesh-sawant/mustache-tag-escaping
Avoid interpretation of content coming from zim by mustache
2025-11-10 20:10:16 +01:00
Vighnesh
55c13c3d24 Avoid interpretation of content coming from zim by mustache 2025-11-10 20:10:06 +01:00
Veloman Yunkan
2b1f556c20 Merge pull request #1239 from kiwix/translatewiki
Localisation updates from https://translatewiki.net.
2025-11-10 18:41:03 +04:00
translatewiki.net
e0cd5a1642 Localisation updates from https://translatewiki.net. 2025-11-10 13:13:07 +01:00
8 changed files with 102 additions and 59 deletions

View File

@@ -35,6 +35,14 @@ namespace Xapian
class Database;
}
namespace nuspell
{
inline namespace v5
{
class Dictionary;
}
}
namespace kiwix
{
@@ -51,6 +59,7 @@ public: // functions
private: // data
std::unique_ptr<Xapian::Database> impl_;
std::unique_ptr<nuspell::Dictionary> nuspell_;
};
} // namespace kiwix

View File

@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
libnuspell_dep = dependency('libnuspell', static:static_deps)
if compiler.has_header('mustache.hpp')
extra_include = []
@@ -94,7 +95,7 @@ endif
# Dependencies as string
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
# Dependencies as array
all_deps += libicu_deps

View File

@@ -20,10 +20,12 @@
#include "spelling_correction.h"
#include "zim/archive.h"
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <xapian.h>
#include <nuspell/dictionary.hxx>
namespace kiwix
{
@@ -80,10 +82,39 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
}
}
const char nuspellAffFileData[] = R"(
SET UTF-8
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
)";
std::unique_ptr<std::istream> getAffDataStream()
{
const char* const userAffFilePath = ::getenv("KIWIX_NUSPELL_AFF_FILE_PATH");
if ( userAffFilePath ) {
return std::make_unique<std::ifstream>(userAffFilePath);
}
return std::make_unique<std::istringstream>(nuspellAffFileData);
}
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
{
auto d = std::make_unique<nuspell::Dictionary>();
const auto& allTitles = getAllTitles(archive);
std::stringstream dicSS;
dicSS << allTitles.size() << "\n";
for ( const auto& t : allTitles ) {
dicSS << t << "\n";
}
d->load_aff_dic(*getAffDataStream(), dicSS);
return d;
}
} // unnamed namespace
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
, nuspell_(createNuspellDictionary(archive))
{
}
@@ -93,14 +124,13 @@ SpellingsDB::~SpellingsDB()
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
{
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}
std::vector<std::string> result;
const auto term = impl_->get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
nuspell_->suggest(word, result);
if ( result.size() > maxCount ) {
result.resize(maxCount);
}
if ( result.size() == 1 && result[0] == word ) {
result.clear();
}
return result;
}

View File

@@ -1,6 +1,7 @@
{
"@metadata": {
"authors": [
"Apq",
"Jopparn",
"Larsa",
"Rofiatmustapha12",
@@ -25,9 +26,25 @@
"400-page-heading": "Ogiltig begäran",
"404-page-title": "Innehållet hittades inte",
"404-page-heading": "Hittades inte",
"new-404-page-title": "Sidan kunde inte hittas",
"new-404-page-heading": "Hoppsan. Sidan hittades inte.",
"404-img-text": "Hittades ej!",
"path-was-not-found": "Den begärda sökvägen hittades ej:",
"404-advice.p1": "Innehållet du letar efter kan fortfarande vara tillgängligt, men det kan finnas på en annan plats i ZIM-filen.",
"404-advice.p2": "Vänligen:",
"404-advice.p3": "Försök att använda sökfunktionen för att hitta det innehåll du vill ha",
"404-advice.p4": "Leta efter nyckelord eller titlar relaterade till den information du söker",
"404-advice.p5": "Den här metoden bör hjälpa dig att hitta önskat innehåll, även om den ursprungliga länken inte fungerar korrekt.",
"500-page-title": "Internt serverfel",
"500-page-heading": "Internt serverfel",
"500-page-text": "Ett internt serverfel uppstod. Vi ber om ursäkt för det :/",
"500-page-heading": "Hoppsan. Sidan fungerar inte.",
"500-page-text": "Den begärda sökvägen kan inte levereras korrekt:",
"500-img-text": "Sidan fungerar ej",
"external-link-detected": "Extern länk upptäckt",
"caution-warning": "Varning!",
"external-link-intro": "Du är på väg att lämna Kiwix ZIM-läsare för att gå online till",
"external-link-advice.p1": "Länken du försöker komma åt är inte en del av ditt offlinepaket och kräver en internetanslutning.",
"external-link-advice.p2": "Om du kan gå online kan du försöka öppna länken.",
"external-link-advice.p3": "Du kan annars återgå till ditt ZIM-innehåll offline genom att använda webbläsarens bakåtknapp.",
"fulltext-search-unavailable": "Fulltextsökning är inte tillgänglig",
"no-search-results": "Sökmaskinen för fulltext är inte tillgänglig för detta innehåll.",
"search-results-page-title": "Sök: {{SEARCH_PATTERN}}",
@@ -36,9 +53,9 @@
"search-result-book-info": "från {{BOOK_TITLE}}",
"word-count": "{{COUNT}} ord",
"library-button-text": "Gå till hemsidan",
"home-button-text": "Gå till huvudsidan för \"{{BOOK_TITLE}}\"",
"home-button-text": "Gå till huvudsidan för '{{{BOOK_TITLE}}}'",
"random-page-button-text": "Gå till en slumpmässigt utvald sida",
"searchbox-tooltip": "Sök efter \"{{BOOK_TITLE}}\"",
"searchbox-tooltip": "Sök '{{{BOOK_TITLE}}}'",
"confusion-of-tongues": "Två eller fler böcker på olika språk skulle delta i sökningen, vilket kan ge förvirrande resultat.",
"welcome-page-overzealous-filter": "Inga resultat. Vill du <a href=\"{{URL}}\">återställa filtret</a>?",
"powered-by-kiwix-html": "Drivs av&nbsp;<a href=\"https://kiwix.org\">Kiwix</a>",
@@ -73,5 +90,6 @@
"book-category.wikiversity": "Wikiversity",
"book-category.wikivoyage": "Wikivoyage",
"book-category.wiktionary": "Wiktionary",
"book-category.other": "Övriga"
"book-category.other": "Övriga",
"text-loading-content": "Laddar innehåll"
}

View File

@@ -2,7 +2,8 @@
"@metadata": {
"authors": [
"Hedda",
"Rofiatmustapha12"
"Rofiatmustapha12",
"SaldırganSincap"
]
},
"name": "Türkçe",
@@ -23,8 +24,8 @@
"404-page-title": "içerik bulunamadı",
"404-page-heading": "Bulunamadı",
"500-page-title": "İç Sunucu Hatası",
"500-page-heading": "İç Sunucu Hatası",
"500-page-text": "Dahili bir sunucu hatası oluştu. Bunun için üzgünüz :/",
"500-page-heading": "Üzgünüz. Sayfa çalışmıyor.",
"500-page-text": "İstenen yol düzgün bir şekilde teslim edilemiyor:",
"fulltext-search-unavailable": "Tam metin araması kullanılamıyor",
"no-search-results": "Tam metin arama motoru bu içerik için kullanılamaz.",
"search-results-page-title": "Arama: {{SEARCH_PATTERN}}",
@@ -33,9 +34,9 @@
"search-result-book-info": "{{BOOK_TITLE}} adlı kitaptan",
"word-count": "{{COUNT}} kelime",
"library-button-text": "Karşılama sayfasına git",
"home-button-text": "'{{BOOK_TITLE}}' anasayfasına gidin",
"home-button-text": "'{{{BOOK_TITLE}}}' ana sayfasına git",
"random-page-button-text": "Rastgele seçilen bir sayfaya git",
"searchbox-tooltip": "'{{BOOK_TITLE}}' ara",
"searchbox-tooltip": "'{{{BOOK_TITLE}}}' ara",
"confusion-of-tongues": "Aramaya farklı dillerde iki veya daha fazla kitap katılacak ve bu da kafa karıştırıcı sonuçlara yol açabilecektir.",
"welcome-page-overzealous-filter": "Sonuç yok. <a href=\"{{URL}}\">Filtreyi sıfırlamak</a> ister misiniz?",
"powered-by-kiwix-html": "<a href=\"https://kiwix.org\">Kiwix</a> tarafından desteklenmektedir",
@@ -45,16 +46,16 @@
"count-of-matching-books": "{{COUNT}} kitap",
"download": "İndir",
"direct-download-link-text": "Doğrudan",
"direct-download-alt-text": "direkt indirme",
"hash-download-link-text": "Sha256 haşesi",
"hash-download-alt-text": "csv indir",
"direct-download-alt-text": "Doğrudan HTTP(S) üzerinden indir",
"hash-download-link-text": "SHA-256 sağlama toplamı",
"hash-download-alt-text": "SHA-256 dosya toplam kontrolünü görüntüle",
"magnet-link-text": "Mıknatıs bağlantısı",
"magnet-alt-text": "mıknatısı indir",
"torrent-download-link-text": "Hedef dosya",
"torrent-download-alt-text": "torrenti indir",
"magnet-alt-text": "Mıknatıs bağlantısıyla indir",
"torrent-download-link-text": "BitTorrent",
"torrent-download-alt-text": "BitTorrent üzerinden indir",
"library-opds-feed-all-entries": "Kütüphane OPDS Akışı - Tüm girişler",
"filter-by-tag": "\"{{TAG}}\" etiketine göre filtrele",
"stop-filtering-by-tag": "\"{{TAG}}\" etiketine göre filtrelemeyi durdur",
"filter-by-tag": "\"{{{TAG}}}\" etiketine göre filtrele",
"stop-filtering-by-tag": "\"{{{TAG}}}\" etiketine göre filtrelemeyi durdur",
"library-opds-feed-parameterised": "Kütüphane OPDS Özet Akışı - {{#LANG}}\nLanguage: {{LANG}} {{/LANG}}{{#CATEGORY}}\nCategory: {{CATEGORY}} {{/CATEGORY}} ile eşleşen girişler {{#TAG}}\nTag: {{TAG}} {{/TAG}}{{#Q}}\nQuery: {{Q}} {{/Q}}",
"welcome-to-kiwix-server": "Kiwix Sunucusuna Hoş Geldiniz",
"download-links-heading": "<b><i>{{BOOK_TITLE}}</i></b> için indirme bağlantıları",

View File

@@ -117,7 +117,7 @@
{{title}}
</a>
{{#snippet}}
<cite>{{>snippet}}...</cite>
<cite>{{{snippet}}}...</cite>
{{/snippet}}
{{#bookInfo}}
<div class="book-title">{{{bookInfo}}}</div>

View File

@@ -21,7 +21,7 @@
<title>{{title}}</title>
<link>{{absolutePath}}</link>
{{#snippet}}
<description>{{>snippet}}...</description>
<description>{{{snippet}}}...</description>
{{/snippet}}
{{#bookTitle}}
<book>

View File

@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
// The spelling correction "Lax -> Lachs" is affected by commit
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
// which caps the edit distance at (length(query_word) - 1). As a result, the
// max edit distance parameter that we pass into get_spelling_suggestion() is
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
Xapian::minor_version(),
Xapian::revision());
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
} else {
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
}
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
// Exact match is not considered a spelling correction
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
// Max edit distance is 3
// Max edit distance can be quite large
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
// Case matters in edit distance
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("h", 1, ({}));
// Case doesn't matter in edit distance
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
// Diacritics matters in edit distance
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
// of a Unicode codepoint).
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
// More than one spelling correction can be requested
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
//////////////////////////////////////////////////////////////////////////////
// Shortcomings of the proof-of-concept implementation
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
// Only one spelling correction can be requested
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
}
using StrCollection = std::vector<std::string>;