mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-12-31 02:18:05 -05:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
68c9702772 | ||
|
|
88d8f2788e | ||
|
|
33f22eb966 | ||
|
|
55c13c3d24 | ||
|
|
2b1f556c20 | ||
|
|
e0cd5a1642 |
@@ -35,6 +35,14 @@ namespace Xapian
|
||||
class Database;
|
||||
}
|
||||
|
||||
namespace nuspell
|
||||
{
|
||||
inline namespace v5
|
||||
{
|
||||
class Dictionary;
|
||||
}
|
||||
}
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
|
||||
@@ -51,6 +59,7 @@ public: // functions
|
||||
|
||||
private: // data
|
||||
std::unique_ptr<Xapian::Database> impl_;
|
||||
std::unique_ptr<nuspell::Dictionary> nuspell_;
|
||||
};
|
||||
|
||||
} // namespace kiwix
|
||||
|
||||
@@ -61,6 +61,7 @@ libcurl_dep = dependency('libcurl', static:static_deps)
|
||||
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
|
||||
zlib_dep = dependency('zlib', static:static_deps)
|
||||
xapian_dep = dependency('xapian-core', static:static_deps)
|
||||
libnuspell_dep = dependency('libnuspell', static:static_deps)
|
||||
|
||||
if compiler.has_header('mustache.hpp')
|
||||
extra_include = []
|
||||
@@ -94,7 +95,7 @@ endif
|
||||
|
||||
|
||||
# Dependencies as string
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
|
||||
all_deps = [thread_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep, libnuspell_dep]
|
||||
|
||||
# Dependencies as array
|
||||
all_deps += libicu_deps
|
||||
|
||||
@@ -20,10 +20,12 @@
|
||||
#include "spelling_correction.h"
|
||||
#include "zim/archive.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <xapian.h>
|
||||
#include <nuspell/dictionary.hxx>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
@@ -80,10 +82,39 @@ std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cac
|
||||
}
|
||||
}
|
||||
|
||||
const char nuspellAffFileData[] = R"(
|
||||
SET UTF-8
|
||||
TRY qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM
|
||||
)";
|
||||
|
||||
std::unique_ptr<std::istream> getAffDataStream()
|
||||
{
|
||||
const char* const userAffFilePath = ::getenv("KIWIX_NUSPELL_AFF_FILE_PATH");
|
||||
if ( userAffFilePath ) {
|
||||
return std::make_unique<std::ifstream>(userAffFilePath);
|
||||
}
|
||||
|
||||
return std::make_unique<std::istringstream>(nuspellAffFileData);
|
||||
}
|
||||
|
||||
std::unique_ptr<nuspell::Dictionary> createNuspellDictionary(const zim::Archive& archive)
|
||||
{
|
||||
auto d = std::make_unique<nuspell::Dictionary>();
|
||||
const auto& allTitles = getAllTitles(archive);
|
||||
std::stringstream dicSS;
|
||||
dicSS << allTitles.size() << "\n";
|
||||
for ( const auto& t : allTitles ) {
|
||||
dicSS << t << "\n";
|
||||
}
|
||||
d->load_aff_dic(*getAffDataStream(), dicSS);
|
||||
return d;
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
|
||||
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
|
||||
, nuspell_(createNuspellDictionary(archive))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -93,14 +124,13 @@ SpellingsDB::~SpellingsDB()
|
||||
|
||||
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
|
||||
{
|
||||
if ( maxCount > 1 ) {
|
||||
throw std::runtime_error("More than one spelling correction was requested");
|
||||
}
|
||||
|
||||
std::vector<std::string> result;
|
||||
const auto term = impl_->get_spelling_suggestion(word, 3);
|
||||
if ( !term.empty() ) {
|
||||
result.push_back(term);
|
||||
nuspell_->suggest(word, result);
|
||||
if ( result.size() > maxCount ) {
|
||||
result.resize(maxCount);
|
||||
}
|
||||
if ( result.size() == 1 && result[0] == word ) {
|
||||
result.clear();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"@metadata": {
|
||||
"authors": [
|
||||
"Apq",
|
||||
"Jopparn",
|
||||
"Larsa",
|
||||
"Rofiatmustapha12",
|
||||
@@ -25,9 +26,25 @@
|
||||
"400-page-heading": "Ogiltig begäran",
|
||||
"404-page-title": "Innehållet hittades inte",
|
||||
"404-page-heading": "Hittades inte",
|
||||
"new-404-page-title": "Sidan kunde inte hittas",
|
||||
"new-404-page-heading": "Hoppsan. Sidan hittades inte.",
|
||||
"404-img-text": "Hittades ej!",
|
||||
"path-was-not-found": "Den begärda sökvägen hittades ej:",
|
||||
"404-advice.p1": "Innehållet du letar efter kan fortfarande vara tillgängligt, men det kan finnas på en annan plats i ZIM-filen.",
|
||||
"404-advice.p2": "Vänligen:",
|
||||
"404-advice.p3": "Försök att använda sökfunktionen för att hitta det innehåll du vill ha",
|
||||
"404-advice.p4": "Leta efter nyckelord eller titlar relaterade till den information du söker",
|
||||
"404-advice.p5": "Den här metoden bör hjälpa dig att hitta önskat innehåll, även om den ursprungliga länken inte fungerar korrekt.",
|
||||
"500-page-title": "Internt serverfel",
|
||||
"500-page-heading": "Internt serverfel",
|
||||
"500-page-text": "Ett internt serverfel uppstod. Vi ber om ursäkt för det :/",
|
||||
"500-page-heading": "Hoppsan. Sidan fungerar inte.",
|
||||
"500-page-text": "Den begärda sökvägen kan inte levereras korrekt:",
|
||||
"500-img-text": "Sidan fungerar ej",
|
||||
"external-link-detected": "Extern länk upptäckt",
|
||||
"caution-warning": "Varning!",
|
||||
"external-link-intro": "Du är på väg att lämna Kiwix ZIM-läsare för att gå online till",
|
||||
"external-link-advice.p1": "Länken du försöker komma åt är inte en del av ditt offlinepaket och kräver en internetanslutning.",
|
||||
"external-link-advice.p2": "Om du kan gå online kan du försöka öppna länken.",
|
||||
"external-link-advice.p3": "Du kan annars återgå till ditt ZIM-innehåll offline genom att använda webbläsarens bakåtknapp.",
|
||||
"fulltext-search-unavailable": "Fulltextsökning är inte tillgänglig",
|
||||
"no-search-results": "Sökmaskinen för fulltext är inte tillgänglig för detta innehåll.",
|
||||
"search-results-page-title": "Sök: {{SEARCH_PATTERN}}",
|
||||
@@ -36,9 +53,9 @@
|
||||
"search-result-book-info": "från {{BOOK_TITLE}}",
|
||||
"word-count": "{{COUNT}} ord",
|
||||
"library-button-text": "Gå till hemsidan",
|
||||
"home-button-text": "Gå till huvudsidan för \"{{BOOK_TITLE}}\"",
|
||||
"home-button-text": "Gå till huvudsidan för '{{{BOOK_TITLE}}}'",
|
||||
"random-page-button-text": "Gå till en slumpmässigt utvald sida",
|
||||
"searchbox-tooltip": "Sök efter \"{{BOOK_TITLE}}\"",
|
||||
"searchbox-tooltip": "Sök '{{{BOOK_TITLE}}}'",
|
||||
"confusion-of-tongues": "Två eller fler böcker på olika språk skulle delta i sökningen, vilket kan ge förvirrande resultat.",
|
||||
"welcome-page-overzealous-filter": "Inga resultat. Vill du <a href=\"{{URL}}\">återställa filtret</a>?",
|
||||
"powered-by-kiwix-html": "Drivs av <a href=\"https://kiwix.org\">Kiwix</a>",
|
||||
@@ -73,5 +90,6 @@
|
||||
"book-category.wikiversity": "Wikiversity",
|
||||
"book-category.wikivoyage": "Wikivoyage",
|
||||
"book-category.wiktionary": "Wiktionary",
|
||||
"book-category.other": "Övriga"
|
||||
"book-category.other": "Övriga",
|
||||
"text-loading-content": "Laddar innehåll"
|
||||
}
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
"@metadata": {
|
||||
"authors": [
|
||||
"Hedda",
|
||||
"Rofiatmustapha12"
|
||||
"Rofiatmustapha12",
|
||||
"SaldırganSincap"
|
||||
]
|
||||
},
|
||||
"name": "Türkçe",
|
||||
@@ -23,8 +24,8 @@
|
||||
"404-page-title": "içerik bulunamadı",
|
||||
"404-page-heading": "Bulunamadı",
|
||||
"500-page-title": "İç Sunucu Hatası",
|
||||
"500-page-heading": "İç Sunucu Hatası",
|
||||
"500-page-text": "Dahili bir sunucu hatası oluştu. Bunun için üzgünüz :/",
|
||||
"500-page-heading": "Üzgünüz. Sayfa çalışmıyor.",
|
||||
"500-page-text": "İstenen yol düzgün bir şekilde teslim edilemiyor:",
|
||||
"fulltext-search-unavailable": "Tam metin araması kullanılamıyor",
|
||||
"no-search-results": "Tam metin arama motoru bu içerik için kullanılamaz.",
|
||||
"search-results-page-title": "Arama: {{SEARCH_PATTERN}}",
|
||||
@@ -33,9 +34,9 @@
|
||||
"search-result-book-info": "{{BOOK_TITLE}} adlı kitaptan",
|
||||
"word-count": "{{COUNT}} kelime",
|
||||
"library-button-text": "Karşılama sayfasına git",
|
||||
"home-button-text": "'{{BOOK_TITLE}}' anasayfasına gidin",
|
||||
"home-button-text": "'{{{BOOK_TITLE}}}' ana sayfasına git",
|
||||
"random-page-button-text": "Rastgele seçilen bir sayfaya git",
|
||||
"searchbox-tooltip": "'{{BOOK_TITLE}}' ara",
|
||||
"searchbox-tooltip": "'{{{BOOK_TITLE}}}' ara",
|
||||
"confusion-of-tongues": "Aramaya farklı dillerde iki veya daha fazla kitap katılacak ve bu da kafa karıştırıcı sonuçlara yol açabilecektir.",
|
||||
"welcome-page-overzealous-filter": "Sonuç yok. <a href=\"{{URL}}\">Filtreyi sıfırlamak</a> ister misiniz?",
|
||||
"powered-by-kiwix-html": "<a href=\"https://kiwix.org\">Kiwix</a> tarafından desteklenmektedir",
|
||||
@@ -45,16 +46,16 @@
|
||||
"count-of-matching-books": "{{COUNT}} kitap",
|
||||
"download": "İndir",
|
||||
"direct-download-link-text": "Doğrudan",
|
||||
"direct-download-alt-text": "direkt indirme",
|
||||
"hash-download-link-text": "Sha256 haşesi",
|
||||
"hash-download-alt-text": "csv indir",
|
||||
"direct-download-alt-text": "Doğrudan HTTP(S) üzerinden indir",
|
||||
"hash-download-link-text": "SHA-256 sağlama toplamı",
|
||||
"hash-download-alt-text": "SHA-256 dosya toplam kontrolünü görüntüle",
|
||||
"magnet-link-text": "Mıknatıs bağlantısı",
|
||||
"magnet-alt-text": "mıknatısı indir",
|
||||
"torrent-download-link-text": "Hedef dosya",
|
||||
"torrent-download-alt-text": "torrenti indir",
|
||||
"magnet-alt-text": "Mıknatıs bağlantısıyla indir",
|
||||
"torrent-download-link-text": "BitTorrent",
|
||||
"torrent-download-alt-text": "BitTorrent üzerinden indir",
|
||||
"library-opds-feed-all-entries": "Kütüphane OPDS Akışı - Tüm girişler",
|
||||
"filter-by-tag": "\"{{TAG}}\" etiketine göre filtrele",
|
||||
"stop-filtering-by-tag": "\"{{TAG}}\" etiketine göre filtrelemeyi durdur",
|
||||
"filter-by-tag": "\"{{{TAG}}}\" etiketine göre filtrele",
|
||||
"stop-filtering-by-tag": "\"{{{TAG}}}\" etiketine göre filtrelemeyi durdur",
|
||||
"library-opds-feed-parameterised": "Kütüphane OPDS Özet Akışı - {{#LANG}}\nLanguage: {{LANG}} {{/LANG}}{{#CATEGORY}}\nCategory: {{CATEGORY}} {{/CATEGORY}} ile eşleşen girişler {{#TAG}}\nTag: {{TAG}} {{/TAG}}{{#Q}}\nQuery: {{Q}} {{/Q}}",
|
||||
"welcome-to-kiwix-server": "Kiwix Sunucusuna Hoş Geldiniz",
|
||||
"download-links-heading": "<b><i>{{BOOK_TITLE}}</i></b> için indirme bağlantıları",
|
||||
|
||||
@@ -117,7 +117,7 @@
|
||||
{{title}}
|
||||
</a>
|
||||
{{#snippet}}
|
||||
<cite>{{>snippet}}...</cite>
|
||||
<cite>{{{snippet}}}...</cite>
|
||||
{{/snippet}}
|
||||
{{#bookInfo}}
|
||||
<div class="book-title">{{{bookInfo}}}</div>
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
<title>{{title}}</title>
|
||||
<link>{{absolutePath}}</link>
|
||||
{{#snippet}}
|
||||
<description>{{>snippet}}...</description>
|
||||
<description>{{{snippet}}}...</description>
|
||||
{{/snippet}}
|
||||
{{#bookTitle}}
|
||||
<book>
|
||||
|
||||
@@ -78,21 +78,7 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
|
||||
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
|
||||
|
||||
// The spelling correction "Lax -> Lachs" is affected by commit
|
||||
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
|
||||
// which caps the edit distance at (length(query_word) - 1). As a result, the
|
||||
// max edit distance parameter that we pass into get_spelling_suggestion() is
|
||||
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
|
||||
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
|
||||
Xapian::minor_version(),
|
||||
Xapian::revision());
|
||||
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
} else {
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
|
||||
}
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
|
||||
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
|
||||
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
|
||||
@@ -144,24 +130,26 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
// Exact match is not considered a spelling correction
|
||||
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
|
||||
|
||||
// Max edit distance is 3
|
||||
// Max edit distance can be quite large
|
||||
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
|
||||
// Case matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führ", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Füh", 1, ({}));
|
||||
// Case doesn't matter in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("führ", 1, ({"Führerschein"}));
|
||||
// Diacritics matters in edit distance
|
||||
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
|
||||
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
|
||||
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
|
||||
// of a Unicode codepoint).
|
||||
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Fuhr", 1, ({}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("führershine", 1, ({"Führerschein"}));
|
||||
|
||||
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({"Führerschein"}));
|
||||
|
||||
// More than one spelling correction can be requested
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
EXPECT_SPELLING_CORRECTION("Kung", 3, ({"King", "Kong"}));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Shortcomings of the proof-of-concept implementation
|
||||
@@ -171,10 +159,6 @@ void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
|
||||
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
|
||||
|
||||
// Only one spelling correction can be requested
|
||||
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
|
||||
}
|
||||
|
||||
using StrCollection = std::vector<std::string>;
|
||||
|
||||
Reference in New Issue
Block a user