mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-12-23 22:47:57 -05:00
Merge pull request #1230 from kiwix/spelling_correction
Spelling correction of full titles
This commit is contained in:
@@ -7,6 +7,7 @@ headers = [
|
|||||||
'downloader.h',
|
'downloader.h',
|
||||||
'search_renderer.h',
|
'search_renderer.h',
|
||||||
'server.h',
|
'server.h',
|
||||||
|
'spelling_correction.h',
|
||||||
'kiwixserve.h',
|
'kiwixserve.h',
|
||||||
'name_mapper.h',
|
'name_mapper.h',
|
||||||
'tools.h',
|
'tools.h',
|
||||||
|
|||||||
58
include/spelling_correction.h
Normal file
58
include/spelling_correction.h
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2025 Veloman Yunkan
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 3 of the License, or
|
||||||
|
* any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||||
|
* MA 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef KIWIX_SPELLING_CORRECTION_H
|
||||||
|
#define KIWIX_SPELLING_CORRECTION_H
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace zim
|
||||||
|
{
|
||||||
|
class Archive;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace Xapian
|
||||||
|
{
|
||||||
|
class Database;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace kiwix
|
||||||
|
{
|
||||||
|
|
||||||
|
class SpellingsDB
|
||||||
|
{
|
||||||
|
public: // functions
|
||||||
|
SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath);
|
||||||
|
~SpellingsDB();
|
||||||
|
|
||||||
|
SpellingsDB(const SpellingsDB& ) = delete;
|
||||||
|
void operator=(const SpellingsDB& ) = delete;
|
||||||
|
|
||||||
|
std::vector<std::string> getSpellingCorrections(const std::string& word, uint32_t maxCount) const;
|
||||||
|
|
||||||
|
private: // data
|
||||||
|
std::unique_ptr<Xapian::Database> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace kiwix
|
||||||
|
|
||||||
|
#endif // KIWIX_SPELLING_CORRECTION_H
|
||||||
@@ -31,6 +31,7 @@ kiwix_sources = [
|
|||||||
'server/internalServer_catalog.cpp',
|
'server/internalServer_catalog.cpp',
|
||||||
'server/i18n.cpp',
|
'server/i18n.cpp',
|
||||||
'opds_catalog.cpp',
|
'opds_catalog.cpp',
|
||||||
|
'spelling_correction.cpp',
|
||||||
'version.cpp'
|
'version.cpp'
|
||||||
]
|
]
|
||||||
kiwix_sources += lib_resources
|
kiwix_sources += lib_resources
|
||||||
|
|||||||
108
src/spelling_correction.cpp
Normal file
108
src/spelling_correction.cpp
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2025 Veloman Yunkan
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 3 of the License, or
|
||||||
|
* any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||||
|
* MA 02110-1301, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "spelling_correction.h"
|
||||||
|
#include "zim/archive.h"
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
|
namespace kiwix
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
std::vector<std::string> getAllTitles(const zim::Archive& a)
|
||||||
|
{
|
||||||
|
std::vector<std::string> result;
|
||||||
|
for (const auto& entry : a.iterByPath() ) {
|
||||||
|
result.push_back(entry.getTitle());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void createXapianDB(std::string path, const zim::Archive& archive)
|
||||||
|
{
|
||||||
|
const int flags = Xapian::DB_BACKEND_GLASS|Xapian::DB_CREATE;
|
||||||
|
const auto tmpDbPath = path + ".tmp";
|
||||||
|
Xapian::WritableDatabase db(tmpDbPath, flags);
|
||||||
|
for (const auto& t : getAllTitles(archive)) {
|
||||||
|
db.add_spelling(t);
|
||||||
|
}
|
||||||
|
db.commit();
|
||||||
|
db.compact(path, Xapian::DBCOMPACT_SINGLE_FILE);
|
||||||
|
db.close();
|
||||||
|
std::filesystem::remove_all(tmpDbPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string spellingsDBPathForZIMArchive(std::filesystem::path cacheDirPath, const zim::Archive& a)
|
||||||
|
{
|
||||||
|
// The version of spellings DB must be updated each time an important change
|
||||||
|
// to the implementation is made that renders using the previous version
|
||||||
|
// impossible or undesirable.
|
||||||
|
const char SPELLINGS_DB_VERSION[] = "0.1";
|
||||||
|
|
||||||
|
std::ostringstream filename;
|
||||||
|
filename << a.getUuid() << ".spellingsdb.v" << SPELLINGS_DB_VERSION;
|
||||||
|
return (cacheDirPath / filename.str()).string();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<Xapian::Database> openOrCreateXapianDB(std::filesystem::path cacheDirPath, const zim::Archive& archive)
|
||||||
|
{
|
||||||
|
const auto path = spellingsDBPathForZIMArchive(cacheDirPath, archive);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return std::make_unique<Xapian::Database>(path);
|
||||||
|
}
|
||||||
|
catch (const Xapian::DatabaseOpeningError& )
|
||||||
|
{
|
||||||
|
createXapianDB(path, archive);
|
||||||
|
return std::make_unique<Xapian::Database>(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // unnamed namespace
|
||||||
|
|
||||||
|
SpellingsDB::SpellingsDB(const zim::Archive& archive, std::filesystem::path cacheDirPath)
|
||||||
|
: impl_(openOrCreateXapianDB(cacheDirPath, archive))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
SpellingsDB::~SpellingsDB()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const
|
||||||
|
{
|
||||||
|
if ( maxCount > 1 ) {
|
||||||
|
throw std::runtime_error("More than one spelling correction was requested");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> result;
|
||||||
|
const auto term = impl_->get_spelling_suggestion(word, 3);
|
||||||
|
if ( !term.empty() ) {
|
||||||
|
result.push_back(term);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace kiwix
|
||||||
143
test/data/create_zim_file_for_testing_spelling_correction
Executable file
143
test/data/create_zim_file_for_testing_spelling_correction
Executable file
@@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
mydir=$(readlink -f "$(dirname "$0")")
|
||||||
|
myname=$(basename "$0")
|
||||||
|
cd "$mydir"
|
||||||
|
|
||||||
|
zimfilename='spelling_correction_test.zim'
|
||||||
|
|
||||||
|
rm -f "$zimfilename"
|
||||||
|
|
||||||
|
datadir=$(mktemp -d --tmpdir $myname.XXXXXX)
|
||||||
|
function cleanup()
|
||||||
|
{
|
||||||
|
rm -rf "$datadir"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT SIGINT SIGQUIT SIGHUP SIGTERM
|
||||||
|
|
||||||
|
generate_html_file()
|
||||||
|
{
|
||||||
|
local word="$1"
|
||||||
|
local letter_count=${#word}
|
||||||
|
local letters=""
|
||||||
|
local i
|
||||||
|
for (( i=0; i<letter_count; ++i ));
|
||||||
|
do
|
||||||
|
local l=${word:i:1}
|
||||||
|
if (( i == letter_count - 1 ))
|
||||||
|
then
|
||||||
|
letters+="and '$l'"
|
||||||
|
else
|
||||||
|
letters+="'$l', "
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
cat >"$word".html <<END
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>$word</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>'$word' is a word consisting of the letters $letters.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
END
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_zim_file_data()
|
||||||
|
{
|
||||||
|
local titles=(
|
||||||
|
"Abenteuer"
|
||||||
|
"Applaus"
|
||||||
|
"Assistent"
|
||||||
|
"Attacke"
|
||||||
|
"Bewandtnis"
|
||||||
|
"Biene"
|
||||||
|
"Botschafter"
|
||||||
|
"Chaos"
|
||||||
|
"Entgelt"
|
||||||
|
"Entzündung"
|
||||||
|
"Fahrradschloss"
|
||||||
|
"Führerschein"
|
||||||
|
"Gral"
|
||||||
|
"Hierarchie"
|
||||||
|
"Honig"
|
||||||
|
"Impfung"
|
||||||
|
"Kamera"
|
||||||
|
"Konkurrenz"
|
||||||
|
"Lachs"
|
||||||
|
"Mond"
|
||||||
|
"Pflaster"
|
||||||
|
"Phänomen"
|
||||||
|
"Prise"
|
||||||
|
"Schirmmütze"
|
||||||
|
"Sohn"
|
||||||
|
"Stuhl"
|
||||||
|
"Teller"
|
||||||
|
"Thermoskanne"
|
||||||
|
"Trog"
|
||||||
|
"Umweltstandard"
|
||||||
|
"Unfug"
|
||||||
|
"Wohnzimmer"
|
||||||
|
"Zunge"
|
||||||
|
"aber"
|
||||||
|
"abonnieren"
|
||||||
|
"amtieren"
|
||||||
|
"attestieren"
|
||||||
|
"ausleeren"
|
||||||
|
"beißen"
|
||||||
|
"ebenfalls"
|
||||||
|
"enttäuschen"
|
||||||
|
"fort"
|
||||||
|
"gefleckt"
|
||||||
|
"gefährlich"
|
||||||
|
"gestern"
|
||||||
|
"gewähren"
|
||||||
|
"hässlich"
|
||||||
|
"konkurrieren"
|
||||||
|
"kämmen"
|
||||||
|
"lustig"
|
||||||
|
"müssen"
|
||||||
|
"nämlich"
|
||||||
|
"runterfallen"
|
||||||
|
"sanft"
|
||||||
|
"schubsen"
|
||||||
|
"seit"
|
||||||
|
"vorgestern"
|
||||||
|
"wahrscheinlich"
|
||||||
|
|
||||||
|
"Willkommen"
|
||||||
|
|
||||||
|
# Entries for demonstrating shortcomings of the PoC implementation
|
||||||
|
"Lorem ipsum"
|
||||||
|
"King"
|
||||||
|
"Kong"
|
||||||
|
)
|
||||||
|
|
||||||
|
local t
|
||||||
|
(
|
||||||
|
cd "$datadir"
|
||||||
|
cp "$mydir"/../../static/skin/favicon/favicon-32x32.png favicon.png
|
||||||
|
for t in "${titles[@]}";
|
||||||
|
do
|
||||||
|
generate_html_file "$t"
|
||||||
|
done
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_zim_file_data
|
||||||
|
zimwriterfs --withoutFTIndex --dont-check-arguments \
|
||||||
|
-w Willkommen.html \
|
||||||
|
-I favicon.png \
|
||||||
|
-l deu \
|
||||||
|
-n spelling_correction_test \
|
||||||
|
-t "Spelling corrections test" \
|
||||||
|
-d "ZIM file for testing spelling corrections" \
|
||||||
|
-c "Kiwix" \
|
||||||
|
-p "Kiwix" \
|
||||||
|
$datadir \
|
||||||
|
"$zimfilename" \
|
||||||
|
&& echo "$zimfilename was successfully created" \
|
||||||
|
|| echo '!!! Failed to create' "$zimfilename" '!!!' >&2
|
||||||
BIN
test/data/spelling_correction_test.zim
Normal file
BIN
test/data/spelling_correction_test.zim
Normal file
Binary file not shown.
@@ -15,7 +15,8 @@ tests = [
|
|||||||
'server_helper',
|
'server_helper',
|
||||||
'lrucache',
|
'lrucache',
|
||||||
'i18n',
|
'i18n',
|
||||||
'response'
|
'response',
|
||||||
|
'spelling_correction'
|
||||||
]
|
]
|
||||||
|
|
||||||
if build_machine.system() != 'windows'
|
if build_machine.system() != 'windows'
|
||||||
@@ -42,6 +43,7 @@ if gtest_dep.found() and not meson.is_cross_build()
|
|||||||
'zimfile_raycharles_uncategorized.zim',
|
'zimfile_raycharles_uncategorized.zim',
|
||||||
'corner_cases#&.zim',
|
'corner_cases#&.zim',
|
||||||
'poor.zim',
|
'poor.zim',
|
||||||
|
'spelling_correction_test.zim',
|
||||||
'library.xml',
|
'library.xml',
|
||||||
'lib_for_server_search_test.xml',
|
'lib_for_server_search_test.xml',
|
||||||
'customized_resources.txt',
|
'customized_resources.txt',
|
||||||
|
|||||||
217
test/spelling_correction.cpp
Normal file
217
test/spelling_correction.cpp
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2025 Veloman Yunkan
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License as
|
||||||
|
* published by the Free Software Foundation; either version 2 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but
|
||||||
|
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||||
|
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||||
|
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include "../include/spelling_correction.h"
|
||||||
|
#include "../src/tools/pathTools.h"
|
||||||
|
#include "zim/archive.h"
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
|
const std::string TEST_DB_PATH = "./spellings.db";
|
||||||
|
|
||||||
|
class SpellingCorrectionTest : public ::testing::Test
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
void SetUp() override {
|
||||||
|
tmpDirPath = makeTmpDirectory();
|
||||||
|
archive = std::make_unique<zim::Archive>("./test/spelling_correction_test.zim");
|
||||||
|
}
|
||||||
|
|
||||||
|
void TearDown() override {
|
||||||
|
std::filesystem::remove_all(tmpDirPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::filesystem::path tmpDirPath;
|
||||||
|
std::unique_ptr<zim::Archive> archive;
|
||||||
|
};
|
||||||
|
|
||||||
|
void testSpellingCorrections(const kiwix::SpellingsDB& spellingsDB)
|
||||||
|
{
|
||||||
|
#define EXPECT_SPELLING_CORRECTION(query, maxSuggestions, parenthesizedExpectedResult) \
|
||||||
|
EXPECT_EQ( \
|
||||||
|
spellingsDB.getSpellingCorrections(query, maxSuggestions), \
|
||||||
|
std::vector<std::string> parenthesizedExpectedResult \
|
||||||
|
)
|
||||||
|
|
||||||
|
EXPECT_SPELLING_CORRECTION("", 1, ({}));
|
||||||
|
|
||||||
|
EXPECT_SPELLING_CORRECTION("geflekt", 1, ({"gefleckt"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Teler", 1, ({"Teller"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Teler", 1, ({"Teller"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("kämen", 1, ({"kämmen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("abonieren", 1, ({"abonnieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("abbonnieren", 1, ({"abonnieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("abbonieren", 1, ({"abonnieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Aplaus", 1, ({"Applaus"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("konkurieren", 1, ({"konkurrieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Asisstent", 1, ({"Assistent"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Assisstent", 1, ({"Assistent"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Atacke", 1, ({"Attacke"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("atestieren", 1, ({"attestieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("entäuschen", 1, ({"enttäuschen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Enzündung", 1, ({"Entzündung"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Schirmütze", 1, ({"Schirmmütze"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Termoskanne", 1, ({"Thermoskanne"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Tsunge", 1, ({"Zunge"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("vort", 1, ({"fort"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Schtuhl", 1, ({"Stuhl"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("beissen", 1, ({"beißen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Camera", 1, ({"Kamera"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Kaos", 1, ({"Chaos"}));
|
||||||
|
|
||||||
|
// The spelling correction "Lax -> Lachs" is affected by commit
|
||||||
|
// https://github.com/xapian/xapian/commit/0cbe35de5c392623388946e6769aa03f912fdde4
|
||||||
|
// which caps the edit distance at (length(query_word) - 1). As a result, the
|
||||||
|
// max edit distance parameter that we pass into get_spelling_suggestion() is
|
||||||
|
// reduced from 3 to 2 and is below the edit distance of "Lachs" from "Lax".
|
||||||
|
const auto xapianVersion = std::make_tuple(Xapian::major_version(),
|
||||||
|
Xapian::minor_version(),
|
||||||
|
Xapian::revision());
|
||||||
|
if ( xapianVersion < std::make_tuple(1, 4, 19) ) {
|
||||||
|
EXPECT_SPELLING_CORRECTION("Lax", 1, ({"Lachs"}));
|
||||||
|
} else {
|
||||||
|
EXPECT_SPELLING_CORRECTION("Lax", 1, ({}));
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_SPELLING_CORRECTION("Mont", 1, ({"Mond"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Umweltstandart", 1, ({"Umweltstandard"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("seid", 1, ({"seit"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Trok", 1, ({"Trog"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Unfuk", 1, ({"Unfug"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("schupsen", 1, ({"schubsen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("warscheinlich", 1, ({"wahrscheinlich"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("gefärlich", 1, ({"gefährlich"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Son", 1, ({"Sohn"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("nähmlich", 1, ({"nämlich"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Grahl", 1, ({"Gral"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Bine", 1, ({"Biene"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Hirarchie", 1, ({"Hierarchie"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Priese", 1, ({"Prise"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("auslehren", 1, ({"ausleeren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Phenomen", 1, ({"Phänomen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Phänomän", 1, ({"Phänomen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Phenomän", 1, ({"Phänomen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("gewehren", 1, ({"gewähren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("aba", 1, ({"aber"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("gestan", 1, ({"gestern"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("ronterfallen", 1, ({"runterfallen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Hönig", 1, ({"Honig"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("mussen", 1, ({"müssen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Bewandnis", 1, ({"Bewandtnis"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("hässlig", 1, ({"hässlich"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("lustich", 1, ({"lustig"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Botschaftler", 1, ({"Botschafter"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("ebemfalls", 1, ({"ebenfalls"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("samft", 1, ({"sanft"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Wohenzimmer", 1, ({"Wohnzimmer"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Flaster", 1, ({"Pflaster"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Imfung", 1, ({"Impfung"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("amptieren", 1, ({"amtieren"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Endgeld", 1, ({"Entgelt"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Abendteuer", 1, ({"Abenteuer"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("sampft", 1, ({"sanft"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("forgestan", 1, ({"vorgestern"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Füreschein", 1, ({"Führerschein"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("ronterfalen", 1, ({"runterfallen"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Farradschluss", 1, ({"Fahrradschloss"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Konkorenz", 1, ({"Konkurrenz"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Hirachie", 1, ({"Hierarchie"}));
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Edge cases
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Exact match is not considered a spelling correction
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führerschein", 1, ({}));
|
||||||
|
|
||||||
|
// Max edit distance is 3
|
||||||
|
EXPECT_SPELLING_CORRECTION( "Führersch", 1, ({"Führerschein"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führersc", 1, ({}));
|
||||||
|
// Case matters in edit distance
|
||||||
|
EXPECT_SPELLING_CORRECTION("führersch", 1, ({}));
|
||||||
|
// Diacritics matters in edit distance
|
||||||
|
EXPECT_SPELLING_CORRECTION("Fuhrersch", 1, ({}));
|
||||||
|
// Mismatch in diacritics counts as 1 in edit distance (this is not trivial,
|
||||||
|
// because from the UTF-8 perspective it is a one-byte vs two-byte encoding
|
||||||
|
// of a Unicode codepoint).
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führersche", 1, ({"Führerschein"}));
|
||||||
|
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führershine", 1, ({"Führerschein"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führershyne", 1, ({}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("führershine", 1, ({}));
|
||||||
|
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führerschrom", 1, ({"Führerschein"}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Führerscdrom", 1, ({}));
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Shortcomings of the proof-of-concept implementation
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Multiword titles are treated as a single entity
|
||||||
|
EXPECT_SPELLING_CORRECTION("Laurem", 1, ({}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("ibsum", 1, ({}));
|
||||||
|
EXPECT_SPELLING_CORRECTION("Loremipsum", 1, ({"Lorem ipsum"}));
|
||||||
|
|
||||||
|
// Only one spelling correction can be requested
|
||||||
|
// EXPECT_SPELLING_CORRECTION("Kung", 2, ({"King", "Kong"}));
|
||||||
|
EXPECT_THROW(spellingsDB.getSpellingCorrections("Kung", 2), std::runtime_error);
|
||||||
|
}
|
||||||
|
|
||||||
|
using StrCollection = std::vector<std::string>;
|
||||||
|
|
||||||
|
StrCollection directoryEntries(std::filesystem::path dirPath)
|
||||||
|
{
|
||||||
|
StrCollection result;
|
||||||
|
for ( const auto& dirEntry : std::filesystem::directory_iterator(dirPath) ) {
|
||||||
|
result.push_back(dirEntry.path().string());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SpellingCorrectionTest, allInOne)
|
||||||
|
{
|
||||||
|
const auto tmpDirModTime0 = std::filesystem::last_write_time(tmpDirPath);
|
||||||
|
ASSERT_TRUE(directoryEntries(tmpDirPath).empty());
|
||||||
|
{
|
||||||
|
const kiwix::SpellingsDB spellingsDB(*archive, tmpDirPath);
|
||||||
|
testSpellingCorrections(spellingsDB);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto tmpDirModTime1 = std::filesystem::last_write_time(tmpDirPath);
|
||||||
|
|
||||||
|
const auto spellingsDbPath = tmpDirPath / "554c9707-897e-097a-53ba-1b1306d8bb88.spellingsdb.v0.1";
|
||||||
|
|
||||||
|
const StrCollection EXPECTED_DIR_CONTENT{ spellingsDbPath.string() };
|
||||||
|
ASSERT_EQ(directoryEntries(tmpDirPath), EXPECTED_DIR_CONTENT);
|
||||||
|
ASSERT_LT(tmpDirModTime0, tmpDirModTime1);
|
||||||
|
const auto fileModTime = std::filesystem::last_write_time(spellingsDbPath);
|
||||||
|
|
||||||
|
{
|
||||||
|
const kiwix::SpellingsDB spellingsDB(*archive, tmpDirPath);
|
||||||
|
testSpellingCorrections(spellingsDB);
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(directoryEntries(tmpDirPath), EXPECTED_DIR_CONTENT );
|
||||||
|
ASSERT_EQ(tmpDirModTime1, std::filesystem::last_write_time(tmpDirPath));
|
||||||
|
ASSERT_EQ(fileModTime, std::filesystem::last_write_time(spellingsDbPath));
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user