Compare commits

...

5 Commits

Author SHA1 Message Date
Matthieu Gautier
be6bdb66ea Adapt fuzzy_rules
Now we store entry in zim without `http://`.
So there is no `//` to search for at beginning.
2023-05-27 10:14:28 +02:00
Matthieu Gautier
e1acac2bc7 Fuzzy match url in the server. 2023-05-26 15:57:36 +03:00
Matthieu Gautier
8b8a038f1f fixup! Move building a response from a path in a helper method. 2023-05-25 14:00:15 +03:00
Matthieu Gautier
6818c38ccb Remove accents when we do a search 2023-05-25 13:21:19 +03:00
Matthieu Gautier
d187409336 Move building a response from a path in a helper method. 2023-05-25 13:20:58 +03:00
3 changed files with 168 additions and 27 deletions

View File

@@ -360,7 +360,7 @@ zim::Query SearchInfo::getZimQuery(bool verbose) const {
if (verbose) {
std::cout << "Performing query '" << pattern<< "'";
}
query.setQuery(pattern);
query.setQuery(removeAccents(pattern));
if (geoQuery) {
if (verbose) {
std::cout << " with geo query '" << geoQuery.distance << "&(" << geoQuery.latitude << ";" << geoQuery.longitude << ")'";
@@ -1106,6 +1106,164 @@ InternalServer::build_redirect(const std::string& bookName, const zim::Item& ite
return Response::build_redirect(*this, url);
}
std::unique_ptr<Response> InternalServer::build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
{
auto entry = getEntryFromPath(archive, path);
if (entry.isRedirect() || path != entry.getPath()) {
// In the condition above, the second case (an entry with a different
// URL was returned) can occur in the following situations:
// 1. path is empty or equal to "/" and the ZIM file doesn't contain
// such an entry, in which case the main entry is returned instead.
// 2. The ZIM file uses old namespace scheme, and the resource at path
// is not present but can be found under one of the 'A', 'I', 'J' or
// '-' namespaces, in which case that resource is returned instead.
return build_redirect(bookName, getFinalItem(archive, entry));
}
if (m_verbose.load()) {
printf("Found %s\n", entry.getPath().c_str());
printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
}
auto response = ItemResponse::build(*this, request, entry.getItem());
response->set_etag_body(std::string(archive.getUuid()));
if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
// NOTE: Content security policy is not applied to PDF content so that
// NOTE: it can be displayed in the viewer in Chromium-based browsers.
response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
response->add_header("Referrer-Policy", "no-referrer");
}
return response;
}
struct FuzzyRule {
std::string match;
std::string fuzzyCannonReplace;
std::string split;
bool splitlast;
std::vector<std::vector<std::string>> args;
};
typedef std::vector<FuzzyRule> FuzzyRules;
const FuzzyRules FUZZY_RULES{
{
/*match:*/ "^(https?://(?:www\\.)?)(youtube\\.com/@[^?]+)[?].*",
/*fuzzyCanonReplace:*/ "$1$2",
/*.split:*/ "",
/*.splitlast:*/ false,
/*.args:*/ {},
},
{
/*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info)",
/*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
/*split:*/ "",
/*slpitlast:*/ false,
/*args:*/ {{"video_id"}},
},
{
/*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(youtubei/v1/[^?]+\\?).*(videoId[^&]+).*",
/*fuzzyCanonReplace: */ "youtube.fuzzy.replayweb.page/$1$2",
/*.split:*/ "",
/*.splitlast:*/ false,
/*"args: */ {{"videoId"}},
},
{
/*"match":*/ ".*googlevideo.com/(videoplayback)",
/*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
/*.split:*/ "",
/*.splitlast:*/ false,
/*"args": */ {
{"id", "itag"},
{"id"}
},
},
};
FuzzyRule get_rule(std::string path) {
if (path.find("?") == std::string::npos) {
path.append("?");
}
for ( const auto& fuzzy_rule : FUZZY_RULES ) {
std::cout << "try to match " << fuzzy_rule.match << std::endl;
if (matchRegex(path, fuzzy_rule.match)) {
return fuzzy_rule;
}
}
throw std::runtime_error("No Rule");
}
std::vector<std::string> gen_fuzzy_urls(const RequestContext& request, const std::string& path) {
std::vector<std::string> fuzzy_urls;
// First of all, add the query_string
auto url_queried = path + "?" + request.get_query();
fuzzy_urls.push_back(url_queried);
try {
auto rule = get_rule(url_queried);
std::cout << "Matching rule : " << rule.match << std::endl;
std::string sep = rule.split.size() != 0 ? rule.split : "?";
auto split_idx = rule.splitlast ? url_queried.rfind(sep) : url_queried.find(sep);
auto prefix = split_idx == std::string::npos ? url_queried: url_queried.substr(0, split_idx+sep.size());
std::cout << "Prefix is : " << prefix << std::endl;
std::string fuzzy_cannon_url;
if (rule.fuzzyCannonReplace.size() != 0) {
std::cout << "replace " << rule.match << " with " << rule.fuzzyCannonReplace << std::endl;
fuzzy_cannon_url = replaceRegex(url_queried, rule.fuzzyCannonReplace, rule.match);
} else {
fuzzy_cannon_url = prefix;
}
// remove querystring from fuzzy_cannon_url.
split_idx = fuzzy_cannon_url.find("?");
fuzzy_cannon_url = fuzzy_cannon_url.substr(0, split_idx);
std::cout << "fuzzy_cannon_url is : " << fuzzy_cannon_url << std::endl;
fuzzy_urls.push_back(fuzzy_cannon_url);
for (auto args: rule.args) {
std::stringstream query;
std::string sep="?";
for (auto arg: args) {
query << sep << arg << "=" << request.get_optional_param(arg, std::string());
sep = "&";
}
fuzzy_urls.push_back(fuzzy_cannon_url+query.str());
}
} catch(const std::runtime_error&) {
auto split_idx = url_queried.find("?");
fuzzy_urls.push_back(split_idx == std::string::npos ? url_queried : url_queried.substr(0, split_idx+1));
}
return fuzzy_urls;
}
std::unique_ptr<Response> InternalServer::build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
{
try {
std::cout << "Try url : " << path << std::endl;
return build_response_for_path(request, archive, bookName, path);
} catch(zim::EntryNotFound& e) {
// We have to do fuzzy matching.
for (const auto& fuzzy_url: gen_fuzzy_urls(request, path)) {
std::cout << "Try fuzzy url : " << fuzzy_url << std::endl;
try {
return build_response_for_path(request, archive, bookName, fuzzy_url);
} catch(zim::EntryNotFound& e) {}
}
// No fuzzy path matches,
std::cout << "Not found, sorry.." << std::endl;
throw zim::EntryNotFound("No fuzzy rule matches.");
}
}
std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& request)
{
const std::string url = request.get_url();
@@ -1143,32 +1301,7 @@ std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& r
}
try {
auto entry = getEntryFromPath(*archive, urlStr);
if (entry.isRedirect() || urlStr != entry.getPath()) {
// In the condition above, the second case (an entry with a different
// URL was returned) can occur in the following situations:
// 1. urlStr is empty or equal to "/" and the ZIM file doesn't contain
// such an entry, in which case the main entry is returned instead.
// 2. The ZIM file uses old namespace scheme, and the resource at urlStr
// is not present but can be found under one of the 'A', 'I', 'J' or
// '-' namespaces, in which case that resource is returned instead.
return build_redirect(bookName, getFinalItem(*archive, entry));
}
auto response = ItemResponse::build(*this, request, entry.getItem());
response->set_etag_body(archiveUuid);
if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
// NOTE: Content security policy is not applied to PDF content so that
// NOTE: it can be displayed in the viewer in Chromium-based browsers.
response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
response->add_header("Referrer-Policy", "no-referrer");
}
if (m_verbose.load()) {
printf("Found %s\n", entry.getPath().c_str());
printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
}
auto response = build_response_for_fuzzypath(request, *archive, bookName, urlStr);
return response;
} catch(zim::EntryNotFound& e) {
if (m_verbose.load())

View File

@@ -123,6 +123,8 @@ class InternalServer {
std::unique_ptr<Response> handle_request(const RequestContext& request);
std::unique_ptr<Response> build_redirect(const std::string& bookName, const zim::Item& item) const;
std::unique_ptr<Response> build_homepage(const RequestContext& request);
std::unique_ptr<Response> build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
std::unique_ptr<Response> build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
std::unique_ptr<Response> handle_viewer_settings(const RequestContext& request);
std::unique_ptr<Response> handle_skin(const RequestContext& request);
std::unique_ptr<Response> handle_catalog(const RequestContext& request);

View File

@@ -122,6 +122,12 @@ zim::Entry getEntryFromPath(const zim::Archive& archive, const std::string& path
if (path.empty() || path == "/") {
return archive.getMainEntry();
}
std::cout << "Search for H/"<<path << std::endl;
auto entry = archive.getEntryByPath("H/"+path);
while (entry.isRedirect()) {
entry = entry.getRedirectEntry();
}
return entry;
}
throw zim::EntryNotFound("Cannot find entry for non empty path");
}