|
|
|
@@ -360,7 +360,7 @@ zim::Query SearchInfo::getZimQuery(bool verbose) const {
|
|
|
|
|
if (verbose) {
|
|
|
|
|
std::cout << "Performing query '" << pattern<< "'";
|
|
|
|
|
}
|
|
|
|
|
query.setQuery(pattern);
|
|
|
|
|
query.setQuery(removeAccents(pattern));
|
|
|
|
|
if (geoQuery) {
|
|
|
|
|
if (verbose) {
|
|
|
|
|
std::cout << " with geo query '" << geoQuery.distance << "&(" << geoQuery.latitude << ";" << geoQuery.longitude << ")'";
|
|
|
|
@@ -1106,6 +1106,164 @@ InternalServer::build_redirect(const std::string& bookName, const zim::Item& ite
|
|
|
|
|
return Response::build_redirect(*this, url);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<Response> InternalServer::build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
|
|
|
|
|
{
|
|
|
|
|
auto entry = getEntryFromPath(archive, path);
|
|
|
|
|
if (entry.isRedirect() || path != entry.getPath()) {
|
|
|
|
|
// In the condition above, the second case (an entry with a different
|
|
|
|
|
// URL was returned) can occur in the following situations:
|
|
|
|
|
// 1. path is empty or equal to "/" and the ZIM file doesn't contain
|
|
|
|
|
// such an entry, in which case the main entry is returned instead.
|
|
|
|
|
// 2. The ZIM file uses old namespace scheme, and the resource at path
|
|
|
|
|
// is not present but can be found under one of the 'A', 'I', 'J' or
|
|
|
|
|
// '-' namespaces, in which case that resource is returned instead.
|
|
|
|
|
return build_redirect(bookName, getFinalItem(archive, entry));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (m_verbose.load()) {
|
|
|
|
|
printf("Found %s\n", entry.getPath().c_str());
|
|
|
|
|
printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto response = ItemResponse::build(*this, request, entry.getItem());
|
|
|
|
|
response->set_etag_body(std::string(archive.getUuid()));
|
|
|
|
|
|
|
|
|
|
if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
|
|
|
|
|
// NOTE: Content security policy is not applied to PDF content so that
|
|
|
|
|
// NOTE: it can be displayed in the viewer in Chromium-based browsers.
|
|
|
|
|
response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
|
|
|
|
|
response->add_header("Referrer-Policy", "no-referrer");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return response;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct FuzzyRule {
|
|
|
|
|
std::string match;
|
|
|
|
|
std::string fuzzyCannonReplace;
|
|
|
|
|
std::string split;
|
|
|
|
|
bool splitlast;
|
|
|
|
|
std::vector<std::vector<std::string>> args;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef std::vector<FuzzyRule> FuzzyRules;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const FuzzyRules FUZZY_RULES{
|
|
|
|
|
{
|
|
|
|
|
/*match:*/ "^(https?://(?:www\\.)?)(youtube\\.com/@[^?]+)[?].*",
|
|
|
|
|
/*fuzzyCanonReplace:*/ "$1$2",
|
|
|
|
|
/*.split:*/ "",
|
|
|
|
|
/*.splitlast:*/ false,
|
|
|
|
|
/*.args:*/ {},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
/*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info)",
|
|
|
|
|
/*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
|
|
|
|
|
/*split:*/ "",
|
|
|
|
|
/*slpitlast:*/ false,
|
|
|
|
|
/*args:*/ {{"video_id"}},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
/*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(youtubei/v1/[^?]+\\?).*(videoId[^&]+).*",
|
|
|
|
|
/*fuzzyCanonReplace: */ "youtube.fuzzy.replayweb.page/$1$2",
|
|
|
|
|
/*.split:*/ "",
|
|
|
|
|
/*.splitlast:*/ false,
|
|
|
|
|
/*"args: */ {{"videoId"}},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
/*"match":*/ ".*googlevideo.com/(videoplayback)",
|
|
|
|
|
/*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
|
|
|
|
|
/*.split:*/ "",
|
|
|
|
|
/*.splitlast:*/ false,
|
|
|
|
|
/*"args": */ {
|
|
|
|
|
{"id", "itag"},
|
|
|
|
|
{"id"}
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
FuzzyRule get_rule(std::string path) {
|
|
|
|
|
if (path.find("?") == std::string::npos) {
|
|
|
|
|
path.append("?");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ( const auto& fuzzy_rule : FUZZY_RULES ) {
|
|
|
|
|
std::cout << "try to match " << fuzzy_rule.match << std::endl;
|
|
|
|
|
if (matchRegex(path, fuzzy_rule.match)) {
|
|
|
|
|
return fuzzy_rule;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
throw std::runtime_error("No Rule");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> gen_fuzzy_urls(const RequestContext& request, const std::string& path) {
|
|
|
|
|
std::vector<std::string> fuzzy_urls;
|
|
|
|
|
// First of all, add the query_string
|
|
|
|
|
auto url_queried = path + "?" + request.get_query();
|
|
|
|
|
fuzzy_urls.push_back(url_queried);
|
|
|
|
|
try {
|
|
|
|
|
auto rule = get_rule(url_queried);
|
|
|
|
|
|
|
|
|
|
std::cout << "Matching rule : " << rule.match << std::endl;
|
|
|
|
|
|
|
|
|
|
std::string sep = rule.split.size() != 0 ? rule.split : "?";
|
|
|
|
|
auto split_idx = rule.splitlast ? url_queried.rfind(sep) : url_queried.find(sep);
|
|
|
|
|
auto prefix = split_idx == std::string::npos ? url_queried: url_queried.substr(0, split_idx+sep.size());
|
|
|
|
|
|
|
|
|
|
std::cout << "Prefix is : " << prefix << std::endl;
|
|
|
|
|
|
|
|
|
|
std::string fuzzy_cannon_url;
|
|
|
|
|
if (rule.fuzzyCannonReplace.size() != 0) {
|
|
|
|
|
std::cout << "replace " << rule.match << " with " << rule.fuzzyCannonReplace << std::endl;
|
|
|
|
|
fuzzy_cannon_url = replaceRegex(url_queried, rule.fuzzyCannonReplace, rule.match);
|
|
|
|
|
} else {
|
|
|
|
|
fuzzy_cannon_url = prefix;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// remove querystring from fuzzy_cannon_url.
|
|
|
|
|
split_idx = fuzzy_cannon_url.find("?");
|
|
|
|
|
fuzzy_cannon_url = fuzzy_cannon_url.substr(0, split_idx);
|
|
|
|
|
|
|
|
|
|
std::cout << "fuzzy_cannon_url is : " << fuzzy_cannon_url << std::endl;
|
|
|
|
|
|
|
|
|
|
fuzzy_urls.push_back(fuzzy_cannon_url);
|
|
|
|
|
|
|
|
|
|
for (auto args: rule.args) {
|
|
|
|
|
std::stringstream query;
|
|
|
|
|
std::string sep="?";
|
|
|
|
|
for (auto arg: args) {
|
|
|
|
|
query << sep << arg << "=" << request.get_optional_param(arg, std::string());
|
|
|
|
|
sep = "&";
|
|
|
|
|
}
|
|
|
|
|
fuzzy_urls.push_back(fuzzy_cannon_url+query.str());
|
|
|
|
|
}
|
|
|
|
|
} catch(const std::runtime_error&) {
|
|
|
|
|
auto split_idx = url_queried.find("?");
|
|
|
|
|
fuzzy_urls.push_back(split_idx == std::string::npos ? url_queried : url_queried.substr(0, split_idx+1));
|
|
|
|
|
}
|
|
|
|
|
return fuzzy_urls;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<Response> InternalServer::build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
|
|
|
|
|
{
|
|
|
|
|
try {
|
|
|
|
|
std::cout << "Try url : " << path << std::endl;
|
|
|
|
|
return build_response_for_path(request, archive, bookName, path);
|
|
|
|
|
} catch(zim::EntryNotFound& e) {
|
|
|
|
|
// We have to do fuzzy matching.
|
|
|
|
|
for (const auto& fuzzy_url: gen_fuzzy_urls(request, path)) {
|
|
|
|
|
std::cout << "Try fuzzy url : " << fuzzy_url << std::endl;
|
|
|
|
|
try {
|
|
|
|
|
return build_response_for_path(request, archive, bookName, fuzzy_url);
|
|
|
|
|
} catch(zim::EntryNotFound& e) {}
|
|
|
|
|
}
|
|
|
|
|
// No fuzzy path matches,
|
|
|
|
|
std::cout << "Not found, sorry.." << std::endl;
|
|
|
|
|
throw zim::EntryNotFound("No fuzzy rule matches.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& request)
|
|
|
|
|
{
|
|
|
|
|
const std::string url = request.get_url();
|
|
|
|
@@ -1143,32 +1301,7 @@ std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& r
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
auto entry = getEntryFromPath(*archive, urlStr);
|
|
|
|
|
if (entry.isRedirect() || urlStr != entry.getPath()) {
|
|
|
|
|
// In the condition above, the second case (an entry with a different
|
|
|
|
|
// URL was returned) can occur in the following situations:
|
|
|
|
|
// 1. urlStr is empty or equal to "/" and the ZIM file doesn't contain
|
|
|
|
|
// such an entry, in which case the main entry is returned instead.
|
|
|
|
|
// 2. The ZIM file uses old namespace scheme, and the resource at urlStr
|
|
|
|
|
// is not present but can be found under one of the 'A', 'I', 'J' or
|
|
|
|
|
// '-' namespaces, in which case that resource is returned instead.
|
|
|
|
|
return build_redirect(bookName, getFinalItem(*archive, entry));
|
|
|
|
|
}
|
|
|
|
|
auto response = ItemResponse::build(*this, request, entry.getItem());
|
|
|
|
|
response->set_etag_body(archiveUuid);
|
|
|
|
|
|
|
|
|
|
if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
|
|
|
|
|
// NOTE: Content security policy is not applied to PDF content so that
|
|
|
|
|
// NOTE: it can be displayed in the viewer in Chromium-based browsers.
|
|
|
|
|
response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
|
|
|
|
|
response->add_header("Referrer-Policy", "no-referrer");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (m_verbose.load()) {
|
|
|
|
|
printf("Found %s\n", entry.getPath().c_str());
|
|
|
|
|
printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto response = build_response_for_fuzzypath(request, *archive, bookName, urlStr);
|
|
|
|
|
return response;
|
|
|
|
|
} catch(zim::EntryNotFound& e) {
|
|
|
|
|
if (m_verbose.load())
|
|
|
|
|