Adapt fuzzy_rules

Now we store entry in zim without `http://`. So there is no `//` to search for at beginning.
Fuzzy match url in the server.
2026-02-19 07:34:10 -05:00 · 2023-05-27 10:14:28 +02:00 · 2023-05-26 15:57:36 +03:00 · 2023-05-25 14:00:15 +03:00 · 2023-05-25 13:21:19 +03:00 · 2023-05-25 13:20:58 +03:00
3 changed files with 168 additions and 27 deletions
--- a/src/server/internalServer.cpp
+++ b/src/server/internalServer.cpp
@@ -360,7 +360,7 @@ zim::Query SearchInfo::getZimQuery(bool verbose) const {
  if (verbose) {
    std::cout << "Performing query '" << pattern<< "'";
  }
-  query.setQuery(pattern);
+  query.setQuery(removeAccents(pattern));
  if (geoQuery) {
    if (verbose) {
      std::cout << " with geo query '" << geoQuery.distance << "&(" << geoQuery.latitude << ";" << geoQuery.longitude << ")'";
@@ -1106,6 +1106,164 @@ InternalServer::build_redirect(const std::string& bookName, const zim::Item& ite
  return Response::build_redirect(*this, url);
 }

+std::unique_ptr<Response> InternalServer::build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
+{
+  auto entry = getEntryFromPath(archive, path);
+  if (entry.isRedirect() || path != entry.getPath()) {
+    // In the condition above, the second case (an entry with a different
+    // URL was returned) can occur in the following situations:
+    // 1. path is empty or equal to "/" and the ZIM file doesn't contain
+    //    such an entry, in which case the main entry is returned instead.
+    // 2. The ZIM file uses old namespace scheme, and the resource at path
+    //    is not present but can be found under one of the 'A', 'I', 'J' or
+    //    '-' namespaces, in which case that resource is returned instead.
+    return build_redirect(bookName, getFinalItem(archive, entry));
+  }
+
+  if (m_verbose.load()) {
+    printf("Found %s\n", entry.getPath().c_str());
+    printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
+  }
+
+  auto response = ItemResponse::build(*this, request, entry.getItem());
+  response->set_etag_body(std::string(archive.getUuid()));
+
+  if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
+    // NOTE: Content security policy is not applied to PDF content so that
+    // NOTE: it can be displayed in the viewer in Chromium-based browsers.
+    response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
+    response->add_header("Referrer-Policy", "no-referrer");
+  }
+
+  return response;
+}
+
+struct FuzzyRule {
+  std::string match;
+  std::string fuzzyCannonReplace;
+  std::string split;
+  bool splitlast;
+  std::vector<std::vector<std::string>> args;
+};
+
+typedef std::vector<FuzzyRule> FuzzyRules;
+
+
+const FuzzyRules FUZZY_RULES{
+  {
+    /*match:*/ "^(https?://(?:www\\.)?)(youtube\\.com/@[^?]+)[?].*",
+    /*fuzzyCanonReplace:*/ "$1$2",
+    /*.split:*/ "",
+    /*.splitlast:*/ false,
+    /*.args:*/ {},
+  },
+  {
+      /*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info)",
+      /*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
+      /*split:*/ "",
+      /*slpitlast:*/ false,
+      /*args:*/ {{"video_id"}},
+    },
+  {
+    /*match:*/ "(?:www\\.)?youtube(?:-nocookie)?\\.com/(youtubei/v1/[^?]+\\?).*(videoId[^&]+).*",
+    /*fuzzyCanonReplace: */ "youtube.fuzzy.replayweb.page/$1$2",
+    /*.split:*/ "",
+    /*.splitlast:*/ false,
+    /*"args: */ {{"videoId"}},
+    },
+     {
+    /*"match":*/  ".*googlevideo.com/(videoplayback)",
+    /*fuzzyCanonReplace": */"youtube.fuzzy.replayweb.page/$1",
+    /*.split:*/ "",
+    /*.splitlast:*/ false,
+    /*"args": */ {
+      {"id", "itag"},
+      {"id"}
+    },
+  },
+};
+
+FuzzyRule get_rule(std::string path) {
+  if (path.find("?") == std::string::npos) {
+    path.append("?");
+  }
+
+  for ( const auto& fuzzy_rule : FUZZY_RULES ) {
+    std::cout << "try to match " << fuzzy_rule.match << std::endl;
+    if (matchRegex(path, fuzzy_rule.match)) {
+      return fuzzy_rule;
+    }
+  }
+  throw std::runtime_error("No Rule");
+}
+
+std::vector<std::string> gen_fuzzy_urls(const RequestContext& request, const std::string& path) {
+  std::vector<std::string> fuzzy_urls;
+  // First of all, add the query_string
+  auto url_queried = path + "?" + request.get_query();
+  fuzzy_urls.push_back(url_queried);
+  try {
+    auto rule = get_rule(url_queried);
+
+    std::cout << "Matching rule : " << rule.match << std::endl;
+
+    std::string sep = rule.split.size() != 0 ? rule.split : "?";
+    auto split_idx = rule.splitlast ? url_queried.rfind(sep) : url_queried.find(sep);
+    auto prefix = split_idx == std::string::npos ? url_queried: url_queried.substr(0, split_idx+sep.size());
+
+    std::cout << "Prefix is : " << prefix << std::endl;
+
+    std::string fuzzy_cannon_url;
+    if (rule.fuzzyCannonReplace.size() != 0) {
+      std::cout << "replace " << rule.match << " with " << rule.fuzzyCannonReplace << std::endl;
+      fuzzy_cannon_url = replaceRegex(url_queried, rule.fuzzyCannonReplace, rule.match);
+    } else {
+      fuzzy_cannon_url = prefix;
+    }
+
+    // remove querystring from fuzzy_cannon_url.
+    split_idx = fuzzy_cannon_url.find("?");
+    fuzzy_cannon_url = fuzzy_cannon_url.substr(0, split_idx);
+
+    std::cout << "fuzzy_cannon_url is : " << fuzzy_cannon_url << std::endl;
+
+    fuzzy_urls.push_back(fuzzy_cannon_url);
+
+    for (auto args: rule.args) {
+      std::stringstream query;
+      std::string sep="?";
+      for (auto arg: args) {
+        query << sep << arg << "=" << request.get_optional_param(arg, std::string());
+        sep = "&";
+      }
+      fuzzy_urls.push_back(fuzzy_cannon_url+query.str());
+    }
+  } catch(const std::runtime_error&) {
+    auto split_idx = url_queried.find("?");
+    fuzzy_urls.push_back(split_idx == std::string::npos ? url_queried : url_queried.substr(0, split_idx+1));
+  }
+  return fuzzy_urls;
+}
+
+std::unique_ptr<Response> InternalServer::build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
+{
+  try {
+    std::cout << "Try url : " << path << std::endl;
+    return build_response_for_path(request, archive, bookName, path);
+  } catch(zim::EntryNotFound& e) {
+    // We have to do fuzzy matching.
+    for (const auto& fuzzy_url: gen_fuzzy_urls(request, path)) {
+      std::cout << "Try fuzzy url : " << fuzzy_url << std::endl;
+      try {
+        return build_response_for_path(request, archive, bookName, fuzzy_url);
+      } catch(zim::EntryNotFound& e) {}
+    }
+    // No fuzzy path matches,
+    std::cout << "Not found, sorry.." << std::endl;
+    throw zim::EntryNotFound("No fuzzy rule matches.");
+  }
+}
+
 std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& request)
 {
  const std::string url = request.get_url();
@@ -1143,32 +1301,7 @@ std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& r
  }

  try {
-    auto entry = getEntryFromPath(*archive, urlStr);
-    if (entry.isRedirect() || urlStr != entry.getPath()) {
-      // In the condition above, the second case (an entry with a different
-      // URL was returned) can occur in the following situations:
-      // 1. urlStr is empty or equal to "/" and the ZIM file doesn't contain
-      //    such an entry, in which case the main entry is returned instead.
-      // 2. The ZIM file uses old namespace scheme, and the resource at urlStr
-      //    is not present but can be found under one of the 'A', 'I', 'J' or
-      //    '-' namespaces, in which case that resource is returned instead.
-      return build_redirect(bookName, getFinalItem(*archive, entry));
-    }
-    auto response = ItemResponse::build(*this, request, entry.getItem());
-    response->set_etag_body(archiveUuid);
-
-    if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
-      // NOTE: Content security policy is not applied to PDF content so that
-      // NOTE: it can be displayed in the viewer in Chromium-based browsers.
-      response->add_header("Content-Security-Policy", CONTENT_CSP_HEADER);
-      response->add_header("Referrer-Policy", "no-referrer");
-    }
-
-    if (m_verbose.load()) {
-      printf("Found %s\n", entry.getPath().c_str());
-      printf("mimeType: %s\n", entry.getItem(true).getMimetype().c_str());
-    }
-
+    auto response = build_response_for_fuzzypath(request, *archive, bookName, urlStr);
    return response;
  } catch(zim::EntryNotFound& e) {
    if (m_verbose.load())
--- a/src/server/internalServer.h
+++ b/src/server/internalServer.h
@@ -123,6 +123,8 @@ class InternalServer {
    std::unique_ptr<Response> handle_request(const RequestContext& request);
    std::unique_ptr<Response> build_redirect(const std::string& bookName, const zim::Item& item) const;
    std::unique_ptr<Response> build_homepage(const RequestContext& request);
+    std::unique_ptr<Response> build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
+    std::unique_ptr<Response> build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
    std::unique_ptr<Response> handle_viewer_settings(const RequestContext& request);
    std::unique_ptr<Response> handle_skin(const RequestContext& request);
    std::unique_ptr<Response> handle_catalog(const RequestContext& request);
--- a/src/tools/archiveTools.cpp
+++ b/src/tools/archiveTools.cpp
@@ -122,6 +122,12 @@ zim::Entry getEntryFromPath(const zim::Archive& archive, const std::string& path
    if (path.empty() || path == "/") {
      return archive.getMainEntry();
    }
+    std::cout << "Search for H/"<<path << std::endl;
+    auto entry = archive.getEntryByPath("H/"+path);
+    while (entry.isRedirect()) {
+      entry = entry.getRedirectEntry();
+    }
+    return entry;
  }
  throw zim::EntryNotFound("Cannot find entry for non empty path");
 }
Author	SHA1	Message	Date
Matthieu Gautier	be6bdb66ea	Adapt fuzzy_rules Now we store entry in zim without `http://`. So there is no `//` to search for at beginning.	2023-05-27 10:14:28 +02:00
Matthieu Gautier	e1acac2bc7	Fuzzy match url in the server.	2023-05-26 15:57:36 +03:00
Matthieu Gautier	8b8a038f1f	fixup! Move building a response from a path in a helper method.	2023-05-25 14:00:15 +03:00
Matthieu Gautier	6818c38ccb	Remove accents when we do a search	2023-05-25 13:21:19 +03:00
Matthieu Gautier	d187409336	Move building a response from a path in a helper method.	2023-05-25 13:20:58 +03:00