#!/usr/bin/env ruby # Wayback Machine Site Map CGI Script # ----------------------------------- # Uses the Wayback Machine's CDX API to retrieve a # list of captured URLs under the given wildcard URL. # # Wayback Classic # Copyright (C) 2024 Jessica Stokes # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . require 'cgi' require_relative 'lib/cdx' require_relative 'lib/encoding' require_relative 'lib/error_reporting' require_relative 'lib/permit_world_writable_temp' if ENV["FORCE_WORLD_WRITABLE_TEMP"] == "true" require_relative 'lib/utils' require_relative 'lib/web_client' module WaybackClassic module Sitemap PAGE_SIZE = 50 def self.run legacy_encoding = LegacyClientEncoding.detect CGI.new.tap do |cgi| ErrorReporting.catch_and_respond(cgi) do if cgi.params.keys - ["q", "page", "filter"] != [] || cgi.params["q"]&.first.nil? || cgi.params["q"]&.first&.empty? raise ErrorReporting::BadRequestError.new("A `q` parameter must be supplied, and `page` and `filter` parameters are optional") end query = cgi.params["q"]&.first page = cgi.params["page"]&.first&.to_i || 1 filter = cgi.params["filter"]&.first response = begin WebClient.open uri("https://web.archive.org/cdx/search/cdx", url: query, output: "json", collapse: "urlkey", fl: "original,mimetype,timestamp,endtimestamp,groupcount,uniqcount", filter: "!statuscode:[45]..") rescue OpenURI::HTTPError raise ErrorReporting::ServerError.new("Couldn't retrieve captured URLs for this wildcard URL") end cdx_results = CDX.objectify response.read # Hold onto the count of results for the entire thing total_count = cdx_results.length # Apply MIME Type and URL filters if filter cdx_results = cdx_results.select do |cdx_result| cdx_result["mimetype"].downcase.include?(filter.downcase) || cdx_result["original"].downcase.include?(filter.downcase) end end # Grab the count of matching results, and calculate the page information scoped_count = cdx_results.length page_count = (scoped_count.to_f / PAGE_SIZE).ceil page_count = 1 if page_count == 0 # Grab this page of results, and go! cdx_results = cdx_results.slice((page - 1) * PAGE_SIZE, PAGE_SIZE) cgi.out "type" => "text/html", "charset" => "UTF-8", "status" => "OK" do render "sitemap.html", query: query, filter: filter, total_count: total_count, scoped_count: scoped_count, page: page || 1, page_count: page_count, cdx_results: cdx_results, legacy_encoding: legacy_encoding end end end end end end WaybackClassic::Sitemap.run if $PROGRAM_NAME == __FILE__