Files
wayback-classic/cgi-bin/sitemap.cgi
Jessica Stokes 2084648afe Add AGPL License
fixes #21
2024-07-20 14:42:13 -07:00

101 lines
3.8 KiB
Ruby
Executable File

#!/usr/bin/env ruby
# Wayback Machine Site Map CGI Script
# -----------------------------------
# Uses the Wayback Machine's CDX API to retrieve a
# list of captured URLs under the given wildcard URL.
#
# Wayback Classic
# Copyright (C) 2024 Jessica Stokes
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
require 'cgi'
require_relative 'lib/cdx'
require_relative 'lib/encoding'
require_relative 'lib/error_reporting'
require_relative 'lib/permit_world_writable_temp' if ENV["FORCE_WORLD_WRITABLE_TEMP"] == "true"
require_relative 'lib/utils'
require_relative 'lib/web_client'
module WaybackClassic
module Sitemap
PAGE_SIZE = 50
def self.run
legacy_encoding = LegacyClientEncoding.detect
CGI.new.tap do |cgi|
ErrorReporting.catch_and_respond(cgi) do
if cgi.params.keys - ["q", "page", "filter"] != [] || cgi.params["q"]&.first.nil? || cgi.params["q"]&.first&.empty?
raise ErrorReporting::BadRequestError.new("A `q` parameter must be supplied, and `page` and `filter` parameters are optional")
end
query = cgi.params["q"]&.first
page = cgi.params["page"]&.first&.to_i || 1
filter = cgi.params["filter"]&.first
response = begin
WebClient.open uri("https://web.archive.org/cdx/search/cdx",
url: query,
output: "json",
collapse: "urlkey",
fl: "original,mimetype,timestamp,endtimestamp,groupcount,uniqcount",
filter: "!statuscode:[45]..")
rescue OpenURI::HTTPError
raise ErrorReporting::ServerError.new("Couldn't retrieve captured URLs for this wildcard URL")
end
cdx_results = CDX.objectify response.read
# Hold onto the count of results for the entire thing
total_count = cdx_results.length
# Apply MIME Type and URL filters
if filter
cdx_results = cdx_results.select do |cdx_result|
cdx_result["mimetype"].downcase.include?(filter.downcase) || cdx_result["original"].downcase.include?(filter.downcase)
end
end
# Grab the count of matching results, and calculate the page information
scoped_count = cdx_results.length
page_count = (scoped_count.to_f / PAGE_SIZE).ceil
page_count = 1 if page_count == 0
# Grab this page of results, and go!
cdx_results = cdx_results.slice((page - 1) * PAGE_SIZE, PAGE_SIZE)
cgi.out "type" => "text/html",
"charset" => "UTF-8",
"status" => "OK" do
render "sitemap.html",
query: query,
filter: filter,
total_count: total_count,
scoped_count: scoped_count,
page: page || 1,
page_count: page_count,
cdx_results: cdx_results,
legacy_encoding: legacy_encoding
end
end
end
end
end
end
WaybackClassic::Sitemap.run if $PROGRAM_NAME == __FILE__