From d8e138ae2d6f082c3252c693af8b7d9e93a2dbe8 Mon Sep 17 00:00:00 2001 From: Daniel O'Connor Date: Mon, 29 Sep 2025 00:27:57 +0930 Subject: [PATCH] Create robots.txt based on wikipedia --- public/robots.txt | 142 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 public/robots.txt diff --git a/public/robots.txt b/public/robots.txt new file mode 100644 index 000000000..da250e302 --- /dev/null +++ b/public/robots.txt @@ -0,0 +1,142 @@ +# robots.txt for based on the one for http://www.wikipedia.org/ and friends + +# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN +# and ignoring 429 ratelimit responses, claims to respect robots: +# http://mj12bot.com/ +User-agent: MJ12bot +Disallow: / + +# advertising-related bots: +User-agent: Mediapartners-Google* +Disallow: / + +# Wikipedia work bots: +User-agent: IsraBot +Disallow: + +User-agent: Orthogaffe +Disallow: + +# Crawlers that are kind enough to obey, but which we'd rather not have +# unless they're feeding search engines. +User-agent: UbiCrawler +Disallow: / + +User-agent: DOC +Disallow: / + +User-agent: Zao +Disallow: / + +# Some bots are known to be trouble, particularly those designed to copy +# entire sites. Please obey robots.txt. +User-agent: sitecheck.internetseer.com +Disallow: / + +User-agent: Zealbot +Disallow: / + +User-agent: MSIECrawler +Disallow: / + +User-agent: SiteSnagger +Disallow: / + +User-agent: WebStripper +Disallow: / + +User-agent: WebCopier +Disallow: / + +User-agent: Fetch +Disallow: / + +User-agent: Offline Explorer +Disallow: / + +User-agent: Teleport +Disallow: / + +User-agent: TeleportPro +Disallow: / + +User-agent: WebZIP +Disallow: / + +User-agent: linko +Disallow: / + +User-agent: HTTrack +Disallow: / + +User-agent: Microsoft.URL.Control +Disallow: / + +User-agent: Xenu +Disallow: / + +User-agent: larbin +Disallow: / + +User-agent: libwww +Disallow: / + +User-agent: ZyBORG +Disallow: / + +User-agent: Download Ninja +Disallow: / + +# Misbehaving: requests much too fast: +User-agent: fast +Disallow: / + +# +# Sorry, wget in its recursive mode is a frequent problem. +# Please read the man page and use it properly; there is a +# --wait option you can use to set the delay between hits, +# for instance. +# +User-agent: wget +Disallow: / + +# +# The 'grub' distributed client has been *very* poorly behaved. +# +User-agent: grub-client +Disallow: / + +# +# Doesn't follow robots.txt anyway, but... +# +User-agent: k2spider +Disallow: / + +# +# Hits many times per second, not acceptable +# http://www.nameprotect.com/botinfo.html +User-agent: NPBot +Disallow: / + +# A capture bot, downloads gazillions of pages with no public benefit +# http://www.webreaper.net/ +User-agent: WebReaper +Disallow: / + +# Per their statement, semrushbot respects crawl-delay directives +# We want them to overall stay within reasonable request rates to +# the backend (20 rps); keeping in mind that the crawl-delay will +# be applied by site and not globally by the bot, 5 seconds seem +# like a reasonable approximation +User-agent: SemrushBot +Crawl-delay: 5 + +# +# Friendly, low-speed bots are welcome viewing pages, but not +# dynamically-generated pages please. +# +# Another exception is for REST API documentation, located at +# /api/rest_v1/?doc. +# +User-agent: * +Disallow: /api/