mirror of
https://github.com/Growstuff/growstuff.git
synced 2026-03-25 18:22:45 -04:00
145 lines
2.7 KiB
Plaintext
145 lines
2.7 KiB
Plaintext
# robots.txt for based on the one for http://www.wikipedia.org/ and friends
|
|
|
|
# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
|
|
# and ignoring 429 ratelimit responses, claims to respect robots:
|
|
# http://mj12bot.com/
|
|
User-agent: MJ12bot
|
|
Disallow: /
|
|
|
|
# advertising-related bots:
|
|
User-agent: Mediapartners-Google*
|
|
Disallow: /
|
|
|
|
# Wikipedia work bots:
|
|
User-agent: IsraBot
|
|
Disallow:
|
|
|
|
User-agent: Orthogaffe
|
|
Disallow:
|
|
|
|
# Crawlers that are kind enough to obey, but which we'd rather not have
|
|
# unless they're feeding search engines.
|
|
User-agent: UbiCrawler
|
|
Disallow: /
|
|
|
|
User-agent: DOC
|
|
Disallow: /
|
|
|
|
User-agent: Zao
|
|
Disallow: /
|
|
|
|
# Some bots are known to be trouble, particularly those designed to copy
|
|
# entire sites. Please obey robots.txt.
|
|
User-agent: sitecheck.internetseer.com
|
|
Disallow: /
|
|
|
|
User-agent: Zealbot
|
|
Disallow: /
|
|
|
|
User-agent: MSIECrawler
|
|
Disallow: /
|
|
|
|
User-agent: SiteSnagger
|
|
Disallow: /
|
|
|
|
User-agent: WebStripper
|
|
Disallow: /
|
|
|
|
User-agent: WebCopier
|
|
Disallow: /
|
|
|
|
User-agent: Fetch
|
|
Disallow: /
|
|
|
|
User-agent: Offline Explorer
|
|
Disallow: /
|
|
|
|
User-agent: Teleport
|
|
Disallow: /
|
|
|
|
User-agent: TeleportPro
|
|
Disallow: /
|
|
|
|
User-agent: WebZIP
|
|
Disallow: /
|
|
|
|
User-agent: linko
|
|
Disallow: /
|
|
|
|
User-agent: HTTrack
|
|
Disallow: /
|
|
|
|
User-agent: Microsoft.URL.Control
|
|
Disallow: /
|
|
|
|
User-agent: Xenu
|
|
Disallow: /
|
|
|
|
User-agent: larbin
|
|
Disallow: /
|
|
|
|
User-agent: libwww
|
|
Disallow: /
|
|
|
|
User-agent: ZyBORG
|
|
Disallow: /
|
|
|
|
User-agent: Download Ninja
|
|
Disallow: /
|
|
|
|
# Misbehaving: requests much too fast:
|
|
User-agent: fast
|
|
Disallow: /
|
|
|
|
#
|
|
# Sorry, wget in its recursive mode is a frequent problem.
|
|
# Please read the man page and use it properly; there is a
|
|
# --wait option you can use to set the delay between hits,
|
|
# for instance.
|
|
#
|
|
User-agent: wget
|
|
Disallow: /
|
|
|
|
#
|
|
# The 'grub' distributed client has been *very* poorly behaved.
|
|
#
|
|
User-agent: grub-client
|
|
Disallow: /
|
|
|
|
#
|
|
# Doesn't follow robots.txt anyway, but...
|
|
#
|
|
User-agent: k2spider
|
|
Disallow: /
|
|
|
|
#
|
|
# Hits many times per second, not acceptable
|
|
# http://www.nameprotect.com/botinfo.html
|
|
User-agent: NPBot
|
|
Disallow: /
|
|
|
|
# A capture bot, downloads gazillions of pages with no public benefit
|
|
# http://www.webreaper.net/
|
|
User-agent: WebReaper
|
|
Disallow: /
|
|
|
|
# Per their statement, semrushbot respects crawl-delay directives
|
|
# We want them to overall stay within reasonable request rates to
|
|
# the backend (20 rps); keeping in mind that the crawl-delay will
|
|
# be applied by site and not globally by the bot, 5 seconds seem
|
|
# like a reasonable approximation
|
|
User-agent: SemrushBot
|
|
Crawl-delay: 5
|
|
|
|
#
|
|
# Friendly, low-speed bots are welcome viewing pages, but not
|
|
# dynamically-generated pages please.
|
|
#
|
|
# Another exception is for REST API documentation, located at
|
|
# /api/rest_v1/?doc.
|
|
#
|
|
User-agent: *
|
|
Disallow: /api/
|
|
|
|
Sitemap: https://growstuff-prod.s3.us-west-2.amazonaws.com/sitemap.xml.gz
|