mirror of
https://github.com/Growstuff/growstuff.git
synced 2026-05-18 05:29:31 -04:00
Merge pull request #4273 from Growstuff/CloCkWeRX-patch-2
Create robots.txt based on wikipedia
This commit is contained in:
142
public/robots.txt
Normal file
142
public/robots.txt
Normal file
@@ -0,0 +1,142 @@
|
||||
# robots.txt for based on the one for http://www.wikipedia.org/ and friends
|
||||
|
||||
# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
|
||||
# and ignoring 429 ratelimit responses, claims to respect robots:
|
||||
# http://mj12bot.com/
|
||||
User-agent: MJ12bot
|
||||
Disallow: /
|
||||
|
||||
# advertising-related bots:
|
||||
User-agent: Mediapartners-Google*
|
||||
Disallow: /
|
||||
|
||||
# Wikipedia work bots:
|
||||
User-agent: IsraBot
|
||||
Disallow:
|
||||
|
||||
User-agent: Orthogaffe
|
||||
Disallow:
|
||||
|
||||
# Crawlers that are kind enough to obey, but which we'd rather not have
|
||||
# unless they're feeding search engines.
|
||||
User-agent: UbiCrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: DOC
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zao
|
||||
Disallow: /
|
||||
|
||||
# Some bots are known to be trouble, particularly those designed to copy
|
||||
# entire sites. Please obey robots.txt.
|
||||
User-agent: sitecheck.internetseer.com
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zealbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: MSIECrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: SiteSnagger
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebStripper
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebCopier
|
||||
Disallow: /
|
||||
|
||||
User-agent: Fetch
|
||||
Disallow: /
|
||||
|
||||
User-agent: Offline Explorer
|
||||
Disallow: /
|
||||
|
||||
User-agent: Teleport
|
||||
Disallow: /
|
||||
|
||||
User-agent: TeleportPro
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebZIP
|
||||
Disallow: /
|
||||
|
||||
User-agent: linko
|
||||
Disallow: /
|
||||
|
||||
User-agent: HTTrack
|
||||
Disallow: /
|
||||
|
||||
User-agent: Microsoft.URL.Control
|
||||
Disallow: /
|
||||
|
||||
User-agent: Xenu
|
||||
Disallow: /
|
||||
|
||||
User-agent: larbin
|
||||
Disallow: /
|
||||
|
||||
User-agent: libwww
|
||||
Disallow: /
|
||||
|
||||
User-agent: ZyBORG
|
||||
Disallow: /
|
||||
|
||||
User-agent: Download Ninja
|
||||
Disallow: /
|
||||
|
||||
# Misbehaving: requests much too fast:
|
||||
User-agent: fast
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Sorry, wget in its recursive mode is a frequent problem.
|
||||
# Please read the man page and use it properly; there is a
|
||||
# --wait option you can use to set the delay between hits,
|
||||
# for instance.
|
||||
#
|
||||
User-agent: wget
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# The 'grub' distributed client has been *very* poorly behaved.
|
||||
#
|
||||
User-agent: grub-client
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Doesn't follow robots.txt anyway, but...
|
||||
#
|
||||
User-agent: k2spider
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Hits many times per second, not acceptable
|
||||
# http://www.nameprotect.com/botinfo.html
|
||||
User-agent: NPBot
|
||||
Disallow: /
|
||||
|
||||
# A capture bot, downloads gazillions of pages with no public benefit
|
||||
# http://www.webreaper.net/
|
||||
User-agent: WebReaper
|
||||
Disallow: /
|
||||
|
||||
# Per their statement, semrushbot respects crawl-delay directives
|
||||
# We want them to overall stay within reasonable request rates to
|
||||
# the backend (20 rps); keeping in mind that the crawl-delay will
|
||||
# be applied by site and not globally by the bot, 5 seconds seem
|
||||
# like a reasonable approximation
|
||||
User-agent: SemrushBot
|
||||
Crawl-delay: 5
|
||||
|
||||
#
|
||||
# Friendly, low-speed bots are welcome viewing pages, but not
|
||||
# dynamically-generated pages please.
|
||||
#
|
||||
# Another exception is for REST API documentation, located at
|
||||
# /api/rest_v1/?doc.
|
||||
#
|
||||
User-agent: *
|
||||
Disallow: /api/
|
||||
Reference in New Issue
Block a user