# robots.txt for blockchain-development-company.skin
# Last updated: 2026-04-08
#
# Strategy (must match .htaccess access policy):
#   - Allow only Google, Bing, Yandex, Mail.ru search engines
#   - Disallow every AI training / AI citation crawler
#   - Disallow every SEO backlink / audit crawler
#   - Disallow every download tool / bulk scraper
#   - Disallow common dev/build paths in the default rule
#
# .htaccess enforces the same policy at the HTTP layer (scrapers get 403,
# allowed search engines get 301 to canonical target, real browsers get
# served the page content). robots.txt is the polite-crawler declaration of
# the same rules for crawlers that respect it.

# =============================================================================
# ALLOWED SEARCH ENGINES (these are the only bots that should crawl)
# =============================================================================

User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Googlebot-Video
Allow: /

User-agent: Googlebot-Mobile
Allow: /

User-agent: APIs-Google
Allow: /

User-agent: Google-InspectionTool
Allow: /

User-agent: Storebot-Google
Allow: /

User-agent: bingbot
Allow: /

User-agent: BingPreview
Allow: /

User-agent: msnbot
Allow: /

User-agent: adidxbot
Allow: /

User-agent: YandexBot
Allow: /

User-agent: YandexImages
Allow: /

User-agent: YandexNews
Allow: /

User-agent: YandexMobileBot
Allow: /

User-agent: YandexMetrika
Allow: /

User-agent: Mail.RU_Bot
Allow: /

# =============================================================================
# AI CRAWLERS. Policy: CITATION-ONLY.
# Live-answer / search-index crawlers are ALLOWED so the page can be cited in
# ChatGPT, Perplexity and Google AI surfaces. Pure-training crawlers stay
# disallowed (corpus ingestion only, no citation value).
# =============================================================================
# OpenAI: block training (GPTBot), allow citation (search index + live fetch)
User-agent: GPTBot
Disallow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /

# Anthropic: training/ingestion crawlers, disallow
User-agent: ClaudeBot
Disallow: /

User-agent: claude-web
Disallow: /

User-agent: anthropic-ai
Disallow: /

# Google AI surfaces (Gemini / AI Overviews generative use): allow
User-agent: Google-Extended
Allow: /

# Apple Intelligence training (distinct from Applebot search): disallow
User-agent: Applebot-Extended
Disallow: /

User-agent: Applebot
Disallow: /

# Perplexity: allow citation (index + live fetch)
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Common Crawl (bulk training dataset)
User-agent: CCBot
Disallow: /

# ByteDance / TikTok
User-agent: Bytespider
Disallow: /

# Meta AI
User-agent: Meta-ExternalAgent
Disallow: /

User-agent: Meta-ExternalFetcher
Disallow: /

User-agent: FacebookBot
Disallow: /

# Amazon
User-agent: Amazonbot
Disallow: /

# Other AI/data crawlers
User-agent: Diffbot
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: Omgili
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: cohere-training-data-crawler
Disallow: /

User-agent: YouBot
Disallow: /

User-agent: Bravebot
Disallow: /

User-agent: Neevabot
Disallow: /

User-agent: FriendlyCrawler
Disallow: /

User-agent: ImagesiftBot
Disallow: /

User-agent: img2dataset
Disallow: /

User-agent: TurnitinBot
Disallow: /

User-agent: Timpibot
Disallow: /

User-agent: PanguBot
Disallow: /

User-agent: webzio-extended
Disallow: /

User-agent: iaskspider
Disallow: /

User-agent: AI2Bot
Disallow: /

User-agent: AwarioSmartBot
Disallow: /

User-agent: VelenPublicWebCrawler
Disallow: /

User-agent: Kangaroo Bot
Disallow: /

User-agent: Scoop.it
Disallow: /

# =============================================================================
# SEO / BACKLINK / AUDIT CRAWLERS (disallow, they extract for competitors)
# =============================================================================
User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: Semrush
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: rogerbot
Disallow: /

User-agent: MegaIndex
Disallow: /

User-agent: Serpstat
Disallow: /

User-agent: SerpstatBot
Disallow: /

User-agent: sistrix
Disallow: /

User-agent: SiteAuditBot
Disallow: /

User-agent: SeznamBot
Disallow: /

User-agent: spbot
Disallow: /

User-agent: linkdexbot
Disallow: /

User-agent: SpyFu
Disallow: /

User-agent: MajesticSEO
Disallow: /

User-agent: Majestic-12
Disallow: /

User-agent: BarkRowler
Disallow: /

User-agent: SeekportBot
Disallow: /

User-agent: Exabot
Disallow: /

User-agent: Cliqzbot
Disallow: /

User-agent: Searchmetricsbot
Disallow: /

User-agent: BacklinkCrawler
Disallow: /

User-agent: LinkpadBot
Disallow: /

User-agent: Linkdex
Disallow: /

User-agent: Cocolyzebot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

# =============================================================================
# NON-WHITELIST SEARCH ENGINES (disallow, user whitelisted only 4)
# =============================================================================
User-agent: baiduspider
Disallow: /

User-agent: Baiduspider
Disallow: /

User-agent: sogou
Disallow: /

User-agent: coccoc
Disallow: /

User-agent: Naver
Disallow: /

User-agent: Yeti
Disallow: /

User-agent: DuckDuckBot
Disallow: /

User-agent: Qwantbot
Disallow: /

User-agent: Qwantify
Disallow: /

User-agent: Ecosia
Disallow: /

User-agent: PetalBot
Disallow: /

# =============================================================================
# DOWNLOAD TOOLS / OFFLINE BROWSERS / BULK SCRAPERS (disallow)
# =============================================================================
User-agent: HTTrack
Disallow: /

User-agent: Wget
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: GetRight
Disallow: /

User-agent: FlashGet
Disallow: /

User-agent: LeechGet
Disallow: /

User-agent: MassDownloader
Disallow: /

User-agent: Harvester
Disallow: /

User-agent: EmailCollector
Disallow: /

User-agent: EmailSiphon
Disallow: /

User-agent: EmailWolf
Disallow: /

User-agent: ExtractorPro
Disallow: /

User-agent: LinkExtractorPro
Disallow: /

# =============================================================================
# ARCHIVE CRAWLERS (disallow, prevent Wayback caching of landing content)
# =============================================================================
User-agent: ia_archiver
Disallow: /

User-agent: archive.org_bot
Disallow: /

User-agent: Archive-It
Disallow: /

User-agent: wayback
Disallow: /

# =============================================================================
# DEFAULT (any bot not listed above)
#
# Allows crawling by polite crawlers that are not explicitly blacklisted,
# but disallows dev/build artifacts. Note: the .htaccess Mozilla check
# already 403s any UA that is neither a real browser nor an allowed bot, so
# this default rule is a second line of defense for crawlers that DO spoof
# Mozilla but respect robots.txt.
# =============================================================================

User-agent: *
Disallow: /.env
Disallow: /.git/
Disallow: /.svn/
Disallow: /.hg/
Disallow: /.idea/
Disallow: /.vscode/
Disallow: /.DS_Store
Disallow: /backups/
Disallow: /node_modules/
Disallow: /vendor/
Disallow: /deploy.sh
Disallow: /up.sh
Disallow: /package.json
Disallow: /package-lock.json
Disallow: /yarn.lock
Disallow: /pnpm-lock.yaml
Disallow: /README.md
Disallow: /CHANGELOG.md
Disallow: /LICENSE
Disallow: /Makefile
Disallow: /*.log$
Disallow: /*.bak$
Disallow: /*.swp$
Disallow: /*.swo$
Disallow: /*.sql$
Disallow: /*.sqlite$
Disallow: /*.env$
Disallow: /*.sh$
Disallow: /*.md$
Disallow: /*.yml$
Disallow: /*.yaml$
Disallow: /*.toml$
Disallow: /*.sample$
Disallow: /*.example$

# Crawl-delay for polite crawlers (Google ignores; Yandex + Bing respect)
Crawl-delay: 2

# =============================================================================
# SITEMAP
# =============================================================================
Sitemap: https://blockchain-development-company.skin/sitemap.xml