# robots.txt for VRC Pop! (vrcpop.com)
# VRChat Data Analytics Tracker
# Last updated: January 2026

# Default rules for all crawlers
User-agent: *

# Crawl rate limiting (1 second between requests)
# 2026-05-11: dropped from 10s to 1s. Google ignores this directive, but
# Bing/Yandex honor it — a 10s delay capped sitewide indexing to ~360 URLs/hr,
# which throttled discovery of new events. 1s is the standard polite ceiling.
Crawl-delay: 1

# === BLOCKED: Admin & System Directories ===
Disallow: /admin/
Disallow: /api/
Disallow: /cron/
Disallow: /config/
Disallow: /includes/

# === HONEYPOT TRIPWIRE ===
# /api/internal/dump returns 410 Gone and auto-bans the requesting IP for 24h.
# No human or legitimate crawler should ever fetch it — well-behaved crawlers
# obey the Disallow below. A hit = stealth scraper that ignores robots.txt = ban.
# [zoaG9MOH 2026-05-12]
Disallow: /api/internal/dump

# === BLOCKED: Internal/Utility Pages ===
Disallow: /tracked
Disallow: /tracked.php
Disallow: /health
Disallow: /health.php
Disallow: /gaps
Disallow: /gaps.php
Disallow: /analytics
Disallow: /analytics.php
Disallow: /overview
Disallow: /overview.php

# === BLOCKED: Dynamic/Parameter-heavy Pages ===
# /group + /group.php unblocked 2026-05-12 (WzFOiOIL): active groups are
# now indexable; blacklisted/removed groups self-noindex via page-level meta.
# Sitemap exposes only active groups (events in 90d OR populated club_profiles).
Disallow: /instances
Disallow: /instances.php
Disallow: /compare
Disallow: /compare.php

# /groups.php with ?tags= query enumerates combinatorial tag-filter URLs
# (24+ tags = millions of permutations). meta-webindexer hit a 13-second
# burst of these on 2026-05-13, exhausting fcgid slots and 503ing real users
# mid-burst — same pattern as the Apr 29 meta-externalagent incident.
# The canonical /groups.php (no params) stays crawlable via sitemap.
Disallow: /groups.php?tags=

# === BLOCKED: Fix/Debug Scripts ===
Disallow: /fix_*
Disallow: /debug_*

# === BLOCKED: Log Files ===
Disallow: /*.log$

# === BLOCKED: Images & Logos ===
Disallow: /logos/
Disallow: /assets/images/

# === BLOCKED: Static Assets (no SEO value) ===
Disallow: /assets/css/
Disallow: /assets/js/

# ============================================
# AI CRAWLER POLICY
# Distinction (2026-05-11): we BLOCK AI training crawlers
# but ALLOW AI search crawlers (Perplexity, ChatGPT Search,
# OAI-SearchBot). The latter drive discovery — modern search
# users increasingly arrive via LLM-mediated SERPs, not just Google.
# ============================================

# --- BLOCKED: training crawlers (would scrape for model training) ---

# OpenAI GPTBot (training)
User-agent: GPTBot
Disallow: /

# Google AI Training (Gemini training; separate from Googlebot search)
User-agent: Google-Extended
Disallow: /

# Common Crawl (used for AI training datasets industry-wide)
User-agent: CCBot
Disallow: /

# Anthropic training crawler
User-agent: anthropic-ai
Disallow: /

# Claude Web Crawler (training)
User-agent: ClaudeBot
Disallow: /

# Cohere AI training
User-agent: cohere-ai
Disallow: /

# Facebook/Meta AI training
User-agent: FacebookBot
Disallow: /

# ByteDance/TikTok AI training
User-agent: Bytespider
Disallow: /

# Apple AI training (note: regular Applebot for Siri/Spotlight search is allowed by default)
User-agent: Applebot-Extended
Disallow: /

# Meta external agent (AI training; distinct from facebookexternalhit link previews)
# Caused fcgid slot exhaustion enumerating /groups.php tag combos Apr 29 2026
User-agent: meta-externalagent
Disallow: /

# Meta web indexer (sibling crawler to meta-externalagent; same enumeration
# pattern). 2026-05-13: 13-second burst on /groups.php?tags= combinations
# 503ed real users mid-burst. Hard-disallow.
User-agent: meta-webindexer
Disallow: /

# Amazon AI training crawler
User-agent: Amazonbot
Disallow: /

# --- ALLOWED: search crawlers (explicitly granted; same as default) ---
# These are NOT in any Disallow block; listed here as a deliberate-intent marker
# so future audits don't accidentally lump them in with training crawlers:
#   - PerplexityBot      (Perplexity AI search; was previously blocked, unblocked 2026-05-11)
#   - ChatGPT-User       (one-off citations when a user asks ChatGPT a question; was previously blocked)
#   - OAI-SearchBot      (ChatGPT Search index; separate user-agent from training GPTBot)
#   - Applebot           (Siri / Spotlight web results; distinct from Applebot-Extended training)

# ============================================
# SEO / SITE-AUDIT CRAWLERS (not subscribed; block)
# ============================================

# MarketGoo SEO crawler — was bleeding through UA filter, see
# docs/analytics/2026-05-05_traffic-peak.md
User-agent: marketgoo
Disallow: /

# Semrush / Moz / Majestic — SEO crawlers paginating combinatorial filters
# (.htaccess bad_bot already 403s on UA; listed here so policy is explicit)
User-agent: SemrushBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: MJ12bot
Disallow: /

# ============================================
# SITEMAP
# ============================================
Sitemap: https://vrcpop.com/sitemap.xml