# robots.txt for VRC Pop! (vrcpop.com) # VRChat Data Analytics Tracker # Last updated: January 2026 # Default rules for all crawlers User-agent: * # Crawl rate limiting (1 second between requests) # 2026-05-11: dropped from 10s to 1s. Google ignores this directive, but # Bing/Yandex honor it — a 10s delay capped sitewide indexing to ~360 URLs/hr, # which throttled discovery of new events. 1s is the standard polite ceiling. Crawl-delay: 1 # === BLOCKED: Admin & System Directories === Disallow: /admin/ Disallow: /api/ Disallow: /cron/ Disallow: /config/ Disallow: /includes/ # === HONEYPOT TRIPWIRE === # /api/internal/dump returns 410 Gone and auto-bans the requesting IP for 24h. # No human or legitimate crawler should ever fetch it — well-behaved crawlers # obey the Disallow below. A hit = stealth scraper that ignores robots.txt = ban. # [zoaG9MOH 2026-05-12] Disallow: /api/internal/dump # === BLOCKED: Internal/Utility Pages === Disallow: /tracked Disallow: /tracked.php Disallow: /health Disallow: /health.php Disallow: /gaps Disallow: /gaps.php Disallow: /analytics Disallow: /analytics.php Disallow: /overview Disallow: /overview.php # === BLOCKED: Dynamic/Parameter-heavy Pages === # /group + /group.php unblocked 2026-05-12 (WzFOiOIL): active groups are # now indexable; blacklisted/removed groups self-noindex via page-level meta. # Sitemap exposes only active groups (events in 90d OR populated club_profiles). Disallow: /instances Disallow: /instances.php Disallow: /compare Disallow: /compare.php # /groups.php with ?tags= query enumerates combinatorial tag-filter URLs # (24+ tags = millions of permutations). meta-webindexer hit a 13-second # burst of these on 2026-05-13, exhausting fcgid slots and 503ing real users # mid-burst — same pattern as the Apr 29 meta-externalagent incident. # The canonical /groups.php (no params) stays crawlable via sitemap. Disallow: /groups.php?tags= # === BLOCKED: Fix/Debug Scripts === Disallow: /fix_* Disallow: /debug_* # === BLOCKED: Log Files === Disallow: /*.log$ # === BLOCKED: Images & Logos === Disallow: /logos/ Disallow: /assets/images/ # === BLOCKED: Static Assets (no SEO value) === Disallow: /assets/css/ Disallow: /assets/js/ # ============================================ # AI CRAWLER POLICY # Distinction (2026-05-11): we BLOCK AI training crawlers # but ALLOW AI search crawlers (Perplexity, ChatGPT Search, # OAI-SearchBot). The latter drive discovery — modern search # users increasingly arrive via LLM-mediated SERPs, not just Google. # ============================================ # --- BLOCKED: training crawlers (would scrape for model training) --- # OpenAI GPTBot (training) User-agent: GPTBot Disallow: / # Google AI Training (Gemini training; separate from Googlebot search) User-agent: Google-Extended Disallow: / # Common Crawl (used for AI training datasets industry-wide) User-agent: CCBot Disallow: / # Anthropic training crawler User-agent: anthropic-ai Disallow: / # Claude Web Crawler (training) User-agent: ClaudeBot Disallow: / # Cohere AI training User-agent: cohere-ai Disallow: / # Facebook/Meta AI training User-agent: FacebookBot Disallow: / # ByteDance/TikTok AI training User-agent: Bytespider Disallow: / # Apple AI training (note: regular Applebot for Siri/Spotlight search is allowed by default) User-agent: Applebot-Extended Disallow: / # Meta external agent (AI training; distinct from facebookexternalhit link previews) # Caused fcgid slot exhaustion enumerating /groups.php tag combos Apr 29 2026 User-agent: meta-externalagent Disallow: / # Meta web indexer (sibling crawler to meta-externalagent; same enumeration # pattern). 2026-05-13: 13-second burst on /groups.php?tags= combinations # 503ed real users mid-burst. Hard-disallow. User-agent: meta-webindexer Disallow: / # Amazon AI training crawler User-agent: Amazonbot Disallow: / # --- ALLOWED: search crawlers (explicitly granted; same as default) --- # These are NOT in any Disallow block; listed here as a deliberate-intent marker # so future audits don't accidentally lump them in with training crawlers: # - PerplexityBot (Perplexity AI search; was previously blocked, unblocked 2026-05-11) # - ChatGPT-User (one-off citations when a user asks ChatGPT a question; was previously blocked) # - OAI-SearchBot (ChatGPT Search index; separate user-agent from training GPTBot) # - Applebot (Siri / Spotlight web results; distinct from Applebot-Extended training) # ============================================ # SEO / SITE-AUDIT CRAWLERS (not subscribed; block) # ============================================ # MarketGoo SEO crawler — was bleeding through UA filter, see # docs/analytics/2026-05-05_traffic-peak.md User-agent: marketgoo Disallow: / # Semrush / Moz / Majestic — SEO crawlers paginating combinatorial filters # (.htaccess bad_bot already 403s on UA; listed here so policy is explicit) User-agent: SemrushBot Disallow: / User-agent: DotBot Disallow: / User-agent: MJ12bot Disallow: / # ============================================ # SITEMAP # ============================================ Sitemap: https://vrcpop.com/sitemap.xml