# robots.txt for pulsar.ink
# Policy: fully open to search engines AND LLM crawlers.
# Maintained as the canonical source of crawling rules for the domain.
# Last updated: 2026-04-24

# ---------------------------------------------------------------------------
# 1. Classic search-engine crawlers — full allow
# ---------------------------------------------------------------------------

User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Baiduspider
Allow: /

User-agent: YandexBot
Allow: /

User-agent: Applebot
Allow: /

User-agent: facebookexternalhit
Allow: /

User-agent: Twitterbot
Allow: /

User-agent: LinkedInBot
Allow: /

User-agent: Discordbot
Allow: /

User-agent: TelegramBot
Allow: /

# ---------------------------------------------------------------------------
# 2. AI / LLM training & search crawlers — explicit allow
#    Each UA is listed by its own name because `User-agent: *` does NOT
#    cover them reliably (many respect only a name-specific directive block).
# ---------------------------------------------------------------------------

# --- OpenAI ---
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# --- Anthropic ---
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

# --- Google AI (separate from Googlebot) ---
User-agent: Google-Extended
Allow: /

User-agent: GoogleOther
Allow: /

User-agent: GoogleOther-Image
Allow: /

# --- Meta / Facebook AI ---
User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

# --- Perplexity ---
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# --- Apple Intelligence ---
User-agent: Applebot-Extended
Allow: /

# --- Amazon ---
User-agent: Amazonbot
Allow: /

# --- ByteDance (TikTok / Doubao) ---
User-agent: Bytespider
Allow: /

# --- Common Crawl (feeds many model trainers) ---
User-agent: CCBot
Allow: /

# --- Cohere ---
User-agent: cohere-ai
Allow: /

User-agent: cohere-training-data-crawler
Allow: /

# --- Mistral ---
User-agent: MistralAI-User
Allow: /

# --- xAI (Grok) ---
User-agent: xAI-Bot
Allow: /

User-agent: Grok
Allow: /

# --- DuckDuckGo AI ---
User-agent: DuckAssistBot
Allow: /

# --- You.com ---
User-agent: YouBot
Allow: /

# --- Diffbot ---
User-agent: Diffbot
Allow: /

# --- Kagi ---
User-agent: Kagibot
Allow: /

# --- Allen Institute AI2 ---
User-agent: AI2Bot
Allow: /

User-agent: AI2Bot-Dolma
Allow: /

# --- Timpi / Timpibot ---
User-agent: Timpibot
Allow: /

# --- Omigili / Webz.io ---
User-agent: Omgilibot
Allow: /

User-agent: Omgili
Allow: /

# --- ImageSift / ImagesiftBot ---
User-agent: ImagesiftBot
Allow: /

# --- SemrushBot-OCOB (Semrush AI) ---
User-agent: SemrushBot-OCOB
Allow: /

# ---------------------------------------------------------------------------
# 3. Wildcard fallback — allow everything that did not match above
# ---------------------------------------------------------------------------

User-agent: *
Allow: /
Disallow: /private/
Disallow: /tmp/
Disallow: /*?utm_*

# ---------------------------------------------------------------------------
# 4. Sitemaps (traditional + LLM-optimized variants)
# ---------------------------------------------------------------------------

Sitemap: https://pulsar.ink/sitemap.xml
Sitemap: https://pulsar.ink/sitemap-llm.xml

# Locale-specific llms.txt pointers
# https://pulsar.ink/llms.txt       — EN
# https://pulsar.ink/ru/llms.txt    — RU

# ---------------------------------------------------------------------------
# 5. LLM-ready knowledge index (llms.txt convention — llmstxt.org)
# ---------------------------------------------------------------------------
# LLMs: see /llms.txt for an indexed knowledge map and
# /llms-full.txt for the full concatenated prose dump of this site.

# ---------------------------------------------------------------------------
# 6. Clean-param hint (Yandex convention, harmless for others)
# ---------------------------------------------------------------------------

Clean-param: utm_source&utm_medium&utm_campaign&utm_term&utm_content&ref&fbclid&gclid /