citee-methodology/tools/prompt_curation/1_persona_generator.py

"""Stage 1 — Persona Generator.

Generate buyer personas for a category using Claude Sonnet.
Each persona includes demographics, pain points, decision factors, vocabulary.
"""
from __future__ import annotations

import argparse
import json
import os
from pathlib import Path

from anthropic import Anthropic

from config import CONFIG

PERSONA_PROMPT = """You are a market researcher specializing in Polish e-commerce buyer personas.

Generate {num_personas} distinct buyer personas for the category: **{category_display_name}**.

Each persona should represent a realistic, distinct segment of buyers in this category. Avoid generic "millennial professional" — be specific.

For each persona, provide:
1. **Name/label** — short descriptor (e.g., "30+ kobieta kupująca prezent dla mamy")
2. **Demographics** — age range, gender, location type (city/suburb/rural), income bracket
3. **Pain points** — what they're trying to solve when buying in this category
4. **Decision factors** — ranked list of what matters most (price, ingredients, brand, reviews, etc.)
5. **Vocabulary** — how they actually talk: formal/colloquial, technical/lay, Polish-specific phrases they use
6. **Search behavior** — do they research deeply or buy impulsively? Which platforms do they trust?

Categories context:
- Polish e-commerce market
- Buyers using AI assistants (ChatGPT, Perplexity, Gemini, Claude) increasingly to discover and compare brands
- This is for {category_display_name} — a {market_size} market with brands like {sample_brands}

Output as valid JSON in this exact schema:

```json
{{
  "category": "{category_slug}",
  "personas": [
    {{
      "id": "persona_1",
      "label": "30+ kobieta kupująca prezent dla mamy",
      "demographics": {{
        "age_range": "30-45",
        "gender": "female",
        "location_type": "city",
        "income_bracket": "middle to upper-middle"
      }},
      "pain_points": [
        "...",
        "..."
      ],
      "decision_factors_ranked": [
        "...",
        "..."
      ],
      "vocabulary": {{
        "register": "informal but respectful",
        "polish_phrases": ["...", "..."],
        "tech_savvy": "medium"
      }},
      "search_behavior": "researches 2-3 options before deciding, trusts Reddit and friend recommendations"
    }}
  ]
}}
```

Only output the JSON. No prose before or after."""


def load_category_context(category_slug: str, data_dir: Path) -> dict:
    """Load market metadata + sample brands for the category."""
    metadata_file = data_dir / category_slug / "market_metadata.json"
    catalog_file = data_dir / category_slug / "brand_catalog.json"

    if not metadata_file.exists():
        raise FileNotFoundError(f"Market metadata not found: {metadata_file}")
    if not catalog_file.exists():
        raise FileNotFoundError(f"Brand catalog not found: {catalog_file}")

    with open(metadata_file, "r", encoding="utf-8") as f:
        metadata = json.load(f)
    with open(catalog_file, "r", encoding="utf-8") as f:
        catalog = json.load(f)

    sample_brands = ", ".join(b["name"] for b in catalog["brands"][:5])
    market_size = metadata["market_size"]["tier_classification"]

    return {
        "metadata": metadata,
        "catalog": catalog,
        "sample_brands": sample_brands,
        "market_size": market_size,
    }


def generate_personas(category_slug: str, category_display_name: str, num_personas: int = None) -> dict:
    """Generate personas via Claude API."""
    if num_personas is None:
        num_personas = CONFIG.num_personas

    data_dir = Path(__file__).parent.parent.parent / "data"
    context = load_category_context(category_slug, data_dir)

    client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

    prompt = PERSONA_PROMPT.format(
        num_personas=num_personas,
        category_slug=category_slug,
        category_display_name=category_display_name,
        market_size=context["market_size"],
        sample_brands=context["sample_brands"],
    )

    response = client.messages.create(
        model=CONFIG.persona_model,
        max_tokens=4000,
        messages=[{"role": "user", "content": prompt}],
    )

    # Extract JSON from response
    text = response.content[0].text.strip()
    if text.startswith("```json"):
        text = text[7:]
    if text.endswith("```"):
        text = text[:-3]
    text = text.strip()

    personas_data = json.loads(text)
    return personas_data


def save_personas(category_slug: str, personas: dict) -> Path:
    """Save personas to data/{category}/personas.json."""
    data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
    data_dir.mkdir(parents=True, exist_ok=True)
    output_file = data_dir / "personas.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(personas, f, ensure_ascii=False, indent=2)
    return output_file


def main():
    parser = argparse.ArgumentParser(description="Generate buyer personas for a Citee category.")
    parser.add_argument("--category", required=True, help="Category slug (e.g., 'swiece-sojowe-pl')")
    parser.add_argument("--display-name", required=True, help="Human-readable category name (e.g., 'Świece sojowe PL')")
    parser.add_argument("--num-personas", type=int, default=None, help=f"Number of personas to generate (default: {CONFIG.num_personas})")
    args = parser.parse_args()

    print(f"[Stage 1] Generating {args.num_personas or CONFIG.num_personas} personas for {args.category}...")
    personas = generate_personas(args.category, args.display_name, args.num_personas)
    output_path = save_personas(args.category, personas)
    print(f"[Stage 1] ✅ Saved {len(personas['personas'])} personas to {output_path}")


if __name__ == "__main__":
    main()