"""Stage 1 — Persona Generator. Generate buyer personas for a category using Claude Sonnet. Each persona includes demographics, pain points, decision factors, vocabulary. """ from __future__ import annotations import argparse import json import os from pathlib import Path from anthropic import Anthropic from config import CONFIG PERSONA_PROMPT = """You are a market researcher specializing in Polish e-commerce buyer personas. Generate {num_personas} distinct buyer personas for the category: **{category_display_name}**. Each persona should represent a realistic, distinct segment of buyers in this category. Avoid generic "millennial professional" — be specific. For each persona, provide: 1. **Name/label** — short descriptor (e.g., "30+ kobieta kupująca prezent dla mamy") 2. **Demographics** — age range, gender, location type (city/suburb/rural), income bracket 3. **Pain points** — what they're trying to solve when buying in this category 4. **Decision factors** — ranked list of what matters most (price, ingredients, brand, reviews, etc.) 5. **Vocabulary** — how they actually talk: formal/colloquial, technical/lay, Polish-specific phrases they use 6. **Search behavior** — do they research deeply or buy impulsively? Which platforms do they trust? Categories context: - Polish e-commerce market - Buyers using AI assistants (ChatGPT, Perplexity, Gemini, Claude) increasingly to discover and compare brands - This is for {category_display_name} — a {market_size} market with brands like {sample_brands} Output as valid JSON in this exact schema: ```json {{ "category": "{category_slug}", "personas": [ {{ "id": "persona_1", "label": "30+ kobieta kupująca prezent dla mamy", "demographics": {{ "age_range": "30-45", "gender": "female", "location_type": "city", "income_bracket": "middle to upper-middle" }}, "pain_points": [ "...", "..." ], "decision_factors_ranked": [ "...", "..." ], "vocabulary": {{ "register": "informal but respectful", "polish_phrases": ["...", "..."], "tech_savvy": "medium" }}, "search_behavior": "researches 2-3 options before deciding, trusts Reddit and friend recommendations" }} ] }} ``` Only output the JSON. No prose before or after.""" def load_category_context(category_slug: str, data_dir: Path) -> dict: """Load market metadata + sample brands for the category.""" metadata_file = data_dir / category_slug / "market_metadata.json" catalog_file = data_dir / category_slug / "brand_catalog.json" if not metadata_file.exists(): raise FileNotFoundError(f"Market metadata not found: {metadata_file}") if not catalog_file.exists(): raise FileNotFoundError(f"Brand catalog not found: {catalog_file}") with open(metadata_file, "r", encoding="utf-8") as f: metadata = json.load(f) with open(catalog_file, "r", encoding="utf-8") as f: catalog = json.load(f) sample_brands = ", ".join(b["name"] for b in catalog["brands"][:5]) market_size = metadata["market_size"]["tier_classification"] return { "metadata": metadata, "catalog": catalog, "sample_brands": sample_brands, "market_size": market_size, } def generate_personas(category_slug: str, category_display_name: str, num_personas: int = None) -> dict: """Generate personas via Claude API.""" if num_personas is None: num_personas = CONFIG.num_personas data_dir = Path(__file__).parent.parent.parent / "data" context = load_category_context(category_slug, data_dir) client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) prompt = PERSONA_PROMPT.format( num_personas=num_personas, category_slug=category_slug, category_display_name=category_display_name, market_size=context["market_size"], sample_brands=context["sample_brands"], ) response = client.messages.create( model=CONFIG.persona_model, max_tokens=4000, messages=[{"role": "user", "content": prompt}], ) # Extract JSON from response text = response.content[0].text.strip() if text.startswith("```json"): text = text[7:] if text.endswith("```"): text = text[:-3] text = text.strip() personas_data = json.loads(text) return personas_data def save_personas(category_slug: str, personas: dict) -> Path: """Save personas to data/{category}/personas.json.""" data_dir = Path(__file__).parent.parent.parent / "data" / category_slug data_dir.mkdir(parents=True, exist_ok=True) output_file = data_dir / "personas.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(personas, f, ensure_ascii=False, indent=2) return output_file def main(): parser = argparse.ArgumentParser(description="Generate buyer personas for a Citee category.") parser.add_argument("--category", required=True, help="Category slug (e.g., 'swiece-sojowe-pl')") parser.add_argument("--display-name", required=True, help="Human-readable category name (e.g., 'Świece sojowe PL')") parser.add_argument("--num-personas", type=int, default=None, help=f"Number of personas to generate (default: {CONFIG.num_personas})") args = parser.parse_args() print(f"[Stage 1] Generating {args.num_personas or CONFIG.num_personas} personas for {args.category}...") personas = generate_personas(args.category, args.display_name, args.num_personas) output_path = save_personas(args.category, personas) print(f"[Stage 1] ✅ Saved {len(personas['personas'])} personas to {output_path}") if __name__ == "__main__": main()