citee-methodology/tools/prompt_curation/2_prompt_brainstormer.py

"""Stage 2 — Prompt Brainstormer.

For each persona generated in Stage 1, generate 30 prompts in the voice of that persona.
Total output: ~200-300 raw prompts per category, distributed across 5 prompt types.
"""
from __future__ import annotations

import argparse
import json
import os
from pathlib import Path

from anthropic import Anthropic

from config import CONFIG, get_target_counts

BRAINSTORM_PROMPT = """You are a Polish e-commerce buyer with the following persona:

**{persona_label}**

Demographics: {demographics}
Pain points: {pain_points}
Decision factors: {decision_factors}
Vocabulary style: {vocabulary}

Your task: Generate {prompts_per_persona} realistic queries you would type into ChatGPT, Perplexity, or Gemini when researching purchases in the **{category_display_name}** category.

CRITICAL distribution requirement — your {prompts_per_persona} prompts must be distributed across these 5 types:

| Type | Share | Example pattern |
|---|---|---|
| **buying** (active purchase intent) | {buying_share}% | "gdzie kupić [product] premium na prezent" |
| **comparison** (decision-stage) | {comparison_share}% | "[Brand A] vs [Brand B] który lepszy" |
| **specific_need** (specific use/attribute) | {specific_share}% | "[product] o [attribute] do [use case]" |
| **informational** (research) | {info_share}% | "co to [product] / jak działa [product]" |
| **brand_direct** (direct brand query) | {brand_share}% | "[Brand X] opinie / recenzje" |

Rules:
1. Write in **Polish** as your persona would actually phrase queries — colloquial, possibly with typos, possibly informal.
2. Be **specific** — avoid "best soy candles" generic. Add context: occasion, recipient, attribute, comparison.
3. Distribute across types per the percentages — don't put all 30 in "buying."
4. Each prompt must be 5-15 words typical. Avoid 1-word ("świeczki") or 30-word essays.
5. Use real brand names from this list when relevant: {sample_brands}

Output format — JSON array of objects, each with:
- `prompt`: the query text
- `type`: one of `buying`, `comparison`, `specific_need`, `informational`, `brand_direct`
- `persona_id`: "{persona_id}"

```json
[
  {{"prompt": "gdzie kupić premium świecę sojową na prezent dla mamy", "type": "buying", "persona_id": "{persona_id}"}},
  ...
]
```

Only output the JSON array. No prose."""


def load_personas(category_slug: str) -> dict:
    data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
    personas_file = data_dir / "personas.json"
    if not personas_file.exists():
        raise FileNotFoundError(
            f"Personas not found: {personas_file}. Run 1_persona_generator.py first."
        )
    with open(personas_file, "r", encoding="utf-8") as f:
        return json.load(f)


def load_brand_catalog(category_slug: str) -> dict:
    data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
    catalog_file = data_dir / "brand_catalog.json"
    with open(catalog_file, "r", encoding="utf-8") as f:
        return json.load(f)


def brainstorm_for_persona(
    client: Anthropic,
    persona: dict,
    category_display_name: str,
    sample_brands: str,
    prompts_per_persona: int,
) -> list[dict]:
    """Generate prompts for one persona."""
    distribution = CONFIG.type_distribution
    prompt = BRAINSTORM_PROMPT.format(
        persona_label=persona["label"],
        persona_id=persona["id"],
        demographics=json.dumps(persona["demographics"], ensure_ascii=False),
        pain_points="; ".join(persona["pain_points"]),
        decision_factors="; ".join(persona["decision_factors_ranked"]),
        vocabulary=json.dumps(persona["vocabulary"], ensure_ascii=False),
        category_display_name=category_display_name,
        prompts_per_persona=prompts_per_persona,
        sample_brands=sample_brands,
        buying_share=int(distribution["buying"] * 100),
        comparison_share=int(distribution["comparison"] * 100),
        specific_share=int(distribution["specific_need"] * 100),
        info_share=int(distribution["informational"] * 100),
        brand_share=int(distribution["brand_direct"] * 100),
    )

    response = client.messages.create(
        model=CONFIG.brainstormer_model,
        max_tokens=8000,
        messages=[{"role": "user", "content": prompt}],
    )

    text = response.content[0].text.strip()
    if text.startswith("```json"):
        text = text[7:]
    if text.endswith("```"):
        text = text[:-3]
    text = text.strip()

    return json.loads(text)


def brainstorm_all(category_slug: str, category_display_name: str) -> dict:
    personas_data = load_personas(category_slug)
    catalog = load_brand_catalog(category_slug)
    sample_brands = ", ".join(b["name"] for b in catalog["brands"][:8])

    client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

    all_prompts = []
    for persona in personas_data["personas"]:
        print(f"[Stage 2] Brainstorming for {persona['label']}...")
        prompts = brainstorm_for_persona(
            client=client,
            persona=persona,
            category_display_name=category_display_name,
            sample_brands=sample_brands,
            prompts_per_persona=CONFIG.prompts_per_persona,
        )
        all_prompts.extend(prompts)

    return {
        "category": category_slug,
        "total_raw_prompts": len(all_prompts),
        "personas_processed": len(personas_data["personas"]),
        "prompts_per_persona": CONFIG.prompts_per_persona,
        "type_distribution_target": CONFIG.type_distribution,
        "raw_prompts": all_prompts,
    }


def save_raw_prompts(category_slug: str, data: dict) -> Path:
    output_dir = Path(__file__).parent.parent.parent / "data" / category_slug
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / "raw_prompts.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return output_file


def main():
    parser = argparse.ArgumentParser(description="Brainstorm raw prompts for category.")
    parser.add_argument("--category", required=True)
    parser.add_argument("--display-name", required=True)
    args = parser.parse_args()

    print(f"[Stage 2] Brainstorming prompts for {args.category}...")
    data = brainstorm_all(args.category, args.display_name)
    output_path = save_raw_prompts(args.category, data)
    print(f"[Stage 2] ✅ Saved {data['total_raw_prompts']} raw prompts to {output_path}")
    print(f"[Stage 2] Target distribution: {get_target_counts()}")


if __name__ == "__main__":
    main()