DATA — Public reference datasets for methodology:
- data/README.md: schema + format definitions for brand catalogs
- data/swiece-sojowe-pl/brand_catalog.json: 35 tracked brands (33 manufacturers + 2 importers) + 5 excluded marketplaces/resellers
- data/swiece-sojowe-pl/brand_catalog.md: human-readable companion
- data/swiece-sojowe-pl/market_metadata.json: GMV estimate, personas, seasonality, expected dynamics
TOOLS — 6-stage prompt curation pipeline (Python 3.12+):
- tools/prompt_curation/README.md: process documentation + cost estimates
- tools/prompt_curation/config.py: tunable parameters per stage
- tools/prompt_curation/.env.example: required API keys template
- tools/prompt_curation/requirements.txt: dependencies
- tools/prompt_curation/1_persona_generator.py: Claude generates 7 buyer personas
- tools/prompt_curation/2_prompt_brainstormer.py: per persona × 30 prompts in voice
- tools/prompt_curation/3_reality_checker.py: Google Trends + Reddit cross-check
- tools/prompt_curation/4_validation_agents.py: 3 critic agents async (real_buyer/methodology/exploit_hunter)
- tools/prompt_curation/5_pilot_test_runner.py: sample × 3 LLM models pre-flight
- tools/prompt_curation/6_human_review_export.py: CSV export for founder approval
- tools/prompt_curation/7_finalize.py: post-approval → closed prompts/{cat}/v{N}.json
- tools/prompt_curation/pipeline.py: orchestrator (stages 1–6, then human review, then 7)
GITIGNORE — Fixed .env.* exclusion to allow .env.example.
This commit completes Faza 1. Stages outputs (data/{cat}/personas.json,
raw_prompts.json, validated_prompts.json, critic_review.json, pilot_test_results.json,
for_human_review.csv) are runtime artifacts — public when committed, derived from
public methodology + public brand catalog. Final approved prompt strings in
prompts/{cat}/v{N}.json remain CLOSED (gitignored, anti-Goodhart's Law).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
172 lines
6.4 KiB
Python
172 lines
6.4 KiB
Python
"""Stage 2 — Prompt Brainstormer.
|
|
|
|
For each persona generated in Stage 1, generate 30 prompts in the voice of that persona.
|
|
Total output: ~200-300 raw prompts per category, distributed across 5 prompt types.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from anthropic import Anthropic
|
|
|
|
from config import CONFIG, get_target_counts
|
|
|
|
BRAINSTORM_PROMPT = """You are a Polish e-commerce buyer with the following persona:
|
|
|
|
**{persona_label}**
|
|
|
|
Demographics: {demographics}
|
|
Pain points: {pain_points}
|
|
Decision factors: {decision_factors}
|
|
Vocabulary style: {vocabulary}
|
|
|
|
Your task: Generate {prompts_per_persona} realistic queries you would type into ChatGPT, Perplexity, or Gemini when researching purchases in the **{category_display_name}** category.
|
|
|
|
CRITICAL distribution requirement — your {prompts_per_persona} prompts must be distributed across these 5 types:
|
|
|
|
| Type | Share | Example pattern |
|
|
|---|---|---|
|
|
| **buying** (active purchase intent) | {buying_share}% | "gdzie kupić [product] premium na prezent" |
|
|
| **comparison** (decision-stage) | {comparison_share}% | "[Brand A] vs [Brand B] który lepszy" |
|
|
| **specific_need** (specific use/attribute) | {specific_share}% | "[product] o [attribute] do [use case]" |
|
|
| **informational** (research) | {info_share}% | "co to [product] / jak działa [product]" |
|
|
| **brand_direct** (direct brand query) | {brand_share}% | "[Brand X] opinie / recenzje" |
|
|
|
|
Rules:
|
|
1. Write in **Polish** as your persona would actually phrase queries — colloquial, possibly with typos, possibly informal.
|
|
2. Be **specific** — avoid "best soy candles" generic. Add context: occasion, recipient, attribute, comparison.
|
|
3. Distribute across types per the percentages — don't put all 30 in "buying."
|
|
4. Each prompt must be 5-15 words typical. Avoid 1-word ("świeczki") or 30-word essays.
|
|
5. Use real brand names from this list when relevant: {sample_brands}
|
|
|
|
Output format — JSON array of objects, each with:
|
|
- `prompt`: the query text
|
|
- `type`: one of `buying`, `comparison`, `specific_need`, `informational`, `brand_direct`
|
|
- `persona_id`: "{persona_id}"
|
|
|
|
```json
|
|
[
|
|
{{"prompt": "gdzie kupić premium świecę sojową na prezent dla mamy", "type": "buying", "persona_id": "{persona_id}"}},
|
|
...
|
|
]
|
|
```
|
|
|
|
Only output the JSON array. No prose."""
|
|
|
|
|
|
def load_personas(category_slug: str) -> dict:
|
|
data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
|
|
personas_file = data_dir / "personas.json"
|
|
if not personas_file.exists():
|
|
raise FileNotFoundError(
|
|
f"Personas not found: {personas_file}. Run 1_persona_generator.py first."
|
|
)
|
|
with open(personas_file, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def load_brand_catalog(category_slug: str) -> dict:
|
|
data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
|
|
catalog_file = data_dir / "brand_catalog.json"
|
|
with open(catalog_file, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def brainstorm_for_persona(
|
|
client: Anthropic,
|
|
persona: dict,
|
|
category_display_name: str,
|
|
sample_brands: str,
|
|
prompts_per_persona: int,
|
|
) -> list[dict]:
|
|
"""Generate prompts for one persona."""
|
|
distribution = CONFIG.type_distribution
|
|
prompt = BRAINSTORM_PROMPT.format(
|
|
persona_label=persona["label"],
|
|
persona_id=persona["id"],
|
|
demographics=json.dumps(persona["demographics"], ensure_ascii=False),
|
|
pain_points="; ".join(persona["pain_points"]),
|
|
decision_factors="; ".join(persona["decision_factors_ranked"]),
|
|
vocabulary=json.dumps(persona["vocabulary"], ensure_ascii=False),
|
|
category_display_name=category_display_name,
|
|
prompts_per_persona=prompts_per_persona,
|
|
sample_brands=sample_brands,
|
|
buying_share=int(distribution["buying"] * 100),
|
|
comparison_share=int(distribution["comparison"] * 100),
|
|
specific_share=int(distribution["specific_need"] * 100),
|
|
info_share=int(distribution["informational"] * 100),
|
|
brand_share=int(distribution["brand_direct"] * 100),
|
|
)
|
|
|
|
response = client.messages.create(
|
|
model=CONFIG.brainstormer_model,
|
|
max_tokens=8000,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
)
|
|
|
|
text = response.content[0].text.strip()
|
|
if text.startswith("```json"):
|
|
text = text[7:]
|
|
if text.endswith("```"):
|
|
text = text[:-3]
|
|
text = text.strip()
|
|
|
|
return json.loads(text)
|
|
|
|
|
|
def brainstorm_all(category_slug: str, category_display_name: str) -> dict:
|
|
personas_data = load_personas(category_slug)
|
|
catalog = load_brand_catalog(category_slug)
|
|
sample_brands = ", ".join(b["name"] for b in catalog["brands"][:8])
|
|
|
|
client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
|
|
|
all_prompts = []
|
|
for persona in personas_data["personas"]:
|
|
print(f"[Stage 2] Brainstorming for {persona['label']}...")
|
|
prompts = brainstorm_for_persona(
|
|
client=client,
|
|
persona=persona,
|
|
category_display_name=category_display_name,
|
|
sample_brands=sample_brands,
|
|
prompts_per_persona=CONFIG.prompts_per_persona,
|
|
)
|
|
all_prompts.extend(prompts)
|
|
|
|
return {
|
|
"category": category_slug,
|
|
"total_raw_prompts": len(all_prompts),
|
|
"personas_processed": len(personas_data["personas"]),
|
|
"prompts_per_persona": CONFIG.prompts_per_persona,
|
|
"type_distribution_target": CONFIG.type_distribution,
|
|
"raw_prompts": all_prompts,
|
|
}
|
|
|
|
|
|
def save_raw_prompts(category_slug: str, data: dict) -> Path:
|
|
output_dir = Path(__file__).parent.parent.parent / "data" / category_slug
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / "raw_prompts.json"
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Brainstorm raw prompts for category.")
|
|
parser.add_argument("--category", required=True)
|
|
parser.add_argument("--display-name", required=True)
|
|
args = parser.parse_args()
|
|
|
|
print(f"[Stage 2] Brainstorming prompts for {args.category}...")
|
|
data = brainstorm_all(args.category, args.display_name)
|
|
output_path = save_raw_prompts(args.category, data)
|
|
print(f"[Stage 2] ✅ Saved {data['total_raw_prompts']} raw prompts to {output_path}")
|
|
print(f"[Stage 2] Target distribution: {get_target_counts()}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|