DATA — Public reference datasets for methodology:
- data/README.md: schema + format definitions for brand catalogs
- data/swiece-sojowe-pl/brand_catalog.json: 35 tracked brands (33 manufacturers + 2 importers) + 5 excluded marketplaces/resellers
- data/swiece-sojowe-pl/brand_catalog.md: human-readable companion
- data/swiece-sojowe-pl/market_metadata.json: GMV estimate, personas, seasonality, expected dynamics
TOOLS — 6-stage prompt curation pipeline (Python 3.12+):
- tools/prompt_curation/README.md: process documentation + cost estimates
- tools/prompt_curation/config.py: tunable parameters per stage
- tools/prompt_curation/.env.example: required API keys template
- tools/prompt_curation/requirements.txt: dependencies
- tools/prompt_curation/1_persona_generator.py: Claude generates 7 buyer personas
- tools/prompt_curation/2_prompt_brainstormer.py: per persona × 30 prompts in voice
- tools/prompt_curation/3_reality_checker.py: Google Trends + Reddit cross-check
- tools/prompt_curation/4_validation_agents.py: 3 critic agents async (real_buyer/methodology/exploit_hunter)
- tools/prompt_curation/5_pilot_test_runner.py: sample × 3 LLM models pre-flight
- tools/prompt_curation/6_human_review_export.py: CSV export for founder approval
- tools/prompt_curation/7_finalize.py: post-approval → closed prompts/{cat}/v{N}.json
- tools/prompt_curation/pipeline.py: orchestrator (stages 1–6, then human review, then 7)
GITIGNORE — Fixed .env.* exclusion to allow .env.example.
This commit completes Faza 1. Stages outputs (data/{cat}/personas.json,
raw_prompts.json, validated_prompts.json, critic_review.json, pilot_test_results.json,
for_human_review.csv) are runtime artifacts — public when committed, derived from
public methodology + public brand catalog. Final approved prompt strings in
prompts/{cat}/v{N}.json remain CLOSED (gitignored, anti-Goodhart's Law).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
158 lines
5.4 KiB
Python
158 lines
5.4 KiB
Python
"""Stage 7 — Finalize.
|
|
|
|
Read the human-decided CSV (`for_human_review_decided.csv`) and produce the
|
|
final closed prompt pool: `prompts/{category}/v{N}.json` (gitignored).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from config import CONFIG, get_target_counts
|
|
|
|
|
|
def load_decided_csv(category_slug: str) -> list[dict]:
|
|
decided_file = (
|
|
Path(__file__).parent.parent.parent
|
|
/ "data"
|
|
/ category_slug
|
|
/ "for_human_review_decided.csv"
|
|
)
|
|
if not decided_file.exists():
|
|
raise FileNotFoundError(
|
|
f"Decided CSV not found: {decided_file}. "
|
|
f"Did you fill out 'for_human_review.csv' and rename to 'for_human_review_decided.csv'?"
|
|
)
|
|
|
|
rows = []
|
|
with open(decided_file, "r", encoding="utf-8-sig", newline="") as f:
|
|
reader = csv.DictReader(f, delimiter=";")
|
|
for row in reader:
|
|
rows.append(row)
|
|
return rows
|
|
|
|
|
|
def determine_next_version(category_slug: str) -> str:
|
|
"""Find next version number (v1, v2, ...) for closed prompts."""
|
|
prompts_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug
|
|
if not prompts_dir.exists():
|
|
prompts_dir.mkdir(parents=True, exist_ok=True)
|
|
return "v1"
|
|
|
|
existing_versions = []
|
|
for f in prompts_dir.glob("v*.json"):
|
|
try:
|
|
num = int(f.stem.replace("v", ""))
|
|
existing_versions.append(num)
|
|
except ValueError:
|
|
continue
|
|
|
|
next_num = (max(existing_versions) + 1) if existing_versions else 1
|
|
return f"v{next_num}"
|
|
|
|
|
|
def finalize(category_slug: str) -> Path:
|
|
rows = load_decided_csv(category_slug)
|
|
|
|
approved_prompts = []
|
|
rejected_count = 0
|
|
edited_count = 0
|
|
|
|
for row in rows:
|
|
decision = (row.get("decision") or "").strip().upper()
|
|
if decision == "REJECT":
|
|
rejected_count += 1
|
|
continue
|
|
if decision == "EDIT":
|
|
edited_text = (row.get("edited_prompt") or "").strip()
|
|
if not edited_text:
|
|
print(f"[Stage 7] ⚠ Row {row.get('row_id')} marked EDIT but no edited_prompt — skipping")
|
|
continue
|
|
approved_prompts.append({
|
|
"prompt": edited_text,
|
|
"type": row["type"],
|
|
"persona_id": row.get("persona_id", ""),
|
|
"edited_from": row.get("prompt", ""),
|
|
})
|
|
edited_count += 1
|
|
elif decision == "APPROVE":
|
|
approved_prompts.append({
|
|
"prompt": row["prompt"],
|
|
"type": row["type"],
|
|
"persona_id": row.get("persona_id", ""),
|
|
})
|
|
# Empty decision = treat as not yet decided, skip
|
|
|
|
# Distribution check
|
|
by_type: dict[str, int] = {}
|
|
for p in approved_prompts:
|
|
by_type[p["type"]] = by_type.get(p["type"], 0) + 1
|
|
|
|
target_counts = get_target_counts()
|
|
|
|
distribution_warnings = []
|
|
for ptype, target in target_counts.items():
|
|
actual = by_type.get(ptype, 0)
|
|
if actual < target * 0.7:
|
|
distribution_warnings.append(f"{ptype}: only {actual} approved (target {target})")
|
|
elif actual > target * 1.3:
|
|
distribution_warnings.append(f"{ptype}: too many — {actual} approved (target {target})")
|
|
|
|
version = determine_next_version(category_slug)
|
|
output_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / f"{version}.json"
|
|
|
|
final_pool = {
|
|
"category": category_slug,
|
|
"version": version,
|
|
"finalized_at": datetime.now(timezone.utc).isoformat(),
|
|
"total_prompts": len(approved_prompts),
|
|
"distribution": by_type,
|
|
"target_distribution": target_counts,
|
|
"rejected_count": rejected_count,
|
|
"edited_count": edited_count,
|
|
"distribution_warnings": distribution_warnings,
|
|
"prompts": approved_prompts,
|
|
}
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(final_pool, f, ensure_ascii=False, indent=2)
|
|
|
|
return output_file, final_pool
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Finalize prompt pool from human-decided CSV.")
|
|
parser.add_argument("--category", required=True)
|
|
args = parser.parse_args()
|
|
|
|
print(f"[Stage 7] Finalizing prompt pool for {args.category}...")
|
|
output_file, summary = finalize(args.category)
|
|
|
|
print(f"[Stage 7] ✅ Saved {output_file}")
|
|
print(f"[Stage 7] Total approved: {summary['total_prompts']}")
|
|
print(f"[Stage 7] Rejected: {summary['rejected_count']}, Edited: {summary['edited_count']}")
|
|
print(f"[Stage 7] Distribution: {summary['distribution']}")
|
|
|
|
if summary["distribution_warnings"]:
|
|
print()
|
|
print("⚠ Distribution warnings:")
|
|
for w in summary["distribution_warnings"]:
|
|
print(f" - {w}")
|
|
print()
|
|
print("Consider re-running Stage 6 to add/remove prompts to fix distribution.")
|
|
else:
|
|
print()
|
|
print(f"✅ Distribution looks good! Final pool ready at:")
|
|
print(f" {output_file}")
|
|
print()
|
|
print("This file is in `prompts/` directory which is gitignored — exact strings remain CLOSED.")
|
|
print("Public reference (illustrative examples) lives in `prompts/example-{category}.md`.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|