citee-methodology/tools/prompt_curation/5_pilot_test_runner.py
Jacek Kubas 03a397343e Faza 1: brand catalog (świece sojowe PL) + prompt curation pipeline
DATA — Public reference datasets for methodology:
- data/README.md: schema + format definitions for brand catalogs
- data/swiece-sojowe-pl/brand_catalog.json: 35 tracked brands (33 manufacturers + 2 importers) + 5 excluded marketplaces/resellers
- data/swiece-sojowe-pl/brand_catalog.md: human-readable companion
- data/swiece-sojowe-pl/market_metadata.json: GMV estimate, personas, seasonality, expected dynamics

TOOLS — 6-stage prompt curation pipeline (Python 3.12+):
- tools/prompt_curation/README.md: process documentation + cost estimates
- tools/prompt_curation/config.py: tunable parameters per stage
- tools/prompt_curation/.env.example: required API keys template
- tools/prompt_curation/requirements.txt: dependencies
- tools/prompt_curation/1_persona_generator.py: Claude generates 7 buyer personas
- tools/prompt_curation/2_prompt_brainstormer.py: per persona × 30 prompts in voice
- tools/prompt_curation/3_reality_checker.py: Google Trends + Reddit cross-check
- tools/prompt_curation/4_validation_agents.py: 3 critic agents async (real_buyer/methodology/exploit_hunter)
- tools/prompt_curation/5_pilot_test_runner.py: sample × 3 LLM models pre-flight
- tools/prompt_curation/6_human_review_export.py: CSV export for founder approval
- tools/prompt_curation/7_finalize.py: post-approval → closed prompts/{cat}/v{N}.json
- tools/prompt_curation/pipeline.py: orchestrator (stages 1–6, then human review, then 7)

GITIGNORE — Fixed .env.* exclusion to allow .env.example.

This commit completes Faza 1. Stages outputs (data/{cat}/personas.json,
raw_prompts.json, validated_prompts.json, critic_review.json, pilot_test_results.json,
for_human_review.csv) are runtime artifacts — public when committed, derived from
public methodology + public brand catalog. Final approved prompt strings in
prompts/{cat}/v{N}.json remain CLOSED (gitignored, anti-Goodhart's Law).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:40:12 +02:00

219 lines
8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Stage 5 — Pilot Test Runner.
Run a sample of N prompts (default 10) through 3 LLM models in parallel.
Reject prompts that:
- Get "I don't know" / "depends on preferences" responses
- Produce zero overlap across models (incoherent)
- Get misinterpreted (LLM answers about wrong topic)
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import random
from pathlib import Path
import httpx
from anthropic import AsyncAnthropic
from openai import AsyncOpenAI
from config import CONFIG
async def query_chatgpt(client: AsyncOpenAI, prompt: str) -> dict:
"""Query GPT-4o with web search."""
try:
response = await client.chat.completions.create(
model="gpt-4o-search-preview",
messages=[{"role": "user", "content": prompt}],
max_tokens=600,
)
text = response.choices[0].message.content
return {"model": "chatgpt", "response": text, "ok": True}
except Exception as exc:
return {"model": "chatgpt", "response": None, "ok": False, "error": str(exc)}
async def query_perplexity(prompt: str) -> dict:
"""Query Perplexity Sonar Pro."""
try:
async with httpx.AsyncClient(timeout=60) as client:
response = await client.post(
"https://api.perplexity.ai/chat/completions",
headers={
"Authorization": f"Bearer {os.environ['PERPLEXITY_API_KEY']}",
"Content-Type": "application/json",
},
json={
"model": "sonar-pro",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 600,
},
)
data = response.json()
text = data["choices"][0]["message"]["content"]
return {"model": "perplexity", "response": text, "ok": True}
except Exception as exc:
return {"model": "perplexity", "response": None, "ok": False, "error": str(exc)}
async def query_gemini(prompt: str) -> dict:
"""Query Gemini Pro."""
try:
# Using direct REST call (google-generativeai SDK is verbose)
async with httpx.AsyncClient(timeout=60) as client:
response = await client.post(
f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={os.environ['GOOGLE_API_KEY']}",
json={
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"maxOutputTokens": 600},
},
)
data = response.json()
text = data["candidates"][0]["content"]["parts"][0]["text"]
return {"model": "gemini", "response": text, "ok": True}
except Exception as exc:
return {"model": "gemini", "response": None, "ok": False, "error": str(exc)}
async def query_all_models(prompt_obj: dict, openai_client: AsyncOpenAI) -> dict:
"""Run prompt against all 3 models in parallel."""
prompt_text = prompt_obj["prompt"]
results = await asyncio.gather(
query_chatgpt(openai_client, prompt_text),
query_perplexity(prompt_text),
query_gemini(prompt_text),
)
return {
"prompt": prompt_obj["prompt"],
"type": prompt_obj["type"],
"persona_id": prompt_obj.get("persona_id"),
"model_responses": {r["model"]: r for r in results},
}
def evaluate_response_quality(test_result: dict, brand_catalog: dict) -> dict:
"""Detect bad signals in responses."""
bad_signals = []
brand_names = set()
for brand in brand_catalog["brands"]:
brand_names.add(brand["name"].lower())
for alias in brand.get("aliases", []):
brand_names.add(alias.lower())
responses = test_result["model_responses"]
successful = [r for r in responses.values() if r.get("ok")]
if len(successful) < 2:
bad_signals.append("less_than_2_models_responded")
for model_name, r in responses.items():
if not r.get("ok"):
continue
text = (r.get("response") or "").lower()
# "I don't know" patterns
idk_patterns = ["nie wiem", "trudno powiedzieć", "to zależy od", "preferencje"]
if any(pattern in text for pattern in idk_patterns) and len(text) < 200:
bad_signals.append(f"{model_name}_uncertain_short")
# No brand mentions
brands_mentioned = [b for b in brand_names if b in text]
r["brands_detected"] = brands_mentioned[:10]
if not brands_mentioned:
bad_signals.append(f"{model_name}_no_brands_mentioned")
# Cross-model overlap
all_brands_per_model = {
m: set(r.get("brands_detected", []))
for m, r in responses.items() if r.get("ok")
}
if len(all_brands_per_model) >= 2:
overlap = set.intersection(*all_brands_per_model.values()) if all_brands_per_model.values() else set()
if not overlap and any(all_brands_per_model.values()):
bad_signals.append("zero_brand_overlap_across_models")
return {
"bad_signals": bad_signals,
"verdict": "reject" if len(bad_signals) >= 2 else ("flag" if bad_signals else "pass"),
}
async def run_pilot_test(category_slug: str) -> dict:
data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
critic_file = data_dir / "critic_review.json"
if not critic_file.exists():
raise FileNotFoundError(f"Run 4_validation_agents.py first. Missing: {critic_file}")
with open(critic_file, "r", encoding="utf-8") as f:
critic_data = json.load(f)
catalog_file = data_dir / "brand_catalog.json"
with open(catalog_file, "r", encoding="utf-8") as f:
brand_catalog = json.load(f)
candidates = critic_data["kept_prompts"]
sample_size = min(CONFIG.pilot_sample_size, len(candidates))
# Stratified sample across types
by_type: dict[str, list[dict]] = {}
for p in candidates:
by_type.setdefault(p["type"], []).append(p)
sampled = []
for ptype, prompts in by_type.items():
n = max(1, int(round(sample_size * CONFIG.type_distribution.get(ptype, 0.2))))
sampled.extend(random.sample(prompts, min(n, len(prompts))))
# Trim to sample_size
sampled = sampled[:sample_size]
print(f"[Stage 5] Pilot testing {len(sampled)} prompts × 3 models = {len(sampled) * 3} API calls")
openai_client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
pilot_results = []
for i, prompt_obj in enumerate(sampled):
print(f"[Stage 5] Testing prompt {i+1}/{len(sampled)}: {prompt_obj['prompt'][:50]}...")
test_result = await query_all_models(prompt_obj, openai_client)
evaluation = evaluate_response_quality(test_result, brand_catalog)
test_result["evaluation"] = evaluation
pilot_results.append(test_result)
pass_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "pass")
flag_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "flag")
reject_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "reject")
return {
"category": category_slug,
"sample_size": len(sampled),
"summary": {"pass": pass_count, "flag": flag_count, "reject": reject_count},
"pilot_results": pilot_results,
}
def save_pilot_results(category_slug: str, data: dict) -> Path:
output_file = (
Path(__file__).parent.parent.parent / "data" / category_slug / "pilot_test_results.json"
)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return output_file
def main():
parser = argparse.ArgumentParser(description="Pilot test sample prompts on 3 models.")
parser.add_argument("--category", required=True)
args = parser.parse_args()
print(f"[Stage 5] Pilot testing {args.category}...")
data = asyncio.run(run_pilot_test(args.category))
output_path = save_pilot_results(args.category, data)
print(f"[Stage 5] ✅ Saved {output_path}")
print(f"[Stage 5] Summary: {data['summary']}")
if __name__ == "__main__":
main()