DATA — Public reference datasets for methodology:
- data/README.md: schema + format definitions for brand catalogs
- data/swiece-sojowe-pl/brand_catalog.json: 35 tracked brands (33 manufacturers + 2 importers) + 5 excluded marketplaces/resellers
- data/swiece-sojowe-pl/brand_catalog.md: human-readable companion
- data/swiece-sojowe-pl/market_metadata.json: GMV estimate, personas, seasonality, expected dynamics
TOOLS — 6-stage prompt curation pipeline (Python 3.12+):
- tools/prompt_curation/README.md: process documentation + cost estimates
- tools/prompt_curation/config.py: tunable parameters per stage
- tools/prompt_curation/.env.example: required API keys template
- tools/prompt_curation/requirements.txt: dependencies
- tools/prompt_curation/1_persona_generator.py: Claude generates 7 buyer personas
- tools/prompt_curation/2_prompt_brainstormer.py: per persona × 30 prompts in voice
- tools/prompt_curation/3_reality_checker.py: Google Trends + Reddit cross-check
- tools/prompt_curation/4_validation_agents.py: 3 critic agents async (real_buyer/methodology/exploit_hunter)
- tools/prompt_curation/5_pilot_test_runner.py: sample × 3 LLM models pre-flight
- tools/prompt_curation/6_human_review_export.py: CSV export for founder approval
- tools/prompt_curation/7_finalize.py: post-approval → closed prompts/{cat}/v{N}.json
- tools/prompt_curation/pipeline.py: orchestrator (stages 1–6, then human review, then 7)
GITIGNORE — Fixed .env.* exclusion to allow .env.example.
This commit completes Faza 1. Stages outputs (data/{cat}/personas.json,
raw_prompts.json, validated_prompts.json, critic_review.json, pilot_test_results.json,
for_human_review.csv) are runtime artifacts — public when committed, derived from
public methodology + public brand catalog. Final approved prompt strings in
prompts/{cat}/v{N}.json remain CLOSED (gitignored, anti-Goodhart's Law).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
194 lines
6.6 KiB
Python
194 lines
6.6 KiB
Python
"""Stage 3 — Reality Checker.
|
|
|
|
Cross-reference raw prompts against real-world signals:
|
|
- Google Trends (PL, past 12 months)
|
|
- Reddit search (PL niche subreddits)
|
|
- Quora PL questions
|
|
|
|
Prompts with zero/marginal real-world signal are flagged for removal.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from config import CONFIG
|
|
|
|
try:
|
|
from pytrends.request import TrendReq
|
|
HAS_PYTRENDS = True
|
|
except ImportError:
|
|
HAS_PYTRENDS = False
|
|
|
|
try:
|
|
import praw
|
|
HAS_PRAW = True
|
|
except ImportError:
|
|
HAS_PRAW = False
|
|
|
|
|
|
def load_raw_prompts(category_slug: str) -> dict:
|
|
data_dir = Path(__file__).parent.parent.parent / "data" / category_slug
|
|
raw_file = data_dir / "raw_prompts.json"
|
|
if not raw_file.exists():
|
|
raise FileNotFoundError(
|
|
f"Raw prompts not found: {raw_file}. Run 2_prompt_brainstormer.py first."
|
|
)
|
|
with open(raw_file, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def check_google_trends(prompt: str, pytrends_client) -> dict:
|
|
"""Check if prompt phrase has any Google Trends signal in PL."""
|
|
if not pytrends_client:
|
|
return {"signal": "skipped", "volume_estimate": None, "reason": "pytrends not available"}
|
|
|
|
# Take first 4 words as keyword (Trends has 100 char limit, simpler is better)
|
|
keyword = " ".join(prompt.split()[:4])
|
|
|
|
try:
|
|
pytrends_client.build_payload(
|
|
kw_list=[keyword],
|
|
cat=0,
|
|
timeframe="today 12-m",
|
|
geo="PL",
|
|
)
|
|
interest = pytrends_client.interest_over_time()
|
|
if interest.empty or keyword not in interest.columns:
|
|
return {"signal": "none", "volume_estimate": 0, "keyword_tested": keyword}
|
|
|
|
avg_interest = interest[keyword].mean()
|
|
return {
|
|
"signal": "present" if avg_interest > 1 else "marginal",
|
|
"volume_estimate": float(avg_interest),
|
|
"keyword_tested": keyword,
|
|
}
|
|
except Exception as exc:
|
|
return {"signal": "error", "volume_estimate": None, "error": str(exc), "keyword_tested": keyword}
|
|
|
|
|
|
def check_reddit(prompt: str, reddit_client, subreddits: list[str]) -> dict:
|
|
"""Search Reddit for prompt-related discussions."""
|
|
if not reddit_client:
|
|
return {"signal": "skipped", "mention_count": None, "reason": "praw not available"}
|
|
|
|
keyword = " ".join(prompt.split()[:5])
|
|
|
|
try:
|
|
total = 0
|
|
examples = []
|
|
for subreddit_name in subreddits:
|
|
subreddit = reddit_client.subreddit(subreddit_name)
|
|
results = list(subreddit.search(keyword, limit=5, time_filter="year"))
|
|
total += len(results)
|
|
for r in results[:2]:
|
|
examples.append({"subreddit": subreddit_name, "title": r.title, "score": r.score})
|
|
|
|
signal = "present" if total >= CONFIG.reddit_min_organic_mentions else (
|
|
"marginal" if total > 0 else "none"
|
|
)
|
|
return {
|
|
"signal": signal,
|
|
"mention_count": total,
|
|
"keyword_tested": keyword,
|
|
"examples": examples[:3],
|
|
}
|
|
except Exception as exc:
|
|
return {"signal": "error", "mention_count": None, "error": str(exc)}
|
|
|
|
|
|
def aggregate_signals(prompt_obj: dict) -> str:
|
|
"""Combine signals into pass/marginal/fail decision."""
|
|
trends = prompt_obj.get("google_trends_check", {}).get("signal", "skipped")
|
|
reddit = prompt_obj.get("reddit_check", {}).get("signal", "skipped")
|
|
|
|
if trends == "present" or reddit == "present":
|
|
return "pass"
|
|
if trends == "marginal" or reddit == "marginal":
|
|
return "marginal"
|
|
if trends == "none" and reddit == "none":
|
|
return "fail"
|
|
return "marginal" # Default for skipped/error states
|
|
|
|
|
|
def check_all_prompts(category_slug: str) -> dict:
|
|
raw_data = load_raw_prompts(category_slug)
|
|
raw_prompts = raw_data["raw_prompts"]
|
|
|
|
pytrends_client = None
|
|
if HAS_PYTRENDS:
|
|
try:
|
|
pytrends_client = TrendReq(hl="pl-PL", tz=120)
|
|
except Exception as exc:
|
|
print(f"[Stage 3] ⚠ pytrends init failed: {exc}")
|
|
|
|
reddit_client = None
|
|
if HAS_PRAW and os.environ.get("REDDIT_CLIENT_ID"):
|
|
try:
|
|
reddit_client = praw.Reddit(
|
|
client_id=os.environ["REDDIT_CLIENT_ID"],
|
|
client_secret=os.environ["REDDIT_CLIENT_SECRET"],
|
|
user_agent=os.environ.get("REDDIT_USER_AGENT", "citee-methodology/1.0"),
|
|
)
|
|
except Exception as exc:
|
|
print(f"[Stage 3] ⚠ Reddit auth failed: {exc}")
|
|
|
|
# PL niche subreddits — adjust per category
|
|
pl_subreddits = ["Polska", "Polska_Marka", "PolskieAukcje", "ksiazki"]
|
|
|
|
validated_prompts = []
|
|
for i, prompt_obj in enumerate(raw_prompts):
|
|
prompt_text = prompt_obj["prompt"]
|
|
if i % 10 == 0:
|
|
print(f"[Stage 3] Checking prompt {i+1}/{len(raw_prompts)}...")
|
|
|
|
prompt_obj["google_trends_check"] = check_google_trends(prompt_text, pytrends_client)
|
|
prompt_obj["reddit_check"] = check_reddit(prompt_text, reddit_client, pl_subreddits)
|
|
prompt_obj["reality_signal"] = aggregate_signals(prompt_obj)
|
|
|
|
validated_prompts.append(prompt_obj)
|
|
# Rate-limit pytrends (otherwise 429s)
|
|
time.sleep(0.5)
|
|
|
|
pass_count = sum(1 for p in validated_prompts if p["reality_signal"] == "pass")
|
|
marginal_count = sum(1 for p in validated_prompts if p["reality_signal"] == "marginal")
|
|
fail_count = sum(1 for p in validated_prompts if p["reality_signal"] == "fail")
|
|
|
|
return {
|
|
"category": category_slug,
|
|
"total_checked": len(validated_prompts),
|
|
"summary": {
|
|
"pass": pass_count,
|
|
"marginal": marginal_count,
|
|
"fail": fail_count,
|
|
},
|
|
"validated_prompts": validated_prompts,
|
|
}
|
|
|
|
|
|
def save_validated(category_slug: str, data: dict) -> Path:
|
|
output_file = (
|
|
Path(__file__).parent.parent.parent / "data" / category_slug / "validated_prompts.json"
|
|
)
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Reality-check raw prompts.")
|
|
parser.add_argument("--category", required=True)
|
|
args = parser.parse_args()
|
|
|
|
print(f"[Stage 3] Reality-checking prompts for {args.category}...")
|
|
data = check_all_prompts(args.category)
|
|
output_path = save_validated(args.category, data)
|
|
print(f"[Stage 3] ✅ Saved validated prompts to {output_path}")
|
|
print(f"[Stage 3] Summary: {data['summary']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|