"""Stage 3 — Reality Checker. Cross-reference raw prompts against real-world signals: - Google Trends (PL, past 12 months) - Reddit search (PL niche subreddits) - Quora PL questions Prompts with zero/marginal real-world signal are flagged for removal. """ from __future__ import annotations import argparse import json import os import time from pathlib import Path from config import CONFIG try: from pytrends.request import TrendReq HAS_PYTRENDS = True except ImportError: HAS_PYTRENDS = False try: import praw HAS_PRAW = True except ImportError: HAS_PRAW = False def load_raw_prompts(category_slug: str) -> dict: data_dir = Path(__file__).parent.parent.parent / "data" / category_slug raw_file = data_dir / "raw_prompts.json" if not raw_file.exists(): raise FileNotFoundError( f"Raw prompts not found: {raw_file}. Run 2_prompt_brainstormer.py first." ) with open(raw_file, "r", encoding="utf-8") as f: return json.load(f) def check_google_trends(prompt: str, pytrends_client) -> dict: """Check if prompt phrase has any Google Trends signal in PL.""" if not pytrends_client: return {"signal": "skipped", "volume_estimate": None, "reason": "pytrends not available"} # Take first 4 words as keyword (Trends has 100 char limit, simpler is better) keyword = " ".join(prompt.split()[:4]) try: pytrends_client.build_payload( kw_list=[keyword], cat=0, timeframe="today 12-m", geo="PL", ) interest = pytrends_client.interest_over_time() if interest.empty or keyword not in interest.columns: return {"signal": "none", "volume_estimate": 0, "keyword_tested": keyword} avg_interest = interest[keyword].mean() return { "signal": "present" if avg_interest > 1 else "marginal", "volume_estimate": float(avg_interest), "keyword_tested": keyword, } except Exception as exc: return {"signal": "error", "volume_estimate": None, "error": str(exc), "keyword_tested": keyword} def check_reddit(prompt: str, reddit_client, subreddits: list[str]) -> dict: """Search Reddit for prompt-related discussions.""" if not reddit_client: return {"signal": "skipped", "mention_count": None, "reason": "praw not available"} keyword = " ".join(prompt.split()[:5]) try: total = 0 examples = [] for subreddit_name in subreddits: subreddit = reddit_client.subreddit(subreddit_name) results = list(subreddit.search(keyword, limit=5, time_filter="year")) total += len(results) for r in results[:2]: examples.append({"subreddit": subreddit_name, "title": r.title, "score": r.score}) signal = "present" if total >= CONFIG.reddit_min_organic_mentions else ( "marginal" if total > 0 else "none" ) return { "signal": signal, "mention_count": total, "keyword_tested": keyword, "examples": examples[:3], } except Exception as exc: return {"signal": "error", "mention_count": None, "error": str(exc)} def aggregate_signals(prompt_obj: dict) -> str: """Combine signals into pass/marginal/fail decision.""" trends = prompt_obj.get("google_trends_check", {}).get("signal", "skipped") reddit = prompt_obj.get("reddit_check", {}).get("signal", "skipped") if trends == "present" or reddit == "present": return "pass" if trends == "marginal" or reddit == "marginal": return "marginal" if trends == "none" and reddit == "none": return "fail" return "marginal" # Default for skipped/error states def check_all_prompts(category_slug: str) -> dict: raw_data = load_raw_prompts(category_slug) raw_prompts = raw_data["raw_prompts"] pytrends_client = None if HAS_PYTRENDS: try: pytrends_client = TrendReq(hl="pl-PL", tz=120) except Exception as exc: print(f"[Stage 3] ⚠ pytrends init failed: {exc}") reddit_client = None if HAS_PRAW and os.environ.get("REDDIT_CLIENT_ID"): try: reddit_client = praw.Reddit( client_id=os.environ["REDDIT_CLIENT_ID"], client_secret=os.environ["REDDIT_CLIENT_SECRET"], user_agent=os.environ.get("REDDIT_USER_AGENT", "citee-methodology/1.0"), ) except Exception as exc: print(f"[Stage 3] ⚠ Reddit auth failed: {exc}") # PL niche subreddits — adjust per category pl_subreddits = ["Polska", "Polska_Marka", "PolskieAukcje", "ksiazki"] validated_prompts = [] for i, prompt_obj in enumerate(raw_prompts): prompt_text = prompt_obj["prompt"] if i % 10 == 0: print(f"[Stage 3] Checking prompt {i+1}/{len(raw_prompts)}...") prompt_obj["google_trends_check"] = check_google_trends(prompt_text, pytrends_client) prompt_obj["reddit_check"] = check_reddit(prompt_text, reddit_client, pl_subreddits) prompt_obj["reality_signal"] = aggregate_signals(prompt_obj) validated_prompts.append(prompt_obj) # Rate-limit pytrends (otherwise 429s) time.sleep(0.5) pass_count = sum(1 for p in validated_prompts if p["reality_signal"] == "pass") marginal_count = sum(1 for p in validated_prompts if p["reality_signal"] == "marginal") fail_count = sum(1 for p in validated_prompts if p["reality_signal"] == "fail") return { "category": category_slug, "total_checked": len(validated_prompts), "summary": { "pass": pass_count, "marginal": marginal_count, "fail": fail_count, }, "validated_prompts": validated_prompts, } def save_validated(category_slug: str, data: dict) -> Path: output_file = ( Path(__file__).parent.parent.parent / "data" / category_slug / "validated_prompts.json" ) with open(output_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) return output_file def main(): parser = argparse.ArgumentParser(description="Reality-check raw prompts.") parser.add_argument("--category", required=True) args = parser.parse_args() print(f"[Stage 3] Reality-checking prompts for {args.category}...") data = check_all_prompts(args.category) output_path = save_validated(args.category, data) print(f"[Stage 3] ✅ Saved validated prompts to {output_path}") print(f"[Stage 3] Summary: {data['summary']}") if __name__ == "__main__": main()