citee-methodology/tools/prompt_curation/4_validation_agents.py

"""Stage 4 — Multi-agent Validation.

Three Claude critic agents review prompts in parallel:
- Agent A: Real buyer critique (does this sound like real persona phrasing?)
- Agent B: Methodology critic (statistical balance, distribution, vocabulary)
- Agent C: Vendor exploit hunter (which prompts are too easy to game by content marketing?)

Prompts flagged by N+ agents (default: 2) are removed.
"""
from __future__ import annotations

import argparse
import asyncio
import json
import os
from pathlib import Path

from anthropic import AsyncAnthropic

from config import CONFIG

AGENT_A_PROMPT = """You are reviewing a list of prompts that buyer personas would supposedly type into ChatGPT/Perplexity/Gemini when researching purchases in **{category}**.

Your job: identify prompts that DON'T sound natural for any realistic Polish e-commerce buyer.

Flag prompts that:
1. Are too formal/academic (no buyer phrases queries like a research paper)
2. Are too long (real users don't type 30-word queries)
3. Are too short / generic (single words or 2-word phrases)
4. Use vocabulary no real Polish buyer would use
5. Are buyer-impossible (e.g., asking about specs only B2B buyer would care about, in a B2C context)

Here are the {prompt_count} prompts to review:

{prompts_list}

Output JSON array of flagged prompt IDs (use the index as ID, 0-indexed):

```json
{{
  "flagged_indices": [3, 7, 12],
  "reasons": {{
    "3": "Too formal — no real buyer types like this",
    "7": "Single word, no buying intent",
    "12": "B2B language in B2C context"
  }}
}}
```

Only output JSON. No prose."""


AGENT_B_PROMPT = """You are a methodology critic for a Polish e-commerce AI visibility ranking project.

Review this prompt list for **statistical and structural issues**:

Target distribution per the methodology:
- buying: 30% (weight 2.0)
- comparison: 25% (weight 1.5)
- specific_need: 20% (weight 1.5)
- informational: 15% (weight 0.3)
- brand_direct: 10% (weight 0.3)

Total prompts: {prompt_count}

Flag issues:
1. Type distribution off by >10% from target
2. Vocabulary too repetitive (same phrases recurring)
3. Subcategory bias (e.g., 80% prompts about prezenty, 20% about everything else)
4. Length distribution unreasonable (all prompts are very long or very short)
5. Missing realistic buyer scenarios (e.g., no prompts about specific occasions, sizes, attributes)

Prompts list:

{prompts_list}

Output:

```json
{{
  "flagged_indices": [...],
  "reasons": {{...}},
  "structural_issues": [
    "Type 'comparison' is over-represented at 35% (target 25%)",
    "20+ prompts mention 'prezent dla mamy' — too repetitive"
  ]
}}
```

Only JSON output."""


AGENT_C_PROMPT = """You are a vendor exploit hunter for a Polish e-commerce AI visibility ranking.

Your job: identify prompts that are TOO EASY for a vendor to game by content marketing fluff.

A prompt is "exploitable" if:
1. The answer can be dominated by writing one good blog post
2. The answer comes primarily from Wikipedia (vendors can edit Wikipedia)
3. The answer is brand-agnostic (any vendor can position to win it via SEO content)
4. The prompt would be answered by listing Wikipedia / blog content rather than specific brand recommendations

We WANT prompts where:
- AI must recommend specific brands (with real reviews, real authority, multi-source citation)
- Prompt requires real product positioning, not just content production
- Multiple sources (reviews, Reddit, brand sites) need to align for ranking

Flag prompts that are too gameable:

{prompts_list}

Output:

```json
{{
  "flagged_indices": [...],
  "reasons": {{
    "5": "Generic 'co to świeca sojowa' — easily gamed by Wikipedia + blog post",
    "12": "Brand-agnostic 'jak działa świeca sojowa' — content marketing fluff target"
  }}
}}
```

Only JSON output."""


async def run_agent(client: AsyncAnthropic, prompt: str) -> dict:
    """Single agent call."""
    response = await client.messages.create(
        model=CONFIG.critic_models["real_buyer_critique"],
        max_tokens=4000,
        messages=[{"role": "user", "content": prompt}],
    )
    text = response.content[0].text.strip()
    if text.startswith("```json"):
        text = text[7:]
    if text.endswith("```"):
        text = text[:-3]
    return json.loads(text.strip())


async def run_three_critics(prompts: list[dict], category_display_name: str) -> dict:
    client = AsyncAnthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

    # Format prompts for review
    prompts_text = "\n".join(
        f"{i}. [{p['type']}] {p['prompt']}" for i, p in enumerate(prompts)
    )

    agent_a = AGENT_A_PROMPT.format(
        category=category_display_name,
        prompt_count=len(prompts),
        prompts_list=prompts_text,
    )
    agent_b = AGENT_B_PROMPT.format(
        prompt_count=len(prompts),
        prompts_list=prompts_text,
    )
    agent_c = AGENT_C_PROMPT.format(
        prompts_list=prompts_text,
    )

    # Run 3 agents in parallel
    print("[Stage 4] Running 3 critic agents in parallel...")
    results = await asyncio.gather(
        run_agent(client, agent_a),
        run_agent(client, agent_b),
        run_agent(client, agent_c),
    )

    return {
        "agent_a_real_buyer": results[0],
        "agent_b_methodology": results[1],
        "agent_c_exploit_hunter": results[2],
    }


def aggregate_flags(critic_results: dict, total_prompts: int) -> dict:
    """Count how many agents flagged each prompt index."""
    flag_counts: dict[int, list[str]] = {}

    for agent_name, result in critic_results.items():
        for idx in result.get("flagged_indices", []):
            if idx not in flag_counts:
                flag_counts[idx] = []
            reason = result.get("reasons", {}).get(str(idx), "no reason given")
            flag_counts[idx].append(f"{agent_name}: {reason}")

    flagged_for_removal = [
        idx for idx, reasons in flag_counts.items()
        if len(reasons) >= CONFIG.flagged_by_n_critics_to_remove
    ]

    return {
        "flag_counts_by_prompt": flag_counts,
        "flagged_for_removal": sorted(flagged_for_removal),
        "removal_threshold_critics": CONFIG.flagged_by_n_critics_to_remove,
        "total_prompts": total_prompts,
        "total_removed": len(flagged_for_removal),
        "total_kept": total_prompts - len(flagged_for_removal),
    }


def main():
    parser = argparse.ArgumentParser(description="Multi-agent validation of prompts.")
    parser.add_argument("--category", required=True)
    parser.add_argument("--display-name", required=True)
    args = parser.parse_args()

    data_dir = Path(__file__).parent.parent.parent / "data" / args.category
    validated_file = data_dir / "validated_prompts.json"
    if not validated_file.exists():
        raise FileNotFoundError(f"Run 3_reality_checker.py first. Missing: {validated_file}")

    with open(validated_file, "r", encoding="utf-8") as f:
        validated_data = json.load(f)

    # Filter out reality-check failures first
    candidates = [p for p in validated_data["validated_prompts"] if p["reality_signal"] != "fail"]
    print(f"[Stage 4] Reviewing {len(candidates)} prompts (post-reality-check)...")

    critic_results = asyncio.run(run_three_critics(candidates, args.display_name))
    aggregation = aggregate_flags(critic_results, len(candidates))

    output = {
        "category": args.category,
        "input_count": len(candidates),
        "critic_results": critic_results,
        "aggregation": aggregation,
        "kept_prompts": [
            p for i, p in enumerate(candidates) if i not in aggregation["flagged_for_removal"]
        ],
    }

    output_file = data_dir / "critic_review.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"[Stage 4] ✅ Saved {output_file}")
    print(f"[Stage 4] Removed: {aggregation['total_removed']}, Kept: {aggregation['total_kept']}")


if __name__ == "__main__":
    main()