citee-methodology/tools/prompt_curation/7_finalize.py

"""Stage 7 — Finalize.

Read the human-decided CSV (`for_human_review_decided.csv`) and produce the
final closed prompt pool: `prompts/{category}/v{N}.json` (gitignored).
"""
from __future__ import annotations

import argparse
import csv
import json
from datetime import datetime, timezone
from pathlib import Path

from config import CONFIG, get_target_counts


def load_decided_csv(category_slug: str) -> list[dict]:
    decided_file = (
        Path(__file__).parent.parent.parent
        / "data"
        / category_slug
        / "for_human_review_decided.csv"
    )
    if not decided_file.exists():
        raise FileNotFoundError(
            f"Decided CSV not found: {decided_file}. "
            f"Did you fill out 'for_human_review.csv' and rename to 'for_human_review_decided.csv'?"
        )

    rows = []
    with open(decided_file, "r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f, delimiter=";")
        for row in reader:
            rows.append(row)
    return rows


def determine_next_version(category_slug: str) -> str:
    """Find next version number (v1, v2, ...) for closed prompts."""
    prompts_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug
    if not prompts_dir.exists():
        prompts_dir.mkdir(parents=True, exist_ok=True)
        return "v1"

    existing_versions = []
    for f in prompts_dir.glob("v*.json"):
        try:
            num = int(f.stem.replace("v", ""))
            existing_versions.append(num)
        except ValueError:
            continue

    next_num = (max(existing_versions) + 1) if existing_versions else 1
    return f"v{next_num}"


def finalize(category_slug: str) -> Path:
    rows = load_decided_csv(category_slug)

    approved_prompts = []
    rejected_count = 0
    edited_count = 0

    for row in rows:
        decision = (row.get("decision") or "").strip().upper()
        if decision == "REJECT":
            rejected_count += 1
            continue
        if decision == "EDIT":
            edited_text = (row.get("edited_prompt") or "").strip()
            if not edited_text:
                print(f"[Stage 7] ⚠ Row {row.get('row_id')} marked EDIT but no edited_prompt — skipping")
                continue
            approved_prompts.append({
                "prompt": edited_text,
                "type": row["type"],
                "persona_id": row.get("persona_id", ""),
                "edited_from": row.get("prompt", ""),
            })
            edited_count += 1
        elif decision == "APPROVE":
            approved_prompts.append({
                "prompt": row["prompt"],
                "type": row["type"],
                "persona_id": row.get("persona_id", ""),
            })
        # Empty decision = treat as not yet decided, skip

    # Distribution check
    by_type: dict[str, int] = {}
    for p in approved_prompts:
        by_type[p["type"]] = by_type.get(p["type"], 0) + 1

    target_counts = get_target_counts()

    distribution_warnings = []
    for ptype, target in target_counts.items():
        actual = by_type.get(ptype, 0)
        if actual < target * 0.7:
            distribution_warnings.append(f"{ptype}: only {actual} approved (target {target})")
        elif actual > target * 1.3:
            distribution_warnings.append(f"{ptype}: too many — {actual} approved (target {target})")

    version = determine_next_version(category_slug)
    output_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{version}.json"

    final_pool = {
        "category": category_slug,
        "version": version,
        "finalized_at": datetime.now(timezone.utc).isoformat(),
        "total_prompts": len(approved_prompts),
        "distribution": by_type,
        "target_distribution": target_counts,
        "rejected_count": rejected_count,
        "edited_count": edited_count,
        "distribution_warnings": distribution_warnings,
        "prompts": approved_prompts,
    }

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_pool, f, ensure_ascii=False, indent=2)

    return output_file, final_pool


def main():
    parser = argparse.ArgumentParser(description="Finalize prompt pool from human-decided CSV.")
    parser.add_argument("--category", required=True)
    args = parser.parse_args()

    print(f"[Stage 7] Finalizing prompt pool for {args.category}...")
    output_file, summary = finalize(args.category)

    print(f"[Stage 7] ✅ Saved {output_file}")
    print(f"[Stage 7] Total approved: {summary['total_prompts']}")
    print(f"[Stage 7] Rejected: {summary['rejected_count']}, Edited: {summary['edited_count']}")
    print(f"[Stage 7] Distribution: {summary['distribution']}")

    if summary["distribution_warnings"]:
        print()
        print("⚠ Distribution warnings:")
        for w in summary["distribution_warnings"]:
            print(f"  - {w}")
        print()
        print("Consider re-running Stage 6 to add/remove prompts to fix distribution.")
    else:
        print()
        print(f"✅ Distribution looks good! Final pool ready at:")
        print(f"   {output_file}")
        print()
        print("This file is in `prompts/` directory which is gitignored — exact strings remain CLOSED.")
        print("Public reference (illustrative examples) lives in `prompts/example-{category}.md`.")


if __name__ == "__main__":
    main()