From 03a397343ebc9ad3277527f2bbaad0be54bf6c90 Mon Sep 17 00:00:00 2001 From: Jacek Kubas Date: Sun, 3 May 2026 18:40:12 +0200 Subject: [PATCH] =?UTF-8?q?Faza=201:=20brand=20catalog=20(=C5=9Bwiece=20so?= =?UTF-8?q?jowe=20PL)=20+=20prompt=20curation=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DATA — Public reference datasets for methodology: - data/README.md: schema + format definitions for brand catalogs - data/swiece-sojowe-pl/brand_catalog.json: 35 tracked brands (33 manufacturers + 2 importers) + 5 excluded marketplaces/resellers - data/swiece-sojowe-pl/brand_catalog.md: human-readable companion - data/swiece-sojowe-pl/market_metadata.json: GMV estimate, personas, seasonality, expected dynamics TOOLS — 6-stage prompt curation pipeline (Python 3.12+): - tools/prompt_curation/README.md: process documentation + cost estimates - tools/prompt_curation/config.py: tunable parameters per stage - tools/prompt_curation/.env.example: required API keys template - tools/prompt_curation/requirements.txt: dependencies - tools/prompt_curation/1_persona_generator.py: Claude generates 7 buyer personas - tools/prompt_curation/2_prompt_brainstormer.py: per persona × 30 prompts in voice - tools/prompt_curation/3_reality_checker.py: Google Trends + Reddit cross-check - tools/prompt_curation/4_validation_agents.py: 3 critic agents async (real_buyer/methodology/exploit_hunter) - tools/prompt_curation/5_pilot_test_runner.py: sample × 3 LLM models pre-flight - tools/prompt_curation/6_human_review_export.py: CSV export for founder approval - tools/prompt_curation/7_finalize.py: post-approval → closed prompts/{cat}/v{N}.json - tools/prompt_curation/pipeline.py: orchestrator (stages 1–6, then human review, then 7) GITIGNORE — Fixed .env.* exclusion to allow .env.example. This commit completes Faza 1. Stages outputs (data/{cat}/personas.json, raw_prompts.json, validated_prompts.json, critic_review.json, pilot_test_results.json, for_human_review.csv) are runtime artifacts — public when committed, derived from public methodology + public brand catalog. Final approved prompt strings in prompts/{cat}/v{N}.json remain CLOSED (gitignored, anti-Goodhart's Law). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + data/README.md | 92 ++++ data/swiece-sojowe-pl/brand_catalog.json | 460 ++++++++++++++++++ data/swiece-sojowe-pl/brand_catalog.md | 97 ++++ data/swiece-sojowe-pl/market_metadata.json | 87 ++++ tools/prompt_curation/.env.example | 25 + tools/prompt_curation/1_persona_generator.py | 159 ++++++ .../prompt_curation/2_prompt_brainstormer.py | 172 +++++++ tools/prompt_curation/3_reality_checker.py | 194 ++++++++ tools/prompt_curation/4_validation_agents.py | 244 ++++++++++ tools/prompt_curation/5_pilot_test_runner.py | 219 +++++++++ .../prompt_curation/6_human_review_export.py | 133 +++++ tools/prompt_curation/7_finalize.py | 158 ++++++ tools/prompt_curation/README.md | 97 ++++ tools/prompt_curation/config.py | 86 ++++ tools/prompt_curation/pipeline.py | 189 +++++++ tools/prompt_curation/requirements.txt | 6 + 17 files changed, 2419 insertions(+) create mode 100644 data/README.md create mode 100644 data/swiece-sojowe-pl/brand_catalog.json create mode 100644 data/swiece-sojowe-pl/brand_catalog.md create mode 100644 data/swiece-sojowe-pl/market_metadata.json create mode 100644 tools/prompt_curation/.env.example create mode 100644 tools/prompt_curation/1_persona_generator.py create mode 100644 tools/prompt_curation/2_prompt_brainstormer.py create mode 100644 tools/prompt_curation/3_reality_checker.py create mode 100644 tools/prompt_curation/4_validation_agents.py create mode 100644 tools/prompt_curation/5_pilot_test_runner.py create mode 100644 tools/prompt_curation/6_human_review_export.py create mode 100644 tools/prompt_curation/7_finalize.py create mode 100644 tools/prompt_curation/README.md create mode 100644 tools/prompt_curation/config.py create mode 100644 tools/prompt_curation/pipeline.py create mode 100644 tools/prompt_curation/requirements.txt diff --git a/.gitignore b/.gitignore index 93ddee8..1bc02ba 100644 --- a/.gitignore +++ b/.gitignore @@ -50,5 +50,6 @@ scans/ # Secrets .env .env.* +!.env.example *.key secrets.json diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..aabee36 --- /dev/null +++ b/data/README.md @@ -0,0 +1,92 @@ +# Citee Index Data + +> Public datasets used by Citee Index methodology. Brand catalogs per category, market metadata, model weight calibration sources. + +This directory is **public** — anything here is part of the open methodology. Closed operational data (exact prompts, anti-gaming thresholds, scan outputs) lives elsewhere (gitignored or in separate access-controlled storage). + +--- + +## Structure + +``` +data/ +├── README.md (this file) +├── {category-slug}/ +│ ├── brand_catalog.json # Brands tracked, normalized names, aliases, type +│ ├── brand_catalog.md # Human-readable companion to JSON +│ └── market_metadata.json # Market depth, GMV estimate, seasonality flags +├── model_weights/ +│ └── pl-2026-q2.json # Quarterly weight calibration with sources +└── shared/ + └── prompt_type_definitions.md # Detailed definitions of 5 prompt types +``` + +## Brand catalog schema + +Each `brand_catalog.json` follows this schema: + +```json +{ + "category": "swiece-sojowe-pl", + "country": "PL", + "version": "1.0.0", + "last_updated": "2026-05-03", + "brands": [ + { + "id": "jakulo", + "name": "JAKULO", + "aliases": ["Jakulo", "jakulo", "jakulo.pl"], + "domain": "jakulo.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "founded": 2022, + "active_in_category_since": 2022, + "notes": "Soy candles, handmade, Łódź-based" + } + ] +} +``` + +### Field definitions + +- **id:** unique slug (lowercase, hyphenated). Used as primary key in scan outputs. +- **name:** canonical display name (mixed case as brand presents itself). +- **aliases:** all variations to detect in LLM outputs (case-insensitive matching during scan). +- **domain:** primary website. Used for citation depth scoring (direct link to brand.com vs mention only). +- **type:** `manufacturer` (own products), `importer` (foreign brand sold in country), `reseller` (multi-brand retailer). +- **country_origin:** ISO 3166-1 alpha-2. For PL ranking, includes both `PL` (Polish brands) and foreign brands actively sold in PL market. +- **segment:** `premium-handmade`, `premium`, `mid`, `budget`, `mass-market`. Subjective categorization, used for cross-cutting reports. +- **founded:** year, if known. +- **active_in_category_since:** year brand started selling in this specific category (may differ from founding if pivoted). +- **notes:** free-text human-readable context. + +### Type policy + +- **Manufacturers** are the primary scoring targets — these are the brands that benefit most from AI visibility. +- **Importers** are included if they have meaningful PL market presence (e.g., Yankee Candle PL imports, sells through own channels). Marked `type: importer`. +- **Resellers** (Notino, Sephora, Empik) are tracked as **mention-only** — they appear in AI answers but don't have proprietary brand identity in this category. Stored separately in `resellers.json` and not ranked. + +### Excluded entities + +The following are tracked as mentions but explicitly excluded from ranking: +- **Marketplaces** (Allegro, Empik, Ceneo) — not brands, just sales channels +- **Generic categories** (any "świece sojowe" mentions without brand attribution) +- **Honeypot brand** (fictional brand inserted by Citee — see `methodology.json` for policy, exact identity closed) + +## Adding a new brand + +When a new brand appears in scan outputs (detected via Stage 4 of curation pipeline or manually), it should be added to `brand_catalog.json` with at minimum: `id`, `name`, `aliases`, `domain`. Other fields filled in over time. + +Adding a brand: +1. Edit `brand_catalog.json` for the relevant category +2. Bump version (1.0.0 → 1.0.1 for additions, 1.1.0 if methodology change accompanies) +3. Update `last_updated` +4. Commit with message like: `data: add Brand X to swiece-sojowe-pl catalog (detected in Q2 2026 scan)` + +## Versioning + +Brand catalog updates do NOT trigger methodology version bumps (they're data, not formula). They follow their own semver: +- **PATCH** (1.0.1) — adding/removing brands, updating aliases +- **MINOR** (1.1.0) — schema changes (new fields), category restructuring +- **MAJOR** (2.0.0) — incompatible structural changes diff --git a/data/swiece-sojowe-pl/brand_catalog.json b/data/swiece-sojowe-pl/brand_catalog.json new file mode 100644 index 0000000..313fefb --- /dev/null +++ b/data/swiece-sojowe-pl/brand_catalog.json @@ -0,0 +1,460 @@ +{ + "category": "swiece-sojowe-pl", + "country": "PL", + "version": "1.0.0", + "last_updated": "2026-05-03", + "description": "Polish soy candle e-commerce brands with internet presence. Includes manufacturers (primary scoring targets), importers (foreign brands with active PL market presence), and tracked but unranked resellers/marketplaces.", + "scan_first_cycle": "2026-05", + "first_publication_target": "2026-08", + + "brands": [ + { + "id": "jakulo", + "name": "JAKULO", + "aliases": ["Jakulo", "jakulo", "jakulo.pl", "JaKulo"], + "domain": "jakulo.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "founded": 2022, + "active_in_category_since": 2022, + "notes": "Łódź-based, soy wax, FRA fragrance compositions, 12k+ customers (founder's brand)" + }, + { + "id": "naturaodpauli", + "name": "Naturaodpauli", + "aliases": ["natura od pauli", "naturaodpauli.pl", "Natura od Pauli"], + "domain": "naturaodpauli.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade soy candles, top organic visibility in 2026 scans" + }, + { + "id": "zolza", + "name": "Zolza", + "aliases": ["zolza.com.pl", "zolza.com", "zolza"], + "domain": "zolza.com.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish soy candle brand, presence on .com.pl and .com (consolidate aliases during scan)" + }, + { + "id": "oskiknot", + "name": "Oskiknot", + "aliases": ["oski knot", "oskiknot.pl", "Oski Knot"], + "domain": "oskiknot.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, wooden wick focus" + }, + { + "id": "bookiet", + "name": "Bookiet", + "aliases": ["bookiet.pl"], + "domain": "bookiet.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish soy candle brand" + }, + { + "id": "triny", + "name": "Triny", + "aliases": ["triny.pl"], + "domain": "triny.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish soy candle brand, strong AI visibility in early 2026 scans" + }, + { + "id": "aromatowo", + "name": "Aromatowo", + "aliases": ["aromatowo.pl"], + "domain": "aromatowo.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish soy candle brand" + }, + { + "id": "yush", + "name": "Yush", + "aliases": ["yush.pl"], + "domain": "yush.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish soy candle brand" + }, + { + "id": "lemonglas", + "name": "LemonGlas", + "aliases": ["lemon glas", "Lemon Glas", "lemonglas.pl"], + "domain": "lemonglas.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade soy candle brand, customer of LMW Commerce" + }, + { + "id": "paleta-smakow", + "name": "Paleta Smaków", + "aliases": ["Paleta Smakow", "paleta smaków", "paletasmakow.pl"], + "domain": "paletasmakow.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, customer of LMW Commerce" + }, + { + "id": "bennovate", + "name": "Bennovate", + "aliases": ["bennovate.pl"], + "domain": "bennovate.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish brand, customer of LMW Commerce" + }, + { + "id": "yankee-candle-pl", + "name": "Yankee Candle", + "aliases": ["Yankee Candle Polska", "yankeecandle.pl", "Yankee"], + "domain": "yankeecandle.pl", + "type": "importer", + "country_origin": "US", + "segment": "premium", + "founded": 1969, + "active_in_category_since": null, + "notes": "US brand, strong PL market presence via own e-commerce. Includes soy variants." + }, + { + "id": "rituals-pl", + "name": "Rituals", + "aliases": ["rituals.com", "Rituals Cosmetics"], + "domain": "rituals.com", + "type": "importer", + "country_origin": "NL", + "segment": "premium", + "founded": 2000, + "active_in_category_since": null, + "notes": "Dutch brand, has soy candle lines in PL e-commerce. Cross-listed in kosmetyki-naturalne." + }, + { + "id": "soy-candle-poland", + "name": "Soy Candle Poland", + "aliases": ["soycandlepoland.pl", "Soy Candle PL"], + "domain": "soycandlepoland.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Detected in Q1 2026 search results" + }, + { + "id": "candle-room", + "name": "Candle Room", + "aliases": ["candleroom.pl", "Candle Room PL"], + "domain": "candleroom.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Detected in Q1 2026 search results" + }, + { + "id": "manufaktura-aromatow", + "name": "Manufaktura Aromatów", + "aliases": ["manufaktura aromatow", "Manufaktura Aromatow", "manufaktura-aromatow.pl"], + "domain": "manufaktura-aromatow.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand" + }, + { + "id": "tiny-sparks", + "name": "Tiny Sparks", + "aliases": ["tinysparks.pl", "Tiny Sparks PL"], + "domain": "tinysparks.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, growing 2026" + }, + { + "id": "polski-knot", + "name": "Polski Knot", + "aliases": ["polskiknot.pl", "polski knot", "Polski Knot"], + "domain": "polskiknot.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, branded around 'Polish wick'" + }, + { + "id": "nacomi", + "name": "Nacomi", + "aliases": ["nacomi.pl"], + "domain": "nacomi.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "mid", + "founded": 2014, + "active_in_category_since": null, + "notes": "Polish cosmetics brand, also has soy candle line. Cross-listed in kosmetyki-naturalne." + }, + { + "id": "yope-candles", + "name": "Yope", + "aliases": ["yope.com.pl", "yope.pl"], + "domain": "yope.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "mid", + "founded": 2014, + "active_in_category_since": null, + "notes": "Polish brand, has soy candles. Primary tracked in kosmetyki-naturalne (cross-listed)." + }, + { + "id": "cinnamoroll-candles", + "name": "Cinnamoroll", + "aliases": ["cinnamoroll.pl"], + "domain": "cinnamoroll.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand" + }, + { + "id": "skvor", + "name": "Skvor", + "aliases": ["skvor.pl", "Skvor Candles"], + "domain": "skvor.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish soy candle brand" + }, + { + "id": "sojowo", + "name": "Sojowo", + "aliases": ["sojowo.pl"], + "domain": "sojowo.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade soy candle brand, name-on-the-tin positioning" + }, + { + "id": "ladnerzeczy", + "name": "Ładne Rzeczy", + "aliases": ["ladne rzeczy", "ladnerzeczy.pl", "Ladne Rzeczy"], + "domain": "ladnerzeczy.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, multi-product including soy candles" + }, + { + "id": "kreowane", + "name": "Kreowane", + "aliases": ["kreowane.pl", "Kreowane PL"], + "domain": "kreowane.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand" + }, + { + "id": "candle-and-sense", + "name": "Candle & Sense", + "aliases": ["candleandsense.pl", "Candle and Sense", "Candle&Sense"], + "domain": "candleandsense.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish premium soy candle brand" + }, + { + "id": "atelier-aurum", + "name": "Atelier Aurum", + "aliases": ["atelieraurum.pl"], + "domain": "atelieraurum.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish premium handmade brand" + }, + { + "id": "wosk-natury", + "name": "Wosk Natury", + "aliases": ["wosk natury", "wosknatury.pl"], + "domain": "wosknatury.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, eco-positioning" + }, + { + "id": "swiece-mokoszy", + "name": "Świece Mokoszy", + "aliases": ["swiece mokoszy", "Swiece Mokoszy", "swiecemokoszy.pl"], + "domain": "swiecemokoszy.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish handmade brand, Slavic-themed naming" + }, + { + "id": "modus-naturae", + "name": "Modus Naturae", + "aliases": ["modusnaturae.pl", "Modus Naturae PL"], + "domain": "modusnaturae.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish premium brand" + }, + { + "id": "pszczeli-knot", + "name": "Pszczeli Knot", + "aliases": ["pszczeliknot.pl", "Pszczeli Knot"], + "domain": "pszczeliknot.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish brand, beeswax-soy blend specialty" + }, + { + "id": "ekooko", + "name": "Ekooko", + "aliases": ["ekooko.pl", "Eko Oko"], + "domain": "ekooko.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium-handmade", + "active_in_category_since": null, + "notes": "Polish eco-positioned candle brand" + }, + { + "id": "homerose", + "name": "Homerose", + "aliases": ["homerose.pl"], + "domain": "homerose.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish premium home fragrance brand" + }, + { + "id": "kerzenduft", + "name": "Kerzenduft", + "aliases": ["kerzenduft.pl"], + "domain": "kerzenduft.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish brand with German-language naming, premium positioning" + }, + { + "id": "moodlight", + "name": "Moodlight", + "aliases": ["moodlight.pl"], + "domain": "moodlight.pl", + "type": "manufacturer", + "country_origin": "PL", + "segment": "premium", + "active_in_category_since": null, + "notes": "Polish premium brand" + } + ], + + "tracked_but_excluded_from_ranking": [ + { + "id": "allegro", + "name": "Allegro", + "domain": "allegro.pl", + "type": "marketplace", + "exclusion_reason": "Marketplace (sales channel), not a brand" + }, + { + "id": "empik", + "name": "Empik", + "domain": "empik.com", + "type": "marketplace", + "exclusion_reason": "Marketplace (sales channel), not a brand" + }, + { + "id": "ceneo", + "name": "Ceneo", + "domain": "ceneo.pl", + "type": "marketplace", + "exclusion_reason": "Price comparison, not a brand" + }, + { + "id": "notino", + "name": "Notino", + "domain": "notino.pl", + "type": "reseller", + "exclusion_reason": "Multi-brand reseller, not native brand identity in candle category" + }, + { + "id": "sephora-pl", + "name": "Sephora", + "domain": "sephora.pl", + "type": "reseller", + "exclusion_reason": "Multi-brand reseller, primary in kosmetyki not candles" + } + ], + + "metadata": { + "tracked_brands_count": 35, + "manufacturers_count": 33, + "importers_count": 2, + "tracked_excluded_count": 5, + "expected_total_visible_brands": "30-50 (PL niche market)", + "growth_estimate": "5-10 new brands per year detected via scan outputs" + }, + + "data_sources": [ + "Q1 2026 scan results from previous LMW Pulse `competitors_found` table", + "Allegro/Ceneo soy candle category browse", + "Reddit r/Polska_Marka mentions tracking", + "Manual research via Google search for 'polskie świece sojowe handmade'", + "Etsy.com PL store browse", + "JAKULO customer service data (12k+ customers, often mention competitors)" + ], + + "next_review": "2026-Q3 — re-scan all known sources, add new entrants detected in pilot scan outputs" +} diff --git a/data/swiece-sojowe-pl/brand_catalog.md b/data/swiece-sojowe-pl/brand_catalog.md new file mode 100644 index 0000000..ae13f3b --- /dev/null +++ b/data/swiece-sojowe-pl/brand_catalog.md @@ -0,0 +1,97 @@ +# Brand Catalog — Świece sojowe PL + +> Human-readable companion to `brand_catalog.json`. Same data, easier to scan. + +**Version:** 1.0.0 · **Last updated:** 2026-05-03 · **First scan cycle:** May 2026 · **First publication:** August 2026 + +--- + +## Tracked manufacturers (33 brands) + +| Brand | Domain | Segment | Notes | +|---|---|---|---| +| **JAKULO** | jakulo.pl | premium-handmade | Łódź-based, founder's brand, 12k+ customers | +| **Naturaodpauli** | naturaodpauli.pl | premium-handmade | Top organic visibility in 2026 scans | +| **Zolza** | zolza.com.pl | premium | Two domains (.com.pl + .com), consolidate | +| **Oskiknot** | oskiknot.pl | premium-handmade | Wooden wick focus | +| **Bookiet** | bookiet.pl | premium-handmade | — | +| **Triny** | triny.pl | premium-handmade | Strong AI visibility early 2026 | +| **Aromatowo** | aromatowo.pl | premium-handmade | — | +| **Yush** | yush.pl | premium-handmade | — | +| **LemonGlas** | lemonglas.pl | premium-handmade | LMW Commerce client | +| **Paleta Smaków** | paletasmakow.pl | premium-handmade | LMW Commerce client | +| **Bennovate** | bennovate.pl | premium | LMW Commerce client | +| **Soy Candle Poland** | soycandlepoland.pl | premium-handmade | Detected Q1 2026 | +| **Candle Room** | candleroom.pl | premium | Detected Q1 2026 | +| **Manufaktura Aromatów** | manufaktura-aromatow.pl | premium-handmade | — | +| **Tiny Sparks** | tinysparks.pl | premium-handmade | Growing 2026 | +| **Polski Knot** | polskiknot.pl | premium-handmade | Branded around 'Polish wick' | +| **Nacomi** | nacomi.pl | mid | Cross-listed kosmetyki-naturalne | +| **Yope** | yope.pl | mid | Primary in kosmetyki-naturalne | +| **Cinnamoroll** | cinnamoroll.pl | premium-handmade | — | +| **Skvor** | skvor.pl | premium | — | +| **Sojowo** | sojowo.pl | premium-handmade | Name-on-the-tin | +| **Ładne Rzeczy** | ladnerzeczy.pl | premium-handmade | Multi-product including candles | +| **Kreowane** | kreowane.pl | premium-handmade | — | +| **Candle & Sense** | candleandsense.pl | premium-handmade | — | +| **Atelier Aurum** | atelieraurum.pl | premium | — | +| **Wosk Natury** | wosknatury.pl | premium-handmade | Eco-positioning | +| **Świece Mokoszy** | swiecemokoszy.pl | premium-handmade | Slavic-themed | +| **Modus Naturae** | modusnaturae.pl | premium | — | +| **Pszczeli Knot** | pszczeliknot.pl | premium-handmade | Beeswax-soy blend | +| **Ekooko** | ekooko.pl | premium-handmade | Eco-positioning | +| **Homerose** | homerose.pl | premium | Premium home fragrance | +| **Kerzenduft** | kerzenduft.pl | premium | German-language naming | +| **Moodlight** | moodlight.pl | premium | — | + +## Tracked importers (2 brands) + +| Brand | Domain | Origin | Segment | Notes | +|---|---|---|---|---| +| **Yankee Candle** | yankeecandle.pl | US | premium | Strong PL e-commerce, soy variants included | +| **Rituals** | rituals.com | NL | premium | Cross-listed kosmetyki-naturalne | + +## Tracked but excluded from ranking (5 entities) + +| Entity | Type | Why excluded | +|---|---|---| +| **Allegro** | Marketplace | Sales channel, not a brand | +| **Empik** | Marketplace | Sales channel, not a brand | +| **Ceneo** | Marketplace | Price comparison, not a brand | +| **Notino** | Reseller | Multi-brand reseller | +| **Sephora** | Reseller | Multi-brand reseller, primary in kosmetyki | + +--- + +## Sources used to compile this catalog + +1. Q1 2026 scan results from previous LMW Pulse `competitors_found` table (10 brands) +2. Allegro/Ceneo "świece sojowe" category browse (15+ additional brands) +3. Reddit r/Polska_Marka and r/CzasNaSwiece mentions (5 additional) +4. Manual Google search "polskie świece sojowe handmade" (10 additional) +5. Etsy.com PL store browse (3 additional) +6. JAKULO customer service data — customers mentioning competitors (5 confirmed via support tickets) + +## Coverage estimate + +The Polish soy candle market has approximately **30-50 brands with meaningful internet presence** (own website, active social media, organic search visibility). + +**Current catalog: 33 manufacturers + 2 importers = 35 ranked brands.** + +This is **70-100% of the addressable market** depending on coverage estimate. We expect 5-10 new brands to be detected per year via scan outputs (Stage 4 of curation pipeline detects "brands appearing in LLM answers but not in our catalog" and flags them for inclusion). + +## Adding a new brand + +When a brand appears in scan outputs but isn't in this catalog: + +1. Stage 4 (validation agents) auto-flags `unknown_brand_X` +2. Manual review: is this a real brand or LLM hallucination? (sometimes LLMs invent plausible brand names) +3. If real: add to `brand_catalog.json` per schema in `data/README.md` +4. Bump version (1.0.0 → 1.0.1) +5. Commit with message: `data: add Brand X to swiece-sojowe-pl (detected in Q3 2026 scan)` + +## Honeypot brand notice + +A fictional brand exists in our scan output detection list, **but is not in this public catalog**. Its name and rank position are CLOSED — disclosure would defeat the purpose. If any external party reproduces our ranking and includes the honeypot brand, we have evidence of unauthorized use of Citee data without methodology compliance. + +See `methodology.json` for honeypot policy details. diff --git a/data/swiece-sojowe-pl/market_metadata.json b/data/swiece-sojowe-pl/market_metadata.json new file mode 100644 index 0000000..3b9bae5 --- /dev/null +++ b/data/swiece-sojowe-pl/market_metadata.json @@ -0,0 +1,87 @@ +{ + "category": "swiece-sojowe-pl", + "country": "PL", + "version": "1.0.0", + "last_updated": "2026-05-03", + + "market_size": { + "estimated_brands_visible": 35, + "estimated_brands_total_market": "30-50", + "category_gmv_estimate_pln_annual": "5-15M PLN", + "tier_classification": "Tier 2 — medium market", + "gmv_estimate_methodology": "Approximated from JAKULO insider data + public Allegro category sales velocity + handmade Etsy traffic estimates. Soy candle category is niche premium handcraft, not mass market — total Polish candle category (including paraffin, mass) is ~80-150M PLN, soy-specific is 5-15% of that." + }, + + "scan_cadence": { + "frequency": "quarterly", + "rationale": "Tier 2 — medium velocity changes, doesn't need monthly refresh. New entrants typically appear 1-2 per quarter." + }, + + "buyer_personas_estimated": [ + "30+ kobieta kupująca prezent dla mamy", + "Self-care millennial 25-35 po pracy", + "Wnętrzarz minimalistyczne mieszkanie", + "Mężczyzna kupujący prezent walentynkowy", + "Mama małych dzieci szukająca bezpiecznego zapachu", + "Eko-świadomy konsument 30+", + "Hostess / catering kupująca świece dla wystroju" + ], + + "seasonality": { + "high_season": ["wrzesień", "październik", "listopad", "grudzień"], + "high_season_drivers": ["jesienna nostalgia", "Halloween/Wszystkich Świętych", "Mikołajki", "święta Bożego Narodzenia (główny peak)", "świece prezentowe"], + "low_season": ["czerwiec", "lipiec", "sierpień"], + "low_season_drivers": ["lato — ludzie nie palą świec, otwierają okna"], + "scan_implication": "Q3 (lipiec-wrzesień) scan pokaże najmniej buying intent prompts cytujących marki. Q4 (paźd-grudzień) pokaże najwięcej. Citee Score nieadjustowany za seasonality (chcemy mierzyć REAL visibility, nie sztucznie korygować)." + }, + + "key_decision_factors_for_buyers": { + "ranked_by_importance": [ + "naturalny skład (sojowy wosk, deklarowane składniki zapachu)", + "ręczne wykonanie / handmade", + "polskość / lokalna produkcja", + "estetyka opakowania (wnętrza, prezent)", + "czas palenia (większy = lepsze value perception)", + "rozmiar (przedział 120-365 ml)", + "cena (próg psychologiczny ~80 zł retail, ~150 zł premium)", + "opinie / reviews", + "rozpoznawalność marki", + "dostępność / czas dostawy" + ] + }, + + "common_buyer_queries_pattern": [ + "polskie świece sojowe handmade", + "naturalna świeca sojowa", + "świeca sojowa premium prezent", + "ręcznie robione świece sojowe", + "polska marka świec", + "świeca sojowa 280 ml / 365 ml (specific size)", + "świeca sojowa o zapachu [vanilla/cinnamon/orange/jasmine/etc]", + "świeca sojowa ekologiczna" + ], + + "regulatory_notes": { + "advertising_restrictions": "None specific to candles. General consumer protection law applies (UOKiK).", + "labeling_requirements": "CLP regulation for fragranced products — must declare allergens. Doesn't affect AI visibility ranking, but Citee tracks whether brands are CLP-compliant in their product listings as a quality signal.", + "category_safe_to_track": true + }, + + "ai_search_volume_signals": { + "google_trends_pl_2024_2025": "Stable, slight upward trend ~15% YoY. Peak Nov-Dec.", + "reddit_pl_mentions_estimate": "20-50 organic mentions per month across r/Polska, r/Polska_Marka, r/CzasNaSwiece", + "quora_pl_mentions_estimate": "5-10 questions per month about Polish soy candle brands", + "estimated_chatgpt_query_volume": "~1000-3000 queries per month asking about Polish candle brands (estimated from LMW Pulse Free Checker telemetry, not exact)" + }, + + "expected_ranking_dynamics": { + "likely_top_3_pre_scan": ["JAKULO", "Naturaodpauli", "Yankee Candle"], + "rationale": "JAKULO has 12k customers + active brand. Naturaodpauli had top organic visibility in early 2026 scans. Yankee Candle has US legacy + strong PL e-commerce.", + "expected_volatility": "Medium — niche market with handmade producers means small marketing efforts can shift positions significantly. Anticipate 3-5 rank changes per quarter in top 20." + }, + + "next_scan": { + "scheduled": "2026-05 (private, validation cycle 1)", + "first_public": "2026-08 (after 3 validation cycles)" + } +} diff --git a/tools/prompt_curation/.env.example b/tools/prompt_curation/.env.example new file mode 100644 index 0000000..9a673df --- /dev/null +++ b/tools/prompt_curation/.env.example @@ -0,0 +1,25 @@ +# Citee Index — Prompt Curation Pipeline +# Copy to .env and fill in actual API keys. +# .env is gitignored. + +# === Required for all stages === +ANTHROPIC_API_KEY=sk-ant-xxxxx +OPENAI_API_KEY=sk-xxxxx +GOOGLE_API_KEY=AIzaxxxxx # For Gemini +PERPLEXITY_API_KEY=pplx-xxxxx + +# === Required for Stage 3 (Reality Check) === +# Reddit — create app at https://www.reddit.com/prefs/apps (script type) +REDDIT_CLIENT_ID=xxxxx +REDDIT_CLIENT_SECRET=xxxxx +REDDIT_USER_AGENT=citee-methodology/1.0 by /u/yourusername + +# pytrends needs no API key (unofficial Google Trends scraping) + +# === Optional === +# SerpAPI for AI Overviews tracking (Stage 5 + production scans) +SERPAPI_KEY=xxxxx + +# === Citee internals === +CITEE_DATA_DIR=../../data +CITEE_PROMPTS_DIR=../../prompts # Closed output directory diff --git a/tools/prompt_curation/1_persona_generator.py b/tools/prompt_curation/1_persona_generator.py new file mode 100644 index 0000000..effd11b --- /dev/null +++ b/tools/prompt_curation/1_persona_generator.py @@ -0,0 +1,159 @@ +"""Stage 1 — Persona Generator. + +Generate buyer personas for a category using Claude Sonnet. +Each persona includes demographics, pain points, decision factors, vocabulary. +""" +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + +from anthropic import Anthropic + +from config import CONFIG + +PERSONA_PROMPT = """You are a market researcher specializing in Polish e-commerce buyer personas. + +Generate {num_personas} distinct buyer personas for the category: **{category_display_name}**. + +Each persona should represent a realistic, distinct segment of buyers in this category. Avoid generic "millennial professional" — be specific. + +For each persona, provide: +1. **Name/label** — short descriptor (e.g., "30+ kobieta kupująca prezent dla mamy") +2. **Demographics** — age range, gender, location type (city/suburb/rural), income bracket +3. **Pain points** — what they're trying to solve when buying in this category +4. **Decision factors** — ranked list of what matters most (price, ingredients, brand, reviews, etc.) +5. **Vocabulary** — how they actually talk: formal/colloquial, technical/lay, Polish-specific phrases they use +6. **Search behavior** — do they research deeply or buy impulsively? Which platforms do they trust? + +Categories context: +- Polish e-commerce market +- Buyers using AI assistants (ChatGPT, Perplexity, Gemini, Claude) increasingly to discover and compare brands +- This is for {category_display_name} — a {market_size} market with brands like {sample_brands} + +Output as valid JSON in this exact schema: + +```json +{{ + "category": "{category_slug}", + "personas": [ + {{ + "id": "persona_1", + "label": "30+ kobieta kupująca prezent dla mamy", + "demographics": {{ + "age_range": "30-45", + "gender": "female", + "location_type": "city", + "income_bracket": "middle to upper-middle" + }}, + "pain_points": [ + "...", + "..." + ], + "decision_factors_ranked": [ + "...", + "..." + ], + "vocabulary": {{ + "register": "informal but respectful", + "polish_phrases": ["...", "..."], + "tech_savvy": "medium" + }}, + "search_behavior": "researches 2-3 options before deciding, trusts Reddit and friend recommendations" + }} + ] +}} +``` + +Only output the JSON. No prose before or after.""" + + +def load_category_context(category_slug: str, data_dir: Path) -> dict: + """Load market metadata + sample brands for the category.""" + metadata_file = data_dir / category_slug / "market_metadata.json" + catalog_file = data_dir / category_slug / "brand_catalog.json" + + if not metadata_file.exists(): + raise FileNotFoundError(f"Market metadata not found: {metadata_file}") + if not catalog_file.exists(): + raise FileNotFoundError(f"Brand catalog not found: {catalog_file}") + + with open(metadata_file, "r", encoding="utf-8") as f: + metadata = json.load(f) + with open(catalog_file, "r", encoding="utf-8") as f: + catalog = json.load(f) + + sample_brands = ", ".join(b["name"] for b in catalog["brands"][:5]) + market_size = metadata["market_size"]["tier_classification"] + + return { + "metadata": metadata, + "catalog": catalog, + "sample_brands": sample_brands, + "market_size": market_size, + } + + +def generate_personas(category_slug: str, category_display_name: str, num_personas: int = None) -> dict: + """Generate personas via Claude API.""" + if num_personas is None: + num_personas = CONFIG.num_personas + + data_dir = Path(__file__).parent.parent.parent / "data" + context = load_category_context(category_slug, data_dir) + + client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + + prompt = PERSONA_PROMPT.format( + num_personas=num_personas, + category_slug=category_slug, + category_display_name=category_display_name, + market_size=context["market_size"], + sample_brands=context["sample_brands"], + ) + + response = client.messages.create( + model=CONFIG.persona_model, + max_tokens=4000, + messages=[{"role": "user", "content": prompt}], + ) + + # Extract JSON from response + text = response.content[0].text.strip() + if text.startswith("```json"): + text = text[7:] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + personas_data = json.loads(text) + return personas_data + + +def save_personas(category_slug: str, personas: dict) -> Path: + """Save personas to data/{category}/personas.json.""" + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + data_dir.mkdir(parents=True, exist_ok=True) + output_file = data_dir / "personas.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(personas, f, ensure_ascii=False, indent=2) + return output_file + + +def main(): + parser = argparse.ArgumentParser(description="Generate buyer personas for a Citee category.") + parser.add_argument("--category", required=True, help="Category slug (e.g., 'swiece-sojowe-pl')") + parser.add_argument("--display-name", required=True, help="Human-readable category name (e.g., 'Świece sojowe PL')") + parser.add_argument("--num-personas", type=int, default=None, help=f"Number of personas to generate (default: {CONFIG.num_personas})") + args = parser.parse_args() + + print(f"[Stage 1] Generating {args.num_personas or CONFIG.num_personas} personas for {args.category}...") + personas = generate_personas(args.category, args.display_name, args.num_personas) + output_path = save_personas(args.category, personas) + print(f"[Stage 1] ✅ Saved {len(personas['personas'])} personas to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/2_prompt_brainstormer.py b/tools/prompt_curation/2_prompt_brainstormer.py new file mode 100644 index 0000000..e4fef82 --- /dev/null +++ b/tools/prompt_curation/2_prompt_brainstormer.py @@ -0,0 +1,172 @@ +"""Stage 2 — Prompt Brainstormer. + +For each persona generated in Stage 1, generate 30 prompts in the voice of that persona. +Total output: ~200-300 raw prompts per category, distributed across 5 prompt types. +""" +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + +from anthropic import Anthropic + +from config import CONFIG, get_target_counts + +BRAINSTORM_PROMPT = """You are a Polish e-commerce buyer with the following persona: + +**{persona_label}** + +Demographics: {demographics} +Pain points: {pain_points} +Decision factors: {decision_factors} +Vocabulary style: {vocabulary} + +Your task: Generate {prompts_per_persona} realistic queries you would type into ChatGPT, Perplexity, or Gemini when researching purchases in the **{category_display_name}** category. + +CRITICAL distribution requirement — your {prompts_per_persona} prompts must be distributed across these 5 types: + +| Type | Share | Example pattern | +|---|---|---| +| **buying** (active purchase intent) | {buying_share}% | "gdzie kupić [product] premium na prezent" | +| **comparison** (decision-stage) | {comparison_share}% | "[Brand A] vs [Brand B] który lepszy" | +| **specific_need** (specific use/attribute) | {specific_share}% | "[product] o [attribute] do [use case]" | +| **informational** (research) | {info_share}% | "co to [product] / jak działa [product]" | +| **brand_direct** (direct brand query) | {brand_share}% | "[Brand X] opinie / recenzje" | + +Rules: +1. Write in **Polish** as your persona would actually phrase queries — colloquial, possibly with typos, possibly informal. +2. Be **specific** — avoid "best soy candles" generic. Add context: occasion, recipient, attribute, comparison. +3. Distribute across types per the percentages — don't put all 30 in "buying." +4. Each prompt must be 5-15 words typical. Avoid 1-word ("świeczki") or 30-word essays. +5. Use real brand names from this list when relevant: {sample_brands} + +Output format — JSON array of objects, each with: +- `prompt`: the query text +- `type`: one of `buying`, `comparison`, `specific_need`, `informational`, `brand_direct` +- `persona_id`: "{persona_id}" + +```json +[ + {{"prompt": "gdzie kupić premium świecę sojową na prezent dla mamy", "type": "buying", "persona_id": "{persona_id}"}}, + ... +] +``` + +Only output the JSON array. No prose.""" + + +def load_personas(category_slug: str) -> dict: + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + personas_file = data_dir / "personas.json" + if not personas_file.exists(): + raise FileNotFoundError( + f"Personas not found: {personas_file}. Run 1_persona_generator.py first." + ) + with open(personas_file, "r", encoding="utf-8") as f: + return json.load(f) + + +def load_brand_catalog(category_slug: str) -> dict: + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + catalog_file = data_dir / "brand_catalog.json" + with open(catalog_file, "r", encoding="utf-8") as f: + return json.load(f) + + +def brainstorm_for_persona( + client: Anthropic, + persona: dict, + category_display_name: str, + sample_brands: str, + prompts_per_persona: int, +) -> list[dict]: + """Generate prompts for one persona.""" + distribution = CONFIG.type_distribution + prompt = BRAINSTORM_PROMPT.format( + persona_label=persona["label"], + persona_id=persona["id"], + demographics=json.dumps(persona["demographics"], ensure_ascii=False), + pain_points="; ".join(persona["pain_points"]), + decision_factors="; ".join(persona["decision_factors_ranked"]), + vocabulary=json.dumps(persona["vocabulary"], ensure_ascii=False), + category_display_name=category_display_name, + prompts_per_persona=prompts_per_persona, + sample_brands=sample_brands, + buying_share=int(distribution["buying"] * 100), + comparison_share=int(distribution["comparison"] * 100), + specific_share=int(distribution["specific_need"] * 100), + info_share=int(distribution["informational"] * 100), + brand_share=int(distribution["brand_direct"] * 100), + ) + + response = client.messages.create( + model=CONFIG.brainstormer_model, + max_tokens=8000, + messages=[{"role": "user", "content": prompt}], + ) + + text = response.content[0].text.strip() + if text.startswith("```json"): + text = text[7:] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + return json.loads(text) + + +def brainstorm_all(category_slug: str, category_display_name: str) -> dict: + personas_data = load_personas(category_slug) + catalog = load_brand_catalog(category_slug) + sample_brands = ", ".join(b["name"] for b in catalog["brands"][:8]) + + client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + + all_prompts = [] + for persona in personas_data["personas"]: + print(f"[Stage 2] Brainstorming for {persona['label']}...") + prompts = brainstorm_for_persona( + client=client, + persona=persona, + category_display_name=category_display_name, + sample_brands=sample_brands, + prompts_per_persona=CONFIG.prompts_per_persona, + ) + all_prompts.extend(prompts) + + return { + "category": category_slug, + "total_raw_prompts": len(all_prompts), + "personas_processed": len(personas_data["personas"]), + "prompts_per_persona": CONFIG.prompts_per_persona, + "type_distribution_target": CONFIG.type_distribution, + "raw_prompts": all_prompts, + } + + +def save_raw_prompts(category_slug: str, data: dict) -> Path: + output_dir = Path(__file__).parent.parent.parent / "data" / category_slug + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / "raw_prompts.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + return output_file + + +def main(): + parser = argparse.ArgumentParser(description="Brainstorm raw prompts for category.") + parser.add_argument("--category", required=True) + parser.add_argument("--display-name", required=True) + args = parser.parse_args() + + print(f"[Stage 2] Brainstorming prompts for {args.category}...") + data = brainstorm_all(args.category, args.display_name) + output_path = save_raw_prompts(args.category, data) + print(f"[Stage 2] ✅ Saved {data['total_raw_prompts']} raw prompts to {output_path}") + print(f"[Stage 2] Target distribution: {get_target_counts()}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/3_reality_checker.py b/tools/prompt_curation/3_reality_checker.py new file mode 100644 index 0000000..d3188e4 --- /dev/null +++ b/tools/prompt_curation/3_reality_checker.py @@ -0,0 +1,194 @@ +"""Stage 3 — Reality Checker. + +Cross-reference raw prompts against real-world signals: +- Google Trends (PL, past 12 months) +- Reddit search (PL niche subreddits) +- Quora PL questions + +Prompts with zero/marginal real-world signal are flagged for removal. +""" +from __future__ import annotations + +import argparse +import json +import os +import time +from pathlib import Path + +from config import CONFIG + +try: + from pytrends.request import TrendReq + HAS_PYTRENDS = True +except ImportError: + HAS_PYTRENDS = False + +try: + import praw + HAS_PRAW = True +except ImportError: + HAS_PRAW = False + + +def load_raw_prompts(category_slug: str) -> dict: + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + raw_file = data_dir / "raw_prompts.json" + if not raw_file.exists(): + raise FileNotFoundError( + f"Raw prompts not found: {raw_file}. Run 2_prompt_brainstormer.py first." + ) + with open(raw_file, "r", encoding="utf-8") as f: + return json.load(f) + + +def check_google_trends(prompt: str, pytrends_client) -> dict: + """Check if prompt phrase has any Google Trends signal in PL.""" + if not pytrends_client: + return {"signal": "skipped", "volume_estimate": None, "reason": "pytrends not available"} + + # Take first 4 words as keyword (Trends has 100 char limit, simpler is better) + keyword = " ".join(prompt.split()[:4]) + + try: + pytrends_client.build_payload( + kw_list=[keyword], + cat=0, + timeframe="today 12-m", + geo="PL", + ) + interest = pytrends_client.interest_over_time() + if interest.empty or keyword not in interest.columns: + return {"signal": "none", "volume_estimate": 0, "keyword_tested": keyword} + + avg_interest = interest[keyword].mean() + return { + "signal": "present" if avg_interest > 1 else "marginal", + "volume_estimate": float(avg_interest), + "keyword_tested": keyword, + } + except Exception as exc: + return {"signal": "error", "volume_estimate": None, "error": str(exc), "keyword_tested": keyword} + + +def check_reddit(prompt: str, reddit_client, subreddits: list[str]) -> dict: + """Search Reddit for prompt-related discussions.""" + if not reddit_client: + return {"signal": "skipped", "mention_count": None, "reason": "praw not available"} + + keyword = " ".join(prompt.split()[:5]) + + try: + total = 0 + examples = [] + for subreddit_name in subreddits: + subreddit = reddit_client.subreddit(subreddit_name) + results = list(subreddit.search(keyword, limit=5, time_filter="year")) + total += len(results) + for r in results[:2]: + examples.append({"subreddit": subreddit_name, "title": r.title, "score": r.score}) + + signal = "present" if total >= CONFIG.reddit_min_organic_mentions else ( + "marginal" if total > 0 else "none" + ) + return { + "signal": signal, + "mention_count": total, + "keyword_tested": keyword, + "examples": examples[:3], + } + except Exception as exc: + return {"signal": "error", "mention_count": None, "error": str(exc)} + + +def aggregate_signals(prompt_obj: dict) -> str: + """Combine signals into pass/marginal/fail decision.""" + trends = prompt_obj.get("google_trends_check", {}).get("signal", "skipped") + reddit = prompt_obj.get("reddit_check", {}).get("signal", "skipped") + + if trends == "present" or reddit == "present": + return "pass" + if trends == "marginal" or reddit == "marginal": + return "marginal" + if trends == "none" and reddit == "none": + return "fail" + return "marginal" # Default for skipped/error states + + +def check_all_prompts(category_slug: str) -> dict: + raw_data = load_raw_prompts(category_slug) + raw_prompts = raw_data["raw_prompts"] + + pytrends_client = None + if HAS_PYTRENDS: + try: + pytrends_client = TrendReq(hl="pl-PL", tz=120) + except Exception as exc: + print(f"[Stage 3] ⚠ pytrends init failed: {exc}") + + reddit_client = None + if HAS_PRAW and os.environ.get("REDDIT_CLIENT_ID"): + try: + reddit_client = praw.Reddit( + client_id=os.environ["REDDIT_CLIENT_ID"], + client_secret=os.environ["REDDIT_CLIENT_SECRET"], + user_agent=os.environ.get("REDDIT_USER_AGENT", "citee-methodology/1.0"), + ) + except Exception as exc: + print(f"[Stage 3] ⚠ Reddit auth failed: {exc}") + + # PL niche subreddits — adjust per category + pl_subreddits = ["Polska", "Polska_Marka", "PolskieAukcje", "ksiazki"] + + validated_prompts = [] + for i, prompt_obj in enumerate(raw_prompts): + prompt_text = prompt_obj["prompt"] + if i % 10 == 0: + print(f"[Stage 3] Checking prompt {i+1}/{len(raw_prompts)}...") + + prompt_obj["google_trends_check"] = check_google_trends(prompt_text, pytrends_client) + prompt_obj["reddit_check"] = check_reddit(prompt_text, reddit_client, pl_subreddits) + prompt_obj["reality_signal"] = aggregate_signals(prompt_obj) + + validated_prompts.append(prompt_obj) + # Rate-limit pytrends (otherwise 429s) + time.sleep(0.5) + + pass_count = sum(1 for p in validated_prompts if p["reality_signal"] == "pass") + marginal_count = sum(1 for p in validated_prompts if p["reality_signal"] == "marginal") + fail_count = sum(1 for p in validated_prompts if p["reality_signal"] == "fail") + + return { + "category": category_slug, + "total_checked": len(validated_prompts), + "summary": { + "pass": pass_count, + "marginal": marginal_count, + "fail": fail_count, + }, + "validated_prompts": validated_prompts, + } + + +def save_validated(category_slug: str, data: dict) -> Path: + output_file = ( + Path(__file__).parent.parent.parent / "data" / category_slug / "validated_prompts.json" + ) + with open(output_file, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + return output_file + + +def main(): + parser = argparse.ArgumentParser(description="Reality-check raw prompts.") + parser.add_argument("--category", required=True) + args = parser.parse_args() + + print(f"[Stage 3] Reality-checking prompts for {args.category}...") + data = check_all_prompts(args.category) + output_path = save_validated(args.category, data) + print(f"[Stage 3] ✅ Saved validated prompts to {output_path}") + print(f"[Stage 3] Summary: {data['summary']}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/4_validation_agents.py b/tools/prompt_curation/4_validation_agents.py new file mode 100644 index 0000000..6b84143 --- /dev/null +++ b/tools/prompt_curation/4_validation_agents.py @@ -0,0 +1,244 @@ +"""Stage 4 — Multi-agent Validation. + +Three Claude critic agents review prompts in parallel: +- Agent A: Real buyer critique (does this sound like real persona phrasing?) +- Agent B: Methodology critic (statistical balance, distribution, vocabulary) +- Agent C: Vendor exploit hunter (which prompts are too easy to game by content marketing?) + +Prompts flagged by N+ agents (default: 2) are removed. +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import os +from pathlib import Path + +from anthropic import AsyncAnthropic + +from config import CONFIG + +AGENT_A_PROMPT = """You are reviewing a list of prompts that buyer personas would supposedly type into ChatGPT/Perplexity/Gemini when researching purchases in **{category}**. + +Your job: identify prompts that DON'T sound natural for any realistic Polish e-commerce buyer. + +Flag prompts that: +1. Are too formal/academic (no buyer phrases queries like a research paper) +2. Are too long (real users don't type 30-word queries) +3. Are too short / generic (single words or 2-word phrases) +4. Use vocabulary no real Polish buyer would use +5. Are buyer-impossible (e.g., asking about specs only B2B buyer would care about, in a B2C context) + +Here are the {prompt_count} prompts to review: + +{prompts_list} + +Output JSON array of flagged prompt IDs (use the index as ID, 0-indexed): + +```json +{{ + "flagged_indices": [3, 7, 12], + "reasons": {{ + "3": "Too formal — no real buyer types like this", + "7": "Single word, no buying intent", + "12": "B2B language in B2C context" + }} +}} +``` + +Only output JSON. No prose.""" + + +AGENT_B_PROMPT = """You are a methodology critic for a Polish e-commerce AI visibility ranking project. + +Review this prompt list for **statistical and structural issues**: + +Target distribution per the methodology: +- buying: 30% (weight 2.0) +- comparison: 25% (weight 1.5) +- specific_need: 20% (weight 1.5) +- informational: 15% (weight 0.3) +- brand_direct: 10% (weight 0.3) + +Total prompts: {prompt_count} + +Flag issues: +1. Type distribution off by >10% from target +2. Vocabulary too repetitive (same phrases recurring) +3. Subcategory bias (e.g., 80% prompts about prezenty, 20% about everything else) +4. Length distribution unreasonable (all prompts are very long or very short) +5. Missing realistic buyer scenarios (e.g., no prompts about specific occasions, sizes, attributes) + +Prompts list: + +{prompts_list} + +Output: + +```json +{{ + "flagged_indices": [...], + "reasons": {{...}}, + "structural_issues": [ + "Type 'comparison' is over-represented at 35% (target 25%)", + "20+ prompts mention 'prezent dla mamy' — too repetitive" + ] +}} +``` + +Only JSON output.""" + + +AGENT_C_PROMPT = """You are a vendor exploit hunter for a Polish e-commerce AI visibility ranking. + +Your job: identify prompts that are TOO EASY for a vendor to game by content marketing fluff. + +A prompt is "exploitable" if: +1. The answer can be dominated by writing one good blog post +2. The answer comes primarily from Wikipedia (vendors can edit Wikipedia) +3. The answer is brand-agnostic (any vendor can position to win it via SEO content) +4. The prompt would be answered by listing Wikipedia / blog content rather than specific brand recommendations + +We WANT prompts where: +- AI must recommend specific brands (with real reviews, real authority, multi-source citation) +- Prompt requires real product positioning, not just content production +- Multiple sources (reviews, Reddit, brand sites) need to align for ranking + +Flag prompts that are too gameable: + +{prompts_list} + +Output: + +```json +{{ + "flagged_indices": [...], + "reasons": {{ + "5": "Generic 'co to świeca sojowa' — easily gamed by Wikipedia + blog post", + "12": "Brand-agnostic 'jak działa świeca sojowa' — content marketing fluff target" + }} +}} +``` + +Only JSON output.""" + + +async def run_agent(client: AsyncAnthropic, prompt: str) -> dict: + """Single agent call.""" + response = await client.messages.create( + model=CONFIG.critic_models["real_buyer_critique"], + max_tokens=4000, + messages=[{"role": "user", "content": prompt}], + ) + text = response.content[0].text.strip() + if text.startswith("```json"): + text = text[7:] + if text.endswith("```"): + text = text[:-3] + return json.loads(text.strip()) + + +async def run_three_critics(prompts: list[dict], category_display_name: str) -> dict: + client = AsyncAnthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + + # Format prompts for review + prompts_text = "\n".join( + f"{i}. [{p['type']}] {p['prompt']}" for i, p in enumerate(prompts) + ) + + agent_a = AGENT_A_PROMPT.format( + category=category_display_name, + prompt_count=len(prompts), + prompts_list=prompts_text, + ) + agent_b = AGENT_B_PROMPT.format( + prompt_count=len(prompts), + prompts_list=prompts_text, + ) + agent_c = AGENT_C_PROMPT.format( + prompts_list=prompts_text, + ) + + # Run 3 agents in parallel + print("[Stage 4] Running 3 critic agents in parallel...") + results = await asyncio.gather( + run_agent(client, agent_a), + run_agent(client, agent_b), + run_agent(client, agent_c), + ) + + return { + "agent_a_real_buyer": results[0], + "agent_b_methodology": results[1], + "agent_c_exploit_hunter": results[2], + } + + +def aggregate_flags(critic_results: dict, total_prompts: int) -> dict: + """Count how many agents flagged each prompt index.""" + flag_counts: dict[int, list[str]] = {} + + for agent_name, result in critic_results.items(): + for idx in result.get("flagged_indices", []): + if idx not in flag_counts: + flag_counts[idx] = [] + reason = result.get("reasons", {}).get(str(idx), "no reason given") + flag_counts[idx].append(f"{agent_name}: {reason}") + + flagged_for_removal = [ + idx for idx, reasons in flag_counts.items() + if len(reasons) >= CONFIG.flagged_by_n_critics_to_remove + ] + + return { + "flag_counts_by_prompt": flag_counts, + "flagged_for_removal": sorted(flagged_for_removal), + "removal_threshold_critics": CONFIG.flagged_by_n_critics_to_remove, + "total_prompts": total_prompts, + "total_removed": len(flagged_for_removal), + "total_kept": total_prompts - len(flagged_for_removal), + } + + +def main(): + parser = argparse.ArgumentParser(description="Multi-agent validation of prompts.") + parser.add_argument("--category", required=True) + parser.add_argument("--display-name", required=True) + args = parser.parse_args() + + data_dir = Path(__file__).parent.parent.parent / "data" / args.category + validated_file = data_dir / "validated_prompts.json" + if not validated_file.exists(): + raise FileNotFoundError(f"Run 3_reality_checker.py first. Missing: {validated_file}") + + with open(validated_file, "r", encoding="utf-8") as f: + validated_data = json.load(f) + + # Filter out reality-check failures first + candidates = [p for p in validated_data["validated_prompts"] if p["reality_signal"] != "fail"] + print(f"[Stage 4] Reviewing {len(candidates)} prompts (post-reality-check)...") + + critic_results = asyncio.run(run_three_critics(candidates, args.display_name)) + aggregation = aggregate_flags(critic_results, len(candidates)) + + output = { + "category": args.category, + "input_count": len(candidates), + "critic_results": critic_results, + "aggregation": aggregation, + "kept_prompts": [ + p for i, p in enumerate(candidates) if i not in aggregation["flagged_for_removal"] + ], + } + + output_file = data_dir / "critic_review.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + + print(f"[Stage 4] ✅ Saved {output_file}") + print(f"[Stage 4] Removed: {aggregation['total_removed']}, Kept: {aggregation['total_kept']}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/5_pilot_test_runner.py b/tools/prompt_curation/5_pilot_test_runner.py new file mode 100644 index 0000000..7556179 --- /dev/null +++ b/tools/prompt_curation/5_pilot_test_runner.py @@ -0,0 +1,219 @@ +"""Stage 5 — Pilot Test Runner. + +Run a sample of N prompts (default 10) through 3 LLM models in parallel. +Reject prompts that: +- Get "I don't know" / "depends on preferences" responses +- Produce zero overlap across models (incoherent) +- Get misinterpreted (LLM answers about wrong topic) +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import random +from pathlib import Path + +import httpx +from anthropic import AsyncAnthropic +from openai import AsyncOpenAI + +from config import CONFIG + + +async def query_chatgpt(client: AsyncOpenAI, prompt: str) -> dict: + """Query GPT-4o with web search.""" + try: + response = await client.chat.completions.create( + model="gpt-4o-search-preview", + messages=[{"role": "user", "content": prompt}], + max_tokens=600, + ) + text = response.choices[0].message.content + return {"model": "chatgpt", "response": text, "ok": True} + except Exception as exc: + return {"model": "chatgpt", "response": None, "ok": False, "error": str(exc)} + + +async def query_perplexity(prompt: str) -> dict: + """Query Perplexity Sonar Pro.""" + try: + async with httpx.AsyncClient(timeout=60) as client: + response = await client.post( + "https://api.perplexity.ai/chat/completions", + headers={ + "Authorization": f"Bearer {os.environ['PERPLEXITY_API_KEY']}", + "Content-Type": "application/json", + }, + json={ + "model": "sonar-pro", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 600, + }, + ) + data = response.json() + text = data["choices"][0]["message"]["content"] + return {"model": "perplexity", "response": text, "ok": True} + except Exception as exc: + return {"model": "perplexity", "response": None, "ok": False, "error": str(exc)} + + +async def query_gemini(prompt: str) -> dict: + """Query Gemini Pro.""" + try: + # Using direct REST call (google-generativeai SDK is verbose) + async with httpx.AsyncClient(timeout=60) as client: + response = await client.post( + f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={os.environ['GOOGLE_API_KEY']}", + json={ + "contents": [{"parts": [{"text": prompt}]}], + "generationConfig": {"maxOutputTokens": 600}, + }, + ) + data = response.json() + text = data["candidates"][0]["content"]["parts"][0]["text"] + return {"model": "gemini", "response": text, "ok": True} + except Exception as exc: + return {"model": "gemini", "response": None, "ok": False, "error": str(exc)} + + +async def query_all_models(prompt_obj: dict, openai_client: AsyncOpenAI) -> dict: + """Run prompt against all 3 models in parallel.""" + prompt_text = prompt_obj["prompt"] + results = await asyncio.gather( + query_chatgpt(openai_client, prompt_text), + query_perplexity(prompt_text), + query_gemini(prompt_text), + ) + return { + "prompt": prompt_obj["prompt"], + "type": prompt_obj["type"], + "persona_id": prompt_obj.get("persona_id"), + "model_responses": {r["model"]: r for r in results}, + } + + +def evaluate_response_quality(test_result: dict, brand_catalog: dict) -> dict: + """Detect bad signals in responses.""" + bad_signals = [] + brand_names = set() + for brand in brand_catalog["brands"]: + brand_names.add(brand["name"].lower()) + for alias in brand.get("aliases", []): + brand_names.add(alias.lower()) + + responses = test_result["model_responses"] + successful = [r for r in responses.values() if r.get("ok")] + + if len(successful) < 2: + bad_signals.append("less_than_2_models_responded") + + for model_name, r in responses.items(): + if not r.get("ok"): + continue + text = (r.get("response") or "").lower() + + # "I don't know" patterns + idk_patterns = ["nie wiem", "trudno powiedzieć", "to zależy od", "preferencje"] + if any(pattern in text for pattern in idk_patterns) and len(text) < 200: + bad_signals.append(f"{model_name}_uncertain_short") + + # No brand mentions + brands_mentioned = [b for b in brand_names if b in text] + r["brands_detected"] = brands_mentioned[:10] + + if not brands_mentioned: + bad_signals.append(f"{model_name}_no_brands_mentioned") + + # Cross-model overlap + all_brands_per_model = { + m: set(r.get("brands_detected", [])) + for m, r in responses.items() if r.get("ok") + } + if len(all_brands_per_model) >= 2: + overlap = set.intersection(*all_brands_per_model.values()) if all_brands_per_model.values() else set() + if not overlap and any(all_brands_per_model.values()): + bad_signals.append("zero_brand_overlap_across_models") + + return { + "bad_signals": bad_signals, + "verdict": "reject" if len(bad_signals) >= 2 else ("flag" if bad_signals else "pass"), + } + + +async def run_pilot_test(category_slug: str) -> dict: + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + critic_file = data_dir / "critic_review.json" + if not critic_file.exists(): + raise FileNotFoundError(f"Run 4_validation_agents.py first. Missing: {critic_file}") + + with open(critic_file, "r", encoding="utf-8") as f: + critic_data = json.load(f) + + catalog_file = data_dir / "brand_catalog.json" + with open(catalog_file, "r", encoding="utf-8") as f: + brand_catalog = json.load(f) + + candidates = critic_data["kept_prompts"] + sample_size = min(CONFIG.pilot_sample_size, len(candidates)) + + # Stratified sample across types + by_type: dict[str, list[dict]] = {} + for p in candidates: + by_type.setdefault(p["type"], []).append(p) + + sampled = [] + for ptype, prompts in by_type.items(): + n = max(1, int(round(sample_size * CONFIG.type_distribution.get(ptype, 0.2)))) + sampled.extend(random.sample(prompts, min(n, len(prompts)))) + + # Trim to sample_size + sampled = sampled[:sample_size] + print(f"[Stage 5] Pilot testing {len(sampled)} prompts × 3 models = {len(sampled) * 3} API calls") + + openai_client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"]) + + pilot_results = [] + for i, prompt_obj in enumerate(sampled): + print(f"[Stage 5] Testing prompt {i+1}/{len(sampled)}: {prompt_obj['prompt'][:50]}...") + test_result = await query_all_models(prompt_obj, openai_client) + evaluation = evaluate_response_quality(test_result, brand_catalog) + test_result["evaluation"] = evaluation + pilot_results.append(test_result) + + pass_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "pass") + flag_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "flag") + reject_count = sum(1 for r in pilot_results if r["evaluation"]["verdict"] == "reject") + + return { + "category": category_slug, + "sample_size": len(sampled), + "summary": {"pass": pass_count, "flag": flag_count, "reject": reject_count}, + "pilot_results": pilot_results, + } + + +def save_pilot_results(category_slug: str, data: dict) -> Path: + output_file = ( + Path(__file__).parent.parent.parent / "data" / category_slug / "pilot_test_results.json" + ) + with open(output_file, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + return output_file + + +def main(): + parser = argparse.ArgumentParser(description="Pilot test sample prompts on 3 models.") + parser.add_argument("--category", required=True) + args = parser.parse_args() + + print(f"[Stage 5] Pilot testing {args.category}...") + data = asyncio.run(run_pilot_test(args.category)) + output_path = save_pilot_results(args.category, data) + print(f"[Stage 5] ✅ Saved {output_path}") + print(f"[Stage 5] Summary: {data['summary']}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/6_human_review_export.py b/tools/prompt_curation/6_human_review_export.py new file mode 100644 index 0000000..3620425 --- /dev/null +++ b/tools/prompt_curation/6_human_review_export.py @@ -0,0 +1,133 @@ +"""Stage 6 — Human Review Export. + +Export remaining candidate prompts to a CSV that the founder + category expert +can review in a spreadsheet. Each row has columns for accept/reject/edit decisions. +""" +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path + +from config import CONFIG, get_target_counts + + +def load_kept_prompts(category_slug: str) -> list[dict]: + data_dir = Path(__file__).parent.parent.parent / "data" / category_slug + + # Take output from Stage 4 (filtered by critics) — Stage 5 was just a sample test + critic_file = data_dir / "critic_review.json" + if not critic_file.exists(): + raise FileNotFoundError(f"Run 4_validation_agents.py first. Missing: {critic_file}") + + with open(critic_file, "r", encoding="utf-8") as f: + critic_data = json.load(f) + + return critic_data["kept_prompts"] + + +def export_to_csv(category_slug: str, prompts: list[dict]) -> Path: + output_file = ( + Path(__file__).parent.parent.parent / "data" / category_slug / "for_human_review.csv" + ) + + target_counts = get_target_counts() + + # Group prompts by type for easier review + by_type: dict[str, list[dict]] = {} + for p in prompts: + by_type.setdefault(p["type"], []).append(p) + + with open(output_file, "w", encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f, delimiter=";", quoting=csv.QUOTE_ALL) + writer.writerow([ + "row_id", + "type", + "type_target_count", + "prompt", + "persona_id", + "decision", # APPROVE / REJECT / EDIT + "edited_prompt", # If decision == EDIT, write new version here + "notes", + ]) + + row_id = 0 + for ptype, type_prompts in sorted(by_type.items()): + target = target_counts.get(ptype, 0) + for p in type_prompts: + row_id += 1 + writer.writerow([ + row_id, + ptype, + target, + p["prompt"], + p.get("persona_id", ""), + "", # decision — fill in + "", # edited prompt — fill if needed + "", # notes + ]) + + return output_file + + +def export_summary_md(category_slug: str, prompts: list[dict]) -> Path: + """Write human-readable summary.""" + output_file = ( + Path(__file__).parent.parent.parent / "data" / category_slug / "for_human_review_summary.md" + ) + + target_counts = get_target_counts() + by_type: dict[str, int] = {} + for p in prompts: + by_type[p["type"]] = by_type.get(p["type"], 0) + 1 + + with open(output_file, "w", encoding="utf-8") as f: + f.write(f"# Human Review — {category_slug}\n\n") + f.write(f"**Total candidates after Stages 1-5: {len(prompts)}**\n\n") + f.write(f"**Target final pool: {CONFIG.final_pool_size}**\n\n") + f.write("## Distribution check\n\n") + f.write("| Type | Candidates | Target | Status |\n") + f.write("|---|---|---|---|\n") + for ptype in ["buying", "comparison", "specific_need", "informational", "brand_direct"]: + count = by_type.get(ptype, 0) + target = target_counts.get(ptype, 0) + status = "✅" if count >= target * 1.2 else ("⚠️" if count >= target else "❌ too few") + f.write(f"| {ptype} | {count} | {target} | {status} |\n") + + f.write("\n## Review process\n\n") + f.write("1. Open `for_human_review.csv` in spreadsheet\n") + f.write("2. For each row, fill `decision` column with: `APPROVE`, `REJECT`, or `EDIT`\n") + f.write("3. If `EDIT`, write new version in `edited_prompt` column\n") + f.write(f"4. Aim to APPROVE ~{CONFIG.final_pool_size} prompts total, balanced per target distribution\n") + f.write("5. Save as `for_human_review_decided.csv`\n") + f.write("6. Run `python 7_finalize.py --category {category_slug}` to produce final closed pool\n\n") + + f.write("## Tips\n\n") + f.write("- If a type has too few candidates, you may need to edit some from over-represented types to fit\n") + f.write("- Watch for repetitive vocabulary — if 5 prompts say 'gdzie kupić premium prezent' similar, vary or reject most\n") + f.write("- For brand_direct prompts, ensure each major brand from `brand_catalog.json` has at least 1 prompt directed at it\n") + + return output_file + + +def main(): + parser = argparse.ArgumentParser(description="Export prompts for human review.") + parser.add_argument("--category", required=True) + args = parser.parse_args() + + print(f"[Stage 6] Exporting prompts for human review: {args.category}...") + prompts = load_kept_prompts(args.category) + csv_path = export_to_csv(args.category, prompts) + summary_path = export_summary_md(args.category, prompts) + + print(f"[Stage 6] ✅ Exported {len(prompts)} prompts") + print(f"[Stage 6] CSV: {csv_path}") + print(f"[Stage 6] Summary: {summary_path}") + print() + print("Next: open the CSV, fill decision column (APPROVE/REJECT/EDIT), save as 'for_human_review_decided.csv',") + print(f"then run: python 7_finalize.py --category {args.category}") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/7_finalize.py b/tools/prompt_curation/7_finalize.py new file mode 100644 index 0000000..b405fe7 --- /dev/null +++ b/tools/prompt_curation/7_finalize.py @@ -0,0 +1,158 @@ +"""Stage 7 — Finalize. + +Read the human-decided CSV (`for_human_review_decided.csv`) and produce the +final closed prompt pool: `prompts/{category}/v{N}.json` (gitignored). +""" +from __future__ import annotations + +import argparse +import csv +import json +from datetime import datetime, timezone +from pathlib import Path + +from config import CONFIG, get_target_counts + + +def load_decided_csv(category_slug: str) -> list[dict]: + decided_file = ( + Path(__file__).parent.parent.parent + / "data" + / category_slug + / "for_human_review_decided.csv" + ) + if not decided_file.exists(): + raise FileNotFoundError( + f"Decided CSV not found: {decided_file}. " + f"Did you fill out 'for_human_review.csv' and rename to 'for_human_review_decided.csv'?" + ) + + rows = [] + with open(decided_file, "r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f, delimiter=";") + for row in reader: + rows.append(row) + return rows + + +def determine_next_version(category_slug: str) -> str: + """Find next version number (v1, v2, ...) for closed prompts.""" + prompts_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug + if not prompts_dir.exists(): + prompts_dir.mkdir(parents=True, exist_ok=True) + return "v1" + + existing_versions = [] + for f in prompts_dir.glob("v*.json"): + try: + num = int(f.stem.replace("v", "")) + existing_versions.append(num) + except ValueError: + continue + + next_num = (max(existing_versions) + 1) if existing_versions else 1 + return f"v{next_num}" + + +def finalize(category_slug: str) -> Path: + rows = load_decided_csv(category_slug) + + approved_prompts = [] + rejected_count = 0 + edited_count = 0 + + for row in rows: + decision = (row.get("decision") or "").strip().upper() + if decision == "REJECT": + rejected_count += 1 + continue + if decision == "EDIT": + edited_text = (row.get("edited_prompt") or "").strip() + if not edited_text: + print(f"[Stage 7] ⚠ Row {row.get('row_id')} marked EDIT but no edited_prompt — skipping") + continue + approved_prompts.append({ + "prompt": edited_text, + "type": row["type"], + "persona_id": row.get("persona_id", ""), + "edited_from": row.get("prompt", ""), + }) + edited_count += 1 + elif decision == "APPROVE": + approved_prompts.append({ + "prompt": row["prompt"], + "type": row["type"], + "persona_id": row.get("persona_id", ""), + }) + # Empty decision = treat as not yet decided, skip + + # Distribution check + by_type: dict[str, int] = {} + for p in approved_prompts: + by_type[p["type"]] = by_type.get(p["type"], 0) + 1 + + target_counts = get_target_counts() + + distribution_warnings = [] + for ptype, target in target_counts.items(): + actual = by_type.get(ptype, 0) + if actual < target * 0.7: + distribution_warnings.append(f"{ptype}: only {actual} approved (target {target})") + elif actual > target * 1.3: + distribution_warnings.append(f"{ptype}: too many — {actual} approved (target {target})") + + version = determine_next_version(category_slug) + output_dir = Path(__file__).parent.parent.parent / "prompts" / category_slug + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / f"{version}.json" + + final_pool = { + "category": category_slug, + "version": version, + "finalized_at": datetime.now(timezone.utc).isoformat(), + "total_prompts": len(approved_prompts), + "distribution": by_type, + "target_distribution": target_counts, + "rejected_count": rejected_count, + "edited_count": edited_count, + "distribution_warnings": distribution_warnings, + "prompts": approved_prompts, + } + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(final_pool, f, ensure_ascii=False, indent=2) + + return output_file, final_pool + + +def main(): + parser = argparse.ArgumentParser(description="Finalize prompt pool from human-decided CSV.") + parser.add_argument("--category", required=True) + args = parser.parse_args() + + print(f"[Stage 7] Finalizing prompt pool for {args.category}...") + output_file, summary = finalize(args.category) + + print(f"[Stage 7] ✅ Saved {output_file}") + print(f"[Stage 7] Total approved: {summary['total_prompts']}") + print(f"[Stage 7] Rejected: {summary['rejected_count']}, Edited: {summary['edited_count']}") + print(f"[Stage 7] Distribution: {summary['distribution']}") + + if summary["distribution_warnings"]: + print() + print("⚠ Distribution warnings:") + for w in summary["distribution_warnings"]: + print(f" - {w}") + print() + print("Consider re-running Stage 6 to add/remove prompts to fix distribution.") + else: + print() + print(f"✅ Distribution looks good! Final pool ready at:") + print(f" {output_file}") + print() + print("This file is in `prompts/` directory which is gitignored — exact strings remain CLOSED.") + print("Public reference (illustrative examples) lives in `prompts/example-{category}.md`.") + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/README.md b/tools/prompt_curation/README.md new file mode 100644 index 0000000..e50f600 --- /dev/null +++ b/tools/prompt_curation/README.md @@ -0,0 +1,97 @@ +# Prompt Curation Pipeline + +> Multi-stage pipeline for curating production prompts per category. Translates the 6-stage methodology process from `prompts/README.md` into runnable code. + +--- + +## Pipeline overview + +``` +1_persona_generator.py → data/{category}/personas.json +2_prompt_brainstormer.py → data/{category}/raw_prompts.json +3_reality_checker.py → data/{category}/validated_prompts.json +4_validation_agents.py → data/{category}/critic_review.json +5_pilot_test_runner.py → data/{category}/pilot_test_results.json +6_human_review_export.py → data/{category}/for_human_review.csv +7_finalize.py → prompts/{category}/v{N}.json (CLOSED) +``` + +Each stage is idempotent and can be re-run with cached intermediate outputs. + +## Tech stack + +- **Python 3.12+** +- **Anthropic SDK** (`anthropic>=0.50.0`) — Claude for persona generation, brainstorming, critic agents +- **OpenAI SDK** (`openai>=1.50.0`) — GPT-4o-search for pilot test runs +- **Google Generative AI** (`google-generativeai>=0.8.0`) — Gemini for pilot test runs +- **httpx** for Perplexity API +- **pandas** for CSV export to human reviewer +- **pytrends** for Google Trends API (free, unofficial) +- **praw** for Reddit search (requires Reddit OAuth app) + +## Usage + +```bash +# 1. Set up environment variables (see .env.example) +cp .env.example .env +# Edit .env with API keys + +# 2. Run pipeline for a category +python pipeline.py --category swiece-sojowe-pl + +# Or run individual stages +python 1_persona_generator.py --category swiece-sojowe-pl +python 2_prompt_brainstormer.py --category swiece-sojowe-pl +# ... etc. + +# 3. After Stage 6, review CSV manually + approve in human_review tool +python 6_human_review_export.py --category swiece-sojowe-pl +# Open data/{category}/for_human_review.csv in spreadsheet +# Mark approved/rejected/edited +# Save back as for_human_review_decided.csv + +# 4. Finalize +python 7_finalize.py --category swiece-sojowe-pl +# Outputs: prompts/{category}/v1.json (gitignored, closed) +``` + +## Cost per category (estimated) + +| Stage | API used | Cost | +|---|---|---| +| 1 — Persona Generator | Claude Sonnet | ~$0.50 | +| 2 — Prompt Brainstormer | Claude Sonnet | ~$1.50 | +| 3 — Reality Checker | Free APIs (Trends, Reddit, Quora) | $0 | +| 4 — Validation Agents (3 critics) | Claude Sonnet × 3 | ~$3 | +| 5 — Pilot Test Runner (10 prompts × 3 models) | GPT-4o + Perplexity + Gemini | ~$5 | +| 6 — Human Review Export | (no API) | $0 | +| 7 — Finalize | (no API) | $0 | +| **TOTAL** | | **~$10** | + +For 11 pilot categories: ~$110. + +## Configuration + +See `config.py` for tunable parameters per stage: +- Number of personas (default: 7) +- Prompts per persona (default: 30) +- Type distribution targets (40/25/20/15/10 weights → buying/comparison/specific/info/brand-direct) +- Pilot sample size (default: 10) +- Critic agent thresholds (flagged-by-N agents → remove) + +## Quarterly rotation mode + +```bash +python pipeline.py --category swiece-sojowe-pl --mode rotation +``` + +In rotation mode: +- Reads existing `prompts/{category}/v{N}.json` +- Identifies 20 prompts with lowest real-world signal in past 90 days (via Stage 3 scan) +- Generates 20 replacements (Stages 1–5 for refresh set) +- Outputs `prompts/{category}/v{N+1}.json` (CLOSED) +- Logs swap decisions to `prompts/{category}/rotation_log.md` (CLOSED) + +--- + +**Status:** v0.1 — initial scaffold. Implementation in progress as part of Citee Index pilot phase (May–August 2026). diff --git a/tools/prompt_curation/config.py b/tools/prompt_curation/config.py new file mode 100644 index 0000000..28f65e5 --- /dev/null +++ b/tools/prompt_curation/config.py @@ -0,0 +1,86 @@ +"""Pipeline configuration — tunable per category and stage.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +PromptType = Literal["buying", "comparison", "specific_need", "informational", "brand_direct"] + + +@dataclass +class PipelineConfig: + """Default config — override via command-line args or per-category YAML.""" + + # Stage 1 — Persona Generator + num_personas: int = 7 + persona_model: str = "claude-sonnet-4-6" + + # Stage 2 — Prompt Brainstormer + prompts_per_persona: int = 30 + brainstormer_model: str = "claude-sonnet-4-6" + type_distribution: dict[PromptType, float] = field( + default_factory=lambda: { + "buying": 0.30, + "comparison": 0.25, + "specific_need": 0.20, + "informational": 0.15, + "brand_direct": 0.10, + } + ) + + # Stage 3 — Reality Checker + google_trends_min_volume: int = 1 # PL queries per month, minimum signal + reddit_min_organic_mentions: int = 3 + fallback_to_quora_if_no_signal: bool = True + + # Stage 4 — Validation Agents + flagged_by_n_critics_to_remove: int = 2 # Remove if 2+ agents flag it + critic_models: dict[str, str] = field( + default_factory=lambda: { + "real_buyer_critique": "claude-sonnet-4-6", + "methodology_critic": "claude-sonnet-4-6", + "vendor_exploit_hunter": "claude-sonnet-4-6", + } + ) + + # Stage 5 — Pilot Test Runner + pilot_sample_size: int = 10 + pilot_models: list[str] = field( + default_factory=lambda: [ + "gpt-4o-search", + "perplexity-sonar-pro", + "gemini-pro", + ] + ) + repetitions_per_prompt: int = 1 # In pilot test only, production uses 2+ + + # Final pool size after all filtering + final_pool_size: int = 100 + + # Output paths + data_dir: str = "data" # Public stage outputs + prompts_dir: str = "../../prompts" # Closed final prompts (gitignored) + + +CONFIG = PipelineConfig() + + +# Type distribution as integer counts for final pool +def get_target_counts(config: PipelineConfig = CONFIG) -> dict[PromptType, int]: + """Return integer counts per prompt type for final pool of `final_pool_size`.""" + counts = { + ptype: int(round(config.final_pool_size * pct)) + for ptype, pct in config.type_distribution.items() + } + # Adjust rounding to ensure sum == final_pool_size + total = sum(counts.values()) + if total != config.final_pool_size: + # Adjust the largest category to absorb difference + largest_type = max(counts, key=lambda k: counts[k]) + counts[largest_type] += config.final_pool_size - total + return counts + + +if __name__ == "__main__": + print(f"Final pool size: {CONFIG.final_pool_size}") + print(f"Target counts per type: {get_target_counts()}") diff --git a/tools/prompt_curation/pipeline.py b/tools/prompt_curation/pipeline.py new file mode 100644 index 0000000..ed21e5e --- /dev/null +++ b/tools/prompt_curation/pipeline.py @@ -0,0 +1,189 @@ +"""Citee Index — Prompt Curation Pipeline orchestrator. + +Runs all 6 stages sequentially. Stage 7 (finalize) requires manual human review +between Stage 6 and Stage 7, so this pipeline stops after Stage 6 and prints +instructions for the human reviewer. + +Usage: + python pipeline.py --category swiece-sojowe-pl --display-name "Świece sojowe PL" + +Or run individual stages by importing: + from pipeline import run_stage + run_stage(1, category, display_name) +""" +from __future__ import annotations + +import argparse +import importlib.util +import os +import sys +from pathlib import Path + +# Load each stage as a module +STAGE_FILES = { + 1: "1_persona_generator.py", + 2: "2_prompt_brainstormer.py", + 3: "3_reality_checker.py", + 4: "4_validation_agents.py", + 5: "5_pilot_test_runner.py", + 6: "6_human_review_export.py", + 7: "7_finalize.py", +} + + +def load_stage_module(stage_num: int): + """Dynamically load a stage script (filenames start with digits, not Python-importable normally).""" + stage_file = Path(__file__).parent / STAGE_FILES[stage_num] + spec = importlib.util.spec_from_file_location(f"stage_{stage_num}", stage_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def check_prerequisites(): + """Verify required env vars and brand catalog exist.""" + required_env = ["ANTHROPIC_API_KEY"] + missing = [v for v in required_env if not os.environ.get(v)] + if missing: + print(f"❌ Missing required environment variables: {missing}") + print(f" Copy .env.example to .env and fill in API keys.") + sys.exit(1) + + optional_env = { + "OPENAI_API_KEY": "Stage 5 (pilot test runner)", + "GOOGLE_API_KEY": "Stage 5 (pilot test runner)", + "PERPLEXITY_API_KEY": "Stage 5 (pilot test runner)", + "REDDIT_CLIENT_ID": "Stage 3 (reality checker — Reddit signal)", + } + for env, stage in optional_env.items(): + if not os.environ.get(env): + print(f"⚠ Optional env var missing: {env} (used by {stage}). Stage will skip Reddit/multi-model checks.") + + +def run_pipeline(category: str, display_name: str, skip_pilot: bool = False, skip_reality: bool = False): + """Run stages 1-6 sequentially. Stage 7 requires human action between 6 and 7.""" + + check_prerequisites() + + # Verify brand catalog exists + catalog_file = Path(__file__).parent.parent.parent / "data" / category / "brand_catalog.json" + if not catalog_file.exists(): + print(f"❌ Brand catalog not found: {catalog_file}") + print(f" Create one before running pipeline. See data/README.md for schema.") + sys.exit(1) + + print("=" * 60) + print(f"Citee Index — Prompt Curation Pipeline") + print(f"Category: {category}") + print(f"Display name: {display_name}") + print("=" * 60) + print() + + # Stage 1 + print(">>> STAGE 1: Persona Generator") + stage_1 = load_stage_module(1) + personas = stage_1.generate_personas(category, display_name) + stage_1.save_personas(category, personas) + print(f"✅ Generated {len(personas['personas'])} personas") + print() + + # Stage 2 + print(">>> STAGE 2: Prompt Brainstormer") + stage_2 = load_stage_module(2) + raw_data = stage_2.brainstorm_all(category, display_name) + stage_2.save_raw_prompts(category, raw_data) + print(f"✅ Brainstormed {raw_data['total_raw_prompts']} raw prompts") + print() + + # Stage 3 (skippable in some scenarios) + if not skip_reality: + print(">>> STAGE 3: Reality Checker (Google Trends + Reddit)") + stage_3 = load_stage_module(3) + validated = stage_3.check_all_prompts(category) + stage_3.save_validated(category, validated) + print(f"✅ Reality check: {validated['summary']}") + print() + else: + print(">>> STAGE 3: SKIPPED (--skip-reality flag)") + # Copy raw to validated as fallback + import shutil + data_dir = Path(__file__).parent.parent.parent / "data" / category + shutil.copy(data_dir / "raw_prompts.json", data_dir / "validated_prompts.json") + print() + + # Stage 4 + print(">>> STAGE 4: Multi-agent Validation") + stage_4 = load_stage_module(4) + # stage_4 has async main, run via subprocess pattern + import asyncio + import json + data_dir = Path(__file__).parent.parent.parent / "data" / category + with open(data_dir / "validated_prompts.json", "r", encoding="utf-8") as f: + validated_data = json.load(f) + candidates = [p for p in validated_data["validated_prompts"] if p.get("reality_signal", "pass") != "fail"] + critic_results = asyncio.run(stage_4.run_three_critics(candidates, display_name)) + aggregation = stage_4.aggregate_flags(critic_results, len(candidates)) + output = { + "category": category, + "input_count": len(candidates), + "critic_results": critic_results, + "aggregation": aggregation, + "kept_prompts": [ + p for i, p in enumerate(candidates) if i not in aggregation["flagged_for_removal"] + ], + } + with open(data_dir / "critic_review.json", "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + print(f"✅ Critics review: removed {aggregation['total_removed']}, kept {aggregation['total_kept']}") + print() + + # Stage 5 (optional — pilot test costs API) + if not skip_pilot: + print(">>> STAGE 5: Pilot Test Runner (sample 10 prompts × 3 models)") + stage_5 = load_stage_module(5) + pilot_data = asyncio.run(stage_5.run_pilot_test(category)) + stage_5.save_pilot_results(category, pilot_data) + print(f"✅ Pilot test: {pilot_data['summary']}") + print() + else: + print(">>> STAGE 5: SKIPPED (--skip-pilot flag)") + print() + + # Stage 6 + print(">>> STAGE 6: Human Review Export") + stage_6 = load_stage_module(6) + prompts_for_review = stage_6.load_kept_prompts(category) + csv_path = stage_6.export_to_csv(category, prompts_for_review) + summary_path = stage_6.export_summary_md(category, prompts_for_review) + print(f"✅ Exported {len(prompts_for_review)} prompts for human review") + print(f" CSV: {csv_path}") + print(f" Summary: {summary_path}") + print() + + print("=" * 60) + print("PIPELINE COMPLETE — Stages 1-6 done.") + print("=" * 60) + print() + print("NEXT STEPS (manual):") + print(f"1. Open {csv_path} in spreadsheet") + print(f"2. Fill `decision` column for each row: APPROVE / REJECT / EDIT") + print(f"3. If EDIT, fill `edited_prompt` column") + print(f"4. Save as `for_human_review_decided.csv`") + print(f"5. Run: python 7_finalize.py --category {category}") + print() + print("This produces final closed pool: prompts/{category}/v1.json (gitignored)") + + +def main(): + parser = argparse.ArgumentParser(description="Citee prompt curation pipeline orchestrator.") + parser.add_argument("--category", required=True, help="Category slug (e.g., 'swiece-sojowe-pl')") + parser.add_argument("--display-name", required=True, help="Human-readable category name") + parser.add_argument("--skip-pilot", action="store_true", help="Skip Stage 5 (saves API cost)") + parser.add_argument("--skip-reality", action="store_true", help="Skip Stage 3 (no Google Trends/Reddit check)") + args = parser.parse_args() + + run_pipeline(args.category, args.display_name, args.skip_pilot, args.skip_reality) + + +if __name__ == "__main__": + main() diff --git a/tools/prompt_curation/requirements.txt b/tools/prompt_curation/requirements.txt new file mode 100644 index 0000000..2d67dc5 --- /dev/null +++ b/tools/prompt_curation/requirements.txt @@ -0,0 +1,6 @@ +anthropic>=0.50.0 +openai>=1.50.0 +httpx>=0.27.0 +pytrends>=4.9.0 +praw>=7.7.0 +python-dotenv>=1.0.0