Prompt Evaluation Basics
Prompt engineering without evaluation is guesswork. You might improve a prompt based on intuition, see better results on your test case, and ship it — only to find it regresses on cases you didn't test. Systematic evaluation turns prompt development from art into engineering.
What Is an Eval?
An eval (short for evaluation) is a structured test of how well a prompt performs across a range of inputs. At minimum, an eval needs:
- A dataset: A collection of inputs representative of real-world usage
- Expected outputs (or evaluation criteria): What does a good response look like?
- A scoring function: How do you measure quality numerically?
- An aggregation: A score over the full dataset (e.g., accuracy, average score)
from dataclasses import dataclass
from typing import Callable, Any
@dataclass
class EvalCase:
"""A single test case for prompt evaluation."""
input: str
expected_output: str | None = None
metadata: dict = None
@dataclass
class EvalResult:
case: EvalCase
actual_output: str
score: float # 0.0 to 1.0
passed: bool
details: str = ""
def run_eval(
prompt_fn: Callable[[str], str],
dataset: list[EvalCase],
scorer: Callable[[EvalCase, str], EvalResult],
) -> dict[str, Any]:
"""Run an evaluation and return aggregate metrics."""
results = []
for case in dataset:
output = prompt_fn(case.input)
result = scorer(case, output)
results.append(result)
scores = [r.score for r in results]
return {
"total_cases": len(results),
"passed": sum(1 for r in results if r.passed),
"pass_rate": sum(1 for r in results if r.passed) / len(results),
"avg_score": sum(scores) / len(scores),
"min_score": min(scores),
"results": results,
}
Types of Scoring Functions
1. Exact Match (for structured outputs)
import json
def exact_match_scorer(case: EvalCase, actual: str) -> EvalResult:
"""Check if output exactly matches expected. Good for JSON/classification."""
try:
actual_parsed = json.loads(actual)
expected_parsed = json.loads(case.expected_output)
passed = actual_parsed == expected_parsed
score = 1.0 if passed else 0.0
return EvalResult(case=case, actual_output=actual, score=score, passed=passed)
except json.JSONDecodeError:
return EvalResult(case=case, actual_output=actual, score=0.0, passed=False,
details="Output was not valid JSON")
2. Semantic Similarity (for paraphrase detection)
from openai import OpenAI
import numpy as np
client = OpenAI()
def cosine_similarity(a: list[float], b: list[float]) -> float:
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def semantic_similarity_scorer(case: EvalCase, actual: str, threshold: float = 0.85) -> EvalResult:
"""Score based on embedding similarity between actual and expected output."""
response = client.embeddings.create(
model="text-embedding-3-small",
input=[actual, case.expected_output]
)
actual_embedding = response.data[0].embedding
expected_embedding = response.data[1].embedding
similarity = cosine_similarity(actual_embedding, expected_embedding)
passed = similarity >= threshold
return EvalResult(
case=case,
actual_output=actual,
score=similarity,
passed=passed,
details=f"Similarity: {similarity:.3f} (threshold: {threshold})"
)
3. Rule-Based (for specific requirements)
import re
def rule_based_scorer(case: EvalCase, actual: str) -> EvalResult:
"""Check specific structural requirements."""
rules = [
("Contains JSON", lambda t: '{' in t and '}' in t),
("Under 200 words", lambda t: len(t.split()) < 200),
("No first-person", lambda t: not re.search(r'\bI\b|\bme\b|\bmy\b', t)),
("Starts with recommendation", lambda t: t.lower().startswith("recommend")),
]
rule_results = [(name, check(actual)) for name, check in rules]
passed_count = sum(1 for _, passed in rule_results if passed)
score = passed_count / len(rules)
details = "\n".join(f"{'✓' if p else '✗'} {name}" for name, p in rule_results)
return EvalResult(
case=case,
actual_output=actual,
score=score,
passed=score >= 0.8, # Must pass 80% of rules
details=details,
)
Building Your First Eval Dataset
Start small and focused:
# A minimal eval dataset for a sentiment classifier
sentiment_dataset = [
EvalCase(input="I love this product!", expected_output="positive"),
EvalCase(input="Terrible quality, waste of money.", expected_output="negative"),
EvalCase(input="It arrived on time.", expected_output="neutral"),
EvalCase(input="Great features but terrible support.", expected_output="mixed"),
EvalCase(input="worst purchase ever", expected_output="negative"),
EvalCase(input="Pretty good for the price", expected_output="positive"),
EvalCase(input="Normal product, nothing special", expected_output="neutral"),
EvalCase(input="Excellent build but battery dies in 2 hours", expected_output="mixed"),
]
# Run the eval
from openai import OpenAI
client = OpenAI()
def my_sentiment_classifier(text: str) -> str:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Classify sentiment as: positive, negative, neutral, or mixed. Respond with only one word."},
{"role": "user", "content": text}
],
temperature=0,
)
return response.choices[0].message.content.strip().lower()
def classification_scorer(case: EvalCase, actual: str) -> EvalResult:
passed = actual == case.expected_output
return EvalResult(case=case, actual_output=actual, score=1.0 if passed else 0.0, passed=passed)
metrics = run_eval(my_sentiment_classifier, sentiment_dataset, classification_scorer)
print(f"Pass rate: {metrics['pass_rate']:.1%}")
print(f"Failing cases:")
for r in metrics["results"]:
if not r.passed:
print(f" Input: '{r.case.input}' | Expected: {r.case.expected_output} | Got: {r.actual_output}")
Good evals tell you not just whether something works, but what specifically fails and for which types of inputs. This specificity is what makes prompt improvement systematic rather than random.