Evaluation and Testing

Prompt Evaluation Basics

13m read

Prompt Evaluation Basics

Prompt engineering without evaluation is guesswork. You might improve a prompt based on intuition, see better results on your test case, and ship it — only to find it regresses on cases you didn't test. Systematic evaluation turns prompt development from art into engineering.

What Is an Eval?

An eval (short for evaluation) is a structured test of how well a prompt performs across a range of inputs. At minimum, an eval needs:

  1. A dataset: A collection of inputs representative of real-world usage
  2. Expected outputs (or evaluation criteria): What does a good response look like?
  3. A scoring function: How do you measure quality numerically?
  4. An aggregation: A score over the full dataset (e.g., accuracy, average score)
from dataclasses import dataclass
from typing import Callable, Any

@dataclass
class EvalCase:
    """A single test case for prompt evaluation."""
    input: str
    expected_output: str | None = None
    metadata: dict = None

@dataclass
class EvalResult:
    case: EvalCase
    actual_output: str
    score: float  # 0.0 to 1.0
    passed: bool
    details: str = ""

def run_eval(
    prompt_fn: Callable[[str], str],
    dataset: list[EvalCase],
    scorer: Callable[[EvalCase, str], EvalResult],
) -> dict[str, Any]:
    """Run an evaluation and return aggregate metrics."""
    results = []
    for case in dataset:
        output = prompt_fn(case.input)
        result = scorer(case, output)
        results.append(result)
    
    scores = [r.score for r in results]
    return {
        "total_cases": len(results),
        "passed": sum(1 for r in results if r.passed),
        "pass_rate": sum(1 for r in results if r.passed) / len(results),
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "results": results,
    }

Types of Scoring Functions

1. Exact Match (for structured outputs)

import json

def exact_match_scorer(case: EvalCase, actual: str) -> EvalResult:
    """Check if output exactly matches expected. Good for JSON/classification."""
    try:
        actual_parsed = json.loads(actual)
        expected_parsed = json.loads(case.expected_output)
        passed = actual_parsed == expected_parsed
        score = 1.0 if passed else 0.0
        return EvalResult(case=case, actual_output=actual, score=score, passed=passed)
    except json.JSONDecodeError:
        return EvalResult(case=case, actual_output=actual, score=0.0, passed=False,
                         details="Output was not valid JSON")

2. Semantic Similarity (for paraphrase detection)

from openai import OpenAI
import numpy as np

client = OpenAI()

def cosine_similarity(a: list[float], b: list[float]) -> float:
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def semantic_similarity_scorer(case: EvalCase, actual: str, threshold: float = 0.85) -> EvalResult:
    """Score based on embedding similarity between actual and expected output."""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[actual, case.expected_output]
    )
    actual_embedding = response.data[0].embedding
    expected_embedding = response.data[1].embedding
    
    similarity = cosine_similarity(actual_embedding, expected_embedding)
    passed = similarity >= threshold
    return EvalResult(
        case=case,
        actual_output=actual,
        score=similarity,
        passed=passed,
        details=f"Similarity: {similarity:.3f} (threshold: {threshold})"
    )

3. Rule-Based (for specific requirements)

import re

def rule_based_scorer(case: EvalCase, actual: str) -> EvalResult:
    """Check specific structural requirements."""
    rules = [
        ("Contains JSON", lambda t: '{' in t and '}' in t),
        ("Under 200 words", lambda t: len(t.split()) < 200),
        ("No first-person", lambda t: not re.search(r'\bI\b|\bme\b|\bmy\b', t)),
        ("Starts with recommendation", lambda t: t.lower().startswith("recommend")),
    ]
    
    rule_results = [(name, check(actual)) for name, check in rules]
    passed_count = sum(1 for _, passed in rule_results if passed)
    score = passed_count / len(rules)
    
    details = "\n".join(f"{'✓' if p else '✗'} {name}" for name, p in rule_results)
    return EvalResult(
        case=case,
        actual_output=actual,
        score=score,
        passed=score >= 0.8,  # Must pass 80% of rules
        details=details,
    )

Building Your First Eval Dataset

Start small and focused:

# A minimal eval dataset for a sentiment classifier
sentiment_dataset = [
    EvalCase(input="I love this product!", expected_output="positive"),
    EvalCase(input="Terrible quality, waste of money.", expected_output="negative"),
    EvalCase(input="It arrived on time.", expected_output="neutral"),
    EvalCase(input="Great features but terrible support.", expected_output="mixed"),
    EvalCase(input="worst purchase ever", expected_output="negative"),
    EvalCase(input="Pretty good for the price", expected_output="positive"),
    EvalCase(input="Normal product, nothing special", expected_output="neutral"),
    EvalCase(input="Excellent build but battery dies in 2 hours", expected_output="mixed"),
]

# Run the eval
from openai import OpenAI
client = OpenAI()

def my_sentiment_classifier(text: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Classify sentiment as: positive, negative, neutral, or mixed. Respond with only one word."},
            {"role": "user", "content": text}
        ],
        temperature=0,
    )
    return response.choices[0].message.content.strip().lower()

def classification_scorer(case: EvalCase, actual: str) -> EvalResult:
    passed = actual == case.expected_output
    return EvalResult(case=case, actual_output=actual, score=1.0 if passed else 0.0, passed=passed)

metrics = run_eval(my_sentiment_classifier, sentiment_dataset, classification_scorer)
print(f"Pass rate: {metrics['pass_rate']:.1%}")
print(f"Failing cases:")
for r in metrics["results"]:
    if not r.passed:
        print(f"  Input: '{r.case.input}' | Expected: {r.case.expected_output} | Got: {r.actual_output}")

Good evals tell you not just whether something works, but what specifically fails and for which types of inputs. This specificity is what makes prompt improvement systematic rather than random.