Prompt Regression Testing
A regression test ensures that improvements to one part of a system don't break another part that was previously working. In prompt engineering, a regression is when a change intended to improve performance on new cases inadvertently degrades performance on cases that were working before. Systematic regression testing catches these issues before they reach production.
Why Prompts Regress
Prompts are sensitive to small changes in ways that differ from traditional code. Adding a sentence to a system prompt can:
- Change the model's "interpretation frame" for all inputs
- Shift the output distribution toward new patterns
- Interact unexpectedly with certain input types
- Improve performance on the targeted case while hurting others
Without a test suite, you're flying blind after every change.
Building a Regression Test Suite
import hashlib
import json
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class PromptVersion:
version_id: str
system_prompt: str
created_at: str
notes: str = ""
@classmethod
def from_prompt(cls, prompt: str, notes: str = "") -> "PromptVersion":
version_id = hashlib.sha256(prompt.encode()).hexdigest()[:8]
return cls(
version_id=version_id,
system_prompt=prompt,
created_at=datetime.utcnow().isoformat(),
notes=notes,
)
@dataclass
class RegressionTestCase:
test_id: str
description: str
input: str
assertions: list[dict] # List of assertion configs
tags: list[str] # e.g., ["edge_case", "critical", "happy_path"]
@dataclass
class RegressionReport:
prompt_version: PromptVersion
run_at: str
total_cases: int
passed: int
failed: int
regressions: list[str] # test_ids that failed this run but passed before
improvements: list[str] # test_ids that passed this run but failed before
@property
def pass_rate(self) -> float:
return self.passed / self.total_cases if self.total_cases > 0 else 0.0
class PromptRegressionSuite:
"""
A full regression testing suite for prompt-based systems.
Stores results to disk so regressions can be detected across runs.
"""
def __init__(self, suite_name: str, results_dir: Path = Path(".prompt_evals")):
self.suite_name = suite_name
self.results_dir = results_dir / suite_name
self.results_dir.mkdir(parents=True, exist_ok=True)
self.test_cases: list[RegressionTestCase] = []
def add_test(self, case: RegressionTestCase) -> None:
self.test_cases.append(case)
def run(self, prompt_version: PromptVersion, prompt_fn) -> RegressionReport:
"""Run all test cases and compare to previous run."""
current_results: dict[str, bool] = {}
for case in self.test_cases:
output = prompt_fn(case.input)
passed = self._evaluate_assertions(output, case.assertions)
current_results[case.test_id] = passed
# Load previous results for regression detection
prev_results = self._load_last_results()
regressions = [
tid for tid, passed in current_results.items()
if not passed and prev_results.get(tid, False)
]
improvements = [
tid for tid, passed in current_results.items()
if passed and not prev_results.get(tid, True)
]
report = RegressionReport(
prompt_version=prompt_version,
run_at=datetime.utcnow().isoformat(),
total_cases=len(current_results),
passed=sum(1 for p in current_results.values() if p),
failed=sum(1 for p in current_results.values() if not p),
regressions=regressions,
improvements=improvements,
)
# Persist results for next run's comparison
self._save_results(current_results)
return report
def _evaluate_assertions(self, output: str, assertions: list[dict]) -> bool:
for assertion in assertions:
if assertion["type"] == "contains":
if assertion["value"].lower() not in output.lower():
return False
elif assertion["type"] == "not_contains":
if assertion["value"].lower() in output.lower():
return False
elif assertion["type"] == "json_valid":
try:
json.loads(output)
except json.JSONDecodeError:
return False
elif assertion["type"] == "max_length":
if len(output.split()) > assertion["value"]:
return False
return True
def _save_results(self, results: dict[str, bool]) -> None:
results_file = self.results_dir / "last_results.json"
results_file.write_text(json.dumps(results, indent=2))
def _load_last_results(self) -> dict[str, bool]:
results_file = self.results_dir / "last_results.json"
if results_file.exists():
return json.loads(results_file.read_text())
return {}
Integrating with CI/CD
# .github/workflows/prompt-tests.yml
name: Prompt Regression Tests
on:
pull_request:
paths:
- 'prompts/**'
- 'agent/system_prompts.py'
jobs:
test-prompts:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- run: pip install -r requirements.txt
- run: python -m pytest tests/prompt_regression/ -v --tb=short
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# tests/prompt_regression/test_sentiment.py
import pytest
def test_no_regressions(sentiment_suite, current_prompt_version, sentiment_fn):
"""Fail CI if any previously-passing test now fails."""
report = sentiment_suite.run(current_prompt_version, sentiment_fn)
if report.regressions:
pytest.fail(
f"REGRESSIONS DETECTED: {len(report.regressions)} test(s) that previously "
f"passed now fail: {report.regressions}\n\n"
f"Pass rate: {report.pass_rate:.1%} ({report.passed}/{report.total_cases})"
)
def test_minimum_pass_rate(sentiment_suite, current_prompt_version, sentiment_fn):
"""Ensure overall pass rate doesn't drop below threshold."""
report = sentiment_suite.run(current_prompt_version, sentiment_fn)
assert report.pass_rate >= 0.90, (
f"Pass rate {report.pass_rate:.1%} below 90% threshold"
)
The Golden Dataset
Your regression suite is only as good as its test cases. Build your golden dataset by:
- Sampling from production: Capture real user inputs (anonymized) that represent actual usage
- Including edge cases: Empty inputs, very long inputs, adversarial inputs, foreign languages
- Marking critical cases: Tag the tests that represent the most important behaviors — these should never regress even if the overall pass rate dips
- Growing it over time: Every bug found in production should become a test case
A regression suite with 50 well-chosen cases is worth more than 500 randomly selected cases that miss your real failure modes.