Prompt Regression Testing

A regression test ensures that improvements to one part of a system don't break another part that was previously working. In prompt engineering, a regression is when a change intended to improve performance on new cases inadvertently degrades performance on cases that were working before. Systematic regression testing catches these issues before they reach production.

Why Prompts Regress

Prompts are sensitive to small changes in ways that differ from traditional code. Adding a sentence to a system prompt can:

Change the model's "interpretation frame" for all inputs
Shift the output distribution toward new patterns
Interact unexpectedly with certain input types
Improve performance on the targeted case while hurting others

Without a test suite, you're flying blind after every change.

Building a Regression Test Suite

import hashlib
import json
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional

@dataclass
class PromptVersion:
    version_id: str
    system_prompt: str
    created_at: str
    notes: str = ""
    
    @classmethod
    def from_prompt(cls, prompt: str, notes: str = "") -> "PromptVersion":
        version_id = hashlib.sha256(prompt.encode()).hexdigest()[:8]
        return cls(
            version_id=version_id,
            system_prompt=prompt,
            created_at=datetime.utcnow().isoformat(),
            notes=notes,
        )

@dataclass  
class RegressionTestCase:
    test_id: str
    description: str
    input: str
    assertions: list[dict]  # List of assertion configs
    tags: list[str]  # e.g., ["edge_case", "critical", "happy_path"]
    
@dataclass
class RegressionReport:
    prompt_version: PromptVersion
    run_at: str
    total_cases: int
    passed: int
    failed: int
    regressions: list[str]  # test_ids that failed this run but passed before
    improvements: list[str]  # test_ids that passed this run but failed before
    
    @property
    def pass_rate(self) -> float:
        return self.passed / self.total_cases if self.total_cases > 0 else 0.0


class PromptRegressionSuite:
    """
    A full regression testing suite for prompt-based systems.
    Stores results to disk so regressions can be detected across runs.
    """
    
    def __init__(self, suite_name: str, results_dir: Path = Path(".prompt_evals")):
        self.suite_name = suite_name
        self.results_dir = results_dir / suite_name
        self.results_dir.mkdir(parents=True, exist_ok=True)
        self.test_cases: list[RegressionTestCase] = []

    def add_test(self, case: RegressionTestCase) -> None:
        self.test_cases.append(case)

    def run(self, prompt_version: PromptVersion, prompt_fn) -> RegressionReport:
        """Run all test cases and compare to previous run."""
        current_results: dict[str, bool] = {}
        
        for case in self.test_cases:
            output = prompt_fn(case.input)
            passed = self._evaluate_assertions(output, case.assertions)
            current_results[case.test_id] = passed

        # Load previous results for regression detection
        prev_results = self._load_last_results()
        
        regressions = [
            tid for tid, passed in current_results.items()
            if not passed and prev_results.get(tid, False)
        ]
        improvements = [
            tid for tid, passed in current_results.items()
            if passed and not prev_results.get(tid, True)
        ]
        
        report = RegressionReport(
            prompt_version=prompt_version,
            run_at=datetime.utcnow().isoformat(),
            total_cases=len(current_results),
            passed=sum(1 for p in current_results.values() if p),
            failed=sum(1 for p in current_results.values() if not p),
            regressions=regressions,
            improvements=improvements,
        )
        
        # Persist results for next run's comparison
        self._save_results(current_results)
        return report

    def _evaluate_assertions(self, output: str, assertions: list[dict]) -> bool:
        for assertion in assertions:
            if assertion["type"] == "contains":
                if assertion["value"].lower() not in output.lower():
                    return False
            elif assertion["type"] == "not_contains":
                if assertion["value"].lower() in output.lower():
                    return False
            elif assertion["type"] == "json_valid":
                try:
                    json.loads(output)
                except json.JSONDecodeError:
                    return False
            elif assertion["type"] == "max_length":
                if len(output.split()) > assertion["value"]:
                    return False
        return True

    def _save_results(self, results: dict[str, bool]) -> None:
        results_file = self.results_dir / "last_results.json"
        results_file.write_text(json.dumps(results, indent=2))

    def _load_last_results(self) -> dict[str, bool]:
        results_file = self.results_dir / "last_results.json"
        if results_file.exists():
            return json.loads(results_file.read_text())
        return {}

Integrating with CI/CD

# .github/workflows/prompt-tests.yml
name: Prompt Regression Tests

on:
  pull_request:
    paths:
      - 'prompts/**'
      - 'agent/system_prompts.py'

jobs:
  test-prompts:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      - run: pip install -r requirements.txt
      - run: python -m pytest tests/prompt_regression/ -v --tb=short
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

# tests/prompt_regression/test_sentiment.py
import pytest

def test_no_regressions(sentiment_suite, current_prompt_version, sentiment_fn):
    """Fail CI if any previously-passing test now fails."""
    report = sentiment_suite.run(current_prompt_version, sentiment_fn)
    
    if report.regressions:
        pytest.fail(
            f"REGRESSIONS DETECTED: {len(report.regressions)} test(s) that previously "
            f"passed now fail: {report.regressions}\n\n"
            f"Pass rate: {report.pass_rate:.1%} ({report.passed}/{report.total_cases})"
        )

def test_minimum_pass_rate(sentiment_suite, current_prompt_version, sentiment_fn):
    """Ensure overall pass rate doesn't drop below threshold."""
    report = sentiment_suite.run(current_prompt_version, sentiment_fn)
    assert report.pass_rate >= 0.90, (
        f"Pass rate {report.pass_rate:.1%} below 90% threshold"
    )

The Golden Dataset

Your regression suite is only as good as its test cases. Build your golden dataset by:

Sampling from production: Capture real user inputs (anonymized) that represent actual usage
Including edge cases: Empty inputs, very long inputs, adversarial inputs, foreign languages
Marking critical cases: Tag the tests that represent the most important behaviors — these should never regress even if the overall pass rate dips
Growing it over time: Every bug found in production should become a test case

A regression suite with 50 well-chosen cases is worth more than 500 randomly selected cases that miss your real failure modes.