LLM Evaluation
Evaluating LLM applications is fundamentally harder than evaluating traditional ML models. There's no single accuracy number --- the quality of a response depends on correctness, helpfulness, safety, style, and context. This lesson covers the full spectrum of evaluation techniques from automated metrics to human evaluation frameworks.
Why Evaluation Matters
Without robust evaluation, you're flying blind:
Automated Reference-Based Metrics
These metrics compare generated text against reference (ground truth) text.
Perplexity
Measures how "surprised" the model is by the text. Lower = better.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizerdef compute_perplexity(text: str, model_name: str = "gpt2") -> float:
"""Compute perplexity of text under a language model."""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
encodings = tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
loss = outputs.loss # Cross-entropy loss
perplexity = torch.exp(loss).item()
return perplexity
Lower perplexity = more fluent/natural text
text_good = "The cat sat on the mat and watched the birds outside."
text_bad = "Mat the on sat cat the birds outside watched and."ppl_good = compute_perplexity(text_good)
ppl_bad = compute_perplexity(text_bad)
print(f"Good text perplexity: {ppl_good:.2f}") # ~30-50
print(f"Bad text perplexity: {ppl_bad:.2f}") # ~200-500
BLEU Score
Measures n-gram overlap between generated and reference text (originally for machine translation):
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunctionreference = "The cat is sitting on the mat".split()
candidate = "The cat sat on the mat".split()
BLEU-4 (considers 1-gram through 4-gram overlap)
smooth = SmoothingFunction().method1
score = sentence_bleu(
[reference], # Can have multiple references
candidate,
smoothing_function=smooth,
)
print(f"BLEU score: {score:.4f}") # 0-1, higher is betterCorpus-level BLEU
from nltk.translate.bleu_score import corpus_bleureferences_list = [
[["The", "cat", "sat", "on", "the", "mat"]],
[["A", "dog", "is", "running", "in", "the", "park"]],
]
candidates_list = [
["The", "cat", "is", "on", "the", "mat"],
["The", "dog", "runs", "in", "the", "park"],
]
corpus_score = corpus_bleu(references_list, candidates_list, smoothing_function=smooth)
print(f"Corpus BLEU: {corpus_score:.4f}")
ROUGE Score
Measures recall-oriented overlap (originally for summarization):
from rouge_score import rouge_scorerscorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"],
use_stemmer=True,
)
reference = "The cat sat on the mat and watched the birds outside the window."
hypothesis = "A cat was sitting on a mat, watching birds through the window."
scores = scorer.score(reference, hypothesis)
for metric, score in scores.items():
print(f"{metric}: Precision={score.precision:.3f}, Recall={score.recall:.3f}, F1={score.fmeasure:.3f}")
rouge1: Unigram overlap
rouge2: Bigram overlap
rougeL: Longest Common Subsequence
Limitations of Automated Metrics
LLM-as-Judge
Use a powerful LLM to evaluate the outputs of another LLM. This is currently the most practical approach for evaluating open-ended generation.
Single-Answer Grading
from openai import OpenAI
from pydantic import BaseModel, Field
import jsonclient = OpenAI()
class EvalScore(BaseModel):
relevance: int = Field(description="1-5 score for relevance to the question")
accuracy: int = Field(description="1-5 score for factual accuracy")
completeness: int = Field(description="1-5 score for completeness of the answer")
clarity: int = Field(description="1-5 score for clarity of explanation")
reasoning: str = Field(description="Brief explanation of the scores")
def llm_judge(question: str, answer: str, reference: str = None) -> EvalScore:
"""Use GPT-4o as a judge to evaluate an answer."""
system_prompt = """You are an expert evaluator. Score the given answer on a 1-5 scale for each criterion.
Scoring guide:
1 = Very poor, 2 = Poor, 3 = Adequate, 4 = Good, 5 = Excellent
Be critical but fair. Provide brief reasoning for your scores."""
user_prompt = f"Question: {question}\n\nAnswer to evaluate: {answer}"
if reference:
user_prompt += f"\n\nReference answer: {reference}"
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
response_format=EvalScore,
)
return response.choices[0].message.parsed
Evaluate
question = "What is gradient descent?"
good_answer = "Gradient descent is an optimization algorithm that iteratively adjusts model parameters by computing the gradient of the loss function and moving in the direction that reduces the loss. The learning rate controls the step size."
bad_answer = "Gradient descent is when the computer learns stuff by going down a hill."eval_good = llm_judge(question, good_answer)
eval_bad = llm_judge(question, bad_answer)
print(f"Good answer: R={eval_good.relevance} A={eval_good.accuracy} Co={eval_good.completeness} Cl={eval_good.clarity}")
print(f"Bad answer: R={eval_bad.relevance} A={eval_bad.accuracy} Co={eval_bad.completeness} Cl={eval_bad.clarity}")
Pairwise Comparison
class PairwiseResult(BaseModel):
winner: str = Field(description="'A', 'B', or 'tie'")
reasoning: str = Field(description="Why this answer is better")
confidence: float = Field(description="Confidence 0-1")def pairwise_judge(question: str, answer_a: str, answer_b: str) -> PairwiseResult:
"""Compare two answers and pick the better one."""
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": "Compare two answers to the same question. Pick the better one or call it a tie. Be specific about why."},
{"role": "user", "content": f"Question: {question}\n\nAnswer A: {answer_a}\n\nAnswer B: {answer_b}"},
],
response_format=PairwiseResult,
)
return response.choices[0].message.parsed
Mitigate position bias: run both orderings and check consistency
result_ab = pairwise_judge(question, good_answer, bad_answer)
result_ba = pairwise_judge(question, bad_answer, good_answer)print(f"A vs B: Winner={result_ab.winner}, Confidence={result_ab.confidence:.2f}")
print(f"B vs A: Winner={result_ba.winner}, Confidence={result_ba.confidence:.2f}")
LLM-as-Judge Best Practices
Standard Benchmarks
MMLU (Massive Multitask Language Understanding)
Tests knowledge across 57 subjects (STEM, humanities, social sciences, etc.):
# Example MMLU question
question = {
"subject": "machine_learning",
"question": "Which of the following is NOT a common activation function?",
"choices": ["ReLU", "Sigmoid", "Softmax", "Gradient"],
"answer": 3, # "Gradient" is not an activation function
}Running MMLU evaluation
from datasets import load_datasetLoad a subject
ds = load_dataset("cais/mmlu", "machine_learning", split="test")correct = 0
total = 0
for example in ds:
prompt = f"""Question: {example['question']}
A. {example['choices'][0]}
B. {example['choices'][1]}
C. {example['choices'][2]}
D. {example['choices'][3]}
Answer with just the letter (A, B, C, or D):"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
max_tokens=1,
)
predicted = response.choices[0].message.content.strip().upper()
expected = chr(65 + example["answer"]) # 0->A, 1->B, etc.
if predicted == expected:
correct += 1
total += 1
print(f"MMLU Machine Learning: {correct}/{total} = {correct/total:.2%}")
HumanEval (Code Generation)
Tests code generation with 164 Python programming problems:
# Example HumanEval problem
problem = {
"task_id": "HumanEval/0",
"prompt": '''from typing import Listdef has_close_elements(numbers: List[float], threshold: float) -> bool:
"""Check if in given list of numbers, any two numbers are closer
to each other than the given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
"""
''',
"test": """
assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False
assert has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True
""",
}
pass@k metric: probability that at least 1 of k samples passes all tests
Other Important Benchmarks
| Benchmark | Tests | Key Metric |
|---|---|---|
| MMLU | Knowledge across 57 subjects | Accuracy |
| HumanEval | Python code generation | pass@1, pass@10 |
| HELM | Holistic eval (accuracy, fairness, robustness) | Multi-metric |
| GSM8K | Grade school math reasoning | Accuracy |
| MT-Bench | Multi-turn conversation quality | Judge score 1-10 |
| TruthfulQA | Resistance to common misconceptions | % truthful |
| BigBench | 200+ diverse reasoning tasks | Per-task accuracy |
A/B Testing for LLM Applications
import random
import statistics
from dataclasses import dataclass@dataclass
class ABTestResult:
variant: str # "A" or "B"
query: str
response: str
latency_ms: float
judge_score: float
user_rating: float = None
class LLMABTest:
def __init__(self, model_a: str, model_b: str, split_ratio: float = 0.5):
self.model_a = model_a
self.model_b = model_b
self.split_ratio = split_ratio
self.results: list[ABTestResult] = []
def assign_variant(self) -> str:
"""Randomly assign a user to variant A or B."""
return "A" if random.random() < self.split_ratio else "B"
def run_query(self, query: str) -> ABTestResult:
"""Route query to the assigned model and evaluate."""
import time
variant = self.assign_variant()
model = self.model_a if variant == "A" else self.model_b
start = time.perf_counter()
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
)
latency = (time.perf_counter() - start) * 1000
answer = response.choices[0].message.content
# Auto-evaluate with LLM judge
judge_result = llm_judge(query, answer)
avg_score = (judge_result.relevance + judge_result.accuracy +
judge_result.completeness + judge_result.clarity) / 4
result = ABTestResult(
variant=variant,
query=query,
response=answer,
latency_ms=latency,
judge_score=avg_score,
)
self.results.append(result)
return result
def analyze(self):
"""Analyze A/B test results."""
a_results = [r for r in self.results if r.variant == "A"]
b_results = [r for r in self.results if r.variant == "B"]
print(f"\nA/B Test Results ({len(self.results)} total queries)")
print("=" * 50)
print(f"Model A ({self.model_a}): {len(a_results)} queries")
print(f" Avg Judge Score: {statistics.mean(r.judge_score for r in a_results):.2f}")
print(f" Avg Latency: {statistics.mean(r.latency_ms for r in a_results):.0f}ms")
print(f"\nModel B ({self.model_b}): {len(b_results)} queries")
print(f" Avg Judge Score: {statistics.mean(r.judge_score for r in b_results):.2f}")
print(f" Avg Latency: {statistics.mean(r.latency_ms for r in b_results):.0f}ms")
Usage
test = LLMABTest("gpt-4o-mini", "gpt-4o")
queries = ["Explain RAG", "What is RLHF?", "How do transformers work?"]
for q in queries:
test.run_query(q)
test.analyze()
Evaluation Frameworks
LangSmith
LangSmith provides tracing, monitoring, and evaluation for LLM applications:
from langsmith import Client
from langsmith.evaluation import evaluatels_client = Client()
Create a dataset of test cases
dataset = ls_client.create_dataset("rag-eval-set")ls_client.create_examples(
inputs=[
{"question": "What is RAG?"},
{"question": "How does attention work?"},
{"question": "What is fine-tuning?"},
],
outputs=[
{"answer": "RAG is Retrieval-Augmented Generation, a technique that grounds LLM responses in retrieved documents."},
{"answer": "Attention computes weighted sums of value vectors based on query-key compatibility."},
{"answer": "Fine-tuning adapts a pre-trained model to a specific task by training on task-specific data."},
],
dataset_id=dataset.id,
)
Define your application as a function
def my_rag_app(inputs: dict) -> dict:
question = inputs["question"]
# Your RAG pipeline here
answer = rag_chain.invoke(question)
return {"answer": answer}Define custom evaluators
def correctness_evaluator(run, example):
"""Check if the answer is factually correct using LLM judge."""
prediction = run.outputs["answer"]
reference = example.outputs["answer"] judge_response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"Is this answer factually consistent with the reference?\n\nAnswer: {prediction}\nReference: {reference}\n\nRespond with just 'yes' or 'no'.",
}],
)
is_correct = "yes" in judge_response.choices[0].message.content.lower()
return {"key": "correctness", "score": 1.0 if is_correct else 0.0}
Run evaluation
results = evaluate(
my_rag_app,
data="rag-eval-set",
evaluators=[correctness_evaluator],
experiment_prefix="rag-v1",
)print(results)
Building a Custom Eval Pipeline
import json
from datetime import datetime
from pathlib import Pathclass EvalPipeline:
"""A lightweight evaluation pipeline for LLM applications."""
def __init__(self, name: str):
self.name = name
self.results = []
self.timestamp = datetime.now().isoformat()
def add_test_case(self, question: str, expected: str, tags: list[str] = None):
"""Add a test case to the pipeline."""
self.results.append({
"question": question,
"expected": expected,
"tags": tags or [],
"actual": None,
"scores": {},
})
def run(self, app_fn, evaluators: dict):
"""Run all test cases through the app and evaluators."""
for i, case in enumerate(self.results):
print(f"Running test {i+1}/{len(self.results)}: {case['question'][:50]}...")
# Get the app's response
actual = app_fn(case["question"])
case["actual"] = actual
# Run each evaluator
for eval_name, eval_fn in evaluators.items():
score = eval_fn(case["question"], actual, case["expected"])
case["scores"][eval_name] = score
return self
def summary(self):
"""Print evaluation summary."""
print(f"\nEvaluation: {self.name}")
print(f"Timestamp: {self.timestamp}")
print("=" * 60)
all_scores = {}
for case in self.results:
for metric, score in case["scores"].items():
all_scores.setdefault(metric, []).append(score)
for metric, scores in all_scores.items():
avg = sum(scores) / len(scores)
print(f" {metric}: {avg:.3f} (min={min(scores):.3f}, max={max(scores):.3f})")
# Per-tag breakdown
tags = set()
for case in self.results:
tags.update(case["tags"])
if tags:
print("\nPer-tag breakdown:")
for tag in sorted(tags):
tag_cases = [c for c in self.results if tag in c["tags"]]
for metric in all_scores:
tag_scores = [c["scores"][metric] for c in tag_cases]
avg = sum(tag_scores) / len(tag_scores)
print(f" [{tag}] {metric}: {avg:.3f} ({len(tag_cases)} cases)")
def save(self, path: str):
"""Save results to JSON."""
output = {
"name": self.name,
"timestamp": self.timestamp,
"results": self.results,
}
Path(path).write_text(json.dumps(output, indent=2))
print(f"Results saved to {path}")