Basic Agent as Judge
Basic usage of Agent as Judge evaluation with numeric scoring and failure callbacks
This example demonstrates basic Agent as Judge evaluation with numeric scoring (1-10 scale) and an on_fail callback for handling evaluation failures.
Add the following code to your Python file
1from kern.agent import Agent2from kern.db.sqlite import SqliteDb3from kern.eval.agent_as_judge import AgentAsJudgeEval, AgentAsJudgeEvaluation4from kern.models.openai import OpenAIResponses567def on_evaluation_failure(evaluation: AgentAsJudgeEvaluation):8 """Callback triggered when evaluation fails (score < threshold)."""9 print(f"Evaluation failed - Score: {evaluation.score}/10")10 print(f"Reason: {evaluation.reason[:100]}...")111213# Setup database to persist eval results14db = SqliteDb(db_file="tmp/agent_as_judge_basic.db")1516agent = Agent(17 model=OpenAIResponses(id="gpt-5.2"),18 instructions="You are a technical writer. Explain concepts clearly and concisely.",19 db=db,20)2122response = agent.run("Explain what an API is")2324evaluation = AgentAsJudgeEval(25 name="Explanation Quality",26 criteria="Explanation should be clear, beginner-friendly, and use simple language",27 scoring_strategy="numeric", # Score 1-1028 threshold=9, # Pass if score >= 929 on_fail=on_evaluation_failure,30 db=db,31)3233result = evaluation.run(34 input="Explain what an API is",35 output=str(response.content),36 print_results=True,37 print_summary=True,38)3940# Query database for stored results41print("Database Results:")42eval_runs = db.get_eval_runs()43print(f"Total evaluations stored: {len(eval_runs)}")44if eval_runs:45 latest = eval_runs[-1]46 print(f"Eval ID: {latest.run_id}")47 print(f"Name: {latest.name}")Set up your virtual environment
1uv venv --python 3.122source .venv/bin/activate1uv venv --python 3.122.venv\Scripts\activateInstall dependencies
1uv pip install -U kern-ai openaiExport your OpenAI API key
1export OPENAI_API_KEY="your_openai_api_key_here"1$Env:OPENAI_API_KEY="your_openai_api_key_here"Run the example
1python agent_as_judge_basic.py