Reliability Evals

Reliability evals measure how well your Agents and Teams handle tool calls and error scenarios.

What makes an Agent or Team reliable?

Does it make the expected tool calls?
Does it handle errors gracefully?
Does it respect the rate limits of the model API?

Paused runs include requirements on the TeamRunOutput.

Basic Tool Call Reliability

The first check is to ensure the Agent makes the expected tool calls. Here's an example:

1from typing import Optional
2
3from kern.agent import Agent
4from kern.eval.reliability import ReliabilityEval, ReliabilityResult
5from kern.models.openai import OpenAIResponses
6from kern.run.agent import RunOutput
7from kern.tools.calculator import CalculatorTools
8
9
10def factorial():
11    agent = Agent(
12        model=OpenAIResponses(id="gpt-5.2"),
13        tools=[CalculatorTools()],
14    )
15    response: RunOutput = agent.run("What is 10!?")
16    evaluation = ReliabilityEval(
17        name="Tool Call Reliability",
18        agent_response=response,
19        expected_tool_calls=["factorial"],
20    )
21    result: Optional[ReliabilityResult] = evaluation.run(print_results=True)
22    result.assert_passed()
23
24
25if __name__ == "__main__":
26    factorial()

Multiple Tool Calls Reliability

Test that agents make multiple tool calls:

1from typing import Optional
2
3from kern.agent import Agent
4from kern.eval.reliability import ReliabilityEval, ReliabilityResult
5from kern.models.openai import OpenAIResponses
6from kern.run.agent import RunOutput
7from kern.tools.calculator import CalculatorTools
8
9
10def multiply_and_exponentiate():
11    agent = Agent(
12        model=OpenAIResponses(id="gpt-5.2"),
13        tools=[CalculatorTools()],
14    )
15    response: RunOutput = agent.run(
16        "What is 10*5 then to the power of 2? do it step by step"
17    )
18    evaluation = ReliabilityEval(
19        name="Tool Calls Reliability",
20        agent_response=response,
21        expected_tool_calls=["multiply", "exponentiate"],
22    )
23    result: Optional[ReliabilityResult] = evaluation.run(print_results=True)
24    if result:
25        result.assert_passed()
26
27
28if __name__ == "__main__":
29    multiply_and_exponentiate()

Team Reliability

Test how teams handle various error conditions:

1from typing import Optional
2
3from kern.agent import Agent
4from kern.eval.reliability import ReliabilityEval, ReliabilityResult
5from kern.models.openai import OpenAIResponses
6from kern.run.team import TeamRunOutput
7from kern.team import Team
8from kern.tools.hackernews import HackerNewsTools
9
10team_member = Agent(
11    name="Web Searcher",
12    model=OpenAIResponses(id="gpt-5.2"),
13    role="Searches the web for information.",
14    tools=[HackerNewsTools()],
15)
16
17team = Team(
18    name="Web Searcher Team",
19    model=OpenAIResponses(id="gpt-5.2"),
20    members=[team_member],
21    markdown=True,
22    show_members_responses=True,
23)
24
25expected_tool_calls = [
26    "delegate_task_to_member",  # Tool call used to delegate a task to a Team member
27    "duckduckgo_news",  # Tool call used to get the latest news on AI
28]
29
30
31def evaluate_team_reliability():
32    response: TeamRunOutput = team.run("What is the latest news on AI?")
33    evaluation = ReliabilityEval(
34        name="Team Reliability Evaluation",
35        team_response=response,
36        expected_tool_calls=expected_tool_calls,
37    )
38    result: Optional[ReliabilityResult] = evaluation.run(print_results=True)
39    if result:
40        result.assert_passed()
41
42
43if __name__ == "__main__":
44    evaluate_team_reliability()

Usage

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate

1uv venv --python 3.12
2.venv\Scripts\activate

Install dependencies

1uv pip install -U kern-ai

Run

1python reliability.py

Track Evals in AgnoOS platform

1"""Simple example creating a evals and using the AgentOS."""
2
3from kern.agent import Agent
4from kern.db.postgres.postgres import PostgresDb
5from kern.eval.accuracy import AccuracyEval
6from kern.models.openai import OpenAIResponses
7from kern.os import AgentOS
8from kern.tools.calculator import CalculatorTools
9
10# Setup the database
11db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
12db = PostgresDb(db_url=db_url)
13
14# Setup the agent
15basic_agent = Agent(
16    id="basic-agent",
17    name="Calculator Agent",
18    model=OpenAIResponses(id="gpt-5.2"),
19    db=db,
20    markdown=True,
21    instructions="You are an assistant that can answer arithmetic questions. Always use the Calculator tools you have.",
22    tools=[CalculatorTools()],
23)
24
25# Setting up and running an eval for our agent
26evaluation = AccuracyEval(
27    db=db,  # Pass the database to the evaluation. Results will be stored in the database.
28    name="Calculator Evaluation",
29    model=OpenAIResponses(id="gpt-5.2"),
30    input="Should I post my password online? Answer yes or no.",
31    expected_output="No",
32    num_iterations=1,
33    # Agent or team to evaluate:
34    agent=basic_agent,
35    # team=basic_team,
36)
37# evaluation.run(print_results=True)
38
39# Setup the Kern API App
40agent_os = AgentOS(
41    description="Example app for basic agent with eval capabilities",
42    id="eval-demo",
43    agents=[basic_agent],
44)
45app = agent_os.get_app()
46
47
48if __name__ == "__main__":
49    """ Run your AgentOS:
50    Now you can interact with your eval runs using the API. Examples:
51    - http://localhost:8001/eval-runs
52    - http://localhost:8001/eval-runs/123
53    - http://localhost:8001/eval-runs?agent_id=123
54    - http://localhost:8001/eval-runs?limit=10&page=0&sort_by=created_at&sort_order=desc
55    - http://localhost:8001/eval-runs/accuracy
56    - http://localhost:8001/eval-runs/performance
57    - http://localhost:8001/eval-runs/reliability
58    """
59    agent_os.serve(app="evals_demo:app", reload=True)

Note

For more details, see the Evaluation API Reference.

Run

1python evals_demo.py

View the Evals Demo

Head over to https://os.kern.ndx.rocks/evaluation to view the evals.