Single Tool Reliability

Example showing how to evaluate reliability of single tool calls.

Create a Python file

1from typing import Optional
2
3from kern.agent import Agent
4from kern.eval.reliability import ReliabilityEval, ReliabilityResult
5from kern.models.openai import OpenAIResponses
6from kern.run.agent import RunOutput
7from kern.tools.calculator import CalculatorTools
8
9
10def factorial():
11 agent = Agent(
12 model=OpenAIResponses(id="gpt-5.2"),
13 tools=[CalculatorTools()],
14 )
15 response: RunOutput = agent.run("What is 10!?")
16 evaluation = ReliabilityEval(
17 name="Tool Call Reliability",
18 agent_response=response,
19 expected_tool_calls=["factorial"],
20 )
21 result: Optional[ReliabilityResult] = evaluation.run(print_results=True)
22 result.assert_passed()
23
24
25if __name__ == "__main__":
26 factorial()

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate
1uv venv --python 3.12
2.venv\Scripts\activate

Install dependencies

1uv pip install -U openai kern-ai

Export your OpenAI API key

1export OPENAI_API_KEY="your_openai_api_key_here"
1$Env:OPENAI_API_KEY="your_openai_api_key_here"

Run Agent

1python reliability_single_tool.py