Single Tool Reliability

Example showing how to evaluate reliability of single tool calls.

Create a Python file

1from typing import Optional
2
3from kern.agent import Agent
4from kern.eval.reliability import ReliabilityEval, ReliabilityResult
5from kern.models.openai import OpenAIResponses
6from kern.run.agent import RunOutput
7from kern.tools.calculator import CalculatorTools
8
9
10def factorial():
11    agent = Agent(
12        model=OpenAIResponses(id="gpt-5.2"),
13        tools=[CalculatorTools()],
14    )
15    response: RunOutput = agent.run("What is 10!?")
16    evaluation = ReliabilityEval(
17        name="Tool Call Reliability",
18        agent_response=response,
19        expected_tool_calls=["factorial"],
20    )
21    result: Optional[ReliabilityResult] = evaluation.run(print_results=True)
22    result.assert_passed()
23
24
25if __name__ == "__main__":
26    factorial()

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate

1uv venv --python 3.12
2.venv\Scripts\activate

Install dependencies

1uv pip install -U openai kern-ai

Export your OpenAI API key

1export OPENAI_API_KEY="your_openai_api_key_here"

1$Env:OPENAI_API_KEY="your_openai_api_key_here"

Run Agent

1python reliability_single_tool.py