Performance Evals

Performance evals measure the latency and memory footprint of an Agent or Team.

Basic Example

1"""Run `uv pip install openai kern-ai memory_profiler` to install dependencies."""
2
3from kern.agent import Agent
4from kern.eval.performance import PerformanceEval
5from kern.models.openai import OpenAIResponses
6
7
8def run_agent():
9    agent = Agent(
10        model=OpenAIResponses(id="gpt-5.2"),
11        system_message="Be concise, reply with one sentence.",
12    )
13
14    response = agent.run("What is the capital of France?")
15    print(f"Agent response: {response.content}")
16
17    return response
18
19
20simple_response_perf = PerformanceEval(
21    name="Simple Performance Evaluation",
22    func=run_agent,
23    num_iterations=1,
24    warmup_runs=0,
25)
26
27if __name__ == "__main__":
28    simple_response_perf.run(print_results=True, print_summary=True)

Tool Usage Performance

Compare how tools affects your agent's performance:

1"""Run `uv pip install kern-ai openai memory_profiler` to install dependencies."""
2
3from typing import Literal
4
5from kern.agent import Agent
6from kern.eval.performance import PerformanceEval
7from kern.models.openai import OpenAIResponses
8
9
10def get_weather(city: Literal["nyc", "sf"]):
11    """Use this to get weather information."""
12    if city == "nyc":
13        return "It might be cloudy in nyc"
14    elif city == "sf":
15        return "It's always sunny in sf"
16
17
18tools = [get_weather]
19
20
21def instantiate_agent():
22    return Agent(model=OpenAIResponses(id="gpt-5.2"), tools=tools)  # type: ignore
23
24
25instantiation_perf = PerformanceEval(
26    name="Tool Instantiation Performance", func=instantiate_agent, num_iterations=1000
27)
28
29if __name__ == "__main__":
30    instantiation_perf.run(print_results=True, print_summary=True)

Performance with asyncronous functions

Evaluate agent performance with asyncronous functions:

1"""This example shows how to run a Performance evaluation on an async function."""
2
3import asyncio
4
5from kern.agent import Agent
6from kern.eval.performance import PerformanceEval
7from kern.models.openai import OpenAIResponses
8
9
10# Simple async function to run an Agent.
11async def arun_agent():
12    agent = Agent(
13        model=OpenAIResponses(id="gpt-5.2"),
14        system_message="Be concise, reply with one sentence.",
15    )
16    response = await agent.arun("What is the capital of France?")
17    return response
18
19
20performance_eval = PerformanceEval(func=arun_agent, num_iterations=10)
21
22# Because we are evaluating an async function, we use the arun method.
23asyncio.run(performance_eval.arun(print_summary=True, print_results=True))

Agent Performace with Memory Updates

Test agent performance with memory updates:

1"""Run `uv pip install openai kern-ai memory_profiler` to install dependencies."""
2
3from kern.agent import Agent
4from kern.db.sqlite import SqliteDb
5from kern.eval.performance import PerformanceEval
6from kern.models.openai import OpenAIResponses
7
8# Memory creation requires a db to be provided
9db = SqliteDb(db_file="tmp/memory.db")
10
11
12def run_agent():
13    agent = Agent(
14        model=OpenAIResponses(id="gpt-5.2"),
15        system_message="Be concise, reply with one sentence.",
16        db=db,
17        update_memory_on_run=True,
18    )
19
20    response = agent.run("My name is Tom! I'm 25 years old and I live in New York.")
21    print(f"Agent response: {response.content}")
22
23    return response
24
25
26response_with_memory_updates_perf = PerformanceEval(
27    name="Memory Updates Performance",
28    func=run_agent,
29    num_iterations=5,
30    warmup_runs=0,
31)
32
33if __name__ == "__main__":
34    response_with_memory_updates_perf.run(print_results=True, print_summary=True)

Agent Performance with Storage

Test agent performance with storage:

1"""Run `uv pip install openai kern-ai` to install dependencies."""
2
3from kern.agent import Agent
4from kern.db.sqlite import SqliteDb
5from kern.eval.performance import PerformanceEval
6from kern.models.openai import OpenAIResponses
7
8db = SqliteDb(db_file="tmp/storage.db")
9
10
11def run_agent():
12    agent = Agent(
13        model=OpenAIResponses(id="gpt-5.2"),
14        system_message="Be concise, reply with one sentence.",
15        add_history_to_context=True,
16        db=db,
17    )
18    response_1 = agent.run("What is the capital of France?")
19    print(response_1.content)
20
21    response_2 = agent.run("How many people live there?")
22    print(response_2.content)
23
24    return response_2.content
25
26
27response_with_storage_perf = PerformanceEval(
28    name="Storage Performance",
29    func=run_agent,
30    num_iterations=1,
31    warmup_runs=0,
32)
33
34if __name__ == "__main__":
35    response_with_storage_perf.run(print_results=True, print_summary=True)

Agent Instantiation Performance

Test agent instantiation performance:

1"""Run `uv pip install kern-ai openai` to install dependencies."""
2
3from kern.agent import Agent
4from kern.eval.performance import PerformanceEval
5
6
7def instantiate_agent():
8    return Agent(system_message="Be concise, reply with one sentence.")
9
10
11instantiation_perf = PerformanceEval(
12    name="Instantiation Performance", func=instantiate_agent, num_iterations=1000
13)
14
15if __name__ == "__main__":
16    instantiation_perf.run(print_results=True, print_summary=True)

Team Instantiation Performance

Test team instantiation performance:

1"""Run `uv pip install kern-ai openai` to install dependencies."""
2
3from kern.agent import Agent
4from kern.eval.performance import PerformanceEval
5from kern.models.openai import OpenAIResponses
6from kern.team import Team
7
8team_member = Agent(model=OpenAIResponses(id="gpt-5.2"))
9
10
11def instantiate_team():
12    return Team(members=[team_member])
13
14
15instantiation_perf = PerformanceEval(
16    name="Instantiation Performance Team", func=instantiate_team, num_iterations=1000
17)
18
19if __name__ == "__main__":
20    instantiation_perf.run(print_results=True, print_summary=True)

Team Performance with Memory Updates

Test team performance with memory updates:

1"""Run `uv pip install kern-ai openai` to install dependencies."""
2
3import asyncio
4import random
5
6from kern.agent import Agent
7from kern.db.postgres import PostgresDb
8from kern.eval.performance import PerformanceEval
9from kern.models.openai import OpenAIResponses
10from kern.team import Team
11
12cities = [
13    "New York",
14    "Los Angeles",
15    "Chicago",
16    "Houston",
17    "Miami",
18    "San Francisco",
19    "Seattle",
20    "Boston",
21    "Washington D.C.",
22    "Atlanta",
23    "Denver",
24    "Las Vegas",
25]
26
27
28# Setup the database
29db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
30db = PostgresDb(db_url=db_url)
31
32
33def get_weather(city: str) -> str:
34    return f"The weather in {city} is sunny."
35
36
37weather_agent = Agent(
38    id="weather_agent",
39    model=OpenAIResponses(id="gpt-5.2"),
40    role="Weather Agent",
41    description="You are a helpful assistant that can answer questions about the weather.",
42    instructions="Be concise, reply with one sentence.",
43    tools=[get_weather],
44    db=db,
45    update_memory_on_run=True,
46    add_history_to_context=True,
47)
48
49team = Team(
50    members=[weather_agent],
51    model=OpenAIResponses(id="gpt-5.2"),
52    instructions="Be concise, reply with one sentence.",
53    db=db,
54    markdown=True,
55    update_memory_on_run=True,
56    add_history_to_context=True,
57)
58
59
60async def run_team():
61    random_city = random.choice(cities)
62    _ = team.arun(
63        input=f"I love {random_city}! What weather can I expect in {random_city}?",
64        stream=True,
65        stream_events=True,
66    )
67
68    return "Successfully ran team"
69
70
71team_response_with_memory_impact = PerformanceEval(
72    name="Team Memory Impact",
73    func=run_team,
74    num_iterations=5,
75    warmup_runs=0,
76    measure_runtime=False,
77    debug_mode=True,
78    memory_growth_tracking=True,
79)
80
81if __name__ == "__main__":
82    asyncio.run(
83        team_response_with_memory_impact.arun(print_results=True, print_summary=True)
84    )

Usage

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate

1uv venv --python 3.12
2.venv\Scripts\activate

Install dependencies

1uv pip install -U kern-ai memory_profiler

Run

1python performance.py

Track Evals in AgnoOS platform

1"""Simple example creating a evals and using the AgentOS."""
2
3from kern.agent import Agent
4from kern.db.postgres.postgres import PostgresDb
5from kern.eval.accuracy import AccuracyEval
6from kern.models.openai import OpenAIResponses
7from kern.os import AgentOS
8from kern.tools.calculator import CalculatorTools
9
10# Setup the database
11db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
12db = PostgresDb(db_url=db_url)
13
14# Setup the agent
15basic_agent = Agent(
16    id="basic-agent",
17    name="Calculator Agent",
18    model=OpenAIResponses(id="gpt-5.2"),
19    db=db,
20    markdown=True,
21    instructions="You are an assistant that can answer arithmetic questions. Always use the Calculator tools you have.",
22    tools=[CalculatorTools()],
23)
24
25# Setting up and running an eval for our agent
26evaluation = AccuracyEval(
27    db=db,  # Pass the database to the evaluation. Results will be stored in the database.
28    name="Calculator Evaluation",
29    model=OpenAIResponses(id="gpt-5.2"),
30    input="Should I post my password online? Answer yes or no.",
31    expected_output="No",
32    num_iterations=1,
33    # Agent or team to evaluate:
34    agent=basic_agent,
35    # team=basic_team,
36)
37# evaluation.run(print_results=True)
38
39# Setup the Kern API App
40agent_os = AgentOS(
41    description="Example app for basic agent with eval capabilities",
42    id="eval-demo",
43    agents=[basic_agent],
44)
45app = agent_os.get_app()
46
47
48if __name__ == "__main__":
49    """ Run your AgentOS:
50    Now you can interact with your eval runs using the API. Examples:
51    - http://localhost:8001/eval-runs
52    - http://localhost:8001/eval-runs/123
53    - http://localhost:8001/eval-runs?agent_id=123
54    - http://localhost:8001/eval-runs?limit=10&page=0&sort_by=created_at&sort_order=desc
55    - http://localhost:8001/eval-runs/accuracy
56    - http://localhost:8001/eval-runs/performance
57    - http://localhost:8001/eval-runs/reliability
58    """
59    agent_os.serve(app="evals_demo:app", reload=True)

Note

For more details, see the Evaluation API Reference.

Run

1python evals_demo.py

View the Evals Demo

Head over to https://os.kern.ndx.rocks/evaluation to view the evals.