Audio Output Agent

Code

1from kern.agent import Agent, RunOutput # noqa
2from kern.models.openai import OpenAIResponses
3from kern.utils.audio import write_audio_to_file
4from kern.db.in_memory import InMemoryDb
5
6# Provide the agent with the audio file and audio configuration and get result as text + audio
7agent = Agent(
8 model=OpenAIChat(
9 id="gpt-4o-audio-preview",
10 modalities=["text", "audio"],
11 audio={"voice": "sage", "format": "wav"},
12 ),
13 db=InMemoryDb(),
14 add_history_to_context=True,
15 markdown=True,
16)
17run_output: RunOutput = agent.run("Tell me a 5 second scary story")
18
19# Save the response audio to a file
20if run_output.response_audio:
21 write_audio_to_file(
22 audio=run_output.response_audio.content, filename="tmp/scary_story.wav"
23 )
24
25run_output: RunOutput = agent.run("What would be in a sequal of this story?")
26
27# Save the response audio to a file
28if run_output.response_audio:
29 write_audio_to_file(
30 audio=run_output.response_audio.content,
31 filename="tmp/scary_story_sequal.wav",
32 )

Usage

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate
1uv venv --python 3.12
2.venv\Scripts\activate

Set your API key

1export OPENAI_API_KEY=xxx

Install dependencies

1uv pip install -U openai kern-ai

Run Agent

1python cookbook/11_models/openai/chat/audio_output_agent.py