Audio Input Output

Fetch the audio file and convert it to a base64 encoded string.

Audio Input Output.

1"""
2Audio Input Output
3=============================
4
5Audio Input Output.
6"""
7
8import requests
9from kern.agent import Agent
10from kern.media import Audio
11from kern.models.openai import OpenAIChat
12from kern.utils.audio import write_audio_to_file
13from rich.pretty import pprint
14
15# Fetch the audio file and convert it to a base64 encoded string
16url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
17response = requests.get(url)
18response.raise_for_status()
19wav_data = response.content
20
21# ---------------------------------------------------------------------------
22# Create Agent
23# ---------------------------------------------------------------------------
24agent = Agent(
25 model=OpenAIChat(
26 id="gpt-4o-audio-preview",
27 modalities=["text", "audio"],
28 audio={"voice": "sage", "format": "wav"},
29 ),
30 markdown=True,
31)
32
33# ---------------------------------------------------------------------------
34# Run Agent
35# ---------------------------------------------------------------------------
36if __name__ == "__main__":
37 run_response = agent.run(
38 "What's in these recording?",
39 audio=[Audio(content=wav_data, format="wav")],
40 )
41
42 if run_response.response_audio is not None:
43 pprint(run_response.content)
44 write_audio_to_file(
45 audio=run_response.response_audio.content, filename="tmp/result.wav"
46 )

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python audio_input_output.py