Audio To Text

Give a transcript of this audio conversation. Use speaker A, speaker B to identify speakers.

Audio To Text.

1"""
2Audio To Text
3=============================
4
5Audio To Text.
6"""
7
8import requests
9from kern.agent import Agent
10from kern.media import Audio
11from kern.models.google import Gemini
12
13# ---------------------------------------------------------------------------
14# Create Agent
15# ---------------------------------------------------------------------------
16agent = Agent(
17 model=Gemini(id="gemini-3-flash-preview"),
18 markdown=True,
19)
20
21url = "https://kern-public.s3.us-east-1.amazonaws.com/demo_data/QA-01.mp3"
22
23response = requests.get(url)
24audio_content = response.content
25
26# ---------------------------------------------------------------------------
27# Run Agent
28# ---------------------------------------------------------------------------
29if __name__ == "__main__":
30 # Give a transcript of this audio conversation. Use speaker A, speaker B to identify speakers.
31
32 agent.print_response(
33 "Give a transcript of this audio conversation. Use speaker A, speaker B to identify speakers.",
34 audio=[Audio(content=audio_content)],
35 stream=True,
36 )

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python audio_to_text.py