Audio Streaming

Mono (Change to 2 if Stereo).

Audio Streaming.

1"""
2Audio Streaming
3=============================
4
5Audio Streaming.
6"""
7
8import base64
9import wave
10from typing import Iterator
11
12from kern.agent import Agent, RunOutputEvent
13from kern.models.openai import OpenAIChat
14
15# Audio Configuration
16SAMPLE_RATE = 24000  # Hz (24kHz)
17CHANNELS = 1  # Mono (Change to 2 if Stereo)
18SAMPLE_WIDTH = 2  # Bytes (16 bits)
19
20# Provide the agent with the audio file and audio configuration and get result as text + audio
21# ---------------------------------------------------------------------------
22# Create Agent
23# ---------------------------------------------------------------------------
24agent = Agent(
25    model=OpenAIChat(
26        id="gpt-4o-audio-preview",
27        modalities=["text", "audio"],
28        audio={
29            "voice": "alloy",
30            "format": "pcm16",
31        },  # Only pcm16 is supported with streaming
32    ),
33)
34
35# ---------------------------------------------------------------------------
36# Run Agent
37# ---------------------------------------------------------------------------
38if __name__ == "__main__":
39    output_stream: Iterator[RunOutputEvent] = agent.run(
40        "Tell me a 10 second story", stream=True
41    )
42
43    filename = "tmp/response_stream.wav"
44
45    # Open the file once in append-binary mode
46    with wave.open(str(filename), "wb") as wav_file:
47        wav_file.setnchannels(CHANNELS)
48        wav_file.setsampwidth(SAMPLE_WIDTH)
49        wav_file.setframerate(SAMPLE_RATE)
50
51        # Iterate over generated audio
52        for response in output_stream:
53            response_audio = response.response_audio  # type: ignore
54            if response_audio:
55                if response_audio.transcript:
56                    print(response_audio.transcript, end="", flush=True)
57                if response_audio.content:
58                    try:
59                        pcm_bytes = base64.b64decode(response_audio.content)
60                        wav_file.writeframes(pcm_bytes)
61                    except Exception as e:
62                        print(f"Error decoding audio: {e}")
63    print()

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python audio_streaming.py