Image To Audio

Convert image descriptions to audio output.

Image To Audio.

1"""
2Image To Audio
3=============================
4
5Image To Audio.
6"""
7
8from pathlib import Path
9
10from kern.agent import Agent, RunOutput
11from kern.media import Image
12from kern.models.openai import OpenAIChat
13from kern.utils.audio import write_audio_to_file
14from rich import print
15from rich.text import Text
16
17cwd = Path(__file__).parent.resolve()
18
19# ---------------------------------------------------------------------------
20# Create Agent
21# ---------------------------------------------------------------------------
22image_agent = Agent(model=OpenAIChat(id="gpt-4o"))
23
24image_path = Path(__file__).parent.joinpath("sample.jpg")
25
26# ---------------------------------------------------------------------------
27# Run Agent
28# ---------------------------------------------------------------------------
29if __name__ == "__main__":
30    image_story: RunOutput = image_agent.run(
31        "Write a 3 sentence fiction story about the image",
32        images=[Image(filepath=image_path)],
33    )
34    formatted_text = Text.from_markup(
35        f":sparkles: [bold magenta]Story:[/bold magenta] {image_story.content} :sparkles:"
36    )
37    print(formatted_text)
38
39    audio_agent = Agent(
40        model=OpenAIChat(
41            id="gpt-4o-audio-preview",
42            modalities=["text", "audio"],
43            audio={"voice": "sage", "format": "wav"},
44        ),
45    )
46
47    audio_story: RunOutput = audio_agent.run(
48        f"Narrate the story with flair: {image_story.content}"
49    )
50    if audio_story.response_audio is not None:
51        write_audio_to_file(
52            audio=audio_story.response_audio.content, filename="tmp/sample_story.wav"
53        )

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python image_to_audio.py