Image To Audio

Convert image descriptions to audio output.

Image To Audio.

1"""
2Image To Audio
3=============================
4
5Image To Audio.
6"""
7
8from pathlib import Path
9
10from kern.agent import Agent, RunOutput
11from kern.media import Image
12from kern.models.openai import OpenAIChat
13from kern.utils.audio import write_audio_to_file
14from rich import print
15from rich.text import Text
16
17cwd = Path(__file__).parent.resolve()
18
19# ---------------------------------------------------------------------------
20# Create Agent
21# ---------------------------------------------------------------------------
22image_agent = Agent(model=OpenAIChat(id="gpt-4o"))
23
24image_path = Path(__file__).parent.joinpath("sample.jpg")
25
26# ---------------------------------------------------------------------------
27# Run Agent
28# ---------------------------------------------------------------------------
29if __name__ == "__main__":
30 image_story: RunOutput = image_agent.run(
31 "Write a 3 sentence fiction story about the image",
32 images=[Image(filepath=image_path)],
33 )
34 formatted_text = Text.from_markup(
35 f":sparkles: [bold magenta]Story:[/bold magenta] {image_story.content} :sparkles:"
36 )
37 print(formatted_text)
38
39 audio_agent = Agent(
40 model=OpenAIChat(
41 id="gpt-4o-audio-preview",
42 modalities=["text", "audio"],
43 audio={"voice": "sage", "format": "wav"},
44 ),
45 )
46
47 audio_story: RunOutput = audio_agent.run(
48 f"Narrate the story with flair: {image_story.content}"
49 )
50 if audio_story.response_audio is not None:
51 write_audio_to_file(
52 audio=audio_story.response_audio.content, filename="tmp/sample_story.wav"
53 )

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python image_to_audio.py