Media Input For Tool

Example showing how tools can access media (images, videos, audio, files) passed to the agent.

1"""
2Media Input For Tool
3=============================
4
5Example showing how tools can access media (images, videos, audio, files) passed to the agent.
6"""
7
8from typing import Optional, Sequence
9
10from kern.agent import Agent
11from kern.media import File
12from kern.models.google import Gemini
13from kern.models.openai import OpenAIResponses  # noqa: F401
14from kern.tools import Toolkit
15
16
17# ---------------------------------------------------------------------------
18# Create Agent
19# ---------------------------------------------------------------------------
20class DocumentProcessingTools(Toolkit):
21    def __init__(self):
22        tools = [
23            self.extract_text_from_pdf,
24        ]
25
26        super().__init__(name="document_processing_tools", tools=tools)
27
28    def extract_text_from_pdf(self, files: Optional[Sequence[File]] = None) -> str:
29        """
30        Extract text from uploaded PDF files using OCR.
31
32        This tool can access any files that were passed to the agent.
33        In a real implementation, you would use a proper OCR service.
34
35        Args:
36            files: Files passed to the agent (automatically injected)
37
38        Returns:
39            Extracted text from the PDF files
40        """
41        if not files:
42            return "No files were uploaded to process."
43
44        print(f"--> Files: {files}")
45
46        extracted_texts = []
47        for i, file in enumerate(files):
48            if file.content:
49                # Simulate OCR processing
50                # In reality, you'd use a service like Tesseract, AWS Textract, etc.
51                file_size = len(file.content)
52                extracted_text = f"""
53                    [SIMULATED OCR RESULT FOR FILE {i + 1}]
54                    Document processed successfully!
55                    File size: {file_size} bytes
56
57                    Sample extracted content:
58                    "This is a sample document with important information about quarterly sales figures.
59                    Q1 Revenue: $125,000
60                    Q2 Revenue: $150,000
61                    Q3 Revenue: $175,000
62
63                    The growth trend shows a 20% increase quarter over quarter."
64                """
65                extracted_texts.append(extracted_text)
66            else:
67                extracted_texts.append(
68                    f"File {i + 1}: Content is empty or inaccessible."
69                )
70
71        return "\n\n".join(extracted_texts)
72
73
74def create_sample_pdf_content() -> bytes:
75    """Create a sample PDF-like content for demonstration."""
76    # This is just sample binary content - in reality you'd have actual PDF bytes
77    sample_content = """
78    %PDF-1.4
79    Sample PDF content for demonstration
80    This would be actual PDF binary data in a real scenario
81    """.encode("utf-8")
82    return sample_content
83
84
85def main():
86    # Create an agent with document processing tools
87    agent = Agent(
88        # model=OpenAIResponses(id="gpt-5.2"),
89        model=Gemini(id="gemini-2.5-pro"),
90        tools=[DocumentProcessingTools()],
91        name="Document Processing Agent",
92        description="An agent that can process uploaded documents. Use the tool to extract text from the PDF.",
93        debug_mode=True,
94        send_media_to_model=False,
95        store_media=True,
96    )
97
98    print("=== Tool Media Access Example ===\n")
99
100    # Example 1: PDF Processing
101    print("1. Testing PDF processing...")
102
103    # Create sample file content
104    pdf_content = create_sample_pdf_content()
105    sample_file = File(content=pdf_content)
106
107    response = agent.run(
108        input="I've uploaded a PDF document. Please extract the text from it and summarize the key financial information.",
109        files=[sample_file],
110        session_id="test_files",
111    )
112
113    print(f"Agent Response: {response.content}")
114    print("\n" + "=" * 50 + "\n")
115
116
117# ---------------------------------------------------------------------------
118# Run Agent
119# ---------------------------------------------------------------------------
120if __name__ == "__main__":
121    main()

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python media_input_for_tool.py