Media Input For Tool

Example showing how tools can access media (images, videos, audio, files) passed to the agent.

1"""
2Media Input For Tool
3=============================
4
5Example showing how tools can access media (images, videos, audio, files) passed to the agent.
6"""
7
8from typing import Optional, Sequence
9
10from kern.agent import Agent
11from kern.media import File
12from kern.models.google import Gemini
13from kern.models.openai import OpenAIResponses # noqa: F401
14from kern.tools import Toolkit
15
16
17# ---------------------------------------------------------------------------
18# Create Agent
19# ---------------------------------------------------------------------------
20class DocumentProcessingTools(Toolkit):
21 def __init__(self):
22 tools = [
23 self.extract_text_from_pdf,
24 ]
25
26 super().__init__(name="document_processing_tools", tools=tools)
27
28 def extract_text_from_pdf(self, files: Optional[Sequence[File]] = None) -> str:
29 """
30 Extract text from uploaded PDF files using OCR.
31
32 This tool can access any files that were passed to the agent.
33 In a real implementation, you would use a proper OCR service.
34
35 Args:
36 files: Files passed to the agent (automatically injected)
37
38 Returns:
39 Extracted text from the PDF files
40 """
41 if not files:
42 return "No files were uploaded to process."
43
44 print(f"--> Files: {files}")
45
46 extracted_texts = []
47 for i, file in enumerate(files):
48 if file.content:
49 # Simulate OCR processing
50 # In reality, you'd use a service like Tesseract, AWS Textract, etc.
51 file_size = len(file.content)
52 extracted_text = f"""
53 [SIMULATED OCR RESULT FOR FILE {i + 1}]
54 Document processed successfully!
55 File size: {file_size} bytes
56
57 Sample extracted content:
58 "This is a sample document with important information about quarterly sales figures.
59 Q1 Revenue: $125,000
60 Q2 Revenue: $150,000
61 Q3 Revenue: $175,000
62
63 The growth trend shows a 20% increase quarter over quarter."
64 """
65 extracted_texts.append(extracted_text)
66 else:
67 extracted_texts.append(
68 f"File {i + 1}: Content is empty or inaccessible."
69 )
70
71 return "\n\n".join(extracted_texts)
72
73
74def create_sample_pdf_content() -> bytes:
75 """Create a sample PDF-like content for demonstration."""
76 # This is just sample binary content - in reality you'd have actual PDF bytes
77 sample_content = """
78 %PDF-1.4
79 Sample PDF content for demonstration
80 This would be actual PDF binary data in a real scenario
81 """.encode("utf-8")
82 return sample_content
83
84
85def main():
86 # Create an agent with document processing tools
87 agent = Agent(
88 # model=OpenAIResponses(id="gpt-5.2"),
89 model=Gemini(id="gemini-2.5-pro"),
90 tools=[DocumentProcessingTools()],
91 name="Document Processing Agent",
92 description="An agent that can process uploaded documents. Use the tool to extract text from the PDF.",
93 debug_mode=True,
94 send_media_to_model=False,
95 store_media=True,
96 )
97
98 print("=== Tool Media Access Example ===\n")
99
100 # Example 1: PDF Processing
101 print("1. Testing PDF processing...")
102
103 # Create sample file content
104 pdf_content = create_sample_pdf_content()
105 sample_file = File(content=pdf_content)
106
107 response = agent.run(
108 input="I've uploaded a PDF document. Please extract the text from it and summarize the key financial information.",
109 files=[sample_file],
110 session_id="test_files",
111 )
112
113 print(f"Agent Response: {response.content}")
114 print("\n" + "=" * 50 + "\n")
115
116
117# ---------------------------------------------------------------------------
118# Run Agent
119# ---------------------------------------------------------------------------
120if __name__ == "__main__":
121 main()

Run the Example

1# Clone and setup repo
2git clone https://github.com/kern-ai/kern.git
3cd kern/cookbook/02_agents/12_multimodal
4
5# Create and activate virtual environment
6./scripts/demo_setup.sh
7source .venvs/demo/bin/activate
8
9python media_input_for_tool.py