Media Input For Tool
Example showing how tools can access media (images, videos, audio, files) passed to the agent.
1"""2Media Input For Tool3=============================45Example showing how tools can access media (images, videos, audio, files) passed to the agent.6"""78from typing import Optional, Sequence910from kern.agent import Agent11from kern.media import File12from kern.models.google import Gemini13from kern.models.openai import OpenAIResponses # noqa: F40114from kern.tools import Toolkit151617# ---------------------------------------------------------------------------18# Create Agent19# ---------------------------------------------------------------------------20class DocumentProcessingTools(Toolkit):21 def __init__(self):22 tools = [23 self.extract_text_from_pdf,24 ]2526 super().__init__(name="document_processing_tools", tools=tools)2728 def extract_text_from_pdf(self, files: Optional[Sequence[File]] = None) -> str:29 """30 Extract text from uploaded PDF files using OCR.3132 This tool can access any files that were passed to the agent.33 In a real implementation, you would use a proper OCR service.3435 Args:36 files: Files passed to the agent (automatically injected)3738 Returns:39 Extracted text from the PDF files40 """41 if not files:42 return "No files were uploaded to process."4344 print(f"--> Files: {files}")4546 extracted_texts = []47 for i, file in enumerate(files):48 if file.content:49 # Simulate OCR processing50 # In reality, you'd use a service like Tesseract, AWS Textract, etc.51 file_size = len(file.content)52 extracted_text = f"""53 [SIMULATED OCR RESULT FOR FILE {i + 1}]54 Document processed successfully!55 File size: {file_size} bytes5657 Sample extracted content:58 "This is a sample document with important information about quarterly sales figures.59 Q1 Revenue: $125,00060 Q2 Revenue: $150,00061 Q3 Revenue: $175,0006263 The growth trend shows a 20% increase quarter over quarter."64 """65 extracted_texts.append(extracted_text)66 else:67 extracted_texts.append(68 f"File {i + 1}: Content is empty or inaccessible."69 )7071 return "\n\n".join(extracted_texts)727374def create_sample_pdf_content() -> bytes:75 """Create a sample PDF-like content for demonstration."""76 # This is just sample binary content - in reality you'd have actual PDF bytes77 sample_content = """78 %PDF-1.479 Sample PDF content for demonstration80 This would be actual PDF binary data in a real scenario81 """.encode("utf-8")82 return sample_content838485def main():86 # Create an agent with document processing tools87 agent = Agent(88 # model=OpenAIResponses(id="gpt-5.2"),89 model=Gemini(id="gemini-2.5-pro"),90 tools=[DocumentProcessingTools()],91 name="Document Processing Agent",92 description="An agent that can process uploaded documents. Use the tool to extract text from the PDF.",93 debug_mode=True,94 send_media_to_model=False,95 store_media=True,96 )9798 print("=== Tool Media Access Example ===\n")99100 # Example 1: PDF Processing101 print("1. Testing PDF processing...")102103 # Create sample file content104 pdf_content = create_sample_pdf_content()105 sample_file = File(content=pdf_content)106107 response = agent.run(108 input="I've uploaded a PDF document. Please extract the text from it and summarize the key financial information.",109 files=[sample_file],110 session_id="test_files",111 )112113 print(f"Agent Response: {response.content}")114 print("\n" + "=" * 50 + "\n")115116117# ---------------------------------------------------------------------------118# Run Agent119# ---------------------------------------------------------------------------120if __name__ == "__main__":121 main()Run the Example
1# Clone and setup repo2git clone https://github.com/kern-ai/kern.git3cd kern/cookbook/02_agents/12_multimodal45# Create and activate virtual environment6./scripts/demo_setup.sh7source .venvs/demo/bin/activate89python media_input_for_tool.py