Custom Chunking
Custom chunking allows you to implement your own chunking strategy by creating a class that inherits from ChunkingStrategy. This is useful when you need to split documents based on specific separators, apply custom logic, or handle domain-specific content formats.
Create a Python file
1from typing import List2import asyncio3from kern.agent import Agent4from kern.knowledge.chunking.base import ChunkingStrategy5from kern.knowledge.content import Document6from kern.knowledge.knowledge import Knowledge7from kern.knowledge.reader.pdf_reader import PDFReader8from kern.vectordb.pgvector import PgVector910class CustomChunking(ChunkingStrategy):11 def __init__(self, separator: str = "---", **kwargs):12 self.separator = separator1314 def chunk(self, document: Document) -> List[Document]:15 # Split by custom separator16 chunks = document.content.split(self.separator)1718 result = []19 for i, chunk_content in enumerate(chunks):20 chunk_content = self.clean_text(chunk_content) # Use inherited method21 if chunk_content:22 meta_data = document.meta_data.copy()23 meta_data["chunk"] = i + 124 result.append(Document(25 id=f"{document.id}_{i+1}" if document.id else None,26 name=document.name,27 meta_data=meta_data,28 content=chunk_content29 ))30 return result3132db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"3334knowledge = Knowledge(35 vector_db=PgVector(table_name="recipes_custom_chunking", db_url=db_url),36)3738asyncio.run(knowledge.ainsert(39 url="https://kern-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf",40 reader=PDFReader(41 name="Custom Chunking Reader",42 chunking_strategy=CustomChunking(separator="---"),43 ),44))4546agent = Agent(47 knowledge=knowledge,48 search_knowledge=True,49)5051agent.print_response("How to make Thai curry?", markdown=True)Set up your virtual environment
1uv venv --python 3.122source .venv/bin/activate1uv venv --python 3.122.venv\Scripts\activateInstall dependencies
1uv pip install -U kern-ai sqlalchemy psycopg pgvectorRun PgVector
1docker run -d \2 -e POSTGRES_DB=ai \3 -e POSTGRES_USER=ai \4 -e POSTGRES_PASSWORD=ai \5 -e PGDATA=/var/lib/postgresql/data/pgdata \6 -v pgvolume:/var/lib/postgresql/data \7 -p 5532:5432 \8 --name pgvector \9 kern/pgvector:16Run the script
1python custom_chunking.pyCustom Chunking Params
| Parameter | Type | Default | Description |
|---|---|---|---|
separator | str | "---" | The string used to split the document content into chunks. |