Custom Chunking

Custom chunking allows you to implement your own chunking strategy by creating a class that inherits from ChunkingStrategy. This is useful when you need to split documents based on specific separators, apply custom logic, or handle domain-specific content formats.

Create a Python file

1from typing import List
2import asyncio
3from kern.agent import Agent
4from kern.knowledge.chunking.base import ChunkingStrategy
5from kern.knowledge.content import Document
6from kern.knowledge.knowledge import Knowledge
7from kern.knowledge.reader.pdf_reader import PDFReader
8from kern.vectordb.pgvector import PgVector
9
10class CustomChunking(ChunkingStrategy):
11 def __init__(self, separator: str = "---", **kwargs):
12 self.separator = separator
13
14 def chunk(self, document: Document) -> List[Document]:
15 # Split by custom separator
16 chunks = document.content.split(self.separator)
17
18 result = []
19 for i, chunk_content in enumerate(chunks):
20 chunk_content = self.clean_text(chunk_content) # Use inherited method
21 if chunk_content:
22 meta_data = document.meta_data.copy()
23 meta_data["chunk"] = i + 1
24 result.append(Document(
25 id=f"{document.id}_{i+1}" if document.id else None,
26 name=document.name,
27 meta_data=meta_data,
28 content=chunk_content
29 ))
30 return result
31
32db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
33
34knowledge = Knowledge(
35 vector_db=PgVector(table_name="recipes_custom_chunking", db_url=db_url),
36)
37
38asyncio.run(knowledge.ainsert(
39 url="https://kern-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf",
40 reader=PDFReader(
41 name="Custom Chunking Reader",
42 chunking_strategy=CustomChunking(separator="---"),
43 ),
44))
45
46agent = Agent(
47 knowledge=knowledge,
48 search_knowledge=True,
49)
50
51agent.print_response("How to make Thai curry?", markdown=True)

Set up your virtual environment

1uv venv --python 3.12
2source .venv/bin/activate
1uv venv --python 3.12
2.venv\Scripts\activate

Install dependencies

1uv pip install -U kern-ai sqlalchemy psycopg pgvector

Run PgVector

1docker run -d \
2 -e POSTGRES_DB=ai \
3 -e POSTGRES_USER=ai \
4 -e POSTGRES_PASSWORD=ai \
5 -e PGDATA=/var/lib/postgresql/data/pgdata \
6 -v pgvolume:/var/lib/postgresql/data \
7 -p 5532:5432 \
8 --name pgvector \
9 kern/pgvector:16

Run the script

1python custom_chunking.py

Custom Chunking Params

ParameterTypeDefaultDescription
separatorstr"---"The string used to split the document content into chunks.