Custom Chunkers¶
This guide explains how to implement custom text chunking strategies.
The ContentChunker Trait¶
All chunkers implement the ContentChunker trait:
use async_trait::async_trait;
#[async_trait]
pub trait ContentChunker: Send + Sync {
async fn chunk(&self, content: &str, size: usize) -> Vec<String>;
fn name(&self) -> &str;
}
Basic Example: Sentence Chunker¶
use embedcache::ContentChunker;
use async_trait::async_trait;
pub struct SentenceChunker;
#[async_trait]
impl ContentChunker for SentenceChunker {
async fn chunk(&self, content: &str, _size: usize) -> Vec<String> {
content
.split('.')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| format!("{}.", s))
.collect()
}
fn name(&self) -> &str {
"sentence"
}
}
Advanced Example: Paragraph Chunker¶
use embedcache::ContentChunker;
use async_trait::async_trait;
pub struct ParagraphChunker {
min_length: usize,
}
impl ParagraphChunker {
pub fn new(min_length: usize) -> Self {
Self { min_length }
}
}
#[async_trait]
impl ContentChunker for ParagraphChunker {
async fn chunk(&self, content: &str, _size: usize) -> Vec<String> {
content
.split("\n\n")
.map(|p| p.trim().to_string())
.filter(|p| p.len() >= self.min_length)
.collect()
}
fn name(&self) -> &str {
"paragraph"
}
}
Async Chunking¶
The trait is async to support chunkers that need external services:
use embedcache::ContentChunker;
use async_trait::async_trait;
pub struct ApiBasedChunker {
api_url: String,
client: reqwest::Client,
}
#[async_trait]
impl ContentChunker for ApiBasedChunker {
async fn chunk(&self, content: &str, size: usize) -> Vec<String> {
// Call external API
match self.client
.post(&self.api_url)
.json(&serde_json::json!({
"content": content,
"chunk_size": size
}))
.send()
.await
{
Ok(response) => {
response.json::<Vec<String>>().await.unwrap_or_else(|_| {
// Fallback to simple chunking
vec![content.to_string()]
})
}
Err(_) => vec![content.to_string()],
}
}
fn name(&self) -> &str {
"api-chunker"
}
}
Registering Custom Chunkers¶
Add your chunker to the chunker map:
use std::collections::HashMap;
use embedcache::{ContentChunker, WordChunker, initialize_chunkers, LLMConfig};
fn create_chunkers_with_custom(
llm_config: Option<&LLMConfig>,
) -> HashMap<String, Box<dyn ContentChunker + Send + Sync>> {
// Start with default chunkers
let mut chunkers = initialize_chunkers(llm_config);
// Add custom chunker
let sentence_chunker = SentenceChunker;
chunkers.insert(
sentence_chunker.name().to_string(),
Box::new(sentence_chunker),
);
chunkers
}
Using Custom Chunkers in API¶
Once registered, use the chunker name in API requests:
curl -X POST http://localhost:8081/v1/embed \
-H "Content-Type: application/json" \
-d '{
"text": ["Your text here..."],
"config": {
"chunking_type": "sentence",
"chunking_size": 256,
"embedding_model": "BGESmallENV15"
}
}'
Best Practices¶
1. Handle Edge Cases¶
async fn chunk(&self, content: &str, size: usize) -> Vec<String> {
// Handle empty content
if content.trim().is_empty() {
return vec![];
}
// Handle very short content
if content.len() < size {
return vec![content.to_string()];
}
// Normal chunking logic...
}
2. Preserve Context¶
async fn chunk(&self, content: &str, size: usize) -> Vec<String> {
let chunks: Vec<String> = /* your chunking logic */;
// Add overlap for context
let overlap = size / 4;
chunks.windows(2)
.map(|window| format!("{} {}", window[0], window[1]))
.collect()
}
3. Provide Fallbacks¶
async fn chunk(&self, content: &str, size: usize) -> Vec<String> {
match self.complex_chunking(content, size).await {
Ok(chunks) if !chunks.is_empty() => chunks,
_ => {
// Fallback to simple word chunking
content
.split_whitespace()
.collect::<Vec<&str>>()
.chunks(size)
.map(|c| c.join(" "))
.collect()
}
}
}
Testing Chunkers¶
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_sentence_chunker() {
let chunker = SentenceChunker;
let content = "First sentence. Second sentence. Third sentence.";
let chunks = chunker.chunk(content, 10).await;
assert_eq!(chunks.len(), 3);
assert!(chunks[0].ends_with('.'));
}
#[tokio::test]
async fn test_empty_content() {
let chunker = SentenceChunker;
let chunks = chunker.chunk("", 10).await;
assert!(chunks.is_empty());
}
}