import { encode } from "gpt-tokenizer"; import crypto from "crypto"; export interface DocumentChunk { content: string; chunkIndex: number; title?: string; context?: string; startPosition: number; endPosition: number; contentHash: string; // Hash for change detection } export interface ChunkedDocument { documentId: string; title: string; originalContent: string; chunks: DocumentChunk[]; totalChunks: number; contentHash: string; // Hash of the entire document chunkHashes: string[]; // Array of chunk hashes for change detection } /** * Document chunking service that splits large documents into semantic chunks * Targets 10-15k tokens per chunk with natural paragraph boundaries */ export class DocumentChunker { private readonly TARGET_CHUNK_SIZE = 12500; // Middle of 10-15k range private readonly MIN_CHUNK_SIZE = 10000; private readonly MAX_CHUNK_SIZE = 15000; private readonly MIN_PARAGRAPH_SIZE = 100; // Minimum tokens for a paragraph to be considered /** * Chunk a document into semantic sections with natural boundaries */ async chunkDocument( originalContent: string, title: string, ): Promise { const documentId = crypto.randomUUID(); const contentHash = this.generateContentHash(originalContent); // First, split by major section headers (markdown style) const majorSections = this.splitByMajorSections(originalContent); const chunks: DocumentChunk[] = []; let currentChunk = ""; let currentChunkStart = 0; let chunkIndex = 0; for (const section of majorSections) { const sectionTokens = encode(section.content).length; const currentChunkTokens = encode(currentChunk).length; // If adding this section would exceed max size, finalize current chunk if (currentChunkTokens > 0 && currentChunkTokens + sectionTokens > this.MAX_CHUNK_SIZE) { if (currentChunkTokens >= this.MIN_CHUNK_SIZE) { chunks.push(this.createChunk( currentChunk, chunkIndex, currentChunkStart, currentChunkStart + currentChunk.length, section.title )); chunkIndex++; currentChunk = ""; currentChunkStart = section.startPosition; } } // Add section to current chunk if (currentChunk) { currentChunk += "\n\n" + section.content; } else { currentChunk = section.content; currentChunkStart = section.startPosition; } // If current chunk is large enough and we have a natural break, consider chunking const updatedChunkTokens = encode(currentChunk).length; if (updatedChunkTokens >= this.TARGET_CHUNK_SIZE) { // Try to find a good breaking point within the section const paragraphs = this.splitIntoParagraphs(section.content); if (paragraphs.length > 1) { // Split at paragraph boundary if beneficial const optimalSplit = this.findOptimalParagraphSplit(currentChunk); if (optimalSplit) { chunks.push(this.createChunk( optimalSplit.beforeSplit, chunkIndex, currentChunkStart, currentChunkStart + optimalSplit.beforeSplit.length, section.title )); chunkIndex++; currentChunk = optimalSplit.afterSplit; currentChunkStart = currentChunkStart + optimalSplit.beforeSplit.length; } } } } // Add remaining content as final chunk if (currentChunk.trim() && encode(currentChunk).length >= this.MIN_PARAGRAPH_SIZE) { chunks.push(this.createChunk( currentChunk, chunkIndex, currentChunkStart, originalContent.length )); } // Generate chunk hashes array const chunkHashes = chunks.map(chunk => chunk.contentHash); return { documentId, title, originalContent, chunks, totalChunks: chunks.length, contentHash, chunkHashes, }; } private splitByMajorSections(content: string): Array<{ content: string; title?: string; startPosition: number; endPosition: number; }> { const sections: Array<{ content: string; title?: string; startPosition: number; endPosition: number; }> = []; // Split by markdown headers (# ## ### etc.) or common document patterns const headerRegex = /^(#{1,6}\s+.*$|={3,}$|-{3,}$)/gm; const matches = Array.from(content.matchAll(headerRegex)); if (matches.length === 0) { // No headers found, treat as single section sections.push({ content: content.trim(), startPosition: 0, endPosition: content.length, }); return sections; } let lastIndex = 0; for (let i = 0; i < matches.length; i++) { const match = matches[i]; const nextMatch = matches[i + 1]; const sectionStart = lastIndex; const sectionEnd = nextMatch ? nextMatch.index! : content.length; const sectionContent = content.slice(sectionStart, sectionEnd).trim(); if (sectionContent) { sections.push({ content: sectionContent, title: this.extractSectionTitle(match[0]), startPosition: sectionStart, endPosition: sectionEnd, }); } lastIndex = match.index! + match[0].length; } return sections; } private extractSectionTitle(header: string): string | undefined { // Extract title from markdown header const markdownMatch = header.match(/^#{1,6}\s+(.+)$/); if (markdownMatch) { return markdownMatch[1].trim(); } return undefined; } private splitIntoParagraphs(content: string): string[] { // Split by double newlines (paragraph breaks) and filter out empty strings return content .split(/\n\s*\n/) .map(p => p.trim()) .filter(p => p.length > 0); } private findOptimalParagraphSplit(content: string): { beforeSplit: string; afterSplit: string; } | null { const paragraphs = this.splitIntoParagraphs(content); if (paragraphs.length < 2) return null; let bestSplitIndex = -1; let bestScore = 0; // Find the split that gets us closest to target size for (let i = 1; i < paragraphs.length; i++) { const beforeSplit = paragraphs.slice(0, i).join("\n\n"); const afterSplit = paragraphs.slice(i).join("\n\n"); const beforeTokens = encode(beforeSplit).length; const afterTokens = encode(afterSplit).length; // Score based on how close we get to target, avoiding too small chunks if (beforeTokens >= this.MIN_CHUNK_SIZE && afterTokens >= this.MIN_PARAGRAPH_SIZE) { const beforeDistance = Math.abs(beforeTokens - this.TARGET_CHUNK_SIZE); const score = 1 / (1 + beforeDistance); // Higher score for closer to target if (score > bestScore) { bestScore = score; bestSplitIndex = i; } } } if (bestSplitIndex > 0) { return { beforeSplit: paragraphs.slice(0, bestSplitIndex).join("\n\n"), afterSplit: paragraphs.slice(bestSplitIndex).join("\n\n"), }; } return null; } private createChunk( content: string, chunkIndex: number, startPosition: number, endPosition: number, title?: string ): DocumentChunk { // Generate a concise context/title if not provided const context = title || this.generateChunkContext(content); const contentHash = this.generateContentHash(content.trim()); return { content: content.trim(), chunkIndex, title: context, context: `Chunk ${chunkIndex + 1}${context ? `: ${context}` : ""}`, startPosition, endPosition, contentHash, }; } private generateChunkContext(content: string): string { // Extract first meaningful line as context (avoiding markdown syntax) const lines = content.split('\n').map(line => line.trim()).filter(Boolean); for (const line of lines.slice(0, 3)) { // Skip markdown headers and find first substantial content if (!line.match(/^#{1,6}\s/) && !line.match(/^[=-]{3,}$/) && line.length > 10) { return line.substring(0, 100) + (line.length > 100 ? "..." : ""); } } return "Document content"; } /** * Generate content hash for change detection */ private generateContentHash(content: string): string { return crypto.createHash('sha256').update(content, 'utf8').digest('hex').substring(0, 16); } /** * Compare chunk hashes to detect changes */ static compareChunkHashes(oldHashes: string[], newHashes: string[]): { changedIndices: number[]; changePercentage: number; } { const maxLength = Math.max(oldHashes.length, newHashes.length); const changedIndices: number[] = []; for (let i = 0; i < maxLength; i++) { const oldHash = oldHashes[i]; const newHash = newHashes[i]; // Mark as changed if hash is different or chunk added/removed if (oldHash !== newHash) { changedIndices.push(i); } } const changePercentage = maxLength > 0 ? (changedIndices.length / maxLength) * 100 : 0; return { changedIndices, changePercentage, }; } /** * Calculate document size in tokens for threshold decisions */ static getDocumentSizeInTokens(content: string): number { return encode(content).length; } }