core/apps/webapp/app/services/documentChunker.server.ts
2025-09-15 12:05:04 +05:30

589 lines
16 KiB
TypeScript

import { encode } from "gpt-tokenizer";
import crypto from "crypto";
export interface DocumentChunk {
content: string;
chunkIndex: number;
title?: string;
context?: string;
startPosition: number;
endPosition: number;
contentHash: string; // Hash for change detection
}
export interface ChunkedDocument {
documentId: string;
title: string;
originalContent: string;
chunks: DocumentChunk[];
totalChunks: number;
contentHash: string; // Hash of the entire document
chunkHashes: string[]; // Array of chunk hashes for change detection
}
/**
* Document chunking service that splits large documents into semantic chunks
* Targets 1-3k tokens per chunk for better entity extraction with natural paragraph boundaries
*/
export class DocumentChunker {
private TARGET_CHUNK_SIZE = 1000; // Much smaller for better entity extraction
private MIN_CHUNK_SIZE = 500;
private MAX_CHUNK_SIZE = 1500;
private MIN_PARAGRAPH_SIZE = 100; // Minimum tokens for a paragraph to be considered
constructor(
targetChunkSize: number = 1000,
minChunkSize: number = 500,
maxChunkSize: number = 1500,
minParagraphSize: number = 100,
) {
this.TARGET_CHUNK_SIZE = targetChunkSize;
this.MIN_CHUNK_SIZE = minChunkSize;
this.MAX_CHUNK_SIZE = maxChunkSize;
this.MIN_PARAGRAPH_SIZE = minParagraphSize;
}
/**
* Chunk a document into semantic sections with natural boundaries
*/
async chunkDocument(
originalContent: string,
title: string,
): Promise<ChunkedDocument> {
const documentId = crypto.randomUUID();
const contentHash = this.generateContentHash(originalContent);
// First, split by major section headers (markdown style)
const majorSections = this.splitByMajorSections(originalContent);
const chunks: DocumentChunk[] = [];
let currentChunk = "";
let currentChunkStart = 0;
let chunkIndex = 0;
for (const section of majorSections) {
const sectionTokens = encode(section.content).length;
const currentChunkTokens = encode(currentChunk).length;
// If adding this section would exceed max size, finalize current chunk
if (
currentChunkTokens > 0 &&
currentChunkTokens + sectionTokens > this.MAX_CHUNK_SIZE
) {
if (currentChunkTokens >= this.MIN_CHUNK_SIZE) {
chunks.push(
this.createChunk(
currentChunk,
chunkIndex,
currentChunkStart,
currentChunkStart + currentChunk.length,
section.title,
),
);
chunkIndex++;
currentChunk = "";
currentChunkStart = section.startPosition;
}
}
// Add section to current chunk
if (currentChunk) {
currentChunk += "\n\n" + section.content;
} else {
currentChunk = section.content;
currentChunkStart = section.startPosition;
}
// If current chunk is large enough and we have a natural break, consider chunking
const updatedChunkTokens = encode(currentChunk).length;
if (updatedChunkTokens >= this.TARGET_CHUNK_SIZE) {
// Try to find a good breaking point within the section
const paragraphs = this.splitIntoParagraphs(section.content);
if (paragraphs.length > 1) {
// Split at paragraph boundary if beneficial
const optimalSplit = this.findOptimalParagraphSplit(currentChunk);
if (optimalSplit) {
chunks.push(
this.createChunk(
optimalSplit.beforeSplit,
chunkIndex,
currentChunkStart,
currentChunkStart + optimalSplit.beforeSplit.length,
section.title,
),
);
chunkIndex++;
currentChunk = optimalSplit.afterSplit;
currentChunkStart =
currentChunkStart + optimalSplit.beforeSplit.length;
}
}
}
}
// Add remaining content as final chunk
if (
currentChunk.trim() &&
encode(currentChunk).length >= this.MIN_PARAGRAPH_SIZE
) {
chunks.push(
this.createChunk(
currentChunk,
chunkIndex,
currentChunkStart,
originalContent.length,
),
);
}
// Generate chunk hashes array
const chunkHashes = chunks.map((chunk) => chunk.contentHash);
return {
documentId,
title,
originalContent,
chunks,
totalChunks: chunks.length,
contentHash,
chunkHashes,
};
}
private splitByMajorSections(content: string): Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> {
const sections: Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> = [];
// Detect headers from multiple formats
const headerMatches = this.findAllHeaders(content);
if (headerMatches.length === 0) {
// No headers found, try to split by natural boundaries
return this.splitByNaturalBoundaries(content);
}
let lastIndex = 0;
for (let i = 0; i < headerMatches.length; i++) {
const match = headerMatches[i];
const nextMatch = headerMatches[i + 1];
const sectionStart = lastIndex;
const sectionEnd = nextMatch ? nextMatch.startIndex : content.length;
const sectionContent = content.slice(sectionStart, sectionEnd).trim();
if (sectionContent) {
sections.push({
content: sectionContent,
title: match.title,
startPosition: sectionStart,
endPosition: sectionEnd,
});
}
lastIndex = match.endIndex;
}
return sections;
}
private findAllHeaders(content: string): Array<{
title: string;
startIndex: number;
endIndex: number;
level: number;
}> {
const headers: Array<{
title: string;
startIndex: number;
endIndex: number;
level: number;
}> = [];
// Markdown headers (# ## ### etc.)
const markdownRegex = /^(#{1,6})\s+(.+)$/gm;
let match;
while ((match = markdownRegex.exec(content)) !== null) {
headers.push({
title: match[2].trim(),
startIndex: match.index,
endIndex: match.index + match[0].length,
level: match[1].length,
});
}
// HTML headers (<h1>, <h2>, etc.)
const htmlRegex = /<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi;
while ((match = htmlRegex.exec(content)) !== null) {
const textContent = match[2].replace(/<[^>]*>/g, "").trim();
if (textContent) {
headers.push({
title: textContent,
startIndex: match.index,
endIndex: match.index + match[0].length,
level: parseInt(match[1]),
});
}
}
// Underlined headers (Setext-style)
const setextRegex = /^(.+)\n(={3,}|-{3,})$/gm;
while ((match = setextRegex.exec(content)) !== null) {
const level = match[2].startsWith("=") ? 1 : 2;
headers.push({
title: match[1].trim(),
startIndex: match.index,
endIndex: match.index + match[0].length,
level,
});
}
// Sort by position in document
return headers.sort((a, b) => a.startIndex - b.startIndex);
}
private splitByNaturalBoundaries(content: string): Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> {
const sections: Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> = [];
// Look for natural boundaries: double line breaks, HTML block elements, etc.
const boundaryPatterns = [
/\n\s*\n\s*\n/g, // Triple line breaks (strong boundary)
/<\/(?:div|section|article|main|p)>\s*<(?:div|section|article|main|p)/gi, // HTML block boundaries
/\n\s*[-=*]{4,}\s*\n/g, // Horizontal rules
];
let boundaries: number[] = [0];
for (const pattern of boundaryPatterns) {
let match;
while ((match = pattern.exec(content)) !== null) {
boundaries.push(match.index);
}
}
boundaries.push(content.length);
boundaries = [...new Set(boundaries)].sort((a, b) => a - b);
// If no natural boundaries found, split by token count
if (boundaries.length <= 2) {
return this.splitByTokenCount(content);
}
for (let i = 0; i < boundaries.length - 1; i++) {
const start = boundaries[i];
const end = boundaries[i + 1];
const sectionContent = content.slice(start, end).trim();
if (
sectionContent &&
encode(sectionContent).length >= this.MIN_PARAGRAPH_SIZE
) {
sections.push({
content: sectionContent,
startPosition: start,
endPosition: end,
});
}
}
return sections.length > 0 ? sections : this.splitByTokenCount(content);
}
private splitByTokenCount(content: string): Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> {
const sections: Array<{
content: string;
title?: string;
startPosition: number;
endPosition: number;
}> = [];
const totalTokens = encode(content).length;
const numSections = Math.ceil(totalTokens / this.TARGET_CHUNK_SIZE);
const charsPerSection = Math.ceil(content.length / numSections);
for (let i = 0; i < numSections; i++) {
const start = i * charsPerSection;
const end = Math.min((i + 1) * charsPerSection, content.length);
// Try to break at word boundaries
let actualEnd = end;
if (end < content.length) {
const nextSpace = content.indexOf(" ", end);
const nextNewline = content.indexOf("\n", end);
const nextBoundary = Math.min(
nextSpace === -1 ? Infinity : nextSpace,
nextNewline === -1 ? Infinity : nextNewline,
);
if (nextBoundary !== Infinity && nextBoundary - end < 100) {
actualEnd = nextBoundary;
}
}
const sectionContent = content.slice(start, actualEnd).trim();
if (sectionContent) {
sections.push({
content: sectionContent,
startPosition: start,
endPosition: actualEnd,
});
}
}
return sections;
}
private splitIntoParagraphs(content: string): string[] {
// Handle HTML paragraphs first
if (
content.includes("<p") ||
content.includes("<div") ||
content.includes("<section")
) {
return this.splitHtmlParagraphs(content);
}
// Split by double newlines (paragraph breaks) for text/markdown
return content
.split(/\n\s*\n/)
.map((p) => p.trim())
.filter((p) => p.length > 0);
}
private splitHtmlParagraphs(content: string): string[] {
const paragraphs: string[] = [];
// Split by HTML block elements
const blockElements = [
"p",
"div",
"section",
"article",
"li",
"blockquote",
"pre",
];
const blockRegex = new RegExp(
`<(${blockElements.join("|")})[^>]*>.*?</\\1>`,
"gis",
);
let lastIndex = 0;
let match;
while ((match = blockRegex.exec(content)) !== null) {
// Add content before this block element
if (match.index > lastIndex) {
const beforeContent = content.slice(lastIndex, match.index).trim();
if (beforeContent) {
paragraphs.push(beforeContent);
}
}
// Add the block element content (strip tags for text content)
const blockContent = match[0].replace(/<[^>]*>/g, " ").trim();
if (blockContent) {
paragraphs.push(blockContent);
}
lastIndex = match.index + match[0].length;
}
// Add remaining content
if (lastIndex < content.length) {
const remainingContent = content.slice(lastIndex).trim();
if (remainingContent) {
// Clean up remaining HTML and split by newlines
const cleaned = remainingContent.replace(/<[^>]*>/g, " ").trim();
if (cleaned) {
paragraphs.push(
...cleaned.split(/\n\s*\n/).filter((p) => p.trim().length > 0),
);
}
}
}
return paragraphs.length > 0
? paragraphs
: this.splitTextParagraphs(content);
}
private splitTextParagraphs(content: string): string[] {
return content
.split(/\n\s*\n/)
.map((p) => p.trim())
.filter((p) => p.length > 0);
}
private findOptimalParagraphSplit(content: string): {
beforeSplit: string;
afterSplit: string;
} | null {
const paragraphs = this.splitIntoParagraphs(content);
if (paragraphs.length < 2) return null;
let bestSplitIndex = -1;
let bestScore = 0;
// Find the split that gets us closest to target size
for (let i = 1; i < paragraphs.length; i++) {
const beforeSplit = paragraphs.slice(0, i).join("\n\n");
const afterSplit = paragraphs.slice(i).join("\n\n");
const beforeTokens = encode(beforeSplit).length;
const afterTokens = encode(afterSplit).length;
// Score based on how close we get to target, avoiding too small chunks
if (
beforeTokens >= this.MIN_CHUNK_SIZE &&
afterTokens >= this.MIN_PARAGRAPH_SIZE
) {
const beforeDistance = Math.abs(beforeTokens - this.TARGET_CHUNK_SIZE);
const score = 1 / (1 + beforeDistance); // Higher score for closer to target
if (score > bestScore) {
bestScore = score;
bestSplitIndex = i;
}
}
}
if (bestSplitIndex > 0) {
return {
beforeSplit: paragraphs.slice(0, bestSplitIndex).join("\n\n"),
afterSplit: paragraphs.slice(bestSplitIndex).join("\n\n"),
};
}
return null;
}
private createChunk(
content: string,
chunkIndex: number,
startPosition: number,
endPosition: number,
title?: string,
): DocumentChunk {
// Generate a concise context/title if not provided
const context = title || this.generateChunkContext(content);
const contentHash = this.generateContentHash(content.trim());
return {
content: content.trim(),
chunkIndex,
title: context,
context: `Chunk ${chunkIndex + 1}${context ? `: ${context}` : ""}`,
startPosition,
endPosition,
contentHash,
};
}
private generateChunkContext(content: string): string {
// Clean content from HTML tags and markup
const cleanContent = content
.replace(/<[^>]*>/g, " ") // Remove HTML tags
.replace(/#{1,6}\s+/g, "") // Remove markdown headers
.replace(/[=-]{3,}/g, "") // Remove underline headers
.replace(/\s+/g, " ") // Normalize whitespace
.trim();
if (!cleanContent) {
return "Document content";
}
// Find first substantial sentence or line
const sentences = cleanContent
.split(/[.!?]+/)
.map((s) => s.trim())
.filter(Boolean);
for (const sentence of sentences.slice(0, 2)) {
if (sentence.length > 20) {
return (
sentence.substring(0, 100) + (sentence.length > 100 ? "..." : "")
);
}
}
// Fallback to first meaningful chunk
const words = cleanContent.split(/\s+/).slice(0, 15).join(" ");
return words.substring(0, 100) + (words.length > 100 ? "..." : "");
}
/**
* Generate content hash for change detection
*/
private generateContentHash(content: string): string {
return crypto
.createHash("sha256")
.update(content, "utf8")
.digest("hex")
.substring(0, 16);
}
/**
* Compare chunk hashes to detect changes
*/
static compareChunkHashes(
oldHashes: string[],
newHashes: string[],
): {
changedIndices: number[];
changePercentage: number;
} {
const maxLength = Math.max(oldHashes.length, newHashes.length);
const changedIndices: number[] = [];
for (let i = 0; i < maxLength; i++) {
const oldHash = oldHashes[i];
const newHash = newHashes[i];
// Mark as changed if hash is different or chunk added/removed
if (oldHash !== newHash) {
changedIndices.push(i);
}
}
const changePercentage =
maxLength > 0 ? (changedIndices.length / maxLength) * 100 : 0;
return {
changedIndices,
changePercentage,
};
}
/**
* Calculate document size in tokens for threshold decisions
*/
static getDocumentSizeInTokens(content: string): number {
return encode(content).length;
}
}