mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 16:58:28 +00:00
204 lines
6.0 KiB
TypeScript
204 lines
6.0 KiB
TypeScript
import { encode } from "gpt-tokenizer";
|
|
import { DocumentChunker, type ChunkedDocument } from "./documentChunker.server";
|
|
import type { DocumentNode } from "@core/types";
|
|
|
|
export interface DifferentialDecision {
|
|
shouldUseDifferential: boolean;
|
|
strategy: "full_reingest" | "chunk_level_diff" | "new_document" | "skip_processing";
|
|
reason: string;
|
|
changedChunkIndices: number[];
|
|
changePercentage: number;
|
|
documentSizeTokens: number;
|
|
}
|
|
|
|
export interface ChunkComparison {
|
|
chunkIndex: number;
|
|
hasChanged: boolean;
|
|
oldHash?: string;
|
|
newHash: string;
|
|
semanticSimilarity?: number;
|
|
}
|
|
|
|
/**
|
|
* Service for implementing differential document processing logic
|
|
* Determines when to use differential vs full re-ingestion based on
|
|
* document size and change percentage thresholds
|
|
*/
|
|
export class DocumentDifferentialService {
|
|
// Threshold constants based on our enhanced approach
|
|
private readonly SMALL_DOC_THRESHOLD = 5 * 1000; // 5K tokens
|
|
private readonly MEDIUM_DOC_THRESHOLD = 50 * 1000; // 50K tokens
|
|
|
|
// Change percentage thresholds
|
|
private readonly SMALL_CHANGE_THRESHOLD = 20; // 20%
|
|
private readonly MEDIUM_CHANGE_THRESHOLD = 30; // 30%
|
|
|
|
/**
|
|
* Analyze whether to use differential processing for a document update
|
|
*/
|
|
async analyzeDifferentialNeed(
|
|
newContent: string,
|
|
existingDocument: DocumentNode | null,
|
|
newChunkedDocument: ChunkedDocument,
|
|
): Promise<DifferentialDecision> {
|
|
// If no existing document, it's a new document
|
|
if (!existingDocument) {
|
|
return {
|
|
shouldUseDifferential: false,
|
|
strategy: "new_document",
|
|
reason: "No existing document found",
|
|
changedChunkIndices: [],
|
|
changePercentage: 100,
|
|
documentSizeTokens: encode(newContent).length,
|
|
};
|
|
}
|
|
|
|
const documentSizeTokens = encode(newContent).length;
|
|
|
|
// Quick content hash comparison
|
|
if (existingDocument.contentHash === newChunkedDocument.contentHash) {
|
|
return {
|
|
shouldUseDifferential: false,
|
|
strategy: "skip_processing", // No changes detected
|
|
reason: "Document content unchanged",
|
|
changedChunkIndices: [],
|
|
changePercentage: 0,
|
|
documentSizeTokens,
|
|
};
|
|
}
|
|
|
|
// Compare chunk hashes to identify changes
|
|
const chunkComparison = DocumentChunker.compareChunkHashes(
|
|
existingDocument.chunkHashes || [],
|
|
newChunkedDocument.chunkHashes,
|
|
);
|
|
|
|
const { changedIndices, changePercentage } = chunkComparison;
|
|
|
|
// Apply threshold-based decision matrix
|
|
const decision = this.applyThresholdDecision(
|
|
documentSizeTokens,
|
|
changePercentage,
|
|
changedIndices,
|
|
);
|
|
|
|
return {
|
|
...decision,
|
|
changedChunkIndices: changedIndices,
|
|
changePercentage,
|
|
documentSizeTokens,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Apply threshold-based decision matrix
|
|
*/
|
|
private applyThresholdDecision(
|
|
documentSizeTokens: number,
|
|
changePercentage: number,
|
|
changedIndices: number[],
|
|
): Pick<DifferentialDecision, "shouldUseDifferential" | "strategy" | "reason"> {
|
|
// Small documents: always full re-ingest (cheap)
|
|
if (documentSizeTokens < this.SMALL_DOC_THRESHOLD) {
|
|
return {
|
|
shouldUseDifferential: false,
|
|
strategy: "full_reingest",
|
|
reason: `Document too small (${documentSizeTokens} tokens < ${this.SMALL_DOC_THRESHOLD})`,
|
|
};
|
|
}
|
|
|
|
// Medium documents (5-50K tokens)
|
|
if (documentSizeTokens < this.MEDIUM_DOC_THRESHOLD) {
|
|
if (changePercentage < this.SMALL_CHANGE_THRESHOLD) {
|
|
return {
|
|
shouldUseDifferential: true,
|
|
strategy: "chunk_level_diff",
|
|
reason: `Medium document with small changes (${changePercentage.toFixed(1)}% < ${this.SMALL_CHANGE_THRESHOLD}%)`,
|
|
};
|
|
} else {
|
|
return {
|
|
shouldUseDifferential: false,
|
|
strategy: "full_reingest",
|
|
reason: `Medium document with large changes (${changePercentage.toFixed(1)}% >= ${this.SMALL_CHANGE_THRESHOLD}%)`,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Large documents (>50K tokens)
|
|
if (changePercentage < this.MEDIUM_CHANGE_THRESHOLD) {
|
|
return {
|
|
shouldUseDifferential: true,
|
|
strategy: "chunk_level_diff",
|
|
reason: `Large document with moderate changes (${changePercentage.toFixed(1)}% < ${this.MEDIUM_CHANGE_THRESHOLD}%)`,
|
|
};
|
|
} else {
|
|
return {
|
|
shouldUseDifferential: false,
|
|
strategy: "full_reingest",
|
|
reason: `Large document with extensive changes (${changePercentage.toFixed(1)}% >= ${this.MEDIUM_CHANGE_THRESHOLD}%)`,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get detailed chunk comparison for differential processing
|
|
*/
|
|
getChunkComparisons(
|
|
existingDocument: DocumentNode,
|
|
newChunkedDocument: ChunkedDocument,
|
|
): ChunkComparison[] {
|
|
const oldHashes = existingDocument.chunkHashes || [];
|
|
const newHashes = newChunkedDocument.chunkHashes;
|
|
const maxLength = Math.max(oldHashes.length, newHashes.length);
|
|
|
|
const comparisons: ChunkComparison[] = [];
|
|
|
|
for (let i = 0; i < maxLength; i++) {
|
|
const oldHash = oldHashes[i];
|
|
const newHash = newHashes[i];
|
|
|
|
comparisons.push({
|
|
chunkIndex: i,
|
|
hasChanged: oldHash !== newHash,
|
|
oldHash,
|
|
newHash: newHash || "", // Handle case where new doc has fewer chunks
|
|
});
|
|
}
|
|
|
|
return comparisons;
|
|
}
|
|
|
|
/**
|
|
* Filter chunks that need re-processing
|
|
*/
|
|
getChunksNeedingReprocessing(
|
|
chunkComparisons: ChunkComparison[],
|
|
): number[] {
|
|
return chunkComparisons
|
|
.filter(comparison => comparison.hasChanged)
|
|
.map(comparison => comparison.chunkIndex);
|
|
}
|
|
|
|
/**
|
|
* Calculate processing cost savings estimate
|
|
*/
|
|
calculateCostSavings(
|
|
totalChunks: number,
|
|
changedChunks: number,
|
|
): {
|
|
chunksToProcess: number;
|
|
chunksSkipped: number;
|
|
estimatedSavingsPercentage: number;
|
|
} {
|
|
const chunksSkipped = totalChunks - changedChunks;
|
|
const estimatedSavingsPercentage = totalChunks > 0
|
|
? (chunksSkipped / totalChunks) * 100
|
|
: 0;
|
|
|
|
return {
|
|
chunksToProcess: changedChunks,
|
|
chunksSkipped,
|
|
estimatedSavingsPercentage,
|
|
};
|
|
}
|
|
} |