import { openai } from "@ai-sdk/openai"; import { type CoreMessage, embed } from "ai"; import { entityTypes, EpisodeType, LLMModelEnum, type AddEpisodeParams, type EntityNode, type EpisodicNode, type StatementNode, type Triple, } from "@core/types"; import { logger } from "./logger.service"; import crypto from "crypto"; import { dedupeNodes, extractMessage, extractText } from "./prompts/nodes"; import { extractStatements, resolveStatementPrompt, } from "./prompts/statements"; import { getRecentEpisodes } from "./graphModels/episode"; import { findSimilarEntities } from "./graphModels/entity"; import { findContradictoryStatements, findSimilarStatements, getTripleForStatement, invalidateStatements, saveTriple, } from "./graphModels/statement"; import { makeModelCall } from "~/lib/model.server"; // Default number of previous episodes to retrieve for context const DEFAULT_EPISODE_WINDOW = 5; const RELEVANT_SCHEMA_LIMIT = 10; export class KnowledgeGraphService { async getEmbedding(text: string) { const { embedding } = await embed({ model: openai.embedding("text-embedding-3-small"), value: text, }); return embedding; } /** * Process an episode and update the knowledge graph. * * This method extracts information from the episode, creates nodes and statements, * and updates the HelixDB database according to the reified + temporal approach. */ async addEpisode(params: AddEpisodeParams) { const startTime = Date.now(); const now = new Date(); try { // Step 1: Context Retrieval - Get previous episodes for context const previousEpisodes = await getRecentEpisodes({ referenceTime: params.referenceTime, limit: DEFAULT_EPISODE_WINDOW, userId: params.userId, source: params.source, }); // Step 2: Episode Creation - Create or retrieve the episode const episode: EpisodicNode = { uuid: crypto.randomUUID(), content: params.episodeBody, source: params.source, type: params.type || EpisodeType.Text, createdAt: now, validAt: new Date(params.referenceTime), labels: [], userId: params.userId, space: params.spaceId, sessionId: params.sessionId, }; // Step 3: Entity Extraction - Extract entities from the episode content const extractedNodes = await this.extractEntities( episode, previousEpisodes, ); // Step 4: Statement Extraction - Extract statements (triples) instead of direct edges const extractedStatements = await this.extractStatements( episode, extractedNodes, previousEpisodes, ); // Step 5: Entity Resolution - Resolve extracted nodes to existing nodes or create new ones const resolvedTriples = await this.resolveExtractedNodes( extractedStatements, episode, previousEpisodes, ); // Step 6: Statement Resolution - Resolve statements and detect contradictions const { resolvedStatements, invalidatedStatements } = await this.resolveStatements(resolvedTriples, episode); // Save triples sequentially to avoid parallel processing issues for (const triple of resolvedStatements) { await saveTriple(triple); } // Invalidate invalidated statements await invalidateStatements({ statementIds: invalidatedStatements }); const endTime = Date.now(); const processingTimeMs = endTime - startTime; logger.log(`Processing time: ${processingTimeMs} ms`); return { episodeUuid: episode.uuid, // nodesCreated: hydratedNodes.length, statementsCreated: resolvedStatements.length, processingTimeMs, }; } catch (error) { console.error("Error in addEpisode:", error); throw error; } } /** * Extract entities from an episode using LLM */ private async extractEntities( episode: EpisodicNode, previousEpisodes: EpisodicNode[], ): Promise { // Use the prompt library to get the appropriate prompts const context = { episodeContent: episode.content, previousEpisodes: previousEpisodes.map((ep) => ({ content: ep.content, createdAt: ep.createdAt.toISOString(), })), entityTypes: entityTypes, }; // Get the extract_json prompt from the prompt library const messages = episode.type === EpisodeType.Conversation ? extractMessage(context) : extractText(context); let responseText = ""; await makeModelCall( false, LLMModelEnum.GPT41, messages as CoreMessage[], (text) => { responseText = text; }, ); // Convert to EntityNode objects const entities: EntityNode[] = []; const outputMatch = responseText.match(/([\s\S]*?)<\/output>/); if (outputMatch && outputMatch[1]) { responseText = outputMatch[1].trim(); const extractedEntities = JSON.parse(responseText || "{}").entities || []; entities.push( ...(await Promise.all( extractedEntities.map(async (entity: any) => ({ uuid: crypto.randomUUID(), name: entity.name, type: entity.type, attributes: entity.attributes || {}, nameEmbedding: await this.getEmbedding(entity.name), createdAt: new Date(), userId: episode.userId, })), )), ); } return entities; } /** * Extract statements as first-class objects from an episode using LLM * This replaces the previous extractEdges method with a reified approach */ private async extractStatements( episode: EpisodicNode, extractedEntities: EntityNode[], previousEpisodes: EpisodicNode[], ): Promise { // Use the prompt library to get the appropriate prompts const context = { episodeContent: episode.content, previousEpisodes: previousEpisodes.map((ep) => ({ content: ep.content, createdAt: ep.createdAt.toISOString(), })), entities: extractedEntities.map((node) => ({ name: node.name, type: node.type, })), referenceTime: episode.validAt.toISOString(), }; // Get the statement extraction prompt from the prompt library const messages = extractStatements(context); let responseText = ""; await makeModelCall( false, LLMModelEnum.GPT41, messages as CoreMessage[], (text) => { responseText = text; }, ); const outputMatch = responseText.match(/([\s\S]*?)<\/output>/); if (outputMatch && outputMatch[1]) { responseText = outputMatch[1].trim(); } else { responseText = "{}"; } // Parse the statements from the LLM response const extractedTriples = JSON.parse(responseText || "{}").edges || []; // Convert extracted triples to Triple objects with Statement nodes const triples = await Promise.all( // Fix: Type 'any'. extractedTriples.map(async (triple: any) => { // Find the subject and object nodes const subjectNode = extractedEntities.find( (node) => node.name.toLowerCase() === triple.source.toLowerCase(), ); const objectNode = extractedEntities.find( (node) => node.name.toLowerCase() === triple.target.toLowerCase(), ); // Find or create a predicate node for the relationship type const predicateNode = extractedEntities.find( (node) => node.name.toLowerCase() === triple.predicate.toLowerCase(), ) || { uuid: crypto.randomUUID(), name: triple.predicate, type: "Predicate", attributes: {}, nameEmbedding: await this.getEmbedding(triple.predicate), createdAt: new Date(), userId: episode.userId, }; if (subjectNode && objectNode) { // Create a statement node const statement: StatementNode = { uuid: crypto.randomUUID(), fact: triple.fact, factEmbedding: await this.getEmbedding(triple.fact), createdAt: new Date(), validAt: episode.validAt, invalidAt: null, attributes: triple.attributes || {}, userId: episode.userId, }; return { statement, subject: subjectNode, predicate: predicateNode, object: objectNode, provenance: episode, }; } return null; }), ); // Filter out null values (where subject or object wasn't found) return triples.filter(Boolean) as Triple[]; } /** * Resolve extracted nodes to existing nodes or create new ones */ private async resolveExtractedNodes( triples: Triple[], episode: EpisodicNode, previousEpisodes: EpisodicNode[], ): Promise { // Step 1: Extract unique entities from triples const uniqueEntitiesMap = new Map(); const entityIdToPositions = new Map< string, Array<{ tripleIndex: number; position: "subject" | "predicate" | "object"; }> >(); // First pass: collect all unique entities and their positions in triples triples.forEach((triple, tripleIndex) => { // Process subject if (!uniqueEntitiesMap.has(triple.subject.uuid)) { uniqueEntitiesMap.set(triple.subject.uuid, triple.subject); } if (!entityIdToPositions.has(triple.subject.uuid)) { entityIdToPositions.set(triple.subject.uuid, []); } entityIdToPositions.get(triple.subject.uuid)!.push({ tripleIndex, position: "subject", }); // Process predicate if (!uniqueEntitiesMap.has(triple.predicate.uuid)) { uniqueEntitiesMap.set(triple.predicate.uuid, triple.predicate); } if (!entityIdToPositions.has(triple.predicate.uuid)) { entityIdToPositions.set(triple.predicate.uuid, []); } entityIdToPositions.get(triple.predicate.uuid)!.push({ tripleIndex, position: "predicate", }); // Process object if (!uniqueEntitiesMap.has(triple.object.uuid)) { uniqueEntitiesMap.set(triple.object.uuid, triple.object); } if (!entityIdToPositions.has(triple.object.uuid)) { entityIdToPositions.set(triple.object.uuid, []); } entityIdToPositions.get(triple.object.uuid)!.push({ tripleIndex, position: "object", }); }); // Convert to arrays for processing const uniqueEntities = Array.from(uniqueEntitiesMap.values()); // Step 2: Find similar entities for each unique entity const similarEntitiesResults = await Promise.all( uniqueEntities.map(async (entity) => { const similarEntities = await findSimilarEntities({ queryEmbedding: entity.nameEmbedding, limit: 5, threshold: 0.85, }); return { entity, similarEntities, }; }), ); // If no similar entities found for any entity, return original triples if (similarEntitiesResults.length === 0) { return triples; } // Step 3: Prepare context for LLM deduplication const dedupeContext = { extracted_nodes: similarEntitiesResults.map((result, index) => ({ id: index, name: result.entity.name, entity_type: result.entity.type, duplication_candidates: result.similarEntities.map((candidate, j) => ({ idx: j, name: candidate.name, entity_types: candidate.type, })), })), episode_content: episode ? episode.content : "", previous_episodes: previousEpisodes ? previousEpisodes.map((ep) => ep.content) : [], }; // Step 4: Call LLM to resolve duplicates const messages = dedupeNodes(dedupeContext); let responseText = ""; await makeModelCall( false, LLMModelEnum.GPT41, messages as CoreMessage[], (text) => { responseText = text; }, ); // Step 5: Process LLM response const outputMatch = responseText.match(/([\s\S]*?)<\/output>/); if (!outputMatch || !outputMatch[1]) { return triples; // Return original if parsing fails } try { responseText = outputMatch[1].trim(); const parsedResponse = JSON.parse(responseText); const nodeResolutions = parsedResponse.entity_resolutions || []; // Step 6: Create mapping from original entity UUID to resolved entity const entityResolutionMap = new Map(); nodeResolutions.forEach((resolution: any, index: number) => { const originalEntity = uniqueEntities[resolution.id ?? index]; if (!originalEntity) return; const duplicateIdx = resolution.duplicate_idx ?? -1; // Get the corresponding result from similarEntitiesResults const resultEntry = similarEntitiesResults.find( (result) => result.entity.uuid === originalEntity.uuid, ); if (!resultEntry) return; // If a duplicate was found, use that entity, otherwise keep original const resolvedEntity = duplicateIdx >= 0 && duplicateIdx < resultEntry.similarEntities.length ? resultEntry.similarEntities[duplicateIdx] : originalEntity; // Update name if provided if (resolution.name) { resolvedEntity.name = resolution.name; } // Map original UUID to resolved entity entityResolutionMap.set(originalEntity.uuid, resolvedEntity); }); // Step 7: Reconstruct triples with resolved entities const resolvedTriples = triples.map((triple) => { const newTriple = { ...triple }; // Replace subject if resolved if (entityResolutionMap.has(triple.subject.uuid)) { newTriple.subject = entityResolutionMap.get(triple.subject.uuid)!; } // Replace predicate if resolved if (entityResolutionMap.has(triple.predicate.uuid)) { newTriple.predicate = entityResolutionMap.get(triple.predicate.uuid)!; } // Replace object if resolved if (entityResolutionMap.has(triple.object.uuid)) { newTriple.object = entityResolutionMap.get(triple.object.uuid)!; } return newTriple; }); return resolvedTriples; } catch (error) { console.error("Error processing entity resolutions:", error); return triples; // Return original triples on error } } /** * Resolve statements by checking for existing statements and handling contradictions * This replaces the previous resolveExtractedEdges method with a reified approach */ private async resolveStatements( triples: Triple[], episode: EpisodicNode, ): Promise<{ resolvedStatements: Triple[]; invalidatedStatements: string[]; }> { const resolvedStatements: Triple[] = []; const invalidatedStatements: string[] = []; if (triples.length === 0) { return { resolvedStatements, invalidatedStatements }; } // Step 1: Collect all potential matches for all triples at once const allPotentialMatches: Map = new Map(); const allExistingTripleData: Map = new Map(); // For preparing the LLM context const newStatements: any[] = []; const similarStatements: any[] = []; for (const triple of triples) { // Track IDs of statements we've already checked to avoid duplicates const checkedStatementIds: string[] = []; let potentialMatches: StatementNode[] = []; // Phase 1: Find statements with exact subject-predicate match const exactMatches = await findContradictoryStatements({ subjectId: triple.subject.uuid, predicateId: triple.predicate.uuid, }); if (exactMatches && exactMatches.length > 0) { potentialMatches.push(...exactMatches); checkedStatementIds.push(...exactMatches.map((s) => s.uuid)); } // Phase 2: Find semantically similar statements const semanticMatches = await findSimilarStatements({ factEmbedding: triple.statement.factEmbedding, threshold: 0.85, excludeIds: checkedStatementIds, }); if (semanticMatches && semanticMatches.length > 0) { potentialMatches.push(...semanticMatches); } if (potentialMatches.length > 0) { logger.info( `Found ${potentialMatches.length} potential matches for: ${triple.statement.fact}`, ); allPotentialMatches.set(triple.statement.uuid, potentialMatches); // Get full triple information for each potential match for (const match of potentialMatches) { if (!allExistingTripleData.has(match.uuid)) { const existingTripleData = await getTripleForStatement({ statementId: match.uuid, }); if (existingTripleData) { allExistingTripleData.set(match.uuid, existingTripleData); // Add to similarStatements for LLM context similarStatements.push({ statementId: match.uuid, fact: existingTripleData.statement.fact, subject: existingTripleData.subject.name, predicate: existingTripleData.predicate.name, object: existingTripleData.object.name, }); } } } } // Add to newStatements for LLM context newStatements.push({ statement: { uuid: triple.statement.uuid, fact: triple.statement.fact, }, subject: triple.subject.name, predicate: triple.predicate.name, object: triple.object.name, }); } // Step 2: If we have potential matches, use the LLM to analyze them in batch if (similarStatements.length > 0) { // Prepare context for the LLM const promptContext = { newStatements, similarStatements, episodeContent: episode.content, referenceTime: episode.validAt.toISOString(), }; // Get the statement resolution prompt const messages = resolveStatementPrompt(promptContext); let responseText = ""; // Call the LLM to analyze all statements at once await makeModelCall(false, LLMModelEnum.GPT41, messages, (text) => { responseText = text; }); try { // Extract the JSON response from the output tags const jsonMatch = responseText.match(/([\s\S]*?)<\/output>/); const analysisResult = jsonMatch ? JSON.parse(jsonMatch[1]) : []; // Process the analysis results for (const result of analysisResult) { const tripleIndex = triples.findIndex( (t) => t.statement.uuid === result.statementId, ); if (tripleIndex === -1) continue; const triple = triples[tripleIndex]; // Handle duplicates if (result.isDuplicate && result.duplicateId) { const duplicateTriple = allExistingTripleData.get( result.duplicateId, ); if (duplicateTriple) { logger.info(`Statement is a duplicate: ${triple.statement.fact}`); resolvedStatements.push(duplicateTriple); continue; } } // Handle contradictions if (result.contradictions && result.contradictions.length > 0) { for (const contradictionId of result.contradictions) { const contradictedTriple = allExistingTripleData.get(contradictionId); if (contradictedTriple) { invalidatedStatements.push(contradictedTriple.statement.uuid); } } } // Add the new statement if it's not a duplicate if (!result.isDuplicate) { logger.info(`Adding new statement: ${triple.statement.fact}`); resolvedStatements.push(triple); } } } catch (e) { logger.error("Error processing batch analysis:", { error: e }); // Fallback: add all statements as new if we couldn't process the analysis for (const triple of triples) { if ( !resolvedStatements.some( (s) => s.statement.uuid === triple.statement.uuid, ) ) { logger.info( `Fallback: Adding statement as new: ${triple.statement.fact}`, ); resolvedStatements.push(triple); } } } } else { // No potential matches found for any statements, add them all as new for (const triple of triples) { logger.info( `No matches found, adding as new: ${triple.statement.fact}`, ); resolvedStatements.push(triple); } } return { resolvedStatements, invalidatedStatements }; } }