Fix: episode normalization with high complexity model (#80)

This commit is contained in:
Manoj 2025-10-02 08:48:56 +05:30 committed by GitHub
parent 46407b0fac
commit 92ca34a02f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 35 additions and 48 deletions

View File

@ -1,5 +1,5 @@
import { runQuery } from "~/lib/neo4j.server"; import { runQuery } from "~/lib/neo4j.server";
import { type EntityNode, type EpisodicNode } from "@core/types"; import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";
export async function saveEpisode(episode: EpisodicNode): Promise<string> { export async function saveEpisode(episode: EpisodicNode): Promise<string> {
const query = ` const query = `
@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
export async function getEpisodeStatements(params: { export async function getEpisodeStatements(params: {
episodeUuid: string; episodeUuid: string;
userId: string; userId: string;
}) { }): Promise<Omit<StatementNode, "factEmbedding">[]> {
const query = ` const query = `
MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement) MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
WHERE stmt.invalidAt IS NULL WHERE stmt.invalidAt IS NULL
@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
return { return {
uuid: stmt.uuid, uuid: stmt.uuid,
fact: stmt.fact, fact: stmt.fact,
factEmbedding: stmt.factEmbedding,
createdAt: new Date(stmt.createdAt), createdAt: new Date(stmt.createdAt),
validAt: new Date(stmt.validAt), validAt: new Date(stmt.validAt),
invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null, invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,

View File

@ -111,7 +111,7 @@ export async function findContradictoryStatements({
subjectId: string; subjectId: string;
predicateId: string; predicateId: string;
userId: string; userId: string;
}): Promise<StatementNode[]> { }): Promise<Omit<StatementNode, "factEmbedding">[]> {
const query = ` const query = `
MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId}) MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate) MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
@ -131,7 +131,6 @@ export async function findContradictoryStatements({
return { return {
uuid: statement.uuid, uuid: statement.uuid,
fact: statement.fact, fact: statement.fact,
factEmbedding: statement.factEmbedding,
createdAt: new Date(statement.createdAt), createdAt: new Date(statement.createdAt),
validAt: new Date(statement.validAt), validAt: new Date(statement.validAt),
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null, invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
objectId: string; objectId: string;
excludePredicateId?: string; excludePredicateId?: string;
userId: string; userId: string;
}): Promise<StatementNode[]> { }): Promise<Omit<StatementNode, "factEmbedding">[]> {
const query = ` const query = `
MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId}) MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object) MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
return { return {
uuid: statement.uuid, uuid: statement.uuid,
fact: statement.fact, fact: statement.fact,
factEmbedding: statement.factEmbedding,
createdAt: new Date(statement.createdAt), createdAt: new Date(statement.createdAt),
validAt: new Date(statement.validAt), validAt: new Date(statement.validAt),
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null, invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -212,7 +210,7 @@ export async function findSimilarStatements({
threshold?: number; threshold?: number;
excludeIds?: string[]; excludeIds?: string[];
userId: string; userId: string;
}): Promise<StatementNode[]> { }): Promise<Omit<StatementNode, "factEmbedding">[]> {
const query = ` const query = `
CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding) CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
YIELD node AS statement, score YIELD node AS statement, score
@ -242,7 +240,6 @@ export async function findSimilarStatements({
return { return {
uuid: statement.uuid, uuid: statement.uuid,
fact: statement.fact, fact: statement.fact,
factEmbedding: statement.factEmbedding,
createdAt: new Date(statement.createdAt), createdAt: new Date(statement.createdAt),
validAt: new Date(statement.validAt), validAt: new Date(statement.validAt),
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null, invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,

View File

@ -418,10 +418,15 @@ export class KnowledgeGraphService {
const processingTimeMs = endTime - startTime; const processingTimeMs = endTime - startTime;
logger.log(`Processing time: ${processingTimeMs} ms`); logger.log(`Processing time: ${processingTimeMs} ms`);
// Count only truly new statements (exclude duplicates)
const newStatementsCount = updatedTriples.filter(triple =>
triple.statement.createdAt >= episode.createdAt
).length;
return { return {
episodeUuid: episode.uuid, episodeUuid: episode.uuid,
// nodesCreated: hydratedNodes.length, // nodesCreated: hydratedNodes.length,
statementsCreated: resolvedStatements.length, statementsCreated: newStatementsCount,
processingTimeMs, processingTimeMs,
tokenUsage: tokenMetrics, tokenUsage: tokenMetrics,
}; };
@ -529,6 +534,7 @@ export class KnowledgeGraphService {
referenceTime: episode.validAt.toISOString(), referenceTime: episode.validAt.toISOString(),
}; };
console.log("proprietary model", isProprietaryModel(undefined, 'high'));
// Statement extraction requires HIGH complexity (causal reasoning, emotional context) // Statement extraction requires HIGH complexity (causal reasoning, emotional context)
// Choose between proprietary and OSS prompts based on model type // Choose between proprietary and OSS prompts based on model type
const messages = isProprietaryModel(undefined, 'high') const messages = isProprietaryModel(undefined, 'high')
@ -905,7 +911,7 @@ export class KnowledgeGraphService {
} }
// Step 1: Collect all potential matches for all triples at once // Step 1: Collect all potential matches for all triples at once
const allPotentialMatches: Map<string, StatementNode[]> = new Map(); const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
const allExistingTripleData: Map<string, Triple> = new Map(); const allExistingTripleData: Map<string, Triple> = new Map();
// For preparing the LLM context // For preparing the LLM context
@ -915,7 +921,7 @@ export class KnowledgeGraphService {
for (const triple of triples) { for (const triple of triples) {
// Track IDs of statements we've already checked to avoid duplicates // Track IDs of statements we've already checked to avoid duplicates
const checkedStatementIds: string[] = []; const checkedStatementIds: string[] = [];
let potentialMatches: StatementNode[] = []; let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];
// Phase 1a: Find statements with exact subject-predicate match // Phase 1a: Find statements with exact subject-predicate match
// Example: "John lives_in New York" vs "John lives_in San Francisco" // Example: "John lives_in New York" vs "John lives_in San Francisco"
@ -965,7 +971,7 @@ export class KnowledgeGraphService {
} }
// Phase 3: Check related memories for contradictory statements // Phase 3: Check related memories for contradictory statements
const previousEpisodesStatements: StatementNode[] = []; const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];
await Promise.all( await Promise.all(
previousEpisodes.map(async (episode) => { previousEpisodes.map(async (episode) => {
@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
tokenMetrics.low.output += usage.completionTokens; tokenMetrics.low.output += usage.completionTokens;
tokenMetrics.low.total += usage.totalTokens; tokenMetrics.low.total += usage.totalTokens;
} }
}, undefined, 'low'); }, undefined, 'high');
let normalizedEpisodeBody = ""; let normalizedEpisodeBody = "";
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/); const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
if (outputMatch && outputMatch[1]) { if (outputMatch && outputMatch[1]) {

View File

@ -7,42 +7,25 @@ export const normalizePrompt = (
Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment. Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
<smart_enrichment_process> CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
Evaluate the episode and apply enrichment ONLY where it adds significant value:
1. PRIMARY FACTS - always preserve the core information from the episode <enrichment_strategy>
2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp 1. PRIMARY FACTS - Always preserve the core information from the episode
3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below) 2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images 3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges 4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections 5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges
ENRICHMENT DECISION MATRIX: When to add context from related memories:
- Clear, complete statement minimal enrichment (just temporal + attribution) - Unclear pronouns ("she", "it", "they") resolve to specific entity
- Unclear references resolve with context - Vague references ("the agency", "the event") add clarifying details
- Emotional support preserve feeling, avoid historical dumping - Continuation phrases ("following up", "as we discussed") connect to previous topic
- New developments connect to ongoing narrative
- Visual content extract specific details as primary facts
</smart_enrichment_process>
<context_usage_decision> When NOT to add context:
When related memories/previous episodes are provided, evaluate if they improve understanding: - Clear, self-contained statements no enrichment needed beyond temporal
- Emotional responses preserve tone, avoid over-contextualization
USE CONTEXT when current episode has: - Already established topics don't repeat details mentioned earlier in conversation
- Unclear pronouns ("she", "it", "they" without clear antecedent) </enrichment_strategy>
- Vague references ("the agency", "the event" without definition in current episode)
- Continuation phrases ("following up", "as we discussed")
- Incomplete information that context clarifies
IGNORE CONTEXT when current episode is:
- Clear and self-contained ("I got a job in New York")
- Simple emotional responses ("Thanks, that's great!")
- Generic encouragement ("You're doing awesome!")
- Complete statements with all necessary information
DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
resolves ambiguity.
</context_usage_decision>
<temporal_resolution> <temporal_resolution>
Using episode timestamp as anchor, convert ALL relative time references: Using episode timestamp as anchor, convert ALL relative time references:
@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (
Transform this document content into enriched factual statements for knowledge graph storage. Transform this document content into enriched factual statements for knowledge graph storage.
CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
<document_processing_approach> <document_processing_approach>
Focus on STRUCTURED CONTENT EXTRACTION optimized for documents: Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:

View File

@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";
const documentIngestionQueue = queue({ const documentIngestionQueue = queue({
name: "document-ingestion-queue", name: "document-ingestion-queue",
concurrencyLimit: 5, concurrencyLimit: 1,
}); });
// Register the Document Ingestion Trigger.dev task // Register the Document Ingestion Trigger.dev task