From 92ca34a02fb8dba60608e10fbadf6741cac7da51 Mon Sep 17 00:00:00 2001
From: Manoj <saimanoj58@gmail.com>
Date: Thu, 2 Oct 2025 08:48:56 +0530
Subject: [PATCH] Fix: episode normalization with high complexity model (#80)

---
 .../app/services/graphModels/episode.ts       |  5 +-
 .../app/services/graphModels/statement.ts     |  9 ++--
 .../app/services/knowledgeGraph.server.ts     | 16 ++++--
 apps/webapp/app/services/prompts/normalize.ts | 51 +++++++------------
 .../app/trigger/ingest/ingest-document.ts     |  2 +-
 5 files changed, 35 insertions(+), 48 deletions(-)
diff --git a/apps/webapp/app/services/graphModels/episode.ts b/apps/webapp/app/services/graphModels/episode.ts
index 30c5de2..d569645 100644
--- a/apps/webapp/app/services/graphModels/episode.ts
+++ b/apps/webapp/app/services/graphModels/episode.ts
@@ -1,5 +1,5 @@
 import { runQuery } from "~/lib/neo4j.server";
-import { type EntityNode, type EpisodicNode } from "@core/types";
+import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";
 
 export async function saveEpisode(episode: EpisodicNode): Promise<string> {
   const query = `
@@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
 export async function getEpisodeStatements(params: {
   episodeUuid: string;
   userId: string;
-}) {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
   const query = `
   MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
   WHERE stmt.invalidAt IS NULL
@@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
     return {
       uuid: stmt.uuid,
       fact: stmt.fact,
-      factEmbedding: stmt.factEmbedding,
       createdAt: new Date(stmt.createdAt),
       validAt: new Date(stmt.validAt),
       invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
diff --git a/apps/webapp/app/services/graphModels/statement.ts b/apps/webapp/app/services/graphModels/statement.ts
index d7347c7..35411b3 100644
--- a/apps/webapp/app/services/graphModels/statement.ts
+++ b/apps/webapp/app/services/graphModels/statement.ts
@@ -111,7 +111,7 @@ export async function findContradictoryStatements({
   subjectId: string;
   predicateId: string;
   userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
   const query = `
       MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
       MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
@@ -131,7 +131,6 @@ export async function findContradictoryStatements({
     return {
       uuid: statement.uuid,
       fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
       createdAt: new Date(statement.createdAt),
       validAt: new Date(statement.validAt),
       invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
   objectId: string;
   excludePredicateId?: string;
   userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
   const query = `
       MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
       MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
@@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
     return {
       uuid: statement.uuid,
       fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
       createdAt: new Date(statement.createdAt),
       validAt: new Date(statement.validAt),
       invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@@ -212,7 +210,7 @@ export async function findSimilarStatements({
   threshold?: number;
   excludeIds?: string[];
   userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
   const query = `
       CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
       YIELD node AS statement, score
@@ -242,7 +240,6 @@ export async function findSimilarStatements({
     return {
       uuid: statement.uuid,
       fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
       createdAt: new Date(statement.createdAt),
       validAt: new Date(statement.validAt),
       invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
diff --git a/apps/webapp/app/services/knowledgeGraph.server.ts b/apps/webapp/app/services/knowledgeGraph.server.ts
index f7f8862..9a7ace8 100644
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@@ -418,10 +418,15 @@ export class KnowledgeGraphService {
       const processingTimeMs = endTime - startTime;
       logger.log(`Processing time: ${processingTimeMs} ms`);
 
+      // Count only truly new statements (exclude duplicates)
+      const newStatementsCount = updatedTriples.filter(triple =>
+        triple.statement.createdAt >= episode.createdAt
+      ).length;
+
       return {
         episodeUuid: episode.uuid,
         // nodesCreated: hydratedNodes.length,
-        statementsCreated: resolvedStatements.length,
+        statementsCreated: newStatementsCount,
         processingTimeMs,
         tokenUsage: tokenMetrics,
       };
@@ -529,6 +534,7 @@ export class KnowledgeGraphService {
       referenceTime: episode.validAt.toISOString(),
     };
 
+    console.log("proprietary model", isProprietaryModel(undefined, 'high'));
     // Statement extraction requires HIGH complexity (causal reasoning, emotional context)
     // Choose between proprietary and OSS prompts based on model type
     const messages = isProprietaryModel(undefined, 'high')
@@ -905,7 +911,7 @@ export class KnowledgeGraphService {
     }
 
     // Step 1: Collect all potential matches for all triples at once
-    const allPotentialMatches: Map<string, StatementNode[]> = new Map();
+    const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
     const allExistingTripleData: Map<string, Triple> = new Map();
 
     // For preparing the LLM context
@@ -915,7 +921,7 @@ export class KnowledgeGraphService {
     for (const triple of triples) {
       // Track IDs of statements we've already checked to avoid duplicates
       const checkedStatementIds: string[] = [];
-      let potentialMatches: StatementNode[] = [];
+      let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];
 
       // Phase 1a: Find statements with exact subject-predicate match
       // Example: "John lives_in New York" vs "John lives_in San Francisco"
@@ -965,7 +971,7 @@ export class KnowledgeGraphService {
       }
 
       // Phase 3: Check related memories for contradictory statements
-      const previousEpisodesStatements: StatementNode[] = [];
+      const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];
 
       await Promise.all(
         previousEpisodes.map(async (episode) => {
@@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
         tokenMetrics.low.output += usage.completionTokens;
         tokenMetrics.low.total += usage.totalTokens;
       }
-    }, undefined, 'low');
+    }, undefined, 'high');
     let normalizedEpisodeBody = "";
     const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
     if (outputMatch && outputMatch[1]) {
diff --git a/apps/webapp/app/services/prompts/normalize.ts b/apps/webapp/app/services/prompts/normalize.ts
index 6b92ee8..a469a9e 100644
--- a/apps/webapp/app/services/prompts/normalize.ts
+++ b/apps/webapp/app/services/prompts/normalize.ts
@@ -7,42 +7,25 @@ export const normalizePrompt = (
 
 Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
 
-<smart_enrichment_process>
-Evaluate the episode and apply enrichment ONLY where it adds significant value:
+CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
 
-1. PRIMARY FACTS - always preserve the core information from the episode
-2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp  
-3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below)
-4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images
-5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges
-6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections
+<enrichment_strategy>
+1. PRIMARY FACTS - Always preserve the core information from the episode
+2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
+3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
+4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
+5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges
 
-ENRICHMENT DECISION MATRIX:
-- Clear, complete statement → minimal enrichment (just temporal + attribution)
-- Unclear references → resolve with context
-- Emotional support → preserve feeling, avoid historical dumping
-- New developments → connect to ongoing narrative
-- Visual content → extract specific details as primary facts
-</smart_enrichment_process>
+When to add context from related memories:
+- Unclear pronouns ("she", "it", "they") → resolve to specific entity
+- Vague references ("the agency", "the event") → add clarifying details
+- Continuation phrases ("following up", "as we discussed") → connect to previous topic
 
-<context_usage_decision>
-When related memories/previous episodes are provided, evaluate if they improve understanding:
-
-USE CONTEXT when current episode has:
-- Unclear pronouns ("she", "it", "they" without clear antecedent)
-- Vague references ("the agency", "the event" without definition in current episode)
-- Continuation phrases ("following up", "as we discussed")
-- Incomplete information that context clarifies
-
-IGNORE CONTEXT when current episode is:
-- Clear and self-contained ("I got a job in New York")
-- Simple emotional responses ("Thanks, that's great!")
-- Generic encouragement ("You're doing awesome!")
-- Complete statements with all necessary information
-
-DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
-resolves ambiguity.
-</context_usage_decision>
+When NOT to add context:
+- Clear, self-contained statements → no enrichment needed beyond temporal
+- Emotional responses → preserve tone, avoid over-contextualization
+- Already established topics → don't repeat details mentioned earlier in conversation
+</enrichment_strategy>
 
 <temporal_resolution>
 Using episode timestamp as anchor, convert ALL relative time references:
@@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (
 
 Transform this document content into enriched factual statements for knowledge graph storage.
 
+CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
+
 <document_processing_approach>
 Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
 
diff --git a/apps/webapp/app/trigger/ingest/ingest-document.ts b/apps/webapp/app/trigger/ingest/ingest-document.ts
index a7d7341..deff20b 100644
--- a/apps/webapp/app/trigger/ingest/ingest-document.ts
+++ b/apps/webapp/app/trigger/ingest/ingest-document.ts
@@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";
 
 const documentIngestionQueue = queue({
   name: "document-ingestion-queue",
-  concurrencyLimit: 5,
+  concurrencyLimit: 1,
 });
 
 // Register the Document Ingestion Trigger.dev task