Fix: episode normalization with high complexity model (#80)

2026-01-11 09:58:28 +00:00 · 2025-10-02 08:48:56 +05:30 · 2025-10-02 08:48:56 +05:30 · 92ca34a02f
commit 92ca34a02f
parent 46407b0fac
5 changed files with 35 additions and 48 deletions
--- a/apps/webapp/app/services/graphModels/episode.ts
+++ b/apps/webapp/app/services/graphModels/episode.ts
@ -1,5 +1,5 @@
 import { runQuery } from "~/lib/neo4j.server";
-import { type EntityNode, type EpisodicNode } from "@core/types";
+import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";
 export async function saveEpisode(episode: EpisodicNode): Promise<string> {
  const query = `
@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
 export async function getEpisodeStatements(params: {
  episodeUuid: string;
  userId: string;
-}) {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
  MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
  WHERE stmt.invalidAt IS NULL
@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
    return {
      uuid: stmt.uuid,
      fact: stmt.fact,
      factEmbedding: stmt.factEmbedding,
      createdAt: new Date(stmt.createdAt),
      validAt: new Date(stmt.validAt),
      invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
--- a/apps/webapp/app/services/graphModels/statement.ts
+++ b/apps/webapp/app/services/graphModels/statement.ts
@ -111,7 +111,7 @@ export async function findContradictoryStatements({
  subjectId: string;
  predicateId: string;
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
      MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
@ -131,7 +131,6 @@ export async function findContradictoryStatements({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
  objectId: string;
  excludePredicateId?: string;
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
      MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -212,7 +210,7 @@ export async function findSimilarStatements({
  threshold?: number;
  excludeIds?: string[];
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
      YIELD node AS statement, score
@ -242,7 +240,6 @@ export async function findSimilarStatements({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@ -418,10 +418,15 @@ export class KnowledgeGraphService {
      const processingTimeMs = endTime - startTime;
      logger.log(`Processing time: ${processingTimeMs} ms`);
      // Count only truly new statements (exclude duplicates)
      const newStatementsCount = updatedTriples.filter(triple =>
        triple.statement.createdAt >= episode.createdAt
      ).length;
      return {
        episodeUuid: episode.uuid,
        // nodesCreated: hydratedNodes.length,
-        statementsCreated: resolvedStatements.length,
+        statementsCreated: newStatementsCount,
        processingTimeMs,
        tokenUsage: tokenMetrics,
      };
@ -529,6 +534,7 @@ export class KnowledgeGraphService {
      referenceTime: episode.validAt.toISOString(),
    };
    console.log("proprietary model", isProprietaryModel(undefined, 'high'));
    // Statement extraction requires HIGH complexity (causal reasoning, emotional context)
    // Choose between proprietary and OSS prompts based on model type
    const messages = isProprietaryModel(undefined, 'high')
@ -905,7 +911,7 @@ export class KnowledgeGraphService {
    }
    // Step 1: Collect all potential matches for all triples at once
-    const allPotentialMatches: Map<string, StatementNode[]> = new Map();
+    const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
    const allExistingTripleData: Map<string, Triple> = new Map();
    // For preparing the LLM context
@ -915,7 +921,7 @@ export class KnowledgeGraphService {
    for (const triple of triples) {
      // Track IDs of statements we've already checked to avoid duplicates
      const checkedStatementIds: string[] = [];
-      let potentialMatches: StatementNode[] = [];
+      let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];
      // Phase 1a: Find statements with exact subject-predicate match
      // Example: "John lives_in New York" vs "John lives_in San Francisco"
@ -965,7 +971,7 @@ export class KnowledgeGraphService {
      }
      // Phase 3: Check related memories for contradictory statements
-      const previousEpisodesStatements: StatementNode[] = [];
+      const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];
      await Promise.all(
        previousEpisodes.map(async (episode) => {
@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
        tokenMetrics.low.output += usage.completionTokens;
        tokenMetrics.low.total += usage.totalTokens;
      }
-    }, undefined, 'low');
+    }, undefined, 'high');
    let normalizedEpisodeBody = "";
    const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
    if (outputMatch && outputMatch[1]) {
--- a/apps/webapp/app/services/prompts/normalize.ts
+++ b/apps/webapp/app/services/prompts/normalize.ts
@ -7,42 +7,25 @@ export const normalizePrompt = (
 Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
-<smart_enrichment_process>
+CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
 Evaluate the episode and apply enrichment ONLY where it adds significant value:
-1. PRIMARY FACTS - always preserve the core information from the episode
+<enrichment_strategy>
-2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp  
+1. PRIMARY FACTS - Always preserve the core information from the episode
-3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below)
+2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
-4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images
+3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
-5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges
+4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
-6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections
+5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges
-ENRICHMENT DECISION MATRIX:
+When to add context from related memories:
- Clear, complete statement → minimal enrichment (just temporal + attribution)
+- Unclear pronouns ("she", "it", "they") → resolve to specific entity
- Unclear references → resolve with context
+- Vague references ("the agency", "the event") → add clarifying details
- Emotional support → preserve feeling, avoid historical dumping
+- Continuation phrases ("following up", "as we discussed") → connect to previous topic
 - New developments → connect to ongoing narrative
 - Visual content → extract specific details as primary facts
 </smart_enrichment_process>
-<context_usage_decision>
+When NOT to add context:
-When related memories/previous episodes are provided, evaluate if they improve understanding:
+- Clear, self-contained statements → no enrichment needed beyond temporal
-
+- Emotional responses → preserve tone, avoid over-contextualization
-USE CONTEXT when current episode has:
+- Already established topics → don't repeat details mentioned earlier in conversation
- Unclear pronouns ("she", "it", "they" without clear antecedent)
+</enrichment_strategy>
 - Vague references ("the agency", "the event" without definition in current episode)
 - Continuation phrases ("following up", "as we discussed")
 - Incomplete information that context clarifies
 IGNORE CONTEXT when current episode is:
 - Clear and self-contained ("I got a job in New York")
 - Simple emotional responses ("Thanks, that's great!")
 - Generic encouragement ("You're doing awesome!")
 - Complete statements with all necessary information
 DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
 resolves ambiguity.
 </context_usage_decision>
 <temporal_resolution>
 Using episode timestamp as anchor, convert ALL relative time references:
@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (
 Transform this document content into enriched factual statements for knowledge graph storage.
 CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
 <document_processing_approach>
 Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
--- a/apps/webapp/app/trigger/ingest/ingest-document.ts
+++ b/apps/webapp/app/trigger/ingest/ingest-document.ts
@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";
 const documentIngestionQueue = queue({
  name: "document-ingestion-queue",
-  concurrencyLimit: 5,
+  concurrencyLimit: 1,
 });
 // Register the Document Ingestion Trigger.dev task