Fix: episode normalization with high complexity model

2026-01-10 23:48:26 +00:00 · 2025-10-01 22:38:56 +05:30 · 2025-10-01 22:38:56 +05:30 · 83b59b1c3b
commit 83b59b1c3b
parent f539ad1ecd
5 changed files with 35 additions and 48 deletions
--- a/apps/webapp/app/services/graphModels/episode.ts
+++ b/apps/webapp/app/services/graphModels/episode.ts
@ -1,5 +1,5 @@
 import { runQuery } from "~/lib/neo4j.server";
-import { type EntityNode, type EpisodicNode } from "@core/types";
+import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";

 export async function saveEpisode(episode: EpisodicNode): Promise<string> {
  const query = `
@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
 export async function getEpisodeStatements(params: {
  episodeUuid: string;
  userId: string;
-}) {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
  MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
  WHERE stmt.invalidAt IS NULL
@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
    return {
      uuid: stmt.uuid,
      fact: stmt.fact,
-      factEmbedding: stmt.factEmbedding,
      createdAt: new Date(stmt.createdAt),
      validAt: new Date(stmt.validAt),
      invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
--- a/apps/webapp/app/services/graphModels/statement.ts
+++ b/apps/webapp/app/services/graphModels/statement.ts
@ -111,7 +111,7 @@ export async function findContradictoryStatements({
  subjectId: string;
  predicateId: string;
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
      MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
@ -131,7 +131,6 @@ export async function findContradictoryStatements({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
  objectId: string;
  excludePredicateId?: string;
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
      MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
@ -212,7 +210,7 @@ export async function findSimilarStatements({
  threshold?: number;
  excludeIds?: string[];
  userId: string;
-}): Promise<StatementNode[]> {
+}): Promise<Omit<StatementNode, "factEmbedding">[]> {
  const query = `
      CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
      YIELD node AS statement, score
@ -242,7 +240,6 @@ export async function findSimilarStatements({
    return {
      uuid: statement.uuid,
      fact: statement.fact,
-      factEmbedding: statement.factEmbedding,
      createdAt: new Date(statement.createdAt),
      validAt: new Date(statement.validAt),
      invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@ -418,10 +418,15 @@ export class KnowledgeGraphService {
      const processingTimeMs = endTime - startTime;
      logger.log(`Processing time: ${processingTimeMs} ms`);

+      // Count only truly new statements (exclude duplicates)
+      const newStatementsCount = updatedTriples.filter(triple =>
+        triple.statement.createdAt >= episode.createdAt
+      ).length;
+
      return {
        episodeUuid: episode.uuid,
        // nodesCreated: hydratedNodes.length,
-        statementsCreated: resolvedStatements.length,
+        statementsCreated: newStatementsCount,
        processingTimeMs,
        tokenUsage: tokenMetrics,
      };
@ -529,6 +534,7 @@ export class KnowledgeGraphService {
      referenceTime: episode.validAt.toISOString(),
    };

+    console.log("proprietary model", isProprietaryModel(undefined, 'high'));
    // Statement extraction requires HIGH complexity (causal reasoning, emotional context)
    // Choose between proprietary and OSS prompts based on model type
    const messages = isProprietaryModel(undefined, 'high')
@ -905,7 +911,7 @@ export class KnowledgeGraphService {
    }

    // Step 1: Collect all potential matches for all triples at once
-    const allPotentialMatches: Map<string, StatementNode[]> = new Map();
+    const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
    const allExistingTripleData: Map<string, Triple> = new Map();

    // For preparing the LLM context
@ -915,7 +921,7 @@ export class KnowledgeGraphService {
    for (const triple of triples) {
      // Track IDs of statements we've already checked to avoid duplicates
      const checkedStatementIds: string[] = [];
-      let potentialMatches: StatementNode[] = [];
+      let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];

      // Phase 1a: Find statements with exact subject-predicate match
      // Example: "John lives_in New York" vs "John lives_in San Francisco"
@ -965,7 +971,7 @@ export class KnowledgeGraphService {
      }

      // Phase 3: Check related memories for contradictory statements
-      const previousEpisodesStatements: StatementNode[] = [];
+      const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];

      await Promise.all(
        previousEpisodes.map(async (episode) => {
@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
        tokenMetrics.low.output += usage.completionTokens;
        tokenMetrics.low.total += usage.totalTokens;
      }
-    }, undefined, 'low');
+    }, undefined, 'high');
    let normalizedEpisodeBody = "";
    const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
    if (outputMatch && outputMatch[1]) {
--- a/apps/webapp/app/services/prompts/normalize.ts
+++ b/apps/webapp/app/services/prompts/normalize.ts
@ -7,42 +7,25 @@ export const normalizePrompt = (

 Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.

-<smart_enrichment_process>
-Evaluate the episode and apply enrichment ONLY where it adds significant value:
+CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.

-1. PRIMARY FACTS - always preserve the core information from the episode
-2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp  
-3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below)
-4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images
-5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges
-6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections
+<enrichment_strategy>
+1. PRIMARY FACTS - Always preserve the core information from the episode
+2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
+3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
+4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
+5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges

-ENRICHMENT DECISION MATRIX:
- Clear, complete statement → minimal enrichment (just temporal + attribution)
- Unclear references → resolve with context
- Emotional support → preserve feeling, avoid historical dumping
- New developments → connect to ongoing narrative
- Visual content → extract specific details as primary facts
-</smart_enrichment_process>
+When to add context from related memories:
+- Unclear pronouns ("she", "it", "they") → resolve to specific entity
+- Vague references ("the agency", "the event") → add clarifying details
+- Continuation phrases ("following up", "as we discussed") → connect to previous topic

-<context_usage_decision>
-When related memories/previous episodes are provided, evaluate if they improve understanding:
-
-USE CONTEXT when current episode has:
- Unclear pronouns ("she", "it", "they" without clear antecedent)
- Vague references ("the agency", "the event" without definition in current episode)
- Continuation phrases ("following up", "as we discussed")
- Incomplete information that context clarifies
-
-IGNORE CONTEXT when current episode is:
- Clear and self-contained ("I got a job in New York")
- Simple emotional responses ("Thanks, that's great!")
- Generic encouragement ("You're doing awesome!")
- Complete statements with all necessary information
-
-DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
-resolves ambiguity.
-</context_usage_decision>
+When NOT to add context:
+- Clear, self-contained statements → no enrichment needed beyond temporal
+- Emotional responses → preserve tone, avoid over-contextualization
+- Already established topics → don't repeat details mentioned earlier in conversation
+</enrichment_strategy>

 <temporal_resolution>
 Using episode timestamp as anchor, convert ALL relative time references:
@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (

 Transform this document content into enriched factual statements for knowledge graph storage.

+CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
+
 <document_processing_approach>
 Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:

--- a/apps/webapp/app/trigger/ingest/ingest-document.ts
+++ b/apps/webapp/app/trigger/ingest/ingest-document.ts
@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";

 const documentIngestionQueue = queue({
  name: "document-ingestion-queue",
-  concurrencyLimit: 5,
+  concurrencyLimit: 1,
 });

 // Register the Document Ingestion Trigger.dev task