feat: add normalize episode content before storage

2026-01-11 18:38:27 +00:00 · 2025-06-11 12:38:24 +05:30 · 2025-06-11 12:38:24 +05:30 · 02c7b90374
commit 02c7b90374
parent 56adc246c8
11 changed files with 232 additions and 112 deletions
--- a/apps/webapp/app/services/graphModels/episode.ts
+++ b/apps/webapp/app/services/graphModels/episode.ts
@ -6,6 +6,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
    MERGE (e:Episode {uuid: $uuid})
    ON CREATE SET
      e.content = $content,
      e.originalContent = $originalContent,
      e.contentEmbedding = $contentEmbedding,
      e.type = $type,
      e.source = $source,
@ -18,6 +19,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
    ON MATCH SET
      e.content = $content,
      e.contentEmbedding = $contentEmbedding,
      e.originalContent = $originalContent,
      e.type = $type,
      e.source = $source,
      e.validAt = $validAt,
@ -30,6 +32,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
  const params = {
    uuid: episode.uuid,
    content: episode.content,
    originalContent: episode.originalContent,
    source: episode.source,
    type: episode.type,
    userId: episode.userId || null,
@ -59,6 +62,7 @@ export async function getEpisode(uuid: string): Promise<EpisodicNode | null> {
  return {
    uuid: episode.uuid,
    content: episode.content,
    originalContent: episode.originalContent,
    contentEmbedding: episode.contentEmbedding,
    type: episode.type,
    source: episode.source,
@ -112,6 +116,7 @@ export async function getRecentEpisodes(params: {
    return {
      uuid: episode.uuid,
      content: episode.content,
      originalContent: episode.originalContent,
      contentEmbedding: episode.contentEmbedding,
      type: episode.type,
      source: episode.source,
--- a/apps/webapp/app/services/graphModels/statement.ts
+++ b/apps/webapp/app/services/graphModels/statement.ts
@ -64,10 +64,14 @@ export async function saveTriple(triple: Triple): Promise<string> {
  MATCH (object:Entity {uuid: $objectUuid, userId: $userId})
  MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})
-  CREATE (episode)-[:HAS_PROVENANCE {uuid: $provenanceEdgeUuid, createdAt: $createdAt}]->(statement)
+  MERGE (episode)-[prov:HAS_PROVENANCE]->(statement)
-  CREATE (statement)-[:HAS_SUBJECT {uuid: $subjectEdgeUuid, createdAt: $createdAt}]->(subject)
+    ON CREATE SET prov.uuid = $provenanceEdgeUuid, prov.createdAt = $createdAt
-  CREATE (statement)-[:HAS_PREDICATE {uuid: $predicateEdgeUuid, createdAt: $createdAt}]->(predicate)
+  MERGE (statement)-[subj:HAS_SUBJECT]->(subject)
-  CREATE (statement)-[:HAS_OBJECT {uuid: $objectEdgeUuid, createdAt: $createdAt}]->(object)
+    ON CREATE SET subj.uuid = $subjectEdgeUuid, subj.createdAt = $createdAt
  MERGE (statement)-[pred:HAS_PREDICATE]->(predicate)
    ON CREATE SET pred.uuid = $predicateEdgeUuid, pred.createdAt = $createdAt
  MERGE (statement)-[obj:HAS_OBJECT]->(object)
    ON CREATE SET obj.uuid = $objectEdgeUuid, obj.createdAt = $createdAt
  RETURN statement.uuid as uuid
  `;
@ -267,6 +271,7 @@ export async function getTripleForStatement({
  const provenance: EpisodicNode = {
    uuid: episodeProps.uuid,
    content: episodeProps.content,
    originalContent: episodeProps.originalContent,
    source: episodeProps.source,
    type: episodeProps.type,
    createdAt: new Date(episodeProps.createdAt),
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@ -1,7 +1,6 @@
 import { openai } from "@ai-sdk/openai";
 import { type CoreMessage, embed } from "ai";
 import {
  entityTypes,
  EpisodeType,
  LLMModelEnum,
  type AddEpisodeParams,
@ -27,6 +26,8 @@ import {
  saveTriple,
 } from "./graphModels/statement";
 import { makeModelCall } from "~/lib/model.server";
 import { Apps, getNodeTypes, getNodeTypesString } from "~/utils/presets/nodes";
 import { normalizePrompt } from "./prompts";
 // Default number of previous episodes to retrieve for context
 const DEFAULT_EPISODE_WINDOW = 5;
@ -61,10 +62,16 @@ export class KnowledgeGraphService {
        source: params.source,
      });
      const normalizedEpisodeBody = await this.normalizeEpisodeBody(
        params.episodeBody,
        params.source,
      );
      // Step 2: Episode Creation - Create or retrieve the episode
      const episode: EpisodicNode = {
        uuid: crypto.randomUUID(),
-        content: params.episodeBody,
+        content: normalizedEpisodeBody,
        originalContent: params.episodeBody,
        source: params.source,
        type: params.type || EpisodeType.Text,
        createdAt: now,
@ -130,6 +137,12 @@ export class KnowledgeGraphService {
    episode: EpisodicNode,
    previousEpisodes: EpisodicNode[],
  ): Promise<EntityNode[]> {
    // Get all app keys
    const allAppEnumValues = Object.values(Apps);
    // Get all node types
    const entityTypes = getNodeTypes(allAppEnumValues);
    // Use the prompt library to get the appropriate prompts
    const context = {
      episodeContent: episode.content,
@ -172,7 +185,9 @@ export class KnowledgeGraphService {
            name: entity.name,
            type: entity.type,
            attributes: entity.attributes || {},
-            nameEmbedding: await this.getEmbedding(entity.name),
+            nameEmbedding: await this.getEmbedding(
              `${entity.type}: ${entity.name}`,
            ),
            createdAt: new Date(),
            userId: episode.userId,
          })),
@ -652,4 +667,35 @@ export class KnowledgeGraphService {
    return { resolvedStatements, invalidatedStatements };
  }
  /**
   * Normalize an episode by extracting entities and creating nodes and statements
   */
  private async normalizeEpisodeBody(episodeBody: string, source: string) {
    let appEnumValues: Apps[] = [];
    if (Apps[source.toUpperCase() as keyof typeof Apps]) {
      appEnumValues = [Apps[source.toUpperCase() as keyof typeof Apps]];
    }
    const entityTypes = getNodeTypesString(appEnumValues);
    const context = {
      episodeContent: episodeBody,
      entityTypes: entityTypes,
      source,
    };
    const messages = normalizePrompt(context);
    let responseText = "";
    await makeModelCall(false, LLMModelEnum.GPT41, messages, (text) => {
      responseText = text;
    });
    let normalizedEpisodeBody = "";
    const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
    if (outputMatch && outputMatch[1]) {
      normalizedEpisodeBody = outputMatch[1].trim();
    } else {
      normalizedEpisodeBody = episodeBody;
    }
    return normalizedEpisodeBody;
  }
 }
--- a/apps/webapp/app/services/prompts/index.ts
+++ b/apps/webapp/app/services/prompts/index.ts
@ -1,2 +1,3 @@
 export * from "./nodes";
 export * from "./statements";
 export * from "./normalize";
--- a/apps/webapp/app/services/prompts/nodes.ts
+++ b/apps/webapp/app/services/prompts/nodes.ts
@ -4,30 +4,6 @@
 import { type CoreMessage } from "ai";
 export interface ExtractedEntity {
  name: string;
  type: string;
  attributes?: Record<string, any>;
 }
 export interface ExtractedEntities {
  entities: ExtractedEntity[];
 }
 export interface MissedEntities {
  missedEntities: string[];
 }
 export interface EntityClassificationTriple {
  uuid: string;
  name: string;
  type: string | null;
 }
 export interface EntityClassification {
  entityClassifications: EntityClassificationTriple[];
 }
 /**
 * Extract entities from an episode using message-based approach
 */
@ -47,10 +23,14 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
 1. **Entity Identification**:
   - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE.
   - **Exclude** entities mentioned only in the PREVIOUS EPISODES (they are for context only).
   - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X).
   - For pronouns that refer to named entities, extract them as separate Alias entities.
 2. **Entity Classification**:
   - Use the descriptions in ENTITY TYPES to classify each extracted entity.
   - Assign the appropriate type for each one.
   - Classify pronouns (I, me, you, etc.) as Alias entities.
 3. **Exclusions**:
   - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
@ -58,6 +38,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
 4. **Formatting**:
   - Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
   - For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you").
 Format your response as a JSON object with the following structure:
@ -113,10 +94,13 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli
 1. **Entity Identification**:
   - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT.
   - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X).
   - For pronouns that refer to named entities, extract them as separate Alias entities.
 2. **Entity Classification**:
   - Use the descriptions in ENTITY TYPES to classify each extracted entity.
   - Assign the appropriate type for each one.
   - Classify pronouns (I, me, you, etc.) as Alias entities.
 3. **Exclusions**:
   - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
@ -124,7 +108,7 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli
 4. **Formatting**:
   - Be **explicit and unambiguous** when naming entities (e.g., use full names when available).
-
+   - For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you").
 Format your response as a JSON object with the following structure:
 <output>
--- a/apps/webapp/app/services/prompts/normalize.ts
+++ b/apps/webapp/app/services/prompts/normalize.ts
@ -0,0 +1,47 @@
 import { type CoreMessage } from "ai";
 export const normalizePrompt = (
  context: Record<string, any>,
 ): CoreMessage[] => {
  const sysPrompt = `
 You are a memory extraction system. Your task is to convert input information—such as user input, system events, or assistant actions—into clear, concise, third-person factual statements suitable for storage in a memory graph. These statements should be easily understandable and retrievable by any system or agent.
 ## Memory Processing Guidelines
 - Always output memory statements in the third person (e.g., "User prefers...", "The assistant performed...", "The system detected...").
 - Convert input information into clear, concise memory statements.
 - Maintain a neutral, factual tone in all memory entries.
 - Structure memories as factual statements, not questions.
 - Include relevant context and temporal information when available.
 ## Node Entity Types
 ${context.entityTypes}
 ## Memory Graph Integration
 - Each memory will be converted to a node in the memory graph.
 - Include relevant relationships between memory items when possible.
 - Specify temporal aspects when memories are time-sensitive.
 - Format memories to support efficient retrieval by any system or agent.
 When processing new information for memory storage, focus on extracting the core facts, preferences, and events that will be most useful for future reference by any system or agent.
 <output>
 {{processed_statement}}
 </output>
 `;
  const userPrompt = `
 <CONTENT>
 ${context.episodeContent}
 </CONTENT>
 <SOURCE>
 ${context.source}
 </SOURCE>
 `;
  return [
    { role: "system", content: sysPrompt },
    { role: "user", content: userPrompt },
  ];
 };
--- a/apps/webapp/app/services/search.server.ts
+++ b/apps/webapp/app/services/search.server.ts
@ -4,6 +4,7 @@ import { embed } from "ai";
 import { logger } from "./logger.service";
 import { applyCrossEncoderReranking, applyWeightedRRF } from "./search/rerank";
 import {
  getEpisodesByStatements,
  performBfsSearch,
  performBM25Search,
  performVectorSearch,
@ -34,7 +35,7 @@ export class SearchService {
    query: string,
    userId: string,
    options: SearchOptions = {},
-  ): Promise<string[]> {
+  ): Promise<{ episodes: string[]; facts: string[] }> {
    // Default options
    const opts: Required<SearchOptions> = {
      limit: options.limit || 10,
@ -71,7 +72,11 @@ export class SearchService {
    const filteredResults = this.applyAdaptiveFiltering(rankedStatements, opts);
    // 3. Return top results
-    return filteredResults.map((statement) => statement.fact);
+    const episodes = await getEpisodesByStatements(filteredResults);
    return {
      episodes: episodes.map((episode) => episode.content),
      facts: filteredResults.map((statement) => statement.fact),
    };
  }
  /**
@ -84,12 +89,14 @@ export class SearchService {
  ): StatementNode[] {
    if (results.length === 0) return [];
    let isRRF = false;
    // Extract scores from results
    const scoredResults = results.map((result) => {
      // Find the score based on reranking strategy used
      let score = 0;
      if ((result as any).rrfScore !== undefined) {
        score = (result as any).rrfScore;
        isRRF = true;
      } else if ((result as any).mmrScore !== undefined) {
        score = (result as any).mmrScore;
      } else if ((result as any).crossEncoderScore !== undefined) {
@ -117,16 +124,31 @@ export class SearchService {
    const minScore = Math.min(...scores);
    const scoreRange = maxScore - minScore;
-    // Define a minimum quality threshold as a fraction of the best score
+    let threshold = 0;
-    // This is relative to the query's score distribution rather than an absolute value
+    if (isRRF || scoreRange < 0.01) {
-    const relativeThreshold = options.scoreThreshold || 0.3; // 30% of the best score by default
+      // For RRF or other compressed score ranges, use a percentile-based approach
-    const absoluteMinimum = 0.1; // Absolute minimum threshold to prevent keeping very poor matches
+      // Keep top 70% (or whatever is specified in options) of results
      const keepPercentage = 1 - (options.scoreThreshold || 0.3);
      const keepCount = Math.max(
        1,
        Math.ceil(scoredResults.length * keepPercentage),
      );
-    // Calculate the actual threshold as a percentage of the distance from min to max score
+      // Set threshold to the score of the last item we want to keep
-    const threshold = Math.max(
+      threshold =
        keepCount < scoredResults.length
          ? scoredResults[keepCount - 1].score
          : 0;
    } else {
      // For normal score distributions, use the relative threshold approach
      const relativeThreshold = options.scoreThreshold || 0.3;
      const absoluteMinimum = 0.1;
      threshold = Math.max(
        absoluteMinimum,
        minScore + scoreRange * relativeThreshold,
      );
    }
    // Filter out low-quality results
    const filteredResults = scoredResults
--- a/apps/webapp/app/services/search/rerank.ts
+++ b/apps/webapp/app/services/search/rerank.ts
@ -89,7 +89,7 @@ export async function applyCrossEncoderReranking(
        {
          role: "system",
          content: `You are an expert tasked with determining whether the statement is relevant to the query
-            Respond with "True" if PASSAGE is relevant to QUERY and "False" otherwise.`,
+            Respond with "True" if STATEMENT is relevant to QUERY and "False" otherwise.`,
        },
        {
          role: "user",
--- a/apps/webapp/app/services/search/utils.ts
+++ b/apps/webapp/app/services/search/utils.ts
@ -1,4 +1,4 @@
-import type { EntityNode, StatementNode } from "@core/types";
+import type { EntityNode, StatementNode, EpisodicNode } from "@core/types";
 import type { SearchOptions } from "../search.server";
 import type { Embedding } from "ai";
 import { logger } from "../logger.service";
@ -219,3 +219,20 @@ export function combineAndDeduplicateStatements(
    ).values(),
  );
 }
 export async function getEpisodesByStatements(
  statements: StatementNode[],
 ): Promise<EpisodicNode[]> {
  const cypher = `
    MATCH (s:Statement)<-[:HAS_PROVENANCE]-(e:Episode)
    WHERE s.uuid IN $statementUuids
    RETURN distinct e
  `;
  const params = {
    statementUuids: statements.map((s) => s.uuid),
  };
  const records = await runQuery(cypher, params);
  return records.map((record) => record.get("e").properties as EpisodicNode);
 }
--- a/apps/webapp/app/utils/presets/nodes.ts
+++ b/apps/webapp/app/utils/presets/nodes.ts
@ -1,11 +1,13 @@
 export enum Apps {
  LINEAR = "LINEAR",
  SLACK = "SLACK",
  SOL = "SOL",
 }
 export const AppNames = {
  [Apps.LINEAR]: "Linear",
  [Apps.SLACK]: "Slack",
  [Apps.SOL]: "Sol",
 } as const;
 // General node types that are common across all apps
@ -38,6 +40,33 @@ export const GENERAL_NODE_TYPES = {
 // App-specific node types
 export const APP_NODE_TYPES = {
  [Apps.SOL]: {
    TASK: {
      name: "Sol Task",
      description:
        "An independent unit of work in Sol, such as a task, bug report, or feature request. Tasks can be associated with lists or linked as subtasks to other tasks.",
    },
    LIST: {
      name: "Sol List",
      description:
        "A flexible container in Sol for organizing content such as tasks, text, or references. Lists are used for task tracking, information collections, or reference materials.",
    },
    PREFERENCE: {
      name: "Sol Preference",
      description:
        "A user-stated intent, setting, or configuration in Sol, such as preferred formats, notification settings, timezones, or other customizations. Preferences reflect how a user wants the system to behave.",
    },
    COMMAND: {
      name: "Sol Command",
      description:
        "A user-issued command or trigger phrase, often starting with '/' or '@', that directs the system or an app to perform a specific action. Commands should always be extracted as distinct, important user actions.",
    },
    AUTOMATION: {
      name: "Sol Automation",
      description:
        "A workflow or rule in Sol that automatically performs actions based on specific conditions or triggers, such as recurring tasks, reminders, or integrations with other systems.",
    },
  },
  [Apps.LINEAR]: {
    ISSUE: {
      name: "Linear Issue",
@ -102,3 +131,33 @@ export function getNodeTypes(apps: Array<keyof typeof APP_NODE_TYPES>) {
    appSpecific: appSpecificTypes,
  };
 }
 export function getNodeTypesString(apps: Array<keyof typeof APP_NODE_TYPES>) {
  let nodeTypesString = "";
  const generalTypes = Object.entries(GENERAL_NODE_TYPES)
    .map(([key, value]) => {
      return `- ${key}: "${value.description}"`;
    })
    .join("\n");
  nodeTypesString += `General Node Types:\n${generalTypes}\n\n`;
  const appSpecificTypes = apps.reduce((acc, appName) => {
    return {
      ...acc,
      [appName]: APP_NODE_TYPES[appName],
    };
  }, {});
  const appSpecificTypesString = Object.entries(appSpecificTypes)
    .map(([appName, types]) => {
      return `For ${appName}:\n${Object.entries(types as any)
        .map(([key, value]: any) => {
          return `- ${key}: "${value.description}"`;
        })
        .join("\n")}\n\n`;
    })
    .join("\n\n");
  nodeTypesString += `App-specific Node Types:\n${appSpecificTypesString}`;
  return nodeTypesString;
 }
--- a/packages/types/src/graph/graph.entity.ts
+++ b/packages/types/src/graph/graph.entity.ts
@ -10,6 +10,7 @@ export enum EpisodeType {
 export interface EpisodicNode {
  uuid: string;
  content: string;
  originalContent: string;
  contentEmbedding?: number[];
  type: string;
  source: string;
@ -82,70 +83,3 @@ export type AddEpisodeResult = {
  statementsCreated: number;
  processingTimeMs: number;
 };
 export const entityTypes = {
  general: {
    PERSON: {
      type: "PERSON",
      description: "Any named individual mentioned or referenced.",
    },
    APP: {
      type: "APP",
      description: "Any third-party service or platform used by the user.",
    },
    SOL_AUTOMATION: {
      type: "SOL_AUTOMATION",
      description: "User workflows or flows combining triggers and actions.",
    },
    SOL_PREFERENCE: {
      type: "SOL_PREFERENCE",
      description: "User-stated intent, setting, or configuration like format, timezone, etc.",
    },
    COMMAND: {
      type: "COMMAND",
      description: "Trigger phrase mapped to an internal action, often starts with / or !",
    },
    TASK: {
      type: "TASK",
      description: "User-stated or inferred goal; may link to Person or App.",
    },
    EVENT: {
      type: "EVENT",
      description: "Time-based mention; supports parsing of phrases like 'next week', 'tomorrow'.",
    },
    LABEL: {
      type: "LABEL",
      description: "Optional categorization tag for organization or filtering.",
    },
    OBJECT: {
      type: "OBJECT",
      description: "Named non-person objects in the user's world (e.g., Projector, Car).",
    },
    TEAM: {
      type: "TEAM",
      description: "User-defined group of people, useful for permissions or targeting.",
    },
  },
  app_specific: {
    SLACK_CHANNEL: {
      type: "SLACK_CHANNEL",
      description: "Slack channel where automations or communications happen.",
    },
    SLACK_USER: {
      type: "SLACK_USER",
      description: "A user in Slack, can be tagged or messaged.",
    },
    GMAIL_THREAD: {
      type: "GMAIL_THREAD",
      description: "An email conversation thread in Gmail.",
    },
    NOTION_PAGE: {
      type: "NOTION_PAGE",
      description: "A page in Notion workspace.",
    },
    CALENDAR_EVENT: {
      type: "CALENDAR_EVENT",
      description: "Event from user's calendar (Google, Outlook, etc.).",
    },
  },
 };