From 02c7b903743db19b54ff1522c8a5854398254eab Mon Sep 17 00:00:00 2001 From: Manoj K Date: Wed, 11 Jun 2025 12:38:24 +0530 Subject: [PATCH] feat: add normalize episode content before storage --- .../app/services/graphModels/episode.ts | 5 ++ .../app/services/graphModels/statement.ts | 13 ++-- .../app/services/knowledgeGraph.server.ts | 52 +++++++++++++- apps/webapp/app/services/prompts/index.ts | 1 + apps/webapp/app/services/prompts/nodes.ts | 34 +++------- apps/webapp/app/services/prompts/normalize.ts | 47 +++++++++++++ apps/webapp/app/services/search.server.ts | 44 +++++++++--- apps/webapp/app/services/search/rerank.ts | 2 +- apps/webapp/app/services/search/utils.ts | 19 +++++- apps/webapp/app/utils/presets/nodes.ts | 59 ++++++++++++++++ packages/types/src/graph/graph.entity.ts | 68 +------------------ 11 files changed, 232 insertions(+), 112 deletions(-) create mode 100644 apps/webapp/app/services/prompts/normalize.ts diff --git a/apps/webapp/app/services/graphModels/episode.ts b/apps/webapp/app/services/graphModels/episode.ts index b78366a..e1657a5 100644 --- a/apps/webapp/app/services/graphModels/episode.ts +++ b/apps/webapp/app/services/graphModels/episode.ts @@ -6,6 +6,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise { MERGE (e:Episode {uuid: $uuid}) ON CREATE SET e.content = $content, + e.originalContent = $originalContent, e.contentEmbedding = $contentEmbedding, e.type = $type, e.source = $source, @@ -18,6 +19,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise { ON MATCH SET e.content = $content, e.contentEmbedding = $contentEmbedding, + e.originalContent = $originalContent, e.type = $type, e.source = $source, e.validAt = $validAt, @@ -30,6 +32,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise { const params = { uuid: episode.uuid, content: episode.content, + originalContent: episode.originalContent, source: episode.source, type: episode.type, userId: episode.userId || null, @@ -59,6 +62,7 @@ export async function getEpisode(uuid: string): Promise { return { uuid: episode.uuid, content: episode.content, + originalContent: episode.originalContent, contentEmbedding: episode.contentEmbedding, type: episode.type, source: episode.source, @@ -112,6 +116,7 @@ export async function getRecentEpisodes(params: { return { uuid: episode.uuid, content: episode.content, + originalContent: episode.originalContent, contentEmbedding: episode.contentEmbedding, type: episode.type, source: episode.source, diff --git a/apps/webapp/app/services/graphModels/statement.ts b/apps/webapp/app/services/graphModels/statement.ts index 3a66e27..b5c2dc4 100644 --- a/apps/webapp/app/services/graphModels/statement.ts +++ b/apps/webapp/app/services/graphModels/statement.ts @@ -64,10 +64,14 @@ export async function saveTriple(triple: Triple): Promise { MATCH (object:Entity {uuid: $objectUuid, userId: $userId}) MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId}) - CREATE (episode)-[:HAS_PROVENANCE {uuid: $provenanceEdgeUuid, createdAt: $createdAt}]->(statement) - CREATE (statement)-[:HAS_SUBJECT {uuid: $subjectEdgeUuid, createdAt: $createdAt}]->(subject) - CREATE (statement)-[:HAS_PREDICATE {uuid: $predicateEdgeUuid, createdAt: $createdAt}]->(predicate) - CREATE (statement)-[:HAS_OBJECT {uuid: $objectEdgeUuid, createdAt: $createdAt}]->(object) + MERGE (episode)-[prov:HAS_PROVENANCE]->(statement) + ON CREATE SET prov.uuid = $provenanceEdgeUuid, prov.createdAt = $createdAt + MERGE (statement)-[subj:HAS_SUBJECT]->(subject) + ON CREATE SET subj.uuid = $subjectEdgeUuid, subj.createdAt = $createdAt + MERGE (statement)-[pred:HAS_PREDICATE]->(predicate) + ON CREATE SET pred.uuid = $predicateEdgeUuid, pred.createdAt = $createdAt + MERGE (statement)-[obj:HAS_OBJECT]->(object) + ON CREATE SET obj.uuid = $objectEdgeUuid, obj.createdAt = $createdAt RETURN statement.uuid as uuid `; @@ -267,6 +271,7 @@ export async function getTripleForStatement({ const provenance: EpisodicNode = { uuid: episodeProps.uuid, content: episodeProps.content, + originalContent: episodeProps.originalContent, source: episodeProps.source, type: episodeProps.type, createdAt: new Date(episodeProps.createdAt), diff --git a/apps/webapp/app/services/knowledgeGraph.server.ts b/apps/webapp/app/services/knowledgeGraph.server.ts index 966382a..3116164 100644 --- a/apps/webapp/app/services/knowledgeGraph.server.ts +++ b/apps/webapp/app/services/knowledgeGraph.server.ts @@ -1,7 +1,6 @@ import { openai } from "@ai-sdk/openai"; import { type CoreMessage, embed } from "ai"; import { - entityTypes, EpisodeType, LLMModelEnum, type AddEpisodeParams, @@ -27,6 +26,8 @@ import { saveTriple, } from "./graphModels/statement"; import { makeModelCall } from "~/lib/model.server"; +import { Apps, getNodeTypes, getNodeTypesString } from "~/utils/presets/nodes"; +import { normalizePrompt } from "./prompts"; // Default number of previous episodes to retrieve for context const DEFAULT_EPISODE_WINDOW = 5; @@ -61,10 +62,16 @@ export class KnowledgeGraphService { source: params.source, }); + const normalizedEpisodeBody = await this.normalizeEpisodeBody( + params.episodeBody, + params.source, + ); + // Step 2: Episode Creation - Create or retrieve the episode const episode: EpisodicNode = { uuid: crypto.randomUUID(), - content: params.episodeBody, + content: normalizedEpisodeBody, + originalContent: params.episodeBody, source: params.source, type: params.type || EpisodeType.Text, createdAt: now, @@ -130,6 +137,12 @@ export class KnowledgeGraphService { episode: EpisodicNode, previousEpisodes: EpisodicNode[], ): Promise { + // Get all app keys + const allAppEnumValues = Object.values(Apps); + + // Get all node types + const entityTypes = getNodeTypes(allAppEnumValues); + // Use the prompt library to get the appropriate prompts const context = { episodeContent: episode.content, @@ -172,7 +185,9 @@ export class KnowledgeGraphService { name: entity.name, type: entity.type, attributes: entity.attributes || {}, - nameEmbedding: await this.getEmbedding(entity.name), + nameEmbedding: await this.getEmbedding( + `${entity.type}: ${entity.name}`, + ), createdAt: new Date(), userId: episode.userId, })), @@ -652,4 +667,35 @@ export class KnowledgeGraphService { return { resolvedStatements, invalidatedStatements }; } + + /** + * Normalize an episode by extracting entities and creating nodes and statements + */ + private async normalizeEpisodeBody(episodeBody: string, source: string) { + let appEnumValues: Apps[] = []; + if (Apps[source.toUpperCase() as keyof typeof Apps]) { + appEnumValues = [Apps[source.toUpperCase() as keyof typeof Apps]]; + } + const entityTypes = getNodeTypesString(appEnumValues); + + const context = { + episodeContent: episodeBody, + entityTypes: entityTypes, + source, + }; + const messages = normalizePrompt(context); + let responseText = ""; + await makeModelCall(false, LLMModelEnum.GPT41, messages, (text) => { + responseText = text; + }); + let normalizedEpisodeBody = ""; + const outputMatch = responseText.match(/([\s\S]*?)<\/output>/); + if (outputMatch && outputMatch[1]) { + normalizedEpisodeBody = outputMatch[1].trim(); + } else { + normalizedEpisodeBody = episodeBody; + } + + return normalizedEpisodeBody; + } } diff --git a/apps/webapp/app/services/prompts/index.ts b/apps/webapp/app/services/prompts/index.ts index 0fc6732..5d28fd2 100644 --- a/apps/webapp/app/services/prompts/index.ts +++ b/apps/webapp/app/services/prompts/index.ts @@ -1,2 +1,3 @@ export * from "./nodes"; export * from "./statements"; +export * from "./normalize"; diff --git a/apps/webapp/app/services/prompts/nodes.ts b/apps/webapp/app/services/prompts/nodes.ts index f0aa792..9139e8d 100644 --- a/apps/webapp/app/services/prompts/nodes.ts +++ b/apps/webapp/app/services/prompts/nodes.ts @@ -4,30 +4,6 @@ import { type CoreMessage } from "ai"; -export interface ExtractedEntity { - name: string; - type: string; - attributes?: Record; -} - -export interface ExtractedEntities { - entities: ExtractedEntity[]; -} - -export interface MissedEntities { - missedEntities: string[]; -} - -export interface EntityClassificationTriple { - uuid: string; - name: string; - type: string | null; -} - -export interface EntityClassification { - entityClassifications: EntityClassificationTriple[]; -} - /** * Extract entities from an episode using message-based approach */ @@ -47,10 +23,14 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr 1. **Entity Identification**: - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE. - **Exclude** entities mentioned only in the PREVIOUS EPISODES (they are for context only). + - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X). + - For pronouns that refer to named entities, extract them as separate Alias entities. + 2. **Entity Classification**: - Use the descriptions in ENTITY TYPES to classify each extracted entity. - Assign the appropriate type for each one. + - Classify pronouns (I, me, you, etc.) as Alias entities. 3. **Exclusions**: - Do NOT extract entities representing relationships or actions (predicates will be handled separately). @@ -58,6 +38,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr 4. **Formatting**: - Be **explicit and unambiguous** in naming entities (e.g., use full names when available). + - For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you"). Format your response as a JSON object with the following structure: @@ -113,10 +94,13 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli 1. **Entity Identification**: - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT. + - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X). + - For pronouns that refer to named entities, extract them as separate Alias entities. 2. **Entity Classification**: - Use the descriptions in ENTITY TYPES to classify each extracted entity. - Assign the appropriate type for each one. + - Classify pronouns (I, me, you, etc.) as Alias entities. 3. **Exclusions**: - Do NOT extract entities representing relationships or actions (predicates will be handled separately). @@ -124,7 +108,7 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli 4. **Formatting**: - Be **explicit and unambiguous** when naming entities (e.g., use full names when available). - + - For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you"). Format your response as a JSON object with the following structure: diff --git a/apps/webapp/app/services/prompts/normalize.ts b/apps/webapp/app/services/prompts/normalize.ts new file mode 100644 index 0000000..689b8e0 --- /dev/null +++ b/apps/webapp/app/services/prompts/normalize.ts @@ -0,0 +1,47 @@ +import { type CoreMessage } from "ai"; + +export const normalizePrompt = ( + context: Record, +): CoreMessage[] => { + const sysPrompt = ` +You are a memory extraction system. Your task is to convert input information—such as user input, system events, or assistant actions—into clear, concise, third-person factual statements suitable for storage in a memory graph. These statements should be easily understandable and retrievable by any system or agent. + +## Memory Processing Guidelines +- Always output memory statements in the third person (e.g., "User prefers...", "The assistant performed...", "The system detected..."). +- Convert input information into clear, concise memory statements. +- Maintain a neutral, factual tone in all memory entries. +- Structure memories as factual statements, not questions. +- Include relevant context and temporal information when available. + +## Node Entity Types +${context.entityTypes} + +## Memory Graph Integration +- Each memory will be converted to a node in the memory graph. +- Include relevant relationships between memory items when possible. +- Specify temporal aspects when memories are time-sensitive. +- Format memories to support efficient retrieval by any system or agent. + +When processing new information for memory storage, focus on extracting the core facts, preferences, and events that will be most useful for future reference by any system or agent. + + +{{processed_statement}} + +`; + + const userPrompt = ` + +${context.episodeContent} + + + +${context.source} + + +`; + + return [ + { role: "system", content: sysPrompt }, + { role: "user", content: userPrompt }, + ]; +}; diff --git a/apps/webapp/app/services/search.server.ts b/apps/webapp/app/services/search.server.ts index 5898869..2a444fc 100644 --- a/apps/webapp/app/services/search.server.ts +++ b/apps/webapp/app/services/search.server.ts @@ -4,6 +4,7 @@ import { embed } from "ai"; import { logger } from "./logger.service"; import { applyCrossEncoderReranking, applyWeightedRRF } from "./search/rerank"; import { + getEpisodesByStatements, performBfsSearch, performBM25Search, performVectorSearch, @@ -34,7 +35,7 @@ export class SearchService { query: string, userId: string, options: SearchOptions = {}, - ): Promise { + ): Promise<{ episodes: string[]; facts: string[] }> { // Default options const opts: Required = { limit: options.limit || 10, @@ -71,7 +72,11 @@ export class SearchService { const filteredResults = this.applyAdaptiveFiltering(rankedStatements, opts); // 3. Return top results - return filteredResults.map((statement) => statement.fact); + const episodes = await getEpisodesByStatements(filteredResults); + return { + episodes: episodes.map((episode) => episode.content), + facts: filteredResults.map((statement) => statement.fact), + }; } /** @@ -84,12 +89,14 @@ export class SearchService { ): StatementNode[] { if (results.length === 0) return []; + let isRRF = false; // Extract scores from results const scoredResults = results.map((result) => { // Find the score based on reranking strategy used let score = 0; if ((result as any).rrfScore !== undefined) { score = (result as any).rrfScore; + isRRF = true; } else if ((result as any).mmrScore !== undefined) { score = (result as any).mmrScore; } else if ((result as any).crossEncoderScore !== undefined) { @@ -117,16 +124,31 @@ export class SearchService { const minScore = Math.min(...scores); const scoreRange = maxScore - minScore; - // Define a minimum quality threshold as a fraction of the best score - // This is relative to the query's score distribution rather than an absolute value - const relativeThreshold = options.scoreThreshold || 0.3; // 30% of the best score by default - const absoluteMinimum = 0.1; // Absolute minimum threshold to prevent keeping very poor matches + let threshold = 0; + if (isRRF || scoreRange < 0.01) { + // For RRF or other compressed score ranges, use a percentile-based approach + // Keep top 70% (or whatever is specified in options) of results + const keepPercentage = 1 - (options.scoreThreshold || 0.3); + const keepCount = Math.max( + 1, + Math.ceil(scoredResults.length * keepPercentage), + ); - // Calculate the actual threshold as a percentage of the distance from min to max score - const threshold = Math.max( - absoluteMinimum, - minScore + scoreRange * relativeThreshold, - ); + // Set threshold to the score of the last item we want to keep + threshold = + keepCount < scoredResults.length + ? scoredResults[keepCount - 1].score + : 0; + } else { + // For normal score distributions, use the relative threshold approach + const relativeThreshold = options.scoreThreshold || 0.3; + const absoluteMinimum = 0.1; + + threshold = Math.max( + absoluteMinimum, + minScore + scoreRange * relativeThreshold, + ); + } // Filter out low-quality results const filteredResults = scoredResults diff --git a/apps/webapp/app/services/search/rerank.ts b/apps/webapp/app/services/search/rerank.ts index caf10fa..b2eadaf 100644 --- a/apps/webapp/app/services/search/rerank.ts +++ b/apps/webapp/app/services/search/rerank.ts @@ -89,7 +89,7 @@ export async function applyCrossEncoderReranking( { role: "system", content: `You are an expert tasked with determining whether the statement is relevant to the query - Respond with "True" if PASSAGE is relevant to QUERY and "False" otherwise.`, + Respond with "True" if STATEMENT is relevant to QUERY and "False" otherwise.`, }, { role: "user", diff --git a/apps/webapp/app/services/search/utils.ts b/apps/webapp/app/services/search/utils.ts index e456981..ec06fbb 100644 --- a/apps/webapp/app/services/search/utils.ts +++ b/apps/webapp/app/services/search/utils.ts @@ -1,4 +1,4 @@ -import type { EntityNode, StatementNode } from "@core/types"; +import type { EntityNode, StatementNode, EpisodicNode } from "@core/types"; import type { SearchOptions } from "../search.server"; import type { Embedding } from "ai"; import { logger } from "../logger.service"; @@ -219,3 +219,20 @@ export function combineAndDeduplicateStatements( ).values(), ); } + +export async function getEpisodesByStatements( + statements: StatementNode[], +): Promise { + const cypher = ` + MATCH (s:Statement)<-[:HAS_PROVENANCE]-(e:Episode) + WHERE s.uuid IN $statementUuids + RETURN distinct e + `; + + const params = { + statementUuids: statements.map((s) => s.uuid), + }; + + const records = await runQuery(cypher, params); + return records.map((record) => record.get("e").properties as EpisodicNode); +} diff --git a/apps/webapp/app/utils/presets/nodes.ts b/apps/webapp/app/utils/presets/nodes.ts index 2e02d23..41c9d99 100644 --- a/apps/webapp/app/utils/presets/nodes.ts +++ b/apps/webapp/app/utils/presets/nodes.ts @@ -1,11 +1,13 @@ export enum Apps { LINEAR = "LINEAR", SLACK = "SLACK", + SOL = "SOL", } export const AppNames = { [Apps.LINEAR]: "Linear", [Apps.SLACK]: "Slack", + [Apps.SOL]: "Sol", } as const; // General node types that are common across all apps @@ -38,6 +40,33 @@ export const GENERAL_NODE_TYPES = { // App-specific node types export const APP_NODE_TYPES = { + [Apps.SOL]: { + TASK: { + name: "Sol Task", + description: + "An independent unit of work in Sol, such as a task, bug report, or feature request. Tasks can be associated with lists or linked as subtasks to other tasks.", + }, + LIST: { + name: "Sol List", + description: + "A flexible container in Sol for organizing content such as tasks, text, or references. Lists are used for task tracking, information collections, or reference materials.", + }, + PREFERENCE: { + name: "Sol Preference", + description: + "A user-stated intent, setting, or configuration in Sol, such as preferred formats, notification settings, timezones, or other customizations. Preferences reflect how a user wants the system to behave.", + }, + COMMAND: { + name: "Sol Command", + description: + "A user-issued command or trigger phrase, often starting with '/' or '@', that directs the system or an app to perform a specific action. Commands should always be extracted as distinct, important user actions.", + }, + AUTOMATION: { + name: "Sol Automation", + description: + "A workflow or rule in Sol that automatically performs actions based on specific conditions or triggers, such as recurring tasks, reminders, or integrations with other systems.", + }, + }, [Apps.LINEAR]: { ISSUE: { name: "Linear Issue", @@ -102,3 +131,33 @@ export function getNodeTypes(apps: Array) { appSpecific: appSpecificTypes, }; } + +export function getNodeTypesString(apps: Array) { + let nodeTypesString = ""; + const generalTypes = Object.entries(GENERAL_NODE_TYPES) + .map(([key, value]) => { + return `- ${key}: "${value.description}"`; + }) + .join("\n"); + nodeTypesString += `General Node Types:\n${generalTypes}\n\n`; + + const appSpecificTypes = apps.reduce((acc, appName) => { + return { + ...acc, + [appName]: APP_NODE_TYPES[appName], + }; + }, {}); + + const appSpecificTypesString = Object.entries(appSpecificTypes) + .map(([appName, types]) => { + return `For ${appName}:\n${Object.entries(types as any) + .map(([key, value]: any) => { + return `- ${key}: "${value.description}"`; + }) + .join("\n")}\n\n`; + }) + .join("\n\n"); + + nodeTypesString += `App-specific Node Types:\n${appSpecificTypesString}`; + return nodeTypesString; +} diff --git a/packages/types/src/graph/graph.entity.ts b/packages/types/src/graph/graph.entity.ts index 8fa00e9..4b838d0 100644 --- a/packages/types/src/graph/graph.entity.ts +++ b/packages/types/src/graph/graph.entity.ts @@ -10,6 +10,7 @@ export enum EpisodeType { export interface EpisodicNode { uuid: string; content: string; + originalContent: string; contentEmbedding?: number[]; type: string; source: string; @@ -82,70 +83,3 @@ export type AddEpisodeResult = { statementsCreated: number; processingTimeMs: number; }; - -export const entityTypes = { - general: { - PERSON: { - type: "PERSON", - description: "Any named individual mentioned or referenced.", - }, - APP: { - type: "APP", - description: "Any third-party service or platform used by the user.", - }, - SOL_AUTOMATION: { - type: "SOL_AUTOMATION", - description: "User workflows or flows combining triggers and actions.", - }, - SOL_PREFERENCE: { - type: "SOL_PREFERENCE", - description: "User-stated intent, setting, or configuration like format, timezone, etc.", - }, - COMMAND: { - type: "COMMAND", - description: "Trigger phrase mapped to an internal action, often starts with / or !", - }, - TASK: { - type: "TASK", - description: "User-stated or inferred goal; may link to Person or App.", - }, - EVENT: { - type: "EVENT", - description: "Time-based mention; supports parsing of phrases like 'next week', 'tomorrow'.", - }, - LABEL: { - type: "LABEL", - description: "Optional categorization tag for organization or filtering.", - }, - OBJECT: { - type: "OBJECT", - description: "Named non-person objects in the user's world (e.g., Projector, Car).", - }, - TEAM: { - type: "TEAM", - description: "User-defined group of people, useful for permissions or targeting.", - }, - }, - app_specific: { - SLACK_CHANNEL: { - type: "SLACK_CHANNEL", - description: "Slack channel where automations or communications happen.", - }, - SLACK_USER: { - type: "SLACK_USER", - description: "A user in Slack, can be tagged or messaged.", - }, - GMAIL_THREAD: { - type: "GMAIL_THREAD", - description: "An email conversation thread in Gmail.", - }, - NOTION_PAGE: { - type: "NOTION_PAGE", - description: "A page in Notion workspace.", - }, - CALENDAR_EVENT: { - type: "CALENDAR_EVENT", - description: "Event from user's calendar (Google, Outlook, etc.).", - }, - }, -};