feat: add normalize episode content before storage

This commit is contained in:
Manoj K 2025-06-11 12:38:24 +05:30
parent 56adc246c8
commit 02c7b90374
11 changed files with 232 additions and 112 deletions

View File

@ -6,6 +6,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
MERGE (e:Episode {uuid: $uuid}) MERGE (e:Episode {uuid: $uuid})
ON CREATE SET ON CREATE SET
e.content = $content, e.content = $content,
e.originalContent = $originalContent,
e.contentEmbedding = $contentEmbedding, e.contentEmbedding = $contentEmbedding,
e.type = $type, e.type = $type,
e.source = $source, e.source = $source,
@ -18,6 +19,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
ON MATCH SET ON MATCH SET
e.content = $content, e.content = $content,
e.contentEmbedding = $contentEmbedding, e.contentEmbedding = $contentEmbedding,
e.originalContent = $originalContent,
e.type = $type, e.type = $type,
e.source = $source, e.source = $source,
e.validAt = $validAt, e.validAt = $validAt,
@ -30,6 +32,7 @@ export async function saveEpisode(episode: EpisodicNode): Promise<string> {
const params = { const params = {
uuid: episode.uuid, uuid: episode.uuid,
content: episode.content, content: episode.content,
originalContent: episode.originalContent,
source: episode.source, source: episode.source,
type: episode.type, type: episode.type,
userId: episode.userId || null, userId: episode.userId || null,
@ -59,6 +62,7 @@ export async function getEpisode(uuid: string): Promise<EpisodicNode | null> {
return { return {
uuid: episode.uuid, uuid: episode.uuid,
content: episode.content, content: episode.content,
originalContent: episode.originalContent,
contentEmbedding: episode.contentEmbedding, contentEmbedding: episode.contentEmbedding,
type: episode.type, type: episode.type,
source: episode.source, source: episode.source,
@ -112,6 +116,7 @@ export async function getRecentEpisodes(params: {
return { return {
uuid: episode.uuid, uuid: episode.uuid,
content: episode.content, content: episode.content,
originalContent: episode.originalContent,
contentEmbedding: episode.contentEmbedding, contentEmbedding: episode.contentEmbedding,
type: episode.type, type: episode.type,
source: episode.source, source: episode.source,

View File

@ -64,10 +64,14 @@ export async function saveTriple(triple: Triple): Promise<string> {
MATCH (object:Entity {uuid: $objectUuid, userId: $userId}) MATCH (object:Entity {uuid: $objectUuid, userId: $userId})
MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId}) MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})
CREATE (episode)-[:HAS_PROVENANCE {uuid: $provenanceEdgeUuid, createdAt: $createdAt}]->(statement) MERGE (episode)-[prov:HAS_PROVENANCE]->(statement)
CREATE (statement)-[:HAS_SUBJECT {uuid: $subjectEdgeUuid, createdAt: $createdAt}]->(subject) ON CREATE SET prov.uuid = $provenanceEdgeUuid, prov.createdAt = $createdAt
CREATE (statement)-[:HAS_PREDICATE {uuid: $predicateEdgeUuid, createdAt: $createdAt}]->(predicate) MERGE (statement)-[subj:HAS_SUBJECT]->(subject)
CREATE (statement)-[:HAS_OBJECT {uuid: $objectEdgeUuid, createdAt: $createdAt}]->(object) ON CREATE SET subj.uuid = $subjectEdgeUuid, subj.createdAt = $createdAt
MERGE (statement)-[pred:HAS_PREDICATE]->(predicate)
ON CREATE SET pred.uuid = $predicateEdgeUuid, pred.createdAt = $createdAt
MERGE (statement)-[obj:HAS_OBJECT]->(object)
ON CREATE SET obj.uuid = $objectEdgeUuid, obj.createdAt = $createdAt
RETURN statement.uuid as uuid RETURN statement.uuid as uuid
`; `;
@ -267,6 +271,7 @@ export async function getTripleForStatement({
const provenance: EpisodicNode = { const provenance: EpisodicNode = {
uuid: episodeProps.uuid, uuid: episodeProps.uuid,
content: episodeProps.content, content: episodeProps.content,
originalContent: episodeProps.originalContent,
source: episodeProps.source, source: episodeProps.source,
type: episodeProps.type, type: episodeProps.type,
createdAt: new Date(episodeProps.createdAt), createdAt: new Date(episodeProps.createdAt),

View File

@ -1,7 +1,6 @@
import { openai } from "@ai-sdk/openai"; import { openai } from "@ai-sdk/openai";
import { type CoreMessage, embed } from "ai"; import { type CoreMessage, embed } from "ai";
import { import {
entityTypes,
EpisodeType, EpisodeType,
LLMModelEnum, LLMModelEnum,
type AddEpisodeParams, type AddEpisodeParams,
@ -27,6 +26,8 @@ import {
saveTriple, saveTriple,
} from "./graphModels/statement"; } from "./graphModels/statement";
import { makeModelCall } from "~/lib/model.server"; import { makeModelCall } from "~/lib/model.server";
import { Apps, getNodeTypes, getNodeTypesString } from "~/utils/presets/nodes";
import { normalizePrompt } from "./prompts";
// Default number of previous episodes to retrieve for context // Default number of previous episodes to retrieve for context
const DEFAULT_EPISODE_WINDOW = 5; const DEFAULT_EPISODE_WINDOW = 5;
@ -61,10 +62,16 @@ export class KnowledgeGraphService {
source: params.source, source: params.source,
}); });
const normalizedEpisodeBody = await this.normalizeEpisodeBody(
params.episodeBody,
params.source,
);
// Step 2: Episode Creation - Create or retrieve the episode // Step 2: Episode Creation - Create or retrieve the episode
const episode: EpisodicNode = { const episode: EpisodicNode = {
uuid: crypto.randomUUID(), uuid: crypto.randomUUID(),
content: params.episodeBody, content: normalizedEpisodeBody,
originalContent: params.episodeBody,
source: params.source, source: params.source,
type: params.type || EpisodeType.Text, type: params.type || EpisodeType.Text,
createdAt: now, createdAt: now,
@ -130,6 +137,12 @@ export class KnowledgeGraphService {
episode: EpisodicNode, episode: EpisodicNode,
previousEpisodes: EpisodicNode[], previousEpisodes: EpisodicNode[],
): Promise<EntityNode[]> { ): Promise<EntityNode[]> {
// Get all app keys
const allAppEnumValues = Object.values(Apps);
// Get all node types
const entityTypes = getNodeTypes(allAppEnumValues);
// Use the prompt library to get the appropriate prompts // Use the prompt library to get the appropriate prompts
const context = { const context = {
episodeContent: episode.content, episodeContent: episode.content,
@ -172,7 +185,9 @@ export class KnowledgeGraphService {
name: entity.name, name: entity.name,
type: entity.type, type: entity.type,
attributes: entity.attributes || {}, attributes: entity.attributes || {},
nameEmbedding: await this.getEmbedding(entity.name), nameEmbedding: await this.getEmbedding(
`${entity.type}: ${entity.name}`,
),
createdAt: new Date(), createdAt: new Date(),
userId: episode.userId, userId: episode.userId,
})), })),
@ -652,4 +667,35 @@ export class KnowledgeGraphService {
return { resolvedStatements, invalidatedStatements }; return { resolvedStatements, invalidatedStatements };
} }
/**
* Normalize an episode by extracting entities and creating nodes and statements
*/
private async normalizeEpisodeBody(episodeBody: string, source: string) {
let appEnumValues: Apps[] = [];
if (Apps[source.toUpperCase() as keyof typeof Apps]) {
appEnumValues = [Apps[source.toUpperCase() as keyof typeof Apps]];
}
const entityTypes = getNodeTypesString(appEnumValues);
const context = {
episodeContent: episodeBody,
entityTypes: entityTypes,
source,
};
const messages = normalizePrompt(context);
let responseText = "";
await makeModelCall(false, LLMModelEnum.GPT41, messages, (text) => {
responseText = text;
});
let normalizedEpisodeBody = "";
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
if (outputMatch && outputMatch[1]) {
normalizedEpisodeBody = outputMatch[1].trim();
} else {
normalizedEpisodeBody = episodeBody;
}
return normalizedEpisodeBody;
}
} }

View File

@ -1,2 +1,3 @@
export * from "./nodes"; export * from "./nodes";
export * from "./statements"; export * from "./statements";
export * from "./normalize";

View File

@ -4,30 +4,6 @@
import { type CoreMessage } from "ai"; import { type CoreMessage } from "ai";
export interface ExtractedEntity {
name: string;
type: string;
attributes?: Record<string, any>;
}
export interface ExtractedEntities {
entities: ExtractedEntity[];
}
export interface MissedEntities {
missedEntities: string[];
}
export interface EntityClassificationTriple {
uuid: string;
name: string;
type: string | null;
}
export interface EntityClassification {
entityClassifications: EntityClassificationTriple[];
}
/** /**
* Extract entities from an episode using message-based approach * Extract entities from an episode using message-based approach
*/ */
@ -47,10 +23,14 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
1. **Entity Identification**: 1. **Entity Identification**:
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE. - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE.
- **Exclude** entities mentioned only in the PREVIOUS EPISODES (they are for context only). - **Exclude** entities mentioned only in the PREVIOUS EPISODES (they are for context only).
- For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X).
- For pronouns that refer to named entities, extract them as separate Alias entities.
2. **Entity Classification**: 2. **Entity Classification**:
- Use the descriptions in ENTITY TYPES to classify each extracted entity. - Use the descriptions in ENTITY TYPES to classify each extracted entity.
- Assign the appropriate type for each one. - Assign the appropriate type for each one.
- Classify pronouns (I, me, you, etc.) as Alias entities.
3. **Exclusions**: 3. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately). - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
@ -58,6 +38,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
4. **Formatting**: 4. **Formatting**:
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available). - Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
- For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you").
Format your response as a JSON object with the following structure: Format your response as a JSON object with the following structure:
@ -113,10 +94,13 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli
1. **Entity Identification**: 1. **Entity Identification**:
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT. - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT.
- For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X).
- For pronouns that refer to named entities, extract them as separate Alias entities.
2. **Entity Classification**: 2. **Entity Classification**:
- Use the descriptions in ENTITY TYPES to classify each extracted entity. - Use the descriptions in ENTITY TYPES to classify each extracted entity.
- Assign the appropriate type for each one. - Assign the appropriate type for each one.
- Classify pronouns (I, me, you, etc.) as Alias entities.
3. **Exclusions**: 3. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately). - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
@ -124,7 +108,7 @@ You are given a TEXT. Your task is to extract **entity nodes** mentioned **expli
4. **Formatting**: 4. **Formatting**:
- Be **explicit and unambiguous** when naming entities (e.g., use full names when available). - Be **explicit and unambiguous** when naming entities (e.g., use full names when available).
- For pronouns, use the exact form as they appear in the text (e.g., "I", "me", "you").
Format your response as a JSON object with the following structure: Format your response as a JSON object with the following structure:
<output> <output>

View File

@ -0,0 +1,47 @@
import { type CoreMessage } from "ai";
export const normalizePrompt = (
context: Record<string, any>,
): CoreMessage[] => {
const sysPrompt = `
You are a memory extraction system. Your task is to convert input informationsuch as user input, system events, or assistant actionsinto clear, concise, third-person factual statements suitable for storage in a memory graph. These statements should be easily understandable and retrievable by any system or agent.
## Memory Processing Guidelines
- Always output memory statements in the third person (e.g., "User prefers...", "The assistant performed...", "The system detected...").
- Convert input information into clear, concise memory statements.
- Maintain a neutral, factual tone in all memory entries.
- Structure memories as factual statements, not questions.
- Include relevant context and temporal information when available.
## Node Entity Types
${context.entityTypes}
## Memory Graph Integration
- Each memory will be converted to a node in the memory graph.
- Include relevant relationships between memory items when possible.
- Specify temporal aspects when memories are time-sensitive.
- Format memories to support efficient retrieval by any system or agent.
When processing new information for memory storage, focus on extracting the core facts, preferences, and events that will be most useful for future reference by any system or agent.
<output>
{{processed_statement}}
</output>
`;
const userPrompt = `
<CONTENT>
${context.episodeContent}
</CONTENT>
<SOURCE>
${context.source}
</SOURCE>
`;
return [
{ role: "system", content: sysPrompt },
{ role: "user", content: userPrompt },
];
};

View File

@ -4,6 +4,7 @@ import { embed } from "ai";
import { logger } from "./logger.service"; import { logger } from "./logger.service";
import { applyCrossEncoderReranking, applyWeightedRRF } from "./search/rerank"; import { applyCrossEncoderReranking, applyWeightedRRF } from "./search/rerank";
import { import {
getEpisodesByStatements,
performBfsSearch, performBfsSearch,
performBM25Search, performBM25Search,
performVectorSearch, performVectorSearch,
@ -34,7 +35,7 @@ export class SearchService {
query: string, query: string,
userId: string, userId: string,
options: SearchOptions = {}, options: SearchOptions = {},
): Promise<string[]> { ): Promise<{ episodes: string[]; facts: string[] }> {
// Default options // Default options
const opts: Required<SearchOptions> = { const opts: Required<SearchOptions> = {
limit: options.limit || 10, limit: options.limit || 10,
@ -71,7 +72,11 @@ export class SearchService {
const filteredResults = this.applyAdaptiveFiltering(rankedStatements, opts); const filteredResults = this.applyAdaptiveFiltering(rankedStatements, opts);
// 3. Return top results // 3. Return top results
return filteredResults.map((statement) => statement.fact); const episodes = await getEpisodesByStatements(filteredResults);
return {
episodes: episodes.map((episode) => episode.content),
facts: filteredResults.map((statement) => statement.fact),
};
} }
/** /**
@ -84,12 +89,14 @@ export class SearchService {
): StatementNode[] { ): StatementNode[] {
if (results.length === 0) return []; if (results.length === 0) return [];
let isRRF = false;
// Extract scores from results // Extract scores from results
const scoredResults = results.map((result) => { const scoredResults = results.map((result) => {
// Find the score based on reranking strategy used // Find the score based on reranking strategy used
let score = 0; let score = 0;
if ((result as any).rrfScore !== undefined) { if ((result as any).rrfScore !== undefined) {
score = (result as any).rrfScore; score = (result as any).rrfScore;
isRRF = true;
} else if ((result as any).mmrScore !== undefined) { } else if ((result as any).mmrScore !== undefined) {
score = (result as any).mmrScore; score = (result as any).mmrScore;
} else if ((result as any).crossEncoderScore !== undefined) { } else if ((result as any).crossEncoderScore !== undefined) {
@ -117,16 +124,31 @@ export class SearchService {
const minScore = Math.min(...scores); const minScore = Math.min(...scores);
const scoreRange = maxScore - minScore; const scoreRange = maxScore - minScore;
// Define a minimum quality threshold as a fraction of the best score let threshold = 0;
// This is relative to the query's score distribution rather than an absolute value if (isRRF || scoreRange < 0.01) {
const relativeThreshold = options.scoreThreshold || 0.3; // 30% of the best score by default // For RRF or other compressed score ranges, use a percentile-based approach
const absoluteMinimum = 0.1; // Absolute minimum threshold to prevent keeping very poor matches // Keep top 70% (or whatever is specified in options) of results
const keepPercentage = 1 - (options.scoreThreshold || 0.3);
const keepCount = Math.max(
1,
Math.ceil(scoredResults.length * keepPercentage),
);
// Calculate the actual threshold as a percentage of the distance from min to max score // Set threshold to the score of the last item we want to keep
const threshold = Math.max( threshold =
keepCount < scoredResults.length
? scoredResults[keepCount - 1].score
: 0;
} else {
// For normal score distributions, use the relative threshold approach
const relativeThreshold = options.scoreThreshold || 0.3;
const absoluteMinimum = 0.1;
threshold = Math.max(
absoluteMinimum, absoluteMinimum,
minScore + scoreRange * relativeThreshold, minScore + scoreRange * relativeThreshold,
); );
}
// Filter out low-quality results // Filter out low-quality results
const filteredResults = scoredResults const filteredResults = scoredResults

View File

@ -89,7 +89,7 @@ export async function applyCrossEncoderReranking(
{ {
role: "system", role: "system",
content: `You are an expert tasked with determining whether the statement is relevant to the query content: `You are an expert tasked with determining whether the statement is relevant to the query
Respond with "True" if PASSAGE is relevant to QUERY and "False" otherwise.`, Respond with "True" if STATEMENT is relevant to QUERY and "False" otherwise.`,
}, },
{ {
role: "user", role: "user",

View File

@ -1,4 +1,4 @@
import type { EntityNode, StatementNode } from "@core/types"; import type { EntityNode, StatementNode, EpisodicNode } from "@core/types";
import type { SearchOptions } from "../search.server"; import type { SearchOptions } from "../search.server";
import type { Embedding } from "ai"; import type { Embedding } from "ai";
import { logger } from "../logger.service"; import { logger } from "../logger.service";
@ -219,3 +219,20 @@ export function combineAndDeduplicateStatements(
).values(), ).values(),
); );
} }
export async function getEpisodesByStatements(
statements: StatementNode[],
): Promise<EpisodicNode[]> {
const cypher = `
MATCH (s:Statement)<-[:HAS_PROVENANCE]-(e:Episode)
WHERE s.uuid IN $statementUuids
RETURN distinct e
`;
const params = {
statementUuids: statements.map((s) => s.uuid),
};
const records = await runQuery(cypher, params);
return records.map((record) => record.get("e").properties as EpisodicNode);
}

View File

@ -1,11 +1,13 @@
export enum Apps { export enum Apps {
LINEAR = "LINEAR", LINEAR = "LINEAR",
SLACK = "SLACK", SLACK = "SLACK",
SOL = "SOL",
} }
export const AppNames = { export const AppNames = {
[Apps.LINEAR]: "Linear", [Apps.LINEAR]: "Linear",
[Apps.SLACK]: "Slack", [Apps.SLACK]: "Slack",
[Apps.SOL]: "Sol",
} as const; } as const;
// General node types that are common across all apps // General node types that are common across all apps
@ -38,6 +40,33 @@ export const GENERAL_NODE_TYPES = {
// App-specific node types // App-specific node types
export const APP_NODE_TYPES = { export const APP_NODE_TYPES = {
[Apps.SOL]: {
TASK: {
name: "Sol Task",
description:
"An independent unit of work in Sol, such as a task, bug report, or feature request. Tasks can be associated with lists or linked as subtasks to other tasks.",
},
LIST: {
name: "Sol List",
description:
"A flexible container in Sol for organizing content such as tasks, text, or references. Lists are used for task tracking, information collections, or reference materials.",
},
PREFERENCE: {
name: "Sol Preference",
description:
"A user-stated intent, setting, or configuration in Sol, such as preferred formats, notification settings, timezones, or other customizations. Preferences reflect how a user wants the system to behave.",
},
COMMAND: {
name: "Sol Command",
description:
"A user-issued command or trigger phrase, often starting with '/' or '@', that directs the system or an app to perform a specific action. Commands should always be extracted as distinct, important user actions.",
},
AUTOMATION: {
name: "Sol Automation",
description:
"A workflow or rule in Sol that automatically performs actions based on specific conditions or triggers, such as recurring tasks, reminders, or integrations with other systems.",
},
},
[Apps.LINEAR]: { [Apps.LINEAR]: {
ISSUE: { ISSUE: {
name: "Linear Issue", name: "Linear Issue",
@ -102,3 +131,33 @@ export function getNodeTypes(apps: Array<keyof typeof APP_NODE_TYPES>) {
appSpecific: appSpecificTypes, appSpecific: appSpecificTypes,
}; };
} }
export function getNodeTypesString(apps: Array<keyof typeof APP_NODE_TYPES>) {
let nodeTypesString = "";
const generalTypes = Object.entries(GENERAL_NODE_TYPES)
.map(([key, value]) => {
return `- ${key}: "${value.description}"`;
})
.join("\n");
nodeTypesString += `General Node Types:\n${generalTypes}\n\n`;
const appSpecificTypes = apps.reduce((acc, appName) => {
return {
...acc,
[appName]: APP_NODE_TYPES[appName],
};
}, {});
const appSpecificTypesString = Object.entries(appSpecificTypes)
.map(([appName, types]) => {
return `For ${appName}:\n${Object.entries(types as any)
.map(([key, value]: any) => {
return `- ${key}: "${value.description}"`;
})
.join("\n")}\n\n`;
})
.join("\n\n");
nodeTypesString += `App-specific Node Types:\n${appSpecificTypesString}`;
return nodeTypesString;
}

View File

@ -10,6 +10,7 @@ export enum EpisodeType {
export interface EpisodicNode { export interface EpisodicNode {
uuid: string; uuid: string;
content: string; content: string;
originalContent: string;
contentEmbedding?: number[]; contentEmbedding?: number[];
type: string; type: string;
source: string; source: string;
@ -82,70 +83,3 @@ export type AddEpisodeResult = {
statementsCreated: number; statementsCreated: number;
processingTimeMs: number; processingTimeMs: number;
}; };
export const entityTypes = {
general: {
PERSON: {
type: "PERSON",
description: "Any named individual mentioned or referenced.",
},
APP: {
type: "APP",
description: "Any third-party service or platform used by the user.",
},
SOL_AUTOMATION: {
type: "SOL_AUTOMATION",
description: "User workflows or flows combining triggers and actions.",
},
SOL_PREFERENCE: {
type: "SOL_PREFERENCE",
description: "User-stated intent, setting, or configuration like format, timezone, etc.",
},
COMMAND: {
type: "COMMAND",
description: "Trigger phrase mapped to an internal action, often starts with / or !",
},
TASK: {
type: "TASK",
description: "User-stated or inferred goal; may link to Person or App.",
},
EVENT: {
type: "EVENT",
description: "Time-based mention; supports parsing of phrases like 'next week', 'tomorrow'.",
},
LABEL: {
type: "LABEL",
description: "Optional categorization tag for organization or filtering.",
},
OBJECT: {
type: "OBJECT",
description: "Named non-person objects in the user's world (e.g., Projector, Car).",
},
TEAM: {
type: "TEAM",
description: "User-defined group of people, useful for permissions or targeting.",
},
},
app_specific: {
SLACK_CHANNEL: {
type: "SLACK_CHANNEL",
description: "Slack channel where automations or communications happen.",
},
SLACK_USER: {
type: "SLACK_USER",
description: "A user in Slack, can be tagged or messaged.",
},
GMAIL_THREAD: {
type: "GMAIL_THREAD",
description: "An email conversation thread in Gmail.",
},
NOTION_PAGE: {
type: "NOTION_PAGE",
description: "A page in Notion workspace.",
},
CALENDAR_EVENT: {
type: "CALENDAR_EVENT",
description: "Event from user's calendar (Google, Outlook, etc.).",
},
},
};