refactor: make entity handling type-free and simplify entity resolution in knowledge graph

This commit is contained in:
Manoj 2025-09-09 20:00:50 +05:30 committed by Harshith Mullapudi
parent 6ddcab873a
commit 952386ca0e
5 changed files with 151 additions and 465 deletions

View File

@ -2,18 +2,8 @@ import type { EntityNode } from "@core/types";
import { runQuery } from "~/lib/neo4j.server"; import { runQuery } from "~/lib/neo4j.server";
export async function saveEntity(entity: EntityNode): Promise<string> { export async function saveEntity(entity: EntityNode): Promise<string> {
// Debug: Log entity to identify missing typeEmbedding // Build query conditionally based on whether typeEmbedding exists
if (!entity.typeEmbedding) { const hasTypeEmbedding = entity.typeEmbedding && entity.typeEmbedding.length > 0;
console.error(`Entity missing typeEmbedding:`, {
uuid: entity.uuid,
name: entity.name,
type: entity.type,
hasNameEmbedding: !!entity.nameEmbedding,
});
throw new Error(
`Entity ${entity.name} (${entity.type}) is missing typeEmbedding`,
);
}
const query = ` const query = `
MERGE (n:Entity {uuid: $uuid}) MERGE (n:Entity {uuid: $uuid})
@ -22,7 +12,7 @@ export async function saveEntity(entity: EntityNode): Promise<string> {
n.type = $type, n.type = $type,
n.attributes = $attributes, n.attributes = $attributes,
n.nameEmbedding = $nameEmbedding, n.nameEmbedding = $nameEmbedding,
n.typeEmbedding = $typeEmbedding, ${hasTypeEmbedding ? 'n.typeEmbedding = $typeEmbedding,' : ''}
n.createdAt = $createdAt, n.createdAt = $createdAt,
n.userId = $userId, n.userId = $userId,
n.space = $space n.space = $space
@ -31,23 +21,27 @@ export async function saveEntity(entity: EntityNode): Promise<string> {
n.type = $type, n.type = $type,
n.attributes = $attributes, n.attributes = $attributes,
n.nameEmbedding = $nameEmbedding, n.nameEmbedding = $nameEmbedding,
n.typeEmbedding = $typeEmbedding, ${hasTypeEmbedding ? 'n.typeEmbedding = $typeEmbedding,' : ''}
n.space = $space n.space = $space
RETURN n.uuid as uuid RETURN n.uuid as uuid
`; `;
const params = { const params: any = {
uuid: entity.uuid, uuid: entity.uuid,
name: entity.name, name: entity.name,
type: entity.type, type: entity.type || "",
attributes: JSON.stringify(entity.attributes || {}), attributes: JSON.stringify(entity.attributes || {}),
nameEmbedding: entity.nameEmbedding, nameEmbedding: entity.nameEmbedding,
typeEmbedding: entity.typeEmbedding,
createdAt: entity.createdAt.toISOString(), createdAt: entity.createdAt.toISOString(),
userId: entity.userId, userId: entity.userId,
space: entity.space || null, space: entity.space || null,
}; };
// Add typeEmbedding to params only if it exists
if (hasTypeEmbedding) {
params.typeEmbedding = entity.typeEmbedding;
}
const result = await runQuery(query, params); const result = await runQuery(query, params);
return result[0].get("uuid"); return result[0].get("uuid");
} }
@ -65,10 +59,10 @@ export async function getEntity(uuid: string): Promise<EntityNode | null> {
return { return {
uuid: entity.uuid, uuid: entity.uuid,
name: entity.name, name: entity.name,
type: entity.type, type: entity.type || null,
attributes: JSON.parse(entity.attributes || "{}"), attributes: JSON.parse(entity.attributes || "{}"),
nameEmbedding: entity.nameEmbedding, nameEmbedding: entity.nameEmbedding,
typeEmbedding: entity.typeEmbedding, typeEmbedding: entity.typeEmbedding || null,
createdAt: new Date(entity.createdAt), createdAt: new Date(entity.createdAt),
userId: entity.userId, userId: entity.userId,
space: entity.space, space: entity.space,

View File

@ -15,8 +15,7 @@ import crypto from "crypto";
import { import {
dedupeNodes, dedupeNodes,
extractAttributes, extractAttributes,
extractMessage, extractEntities,
extractText,
} from "./prompts/nodes"; } from "./prompts/nodes";
import { import {
extractStatements, extractStatements,
@ -25,14 +24,11 @@ import {
import { import {
getEpisodeStatements, getEpisodeStatements,
getRecentEpisodes, getRecentEpisodes,
getRelatedEpisodesEntities,
searchEpisodesByEmbedding, searchEpisodesByEmbedding,
} from "./graphModels/episode"; } from "./graphModels/episode";
import { import {
findExactPredicateMatches, findExactPredicateMatches,
findSimilarEntities, findSimilarEntities,
findSimilarEntitiesWithSameType,
replaceEntityReferences,
} from "./graphModels/entity"; } from "./graphModels/entity";
import { import {
findContradictoryStatements, findContradictoryStatements,
@ -47,9 +43,7 @@ import { getEmbedding, makeModelCall } from "~/lib/model.server";
import { runQuery } from "~/lib/neo4j.server"; import { runQuery } from "~/lib/neo4j.server";
import { import {
Apps, Apps,
getNodeTypes,
getNodeTypesString, getNodeTypesString,
isPresetType,
} from "~/utils/presets/nodes"; } from "~/utils/presets/nodes";
import { normalizePrompt, normalizeDocumentPrompt } from "./prompts"; import { normalizePrompt, normalizeDocumentPrompt } from "./prompts";
import { type PrismaClient } from "@prisma/client"; import { type PrismaClient } from "@prisma/client";
@ -272,8 +266,8 @@ export class KnowledgeGraphService {
params.type, params.type,
); );
const normalizedTime = Date.now() - startTime; const normalizedTime = Date.now();
logger.log(`Normalized episode body in ${normalizedTime} ms`); logger.log(`Normalized episode body in ${normalizedTime - startTime} ms`);
if (normalizedEpisodeBody === "NOTHING_TO_REMEMBER") { if (normalizedEpisodeBody === "NOTHING_TO_REMEMBER") {
logger.log("Nothing to remember"); logger.log("Nothing to remember");
@ -284,15 +278,6 @@ export class KnowledgeGraphService {
}; };
} }
const relatedEpisodesEntities = await getRelatedEpisodesEntities({
embedding: await this.getEmbedding(normalizedEpisodeBody),
userId: params.userId,
minSimilarity: 0.7,
});
const relatedTime = Date.now() - normalizedTime;
logger.log(`Related episodes entities in ${relatedTime} ms`);
// Step 2: Episode Creation - Create or retrieve the episode // Step 2: Episode Creation - Create or retrieve the episode
const episode: EpisodicNode = { const episode: EpisodicNode = {
uuid: crypto.randomUUID(), uuid: crypto.randomUUID(),
@ -316,23 +301,18 @@ export class KnowledgeGraphService {
); );
const extractedTime = Date.now(); const extractedTime = Date.now();
logger.log(`Extracted entities in ${extractedTime - relatedTime} ms`); logger.log(`Extracted entities in ${extractedTime - normalizedTime} ms`);
// Step 3.1: Context-aware entity resolution with preset type evolution // Step 3.1: Simple entity categorization (no type-based expansion needed)
await this.resolveEntitiesWithContext( const categorizedEntities = {
extractedNodes, primary: extractedNodes,
relatedEpisodesEntities, expanded: [], // No expansion needed with type-free approach
); };
// Step 3.2: Handle preset type logic - expand entities for statement extraction
const categorizedEntities = await this.expandEntitiesForStatements(
extractedNodes,
episode,
);
const expandedTime = Date.now(); const expandedTime = Date.now();
logger.log(`Expanded entities in ${expandedTime - extractedTime} ms`); logger.log(`Processed entities in ${expandedTime - extractedTime} ms`);
console.log(extractedNodes.map((e) => e.name));
// Step 4: Statement Extrraction - Extract statements (triples) instead of direct edges // Step 4: Statement Extrraction - Extract statements (triples) instead of direct edges
const extractedStatements = await this.extractStatements( const extractedStatements = await this.extractStatements(
episode, episode,
@ -371,10 +351,12 @@ export class KnowledgeGraphService {
); );
// Step 7: ADd attributes to entity nodes // Step 7: ADd attributes to entity nodes
const updatedTriples = await this.addAttributesToEntities( // const updatedTriples = await this.addAttributesToEntities(
resolvedStatements, // resolvedStatements,
episode, // episode,
); // );
const updatedTriples = resolvedStatements;
const updatedTriplesTime = Date.now(); const updatedTriplesTime = Date.now();
logger.log( logger.log(
@ -439,12 +421,6 @@ export class KnowledgeGraphService {
episode: EpisodicNode, episode: EpisodicNode,
previousEpisodes: EpisodicNode[], previousEpisodes: EpisodicNode[],
): Promise<EntityNode[]> { ): Promise<EntityNode[]> {
// Get all app keys
const allAppEnumValues = Object.values(Apps);
// Get all node types
const entityTypes = getNodeTypes(allAppEnumValues);
// Use the prompt library to get the appropriate prompts // Use the prompt library to get the appropriate prompts
const context = { const context = {
episodeContent: episode.content, episodeContent: episode.content,
@ -452,13 +428,11 @@ export class KnowledgeGraphService {
content: ep.content, content: ep.content,
createdAt: ep.createdAt.toISOString(), createdAt: ep.createdAt.toISOString(),
})), })),
entityTypes: entityTypes,
}; };
// Get the extract_json prompt from the prompt library // Get the unified entity extraction prompt
const messages = episode.sessionId const extractionMode = episode.sessionId ? 'conversation' : 'document';
? extractMessage(context) const messages = extractEntities(context, extractionMode);
: extractText(context);
let responseText = ""; let responseText = "";
@ -474,21 +448,19 @@ export class KnowledgeGraphService {
responseText = outputMatch[1].trim(); responseText = outputMatch[1].trim();
const extractedEntities = JSON.parse(responseText || "{}").entities || []; const extractedEntities = JSON.parse(responseText || "{}").entities || [];
// Batch generate embeddings for better performance // Batch generate embeddings for entity names
const entityNames = extractedEntities.map((entity: any) => entity.name); const entityNames = extractedEntities.map((entity: any) => entity.name);
const entityTypes = extractedEntities.map((entity: any) => entity.type); const nameEmbeddings = await Promise.all(
const [nameEmbeddings, typeEmbeddings] = await Promise.all([ entityNames.map((name: string) => this.getEmbedding(name))
Promise.all(entityNames.map((name: string) => this.getEmbedding(name))), );
Promise.all(entityTypes.map((type: string) => this.getEmbedding(type))),
]);
entities = extractedEntities.map((entity: any, index: number) => ({ entities = extractedEntities.map((entity: any, index: number) => ({
uuid: crypto.randomUUID(), uuid: crypto.randomUUID(),
name: entity.name, name: entity.name,
type: entity.type, type: undefined, // Type will be inferred from statements
attributes: entity.attributes || {}, attributes: entity.attributes || {},
nameEmbedding: nameEmbeddings[index], nameEmbedding: nameEmbeddings[index],
typeEmbedding: typeEmbeddings[index], typeEmbedding: undefined, // No type embedding needed
createdAt: new Date(), createdAt: new Date(),
userId: episode.userId, userId: episode.userId,
})); }));
@ -537,6 +509,8 @@ export class KnowledgeGraphService {
responseText = text; responseText = text;
}); });
console.log(responseText);
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/); const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
if (outputMatch && outputMatch[1]) { if (outputMatch && outputMatch[1]) {
responseText = outputMatch[1].trim(); responseText = outputMatch[1].trim();
@ -548,6 +522,8 @@ export class KnowledgeGraphService {
const extractedTriples: ExtractedTripleData[] = const extractedTriples: ExtractedTripleData[] =
JSON.parse(responseText || "{}").edges || []; JSON.parse(responseText || "{}").edges || [];
console.log(`extracted triples length: ${extractedTriples.length}`)
// Create maps to deduplicate entities by name within this extraction // Create maps to deduplicate entities by name within this extraction
const predicateMap = new Map<string, EntityNode>(); const predicateMap = new Map<string, EntityNode>();
@ -597,17 +573,13 @@ export class KnowledgeGraphService {
// Convert extracted triples to Triple objects with Statement nodes // Convert extracted triples to Triple objects with Statement nodes
const triples = extractedTriples.map( const triples = extractedTriples.map(
(triple: ExtractedTripleData, tripleIndex: number) => { (triple: ExtractedTripleData, tripleIndex: number) => {
// Find the subject and object nodes by matching both name and type // Find the subject and object nodes by matching name (type-free approach)
const subjectNode = allEntities.find( const subjectNode = allEntities.find(
(node) => (node) => node.name.toLowerCase() === triple.source.toLowerCase()
node.name.toLowerCase() === triple.source.toLowerCase() &&
node.type.toLowerCase() === triple.sourceType.toLowerCase(),
); );
const objectNode = allEntities.find( const objectNode = allEntities.find(
(node) => (node) => node.name.toLowerCase() === triple.target.toLowerCase()
node.name.toLowerCase() === triple.target.toLowerCase() &&
node.type.toLowerCase() === triple.targetType.toLowerCase(),
); );
// Get the deduplicated predicate node // Get the deduplicated predicate node
@ -661,108 +633,7 @@ export class KnowledgeGraphService {
return triples.filter(Boolean) as Triple[]; return triples.filter(Boolean) as Triple[];
} }
/**
* Expand entities for statement extraction by adding existing preset entities
*/
private async expandEntitiesForStatements(
extractedNodes: EntityNode[],
episode: EpisodicNode,
): Promise<{
primary: EntityNode[];
expanded: EntityNode[];
}> {
const allAppEnumValues = Object.values(Apps);
const expandedEntities: EntityNode[] = [];
// For each extracted entity, check if we need to add existing preset entities
for (const entity of extractedNodes) {
const newIsPreset = isPresetType(entity.type, allAppEnumValues);
// Find similar entities with same name
const similarEntities = await findSimilarEntities({
queryEmbedding: entity.nameEmbedding,
limit: 5,
threshold: 0.8,
userId: episode.userId,
});
for (const existingEntity of similarEntities) {
const existingIsPreset = isPresetType(
existingEntity.type,
allAppEnumValues,
);
// If both are preset types, include both for statement extraction
if (newIsPreset && existingIsPreset) {
// Add the existing entity to the list if not already present
if (!expandedEntities.some((e) => e.uuid === existingEntity.uuid)) {
expandedEntities.push(existingEntity);
}
}
}
}
// Deduplicate by name AND type combination
const deduplicateEntities = (entities: EntityNode[]) => {
const seen = new Map<string, EntityNode>();
return entities.filter((entity) => {
const key = `${entity.name.toLowerCase()}_${entity.type.toLowerCase()}`;
if (seen.has(key)) {
return false;
}
seen.set(key, entity);
return true;
});
};
return {
primary: deduplicateEntities(extractedNodes),
expanded: deduplicateEntities(
expandedEntities.filter(
(e) => !extractedNodes.some((primary) => primary.uuid === e.uuid),
),
),
};
}
/**
* Resolve entities with context-aware deduplication and preset type evolution
* Only merges entities that appear in semantically related episodes
*/
private async resolveEntitiesWithContext(
extractedNodes: EntityNode[],
relatedEpisodesEntities: EntityNode[],
): Promise<void> {
const allAppEnumValues = Object.values(Apps);
extractedNodes.map(async (newEntity) => {
// Find same-name entities in related episodes (contextually relevant)
const sameNameInContext = relatedEpisodesEntities.filter(
(existing) =>
existing.name.toLowerCase() === newEntity.name.toLowerCase(),
);
if (sameNameInContext.length > 0) {
let existingEntityIds: string[] = [];
sameNameInContext.forEach(async (existingEntity) => {
const newIsPreset = isPresetType(newEntity.type, allAppEnumValues);
const existingIsPreset = isPresetType(
existingEntity.type,
allAppEnumValues,
);
if (newIsPreset && !existingIsPreset) {
// New is preset, existing is custom - evolve existing entity to preset type
existingEntityIds.push(existingEntity.uuid);
}
});
if (existingEntityIds.length > 0) {
await replaceEntityReferences(newEntity, existingEntityIds);
}
}
});
}
/** /**
* Resolve extracted nodes to existing nodes or create new ones * Resolve extracted nodes to existing nodes or create new ones
@ -835,9 +706,8 @@ export class KnowledgeGraphService {
// Step 2a: Find similar entities for non-predicate entities // Step 2a: Find similar entities for non-predicate entities
const similarEntitiesResults = await Promise.all( const similarEntitiesResults = await Promise.all(
nonPredicates.map(async (entity) => { nonPredicates.map(async (entity) => {
const similarEntities = await findSimilarEntitiesWithSameType({ const similarEntities = await findSimilarEntities({
queryEmbedding: entity.nameEmbedding, queryEmbedding: entity.nameEmbedding,
entityType: entity.type,
limit: 5, limit: 5,
threshold: 0.7, threshold: 0.7,
userId: episode.userId, userId: episode.userId,
@ -1240,20 +1110,12 @@ export class KnowledgeGraphService {
return triples; // No entities to process return triples; // No entities to process
} }
// Get all app keys
const allAppEnumValues = Object.values(Apps);
// Get all node types with their attribute definitions
const entityTypes = getNodeTypes(allAppEnumValues);
// Prepare simplified context for the LLM // Prepare simplified context for the LLM
const context = { const context = {
episodeContent: episode.content, episodeContent: episode.content,
entityTypes: entityTypes,
entities: entities.map((entity) => ({ entities: entities.map((entity) => ({
uuid: entity.uuid, uuid: entity.uuid,
name: entity.name, name: entity.name,
type: entity.type,
currentAttributes: entity.attributes || {}, currentAttributes: entity.attributes || {},
})), })),
}; };

View File

@ -5,16 +5,20 @@
import { type CoreMessage } from "ai"; import { type CoreMessage } from "ai";
/** /**
* Extract entities from an episode using message-based approach * Extract entities from content using unified approach (works for both conversations and documents)
*/ */
export const extractMessage = (context: Record<string, any>): CoreMessage[] => { export const extractEntities = (
context: Record<string, any>,
extractionMode: 'conversation' | 'document' = 'conversation'
): CoreMessage[] => {
const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages for a reified knowledge graph. const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages for a reified knowledge graph.
Your primary task is to extract and classify significant entities mentioned in the conversation. Your primary task is to extract all significant entities mentioned in the conversation, treating both concrete entities and type/concept entities as first-class nodes.
In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements. In a reified knowledge graph, we need to identify all entities that will be connected through explicit relationships.
Focus on extracting: Focus on extracting:
1. Subject entities (people, objects, concepts) 1. Concrete entities (people, objects, specific instances)
2. Object entities (people, objects, concepts) 2. Type/concept entities (categories, classes, abstract concepts)
3. All entities that participate in "X is a Y" relationships
Instructions: Instructions:
@ -22,16 +26,16 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
1. **Entity Identification**: 1. **Entity Identification**:
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE. - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE.
- For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X). - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as an Alias entity AND the named entity (X).
- **ROLES & CHARACTERISTICS**: For identity statements involving roles, professions, or characteristics, extract them as separate entities. - **ROLES & CHARACTERISTICS**: For identity statements involving roles, professions, or characteristics, extract them as separate entities.
- For pronouns that refer to named entities, extract them as separate Alias entities. - For pronouns that refer to named entities, extract them as separate Alias entities.
- **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities.
2. **Entity Classification**: 2. **Type and Concept Entity Extraction**:
- Prefer using appropriate types from the ENTITY_TYPES section when they fit naturally. - **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities.
- DO NOT force-fit entities into inappropriate types from ENTITY_TYPES. - **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities.
- If no type from ENTITY_TYPES fits naturally, create a descriptive type based on context (e.g., "memory_graph_system", "authentication_bug"). - **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities.
- Each entity should have exactly ONE type that best describes what it is. - **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships.
- Classify pronouns (I, me, you, etc.) as "Alias" entities.
3. **Exclusions**: 3. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately). - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
@ -40,13 +44,13 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
- Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm"). - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm").
4. **Entity Name Extraction**: 4. **Entity Name Extraction**:
- Extract ONLY the core entity name, WITHOUT any type descriptors or qualifiers - Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers
- When text mentions "Tesla car", extract name as "Tesla" with type "Vehicle" - When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car"
- When text mentions "John's company", extract name as "John" with type "Person" (company is a separate entity) - When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities
- **CLEAN NAMES**: Remove type words like "app", "system", "platform", "tool", "service", "company", "organization" from the entity name - **CLEAN NAMES**: Remove articles (a, an, the) and quantifiers, but preserve the core concept
- **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you") and classify as "Alias" - **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you")
- **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John") - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John")
- **NO TYPE SUFFIXES**: Never append the entity type to the entity name - **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" "Space")
5. **Temporal and Relationship Context Extraction**: 5. **Temporal and Relationship Context Extraction**:
- EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years") - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years")
@ -57,6 +61,19 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
## Examples of Correct Entity Extraction: ## Examples of Correct Entity Extraction:
**TYPE/CONCEPT ENTITY EXTRACTION:**
**EXTRACT BOTH ENTITIES IN "IS A" RELATIONSHIPS:**
- Text: "Profile is a memory space" Extract: "Profile" AND "MemorySpace"
- Text: "Tesla is a car" Extract: "Tesla" AND "Car"
- Text: "John is a teacher" Extract: "John" AND "Teacher"
- Text: "Goals space connects to Projects" Extract: "Goals", "Space", AND "Projects"
**EXTRACT CONCEPT ENTITIES:**
- Text: "rated 10/10 for usefulness" Extract: "Usefulness", "Rating"
- Text: "essential classification tier" Extract: "Essential", "Classification", "Tier"
- Text: "hierarchical memory system" Extract: "Hierarchical", "Memory", "System"
**TEMPORAL INFORMATION - What to EXTRACT vs EXCLUDE:** **TEMPORAL INFORMATION - What to EXTRACT vs EXCLUDE:**
**EXTRACT - Relationship Temporal Information:** **EXTRACT - Relationship Temporal Information:**
@ -73,47 +90,50 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
- Text: "next week" Don't extract "next week" - Text: "next week" Don't extract "next week"
**RELATIONSHIP CONTEXT ENTITIES:** **RELATIONSHIP CONTEXT ENTITIES:**
- Text: "my close friends" Extract: "close friends" (QualifiedGroup) - Text: "my close friends" Extract: "Close Friends" (QualifiedGroup)
- Text: "strong support system" Extract: "support system" (RelationshipType) - Text: "strong support system" Extract: "Support System" (RelationshipType)
- Text: "work colleagues" Extract: "work colleagues" (ProfessionalGroup) - Text: "work colleagues" Extract: "Work Colleagues" (ProfessionalGroup)
- Text: "family members" Extract: "family members" (FamilyGroup) - Text: "family members" Extract: "Family Members" (FamilyGroup)
**STANDARD ENTITY EXTRACTION:** **STANDARD ENTITY EXTRACTION:**
- Text: "Tesla car" Name: "Tesla", Type: "Vehicle" - Text: "Tesla car" Extract: "Tesla" AND "Car"
- Text: "Google's search engine" Name: "Google", Type: "Company" + Name: "Search Engine", Type: "Product" - Text: "Google's search engine" Extract: "Google" AND "Search Engine"
- Text: "Microsoft Office suite" Name: "Microsoft Office", Type: "Software" - Text: "Microsoft Office suite" Extract: "Microsoft Office" AND "Suite"
- Text: "John's startup company" Name: "John", Type: "Person" + Name: "Startup", Type: "Company" - Text: "John's startup company" Extract: "John", "Startup", AND "Company"
**INCORRECT Examples:** **CORRECT vs INCORRECT Examples:**
- Text: "Tesla car" Name: "Tesla car", Type: "Vehicle"
- Text: "authentication system" Name: "authentication system", Type: "System" **CORRECT:**
- Text: "payment service" Name: "payment service", Type: "Service" - Text: "Profile is a memory space" Extract: "Profile", "MemorySpace"
- Text: "essential classification system" Extract: "Essential", "Classification", "System"
- Text: "10/10 usefulness rating" Extract: "Usefulness", "Rating"
**INCORRECT:**
- Text: "Profile is a memory space" Only extract: "Profile"
- Text: "authentication system" Extract: "authentication system" (should be "Authentication", "System")
- Text: "payment service" Extract: "payment service" (should be "Payment", "Service")
Format your response as a JSON object with the following structure: Format your response as a JSON object with the following structure:
<output> <output>
{ {
"entities": [ "entities": [
{ {
"name": "Entity Name", "name": "Entity Name"
"type": "Entity Type",
} }
// Additional entities... // Additional entities...
] ]
} }
</output>`; </output>`;
const contentLabel = extractionMode === 'conversation' ? 'CURRENT EPISODE' : 'TEXT';
const userPrompt = ` const userPrompt = `
<PREVIOUS EPISODES> ${extractionMode === 'conversation' ? `<PREVIOUS EPISODES>
${JSON.stringify(context.previousEpisodes || [], null, 2)} ${JSON.stringify(context.previousEpisodes || [], null, 2)}
</PREVIOUS EPISODES> </PREVIOUS EPISODES>
<CURRENT EPISODE> ` : ''}<${contentLabel}>
${context.episodeContent} ${context.episodeContent}
</CURRENT EPISODE> </${contentLabel}>
<ENTITY_TYPES>
${JSON.stringify(context.entityTypes || {}, null, 2)}
</ENTITY_TYPES>
`; `;
@ -123,161 +143,6 @@ ${JSON.stringify(context.entityTypes || {}, null, 2)}
]; ];
}; };
/**
* Extract entities from text-based content
*/
export const extractText = (context: Record<string, any>): CoreMessage[] => {
const sysPrompt = `
You are an AI assistant that extracts entity nodes from text for a reified knowledge graph.
Your primary task is to extract and classify significant entities mentioned in the provided text.
In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements.
Focus on extracting:
1. Subject entities
2. Object entities
Instructions:
You are given a TEXT. Your task is to extract **entity nodes** mentioned **explicitly or implicitly** in the TEXT.
1. **Entity Identification**:
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT.
- For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X).
- **ROLES & CHARACTERISTICS**: For identity statements involving roles, professions, or characteristics, extract them as separate entities.
- For pronouns that refer to named entities, extract them as separate Alias entities.
2. **Entity Classification**:
- Prefer using appropriate types from the ENTITY_TYPES section when they fit naturally.
- DO NOT force-fit entities into inappropriate types from ENTITY_TYPES.
- If no type from ENTITY_TYPES fits naturally, create a descriptive type based on context.
- Each entity should have exactly ONE type that best describes what it is.
- Classify pronouns (I, me, you, etc.) as "Alias" entities.
3. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately).
- **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements.
- Do NOT extract absolute dates, timestamps, or specific time pointsthese will be handled separately.
- Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm").
4. **Entity Name Extraction**:
- Extract ONLY the core entity name, WITHOUT any type descriptors or qualifiers
- When text mentions "Tesla car", extract name as "Tesla" with type "Vehicle"
- When text mentions "John's company", extract name as "John" with type "Person" (company is a separate entity)
- **CLEAN NAMES**: Remove type words like "app", "system", "platform", "tool", "service", "company", "organization" from the entity name
- **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you") and classify as "Alias"
- **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John")
- **NO TYPE SUFFIXES**: Never append the entity type to the entity name
5. **Temporal and Relationship Context Extraction**:
- EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years")
- EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college")
- EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members")
- DO NOT extract absolute dates, timestamps, or specific time points ("June 9, 2023", "3pm", "last Saturday")
- DO NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday")
## Examples of Correct Entity Extraction:
**TEMPORAL INFORMATION - What to EXTRACT vs EXCLUDE:**
**EXTRACT - Relationship Temporal Information:**
- Text: "I've known these friends for 4 years" Extract: "4 years" (Duration)
- Text: "since I moved from my home country" Extract: "since moving" (TemporalContext)
- Text: "after that tough breakup" Extract: "after breakup" (TemporalContext)
- Text: "we've been married for 5 years" Extract: "5 years" (Duration)
- Text: "during college" Extract: "during college" (TemporalContext)
**EXCLUDE - Absolute Dates/Times:**
- Text: "on June 9, 2023" Don't extract "June 9, 2023"
- Text: "last Saturday" Don't extract "last Saturday"
- Text: "at 3pm yesterday" Don't extract "3pm" or "yesterday"
- Text: "next week" Don't extract "next week"
**RELATIONSHIP CONTEXT ENTITIES:**
- Text: "my close friends" Extract: "close friends" (QualifiedGroup)
- Text: "strong support system" Extract: "support system" (RelationshipType)
- Text: "work colleagues" Extract: "work colleagues" (ProfessionalGroup)
- Text: "family members" Extract: "family members" (FamilyGroup)
**STANDARD ENTITY EXTRACTION:**
- Text: "Tesla car" Name: "Tesla", Type: "Vehicle"
- Text: "Google's search engine" Name: "Google", Type: "Company" + Name: "Search Engine", Type: "Product"
- Text: "Microsoft Office suite" Name: "Microsoft Office", Type: "Software"
- Text: "John's startup company" Name: "John", Type: "Person" + Name: "Startup", Type: "Company"
**INCORRECT Examples:**
- Text: "Tesla car" Name: "Tesla car", Type: "Vehicle"
- Text: "authentication system" Name: "authentication system", Type: "System"
- Text: "payment service" Name: "payment service", Type: "Service"
Format your response as a JSON object with the following structure:
<output>
{
"entities": [
{
"name": "Entity Name",
"type": "Entity Type"
}
// Additional entities...
]
}
</output>`;
const userPrompt = `
<TEXT>
${context.episodeContent}
</TEXT>
<ENTITY_TYPES>
${JSON.stringify(context.entityTypes || {}, null, 2)}
</ENTITY_TYPES>
`;
return [
{ role: "system", content: sysPrompt },
{ role: "user", content: userPrompt },
];
};
/**
* Extract entities from an episode using JSON-based approach
*/
export const extractJson = (context: Record<string, any>): CoreMessage[] => {
const sysPrompt = `You are an AI assistant that extracts entity nodes from text.
Your primary task is to extract and classify significant entities mentioned in the content.`;
const userPrompt = `
<PREVIOUS EPISODES>
${JSON.stringify(context.previousEpisodes || [], null, 2)}
</PREVIOUS EPISODES>
<CURRENT EPISODE>
${context.episodeContent}
</CURRENT EPISODE>
<ENTITY TYPES>
${JSON.stringify(context.entityTypes || {}, null, 2)}
</ENTITY TYPES>
Instructions:
Extract all significant entities mentioned in the CURRENT EPISODE. For each entity, provide a name and type.
Respond with a JSON object containing an "entities" array of objects, each with "name" and "type" properties.
Guidelines:
1. Extract significant entities, concepts, or actors mentioned in the content.
2. Avoid creating nodes for relationships or actions.
3. Avoid creating nodes for temporal information like dates, times or years (these will be added to edges later).
4. **CLEAN ENTITY NAMES**: Extract ONLY the core entity name WITHOUT type descriptors:
- "Tesla car" Name: "Tesla", Type: "Vehicle"
- Remove words like "app", "system", "platform", "tool", "service", "company" from entity names
5. Use full names when available and avoid abbreviations.
${context.customPrompt || ""}
`;
return [
{ role: "system", content: sysPrompt },
{ role: "user", content: userPrompt },
];
};
/** /**
* Resolve entity duplications * Resolve entity duplications
@ -286,84 +151,53 @@ export const dedupeNodes = (context: Record<string, any>): CoreMessage[] => {
return [ return [
{ {
role: "system", role: "system",
content: `You are a helpful assistant who determines whether or not ENTITIES extracted from a conversation are duplicates of existing entities. content: `You are a helpful assistant who determines whether extracted entities are duplicates of existing entities.
## CRITICAL RULE: Entity Type Matters Focus on name-based similarity and contextual meaning to identify duplicates.
DO NOT mark entities with different types as duplicates, even if they have identical names.
- DO NOT mark "John" (Person) and "John" (Company) as duplicates
- DO NOT mark "Apple" (Company) and "Apple" (Fruit) as duplicates
- DO NOT mark "Core" (App) and "Core" (Concept) as duplicates
Consider entities as potential duplicates ONLY if they have:
1. Similar or identical names AND
2. The EXACT SAME entity type
Each entity in ENTITIES is represented as a JSON object with the following structure: Each entity in ENTITIES is represented as a JSON object with the following structure:
{ {
id: integer id of the entity, id: integer id of the entity,
name: "name of the entity", name: "name of the entity",
entity_type: "ontological classification of the entity",
entity_type_description: "Description of what the entity type represents",
duplication_candidates: [ duplication_candidates: [
{ {
idx: integer index of the candidate entity, idx: integer index of the candidate entity,
name: "name of the candidate entity", name: "name of the candidate entity",
entity_type: "ontological classification of the candidate entity",
...<additional attributes> ...<additional attributes>
} }
] ]
} }
## Duplication Decision Rules ## Duplication Decision Framework
For each entity, determine if it is a duplicate of any of its duplication candidates:
### MARK AS DUPLICATE (duplicate_idx >= 0) when: ### MARK AS DUPLICATE (duplicate_idx >= 0) when:
- Verify the candidate has the SAME entity_type as the current entity - **IDENTICAL NAMES**: Exact same name or obvious synonyms
- AND confirm the entities refer to the same real-world object or concept - **SEMANTIC EQUIVALENCE**: Different names but clearly referring to the same entity
- AND check that the names are very similar or identical - **STRUCTURAL VARIATIONS**: Same entity with minor formatting differences
### SPECIAL RULE FOR PREDICATES:
**ALWAYS mark identical predicates as duplicates** - predicates are universal and reusable:
- Mark "is associated with" (Predicate) vs "is associated with" (Predicate) duplicate_idx = 0
- Mark "works for" (Predicate) vs "works for" (Predicate) duplicate_idx = 0
- Mark "owns" (Predicate) vs "owns" (Predicate) duplicate_idx = 0
### DO NOT mark as duplicate (duplicate_idx = -1) when: ### DO NOT mark as duplicate (duplicate_idx = -1) when:
- Confirm the candidate has a DIFFERENT entity_type (even with identical names) - **DIFFERENT INSTANCES**: Similar names but different real-world entities
- Identify they are related but distinct entities - **CONTEXTUAL DISTINCTION**: Same name but different contexts suggest distinct entities
- Recognize they have similar names or purposes but refer to separate instances or concepts - **HIERARCHICAL RELATIONSHIPS**: One is part of/contains the other
- Distinguish when one is a general concept and the other is a specific instance
- **EXCEPTION**: DO NOT apply this rule to Predicates - always deduplicate identical predicates
## Examples: ## Example Patterns:
**CORRECT - Mark as NOT Duplicates (Different Types):** **DUPLICATE CASES:**
- Set "Tesla" (Company) vs "Tesla" (Car) duplicate_idx = -1 - "John Smith" vs "John Smith" Check context for same person
- Set "Apple" (Company) vs "Apple" (Fruit) duplicate_idx = -1 - "Microsoft" vs "Microsoft Corporation" Same organization (duplicate_idx = 0)
- Set "Core" (App) vs "Core" (System) duplicate_idx = -1 - "iPhone" vs "Apple iPhone" Same product (duplicate_idx = 0)
- "Tier 1" vs "Tier 1" Same classification level (duplicate_idx = 0)
**CORRECT - Mark Predicates AS Duplicates (Same Name, Same Type):** **NOT DUPLICATE CASES:**
- Set "is associated with" (Predicate) vs "is associated with" (Predicate) duplicate_idx = 0 - "Meeting Room A" vs "Meeting Room B" Different rooms (duplicate_idx = -1)
- Set "works for" (Predicate) vs "works for" (Predicate) duplicate_idx = 0 - "Project Alpha" vs "Project Beta" Different projects (duplicate_idx = -1)
- Set "owns" (Predicate) vs "owns" (Predicate) duplicate_idx = 0 - "Essential" vs "Critical" Different priority levels (duplicate_idx = -1)
- "Team Lead" vs "Team Member" Different roles (duplicate_idx = -1)
**CORRECT - Evaluate Potential Duplicates (Same Type):** ## Decision Guidelines:
- Check if "John Smith" (Person) vs "John Smith" (Person) refer to same person - **CONSERVATIVE APPROACH**: When uncertain, prefer NOT marking as duplicate
- Check if "Microsoft" (Company) vs "Microsoft Corporation" (Company) are the same company - **CONTEXT MATTERS**: Consider the episode content and previous episodes
- Check if "iPhone" (Product) vs "Apple iPhone" (Product) are the same product - **SEMANTIC MEANING**: Focus on whether they refer to the same real-world entity
**CORRECT - Mark as NOT Duplicates (Same Type, Different Instances):**
- Set "Meeting" (Event) vs "Meeting" (Event) duplicate_idx = -1 (different meetings)
- Set "Project" (Task) vs "Project" (Task) duplicate_idx = -1 (different projects)
- **NOTE**: DO NOT apply this rule to Predicates - always deduplicate identical predicates
## Task:
Provide your response as a JSON object with an "entity_resolutions" array containing one entry for each entity.
For each entity, include:
- "id": the id of the entity (integer)
- "name": the name of the entity (string)
- "duplicate_idx": the index of the duplicate candidate, or -1 if no duplicate (integer)
Format your response as follows: Format your response as follows:
<output> <output>
@ -380,12 +214,9 @@ Format your response as follows:
</output> </output>
## Important Instructions: ## Important Instructions:
- FIRST check if entity types match before considering any duplication
- If entity types don't match, immediately set duplicate_idx = -1
- Only mark entities with identical types as potential duplicates
- When in doubt, prefer NOT marking as duplicate (duplicate_idx = -1)
- Always include all entities from the input in your response - Always include all entities from the input in your response
- Always wrap the output in these tags <output> </output> - Always wrap the output in these tags <output> </output>
- When in doubt, prefer NOT marking as duplicate (duplicate_idx = -1)
`, `,
}, },
{ {
@ -412,16 +243,21 @@ export const extractAttributes = (
): CoreMessage[] => { ): CoreMessage[] => {
const sysPrompt = ` const sysPrompt = `
You are an AI assistant that extracts and enhances entity attributes based on context. You are an AI assistant that extracts and enhances entity attributes based on context.
Your task is to analyze entities and provide appropriate attribute values for each entity based on its type definition. Your task is to analyze entities and provide appropriate attribute values based on available information.
For each entity: For each entity:
1. Look at its type and identify the required and optional attributes from the entity type definitions 1. Analyze the context to identify relevant attributes for the entity
2. Check if the entity already has values for these attributes 2. Extract appropriate values from the episode content if available
3. For missing attributes, extract appropriate values from the context if possible 3. Focus on factual, descriptive attributes rather than type classifications
4. For existing attributes, enhance or correct them if needed based on the context 4. Give empty attributes object ({}) when there are no attributes to update
5. Give empty attributes object ({}) when there are no attributes to update 5. Only include attributes that you're adding or modifying
6. Only include attributes that you're updating - don't repeat existing attributes that don't need changes 6. I'll merge your new attributes with existing ones, so only provide updates
7. I'll merge your new attributes with the current attributes, so only provide values that should be added or modified
Common attribute types to consider:
- Descriptive properties (color, size, status, etc.)
- Relational context (role, position, relationship, etc.)
- Temporal information (duration, frequency, etc.)
- Qualitative aspects (importance, preference, etc.)
Provide your output in this structure: Provide your output in this structure:
<output> <output>
@ -441,10 +277,6 @@ Provide your output in this structure:
</output>`; </output>`;
const userPrompt = ` const userPrompt = `
<ENTITY_TYPES>
${JSON.stringify(context.entityTypes, null, 2)}
</ENTITY_TYPES>
<ENTITIES> <ENTITIES>
${JSON.stringify(context.entities, null, 2)} ${JSON.stringify(context.entities, null, 2)}
</ENTITIES> </ENTITIES>
@ -453,7 +285,7 @@ ${JSON.stringify(context.entities, null, 2)}
${context.episodeContent} ${context.episodeContent}
</EPISODE_CONTENT> </EPISODE_CONTENT>
Based on the above information, please extract and enhance attributes for each entity according to its type definition. Return only the uuid and updated attributes for each entity.`; Based on the above information, please extract and enhance attributes for each entity based on the context. Return only the uuid and updated attributes for each entity.`;
return [ return [
{ role: "system", content: sysPrompt }, { role: "system", content: sysPrompt },
{ role: "user", content: userPrompt }, { role: "user", content: userPrompt },

View File

@ -132,10 +132,8 @@ Format your response as a JSON object with the following structure:
"edges": [ "edges": [
{ {
"source": "[Subject Entity Name - MUST be from AVAILABLE ENTITIES]", "source": "[Subject Entity Name - MUST be from AVAILABLE ENTITIES]",
"sourceType": "[Source Entity Type]",
"predicate": "[Relationship Type]", "predicate": "[Relationship Type]",
"target": "[Object Entity Name - MUST be from AVAILABLE ENTITIES]", "target": "[Object Entity Name - MUST be from AVAILABLE ENTITIES]",
"targetType": "[Target Entity Type]",
"fact": "[Natural language representation of the fact]", "fact": "[Natural language representation of the fact]",
"attributes": { "attributes": {
"confidence": confidence of the fact, "confidence": confidence of the fact,

View File

@ -48,10 +48,10 @@ export interface EpisodicNode {
export interface EntityNode { export interface EntityNode {
uuid: string; uuid: string;
name: string; name: string;
type: string; // Single type - either from presets or custom type?: string; // Optional type - can be inferred from statements
attributes: Record<string, any>; attributes: Record<string, any>;
nameEmbedding: number[]; nameEmbedding: number[];
typeEmbedding: number[]; typeEmbedding?: number[]; // Optional since type is optional
createdAt: Date; createdAt: Date;
userId: string; userId: string;
space?: string; space?: string;
@ -123,10 +123,10 @@ export type AddEpisodeResult = {
export interface ExtractedTripleData { export interface ExtractedTripleData {
source: string; source: string;
sourceType: string; sourceType?: string; // Optional - can be inferred from statements
predicate: string; predicate: string;
target: string; target: string;
targetType: string; targetType?: string; // Optional - can be inferred from statements
fact: string; fact: string;
attributes?: Record<string, any>; attributes?: Record<string, any>;
} }