From 952386ca0eeb0404ebe2941381bc65a715a2e99e Mon Sep 17 00:00:00 2001 From: Manoj Date: Tue, 9 Sep 2025 20:00:50 +0530 Subject: [PATCH] refactor: make entity handling type-free and simplify entity resolution in knowledge graph --- .../webapp/app/services/graphModels/entity.ts | 34 +- .../app/services/knowledgeGraph.server.ts | 206 ++-------- apps/webapp/app/services/prompts/nodes.ts | 366 +++++------------- .../webapp/app/services/prompts/statements.ts | 2 - packages/types/src/graph/graph.entity.ts | 8 +- 5 files changed, 151 insertions(+), 465 deletions(-) diff --git a/apps/webapp/app/services/graphModels/entity.ts b/apps/webapp/app/services/graphModels/entity.ts index a11fcd1..fa216a5 100644 --- a/apps/webapp/app/services/graphModels/entity.ts +++ b/apps/webapp/app/services/graphModels/entity.ts @@ -2,19 +2,9 @@ import type { EntityNode } from "@core/types"; import { runQuery } from "~/lib/neo4j.server"; export async function saveEntity(entity: EntityNode): Promise { - // Debug: Log entity to identify missing typeEmbedding - if (!entity.typeEmbedding) { - console.error(`Entity missing typeEmbedding:`, { - uuid: entity.uuid, - name: entity.name, - type: entity.type, - hasNameEmbedding: !!entity.nameEmbedding, - }); - throw new Error( - `Entity ${entity.name} (${entity.type}) is missing typeEmbedding`, - ); - } - + // Build query conditionally based on whether typeEmbedding exists + const hasTypeEmbedding = entity.typeEmbedding && entity.typeEmbedding.length > 0; + const query = ` MERGE (n:Entity {uuid: $uuid}) ON CREATE SET @@ -22,7 +12,7 @@ export async function saveEntity(entity: EntityNode): Promise { n.type = $type, n.attributes = $attributes, n.nameEmbedding = $nameEmbedding, - n.typeEmbedding = $typeEmbedding, + ${hasTypeEmbedding ? 'n.typeEmbedding = $typeEmbedding,' : ''} n.createdAt = $createdAt, n.userId = $userId, n.space = $space @@ -31,23 +21,27 @@ export async function saveEntity(entity: EntityNode): Promise { n.type = $type, n.attributes = $attributes, n.nameEmbedding = $nameEmbedding, - n.typeEmbedding = $typeEmbedding, + ${hasTypeEmbedding ? 'n.typeEmbedding = $typeEmbedding,' : ''} n.space = $space RETURN n.uuid as uuid `; - const params = { + const params: any = { uuid: entity.uuid, name: entity.name, - type: entity.type, + type: entity.type || "", attributes: JSON.stringify(entity.attributes || {}), nameEmbedding: entity.nameEmbedding, - typeEmbedding: entity.typeEmbedding, createdAt: entity.createdAt.toISOString(), userId: entity.userId, space: entity.space || null, }; + // Add typeEmbedding to params only if it exists + if (hasTypeEmbedding) { + params.typeEmbedding = entity.typeEmbedding; + } + const result = await runQuery(query, params); return result[0].get("uuid"); } @@ -65,10 +59,10 @@ export async function getEntity(uuid: string): Promise { return { uuid: entity.uuid, name: entity.name, - type: entity.type, + type: entity.type || null, attributes: JSON.parse(entity.attributes || "{}"), nameEmbedding: entity.nameEmbedding, - typeEmbedding: entity.typeEmbedding, + typeEmbedding: entity.typeEmbedding || null, createdAt: new Date(entity.createdAt), userId: entity.userId, space: entity.space, diff --git a/apps/webapp/app/services/knowledgeGraph.server.ts b/apps/webapp/app/services/knowledgeGraph.server.ts index ac69c79..d274fd8 100644 --- a/apps/webapp/app/services/knowledgeGraph.server.ts +++ b/apps/webapp/app/services/knowledgeGraph.server.ts @@ -15,8 +15,7 @@ import crypto from "crypto"; import { dedupeNodes, extractAttributes, - extractMessage, - extractText, + extractEntities, } from "./prompts/nodes"; import { extractStatements, @@ -25,14 +24,11 @@ import { import { getEpisodeStatements, getRecentEpisodes, - getRelatedEpisodesEntities, searchEpisodesByEmbedding, } from "./graphModels/episode"; import { findExactPredicateMatches, findSimilarEntities, - findSimilarEntitiesWithSameType, - replaceEntityReferences, } from "./graphModels/entity"; import { findContradictoryStatements, @@ -47,9 +43,7 @@ import { getEmbedding, makeModelCall } from "~/lib/model.server"; import { runQuery } from "~/lib/neo4j.server"; import { Apps, - getNodeTypes, getNodeTypesString, - isPresetType, } from "~/utils/presets/nodes"; import { normalizePrompt, normalizeDocumentPrompt } from "./prompts"; import { type PrismaClient } from "@prisma/client"; @@ -272,8 +266,8 @@ export class KnowledgeGraphService { params.type, ); - const normalizedTime = Date.now() - startTime; - logger.log(`Normalized episode body in ${normalizedTime} ms`); + const normalizedTime = Date.now(); + logger.log(`Normalized episode body in ${normalizedTime - startTime} ms`); if (normalizedEpisodeBody === "NOTHING_TO_REMEMBER") { logger.log("Nothing to remember"); @@ -284,15 +278,6 @@ export class KnowledgeGraphService { }; } - const relatedEpisodesEntities = await getRelatedEpisodesEntities({ - embedding: await this.getEmbedding(normalizedEpisodeBody), - userId: params.userId, - minSimilarity: 0.7, - }); - - const relatedTime = Date.now() - normalizedTime; - logger.log(`Related episodes entities in ${relatedTime} ms`); - // Step 2: Episode Creation - Create or retrieve the episode const episode: EpisodicNode = { uuid: crypto.randomUUID(), @@ -316,23 +301,18 @@ export class KnowledgeGraphService { ); const extractedTime = Date.now(); - logger.log(`Extracted entities in ${extractedTime - relatedTime} ms`); + logger.log(`Extracted entities in ${extractedTime - normalizedTime} ms`); - // Step 3.1: Context-aware entity resolution with preset type evolution - await this.resolveEntitiesWithContext( - extractedNodes, - relatedEpisodesEntities, - ); - - // Step 3.2: Handle preset type logic - expand entities for statement extraction - const categorizedEntities = await this.expandEntitiesForStatements( - extractedNodes, - episode, - ); + // Step 3.1: Simple entity categorization (no type-based expansion needed) + const categorizedEntities = { + primary: extractedNodes, + expanded: [], // No expansion needed with type-free approach + }; const expandedTime = Date.now(); - logger.log(`Expanded entities in ${expandedTime - extractedTime} ms`); + logger.log(`Processed entities in ${expandedTime - extractedTime} ms`); + console.log(extractedNodes.map((e) => e.name)); // Step 4: Statement Extrraction - Extract statements (triples) instead of direct edges const extractedStatements = await this.extractStatements( episode, @@ -371,10 +351,12 @@ export class KnowledgeGraphService { ); // Step 7: ADd attributes to entity nodes - const updatedTriples = await this.addAttributesToEntities( - resolvedStatements, - episode, - ); + // const updatedTriples = await this.addAttributesToEntities( + // resolvedStatements, + // episode, + // ); + + const updatedTriples = resolvedStatements; const updatedTriplesTime = Date.now(); logger.log( @@ -439,12 +421,6 @@ export class KnowledgeGraphService { episode: EpisodicNode, previousEpisodes: EpisodicNode[], ): Promise { - // Get all app keys - const allAppEnumValues = Object.values(Apps); - - // Get all node types - const entityTypes = getNodeTypes(allAppEnumValues); - // Use the prompt library to get the appropriate prompts const context = { episodeContent: episode.content, @@ -452,13 +428,11 @@ export class KnowledgeGraphService { content: ep.content, createdAt: ep.createdAt.toISOString(), })), - entityTypes: entityTypes, }; - // Get the extract_json prompt from the prompt library - const messages = episode.sessionId - ? extractMessage(context) - : extractText(context); + // Get the unified entity extraction prompt + const extractionMode = episode.sessionId ? 'conversation' : 'document'; + const messages = extractEntities(context, extractionMode); let responseText = ""; @@ -474,21 +448,19 @@ export class KnowledgeGraphService { responseText = outputMatch[1].trim(); const extractedEntities = JSON.parse(responseText || "{}").entities || []; - // Batch generate embeddings for better performance + // Batch generate embeddings for entity names const entityNames = extractedEntities.map((entity: any) => entity.name); - const entityTypes = extractedEntities.map((entity: any) => entity.type); - const [nameEmbeddings, typeEmbeddings] = await Promise.all([ - Promise.all(entityNames.map((name: string) => this.getEmbedding(name))), - Promise.all(entityTypes.map((type: string) => this.getEmbedding(type))), - ]); + const nameEmbeddings = await Promise.all( + entityNames.map((name: string) => this.getEmbedding(name)) + ); entities = extractedEntities.map((entity: any, index: number) => ({ uuid: crypto.randomUUID(), name: entity.name, - type: entity.type, + type: undefined, // Type will be inferred from statements attributes: entity.attributes || {}, nameEmbedding: nameEmbeddings[index], - typeEmbedding: typeEmbeddings[index], + typeEmbedding: undefined, // No type embedding needed createdAt: new Date(), userId: episode.userId, })); @@ -537,6 +509,8 @@ export class KnowledgeGraphService { responseText = text; }); + console.log(responseText); + const outputMatch = responseText.match(/([\s\S]*?)<\/output>/); if (outputMatch && outputMatch[1]) { responseText = outputMatch[1].trim(); @@ -548,6 +522,8 @@ export class KnowledgeGraphService { const extractedTriples: ExtractedTripleData[] = JSON.parse(responseText || "{}").edges || []; + console.log(`extracted triples length: ${extractedTriples.length}`) + // Create maps to deduplicate entities by name within this extraction const predicateMap = new Map(); @@ -597,17 +573,13 @@ export class KnowledgeGraphService { // Convert extracted triples to Triple objects with Statement nodes const triples = extractedTriples.map( (triple: ExtractedTripleData, tripleIndex: number) => { - // Find the subject and object nodes by matching both name and type + // Find the subject and object nodes by matching name (type-free approach) const subjectNode = allEntities.find( - (node) => - node.name.toLowerCase() === triple.source.toLowerCase() && - node.type.toLowerCase() === triple.sourceType.toLowerCase(), + (node) => node.name.toLowerCase() === triple.source.toLowerCase() ); const objectNode = allEntities.find( - (node) => - node.name.toLowerCase() === triple.target.toLowerCase() && - node.type.toLowerCase() === triple.targetType.toLowerCase(), + (node) => node.name.toLowerCase() === triple.target.toLowerCase() ); // Get the deduplicated predicate node @@ -661,108 +633,7 @@ export class KnowledgeGraphService { return triples.filter(Boolean) as Triple[]; } - /** - * Expand entities for statement extraction by adding existing preset entities - */ - private async expandEntitiesForStatements( - extractedNodes: EntityNode[], - episode: EpisodicNode, - ): Promise<{ - primary: EntityNode[]; - expanded: EntityNode[]; - }> { - const allAppEnumValues = Object.values(Apps); - const expandedEntities: EntityNode[] = []; - // For each extracted entity, check if we need to add existing preset entities - for (const entity of extractedNodes) { - const newIsPreset = isPresetType(entity.type, allAppEnumValues); - - // Find similar entities with same name - const similarEntities = await findSimilarEntities({ - queryEmbedding: entity.nameEmbedding, - limit: 5, - threshold: 0.8, - userId: episode.userId, - }); - - for (const existingEntity of similarEntities) { - const existingIsPreset = isPresetType( - existingEntity.type, - allAppEnumValues, - ); - - // If both are preset types, include both for statement extraction - if (newIsPreset && existingIsPreset) { - // Add the existing entity to the list if not already present - if (!expandedEntities.some((e) => e.uuid === existingEntity.uuid)) { - expandedEntities.push(existingEntity); - } - } - } - } - - // Deduplicate by name AND type combination - const deduplicateEntities = (entities: EntityNode[]) => { - const seen = new Map(); - return entities.filter((entity) => { - const key = `${entity.name.toLowerCase()}_${entity.type.toLowerCase()}`; - if (seen.has(key)) { - return false; - } - seen.set(key, entity); - return true; - }); - }; - - return { - primary: deduplicateEntities(extractedNodes), - expanded: deduplicateEntities( - expandedEntities.filter( - (e) => !extractedNodes.some((primary) => primary.uuid === e.uuid), - ), - ), - }; - } - - /** - * Resolve entities with context-aware deduplication and preset type evolution - * Only merges entities that appear in semantically related episodes - */ - private async resolveEntitiesWithContext( - extractedNodes: EntityNode[], - relatedEpisodesEntities: EntityNode[], - ): Promise { - const allAppEnumValues = Object.values(Apps); - - extractedNodes.map(async (newEntity) => { - // Find same-name entities in related episodes (contextually relevant) - const sameNameInContext = relatedEpisodesEntities.filter( - (existing) => - existing.name.toLowerCase() === newEntity.name.toLowerCase(), - ); - - if (sameNameInContext.length > 0) { - let existingEntityIds: string[] = []; - sameNameInContext.forEach(async (existingEntity) => { - const newIsPreset = isPresetType(newEntity.type, allAppEnumValues); - const existingIsPreset = isPresetType( - existingEntity.type, - allAppEnumValues, - ); - - if (newIsPreset && !existingIsPreset) { - // New is preset, existing is custom - evolve existing entity to preset type - existingEntityIds.push(existingEntity.uuid); - } - }); - - if (existingEntityIds.length > 0) { - await replaceEntityReferences(newEntity, existingEntityIds); - } - } - }); - } /** * Resolve extracted nodes to existing nodes or create new ones @@ -835,9 +706,8 @@ export class KnowledgeGraphService { // Step 2a: Find similar entities for non-predicate entities const similarEntitiesResults = await Promise.all( nonPredicates.map(async (entity) => { - const similarEntities = await findSimilarEntitiesWithSameType({ + const similarEntities = await findSimilarEntities({ queryEmbedding: entity.nameEmbedding, - entityType: entity.type, limit: 5, threshold: 0.7, userId: episode.userId, @@ -1240,20 +1110,12 @@ export class KnowledgeGraphService { return triples; // No entities to process } - // Get all app keys - const allAppEnumValues = Object.values(Apps); - - // Get all node types with their attribute definitions - const entityTypes = getNodeTypes(allAppEnumValues); - // Prepare simplified context for the LLM const context = { episodeContent: episode.content, - entityTypes: entityTypes, entities: entities.map((entity) => ({ uuid: entity.uuid, name: entity.name, - type: entity.type, currentAttributes: entity.attributes || {}, })), }; diff --git a/apps/webapp/app/services/prompts/nodes.ts b/apps/webapp/app/services/prompts/nodes.ts index 1937eee..4fe9af7 100644 --- a/apps/webapp/app/services/prompts/nodes.ts +++ b/apps/webapp/app/services/prompts/nodes.ts @@ -5,16 +5,20 @@ import { type CoreMessage } from "ai"; /** - * Extract entities from an episode using message-based approach + * Extract entities from content using unified approach (works for both conversations and documents) */ -export const extractMessage = (context: Record): CoreMessage[] => { +export const extractEntities = ( + context: Record, + extractionMode: 'conversation' | 'document' = 'conversation' +): CoreMessage[] => { const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages for a reified knowledge graph. -Your primary task is to extract and classify significant entities mentioned in the conversation. +Your primary task is to extract all significant entities mentioned in the conversation, treating both concrete entities and type/concept entities as first-class nodes. -In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements. +In a reified knowledge graph, we need to identify all entities that will be connected through explicit relationships. Focus on extracting: -1. Subject entities (people, objects, concepts) -2. Object entities (people, objects, concepts) +1. Concrete entities (people, objects, specific instances) +2. Type/concept entities (categories, classes, abstract concepts) +3. All entities that participate in "X is a Y" relationships Instructions: @@ -22,16 +26,16 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr 1. **Entity Identification**: - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the CURRENT EPISODE. - - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X). + - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as an Alias entity AND the named entity (X). - **ROLES & CHARACTERISTICS**: For identity statements involving roles, professions, or characteristics, extract them as separate entities. - For pronouns that refer to named entities, extract them as separate Alias entities. + - **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities. -2. **Entity Classification**: - - Prefer using appropriate types from the ENTITY_TYPES section when they fit naturally. - - DO NOT force-fit entities into inappropriate types from ENTITY_TYPES. - - If no type from ENTITY_TYPES fits naturally, create a descriptive type based on context (e.g., "memory_graph_system", "authentication_bug"). - - Each entity should have exactly ONE type that best describes what it is. - - Classify pronouns (I, me, you, etc.) as "Alias" entities. +2. **Type and Concept Entity Extraction**: + - **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities. + - **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities. + - **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities. + - **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships. 3. **Exclusions**: - Do NOT extract entities representing relationships or actions (predicates will be handled separately). @@ -40,13 +44,13 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm"). 4. **Entity Name Extraction**: - - Extract ONLY the core entity name, WITHOUT any type descriptors or qualifiers - - When text mentions "Tesla car", extract name as "Tesla" with type "Vehicle" - - When text mentions "John's company", extract name as "John" with type "Person" (company is a separate entity) - - **CLEAN NAMES**: Remove type words like "app", "system", "platform", "tool", "service", "company", "organization" from the entity name - - **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you") and classify as "Alias" + - Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers + - When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car" + - When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities + - **CLEAN NAMES**: Remove articles (a, an, the) and quantifiers, but preserve the core concept + - **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you") - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John") - - **NO TYPE SUFFIXES**: Never append the entity type to the entity name + - **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" → "Space") 5. **Temporal and Relationship Context Extraction**: - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years") @@ -57,6 +61,19 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr ## Examples of Correct Entity Extraction: +**TYPE/CONCEPT ENTITY EXTRACTION:** + +✅ **EXTRACT BOTH ENTITIES IN "IS A" RELATIONSHIPS:** +- Text: "Profile is a memory space" → Extract: "Profile" AND "MemorySpace" +- Text: "Tesla is a car" → Extract: "Tesla" AND "Car" +- Text: "John is a teacher" → Extract: "John" AND "Teacher" +- Text: "Goals space connects to Projects" → Extract: "Goals", "Space", AND "Projects" + +✅ **EXTRACT CONCEPT ENTITIES:** +- Text: "rated 10/10 for usefulness" → Extract: "Usefulness", "Rating" +- Text: "essential classification tier" → Extract: "Essential", "Classification", "Tier" +- Text: "hierarchical memory system" → Extract: "Hierarchical", "Memory", "System" + **TEMPORAL INFORMATION - What to EXTRACT vs EXCLUDE:** ✅ **EXTRACT - Relationship Temporal Information:** @@ -73,47 +90,50 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr - Text: "next week" → Don't extract "next week" **RELATIONSHIP CONTEXT ENTITIES:** -- Text: "my close friends" → Extract: "close friends" (QualifiedGroup) -- Text: "strong support system" → Extract: "support system" (RelationshipType) -- Text: "work colleagues" → Extract: "work colleagues" (ProfessionalGroup) -- Text: "family members" → Extract: "family members" (FamilyGroup) +- Text: "my close friends" → Extract: "Close Friends" (QualifiedGroup) +- Text: "strong support system" → Extract: "Support System" (RelationshipType) +- Text: "work colleagues" → Extract: "Work Colleagues" (ProfessionalGroup) +- Text: "family members" → Extract: "Family Members" (FamilyGroup) **STANDARD ENTITY EXTRACTION:** -- Text: "Tesla car" → Name: "Tesla", Type: "Vehicle" -- Text: "Google's search engine" → Name: "Google", Type: "Company" + Name: "Search Engine", Type: "Product" -- Text: "Microsoft Office suite" → Name: "Microsoft Office", Type: "Software" -- Text: "John's startup company" → Name: "John", Type: "Person" + Name: "Startup", Type: "Company" +- Text: "Tesla car" → Extract: "Tesla" AND "Car" +- Text: "Google's search engine" → Extract: "Google" AND "Search Engine" +- Text: "Microsoft Office suite" → Extract: "Microsoft Office" AND "Suite" +- Text: "John's startup company" → Extract: "John", "Startup", AND "Company" -**INCORRECT Examples:** -- Text: "Tesla car" → ❌ Name: "Tesla car", Type: "Vehicle" -- Text: "authentication system" → ❌ Name: "authentication system", Type: "System" -- Text: "payment service" → ❌ Name: "payment service", Type: "Service" +**CORRECT vs INCORRECT Examples:** + +✅ **CORRECT:** +- Text: "Profile is a memory space" → Extract: "Profile", "MemorySpace" +- Text: "essential classification system" → Extract: "Essential", "Classification", "System" +- Text: "10/10 usefulness rating" → Extract: "Usefulness", "Rating" + +❌ **INCORRECT:** +- Text: "Profile is a memory space" → ❌ Only extract: "Profile" +- Text: "authentication system" → ❌ Extract: "authentication system" (should be "Authentication", "System") +- Text: "payment service" → ❌ Extract: "payment service" (should be "Payment", "Service") Format your response as a JSON object with the following structure: { "entities": [ { - "name": "Entity Name", - "type": "Entity Type", + "name": "Entity Name" } // Additional entities... ] } `; + const contentLabel = extractionMode === 'conversation' ? 'CURRENT EPISODE' : 'TEXT'; const userPrompt = ` - +${extractionMode === 'conversation' ? ` ${JSON.stringify(context.previousEpisodes || [], null, 2)} - +` : ''}<${contentLabel}> ${context.episodeContent} - - - -${JSON.stringify(context.entityTypes || {}, null, 2)} - + `; @@ -123,161 +143,6 @@ ${JSON.stringify(context.entityTypes || {}, null, 2)} ]; }; -/** - * Extract entities from text-based content - */ -export const extractText = (context: Record): CoreMessage[] => { - const sysPrompt = ` -You are an AI assistant that extracts entity nodes from text for a reified knowledge graph. -Your primary task is to extract and classify significant entities mentioned in the provided text. - -In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements. -Focus on extracting: -1. Subject entities -2. Object entities - -Instructions: - -You are given a TEXT. Your task is to extract **entity nodes** mentioned **explicitly or implicitly** in the TEXT. - -1. **Entity Identification**: - - Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT. - - For identity statements like "I am X" or "I'm X", extract BOTH the pronoun ("I") as a Alias entity AND the named entity (X). - - **ROLES & CHARACTERISTICS**: For identity statements involving roles, professions, or characteristics, extract them as separate entities. - - For pronouns that refer to named entities, extract them as separate Alias entities. - -2. **Entity Classification**: - - Prefer using appropriate types from the ENTITY_TYPES section when they fit naturally. - - DO NOT force-fit entities into inappropriate types from ENTITY_TYPES. - - If no type from ENTITY_TYPES fits naturally, create a descriptive type based on context. - - Each entity should have exactly ONE type that best describes what it is. - - Classify pronouns (I, me, you, etc.) as "Alias" entities. - -3. **Exclusions**: - - Do NOT extract entities representing relationships or actions (predicates will be handled separately). - - **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements. - - Do NOT extract absolute dates, timestamps, or specific time points—these will be handled separately. - - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm"). - -4. **Entity Name Extraction**: - - Extract ONLY the core entity name, WITHOUT any type descriptors or qualifiers - - When text mentions "Tesla car", extract name as "Tesla" with type "Vehicle" - - When text mentions "John's company", extract name as "John" with type "Person" (company is a separate entity) - - **CLEAN NAMES**: Remove type words like "app", "system", "platform", "tool", "service", "company", "organization" from the entity name - - **PRONOUNS**: Use exact form as they appear (e.g., "I", "me", "you") and classify as "Alias" - - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John") - - **NO TYPE SUFFIXES**: Never append the entity type to the entity name - -5. **Temporal and Relationship Context Extraction**: - - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years") - - EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college") - - EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members") - - DO NOT extract absolute dates, timestamps, or specific time points ("June 9, 2023", "3pm", "last Saturday") - - DO NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday") - -## Examples of Correct Entity Extraction: - -**TEMPORAL INFORMATION - What to EXTRACT vs EXCLUDE:** - -✅ **EXTRACT - Relationship Temporal Information:** -- Text: "I've known these friends for 4 years" → Extract: "4 years" (Duration) -- Text: "since I moved from my home country" → Extract: "since moving" (TemporalContext) -- Text: "after that tough breakup" → Extract: "after breakup" (TemporalContext) -- Text: "we've been married for 5 years" → Extract: "5 years" (Duration) -- Text: "during college" → Extract: "during college" (TemporalContext) - -❌ **EXCLUDE - Absolute Dates/Times:** -- Text: "on June 9, 2023" → Don't extract "June 9, 2023" -- Text: "last Saturday" → Don't extract "last Saturday" -- Text: "at 3pm yesterday" → Don't extract "3pm" or "yesterday" -- Text: "next week" → Don't extract "next week" - -**RELATIONSHIP CONTEXT ENTITIES:** -- Text: "my close friends" → Extract: "close friends" (QualifiedGroup) -- Text: "strong support system" → Extract: "support system" (RelationshipType) -- Text: "work colleagues" → Extract: "work colleagues" (ProfessionalGroup) -- Text: "family members" → Extract: "family members" (FamilyGroup) - -**STANDARD ENTITY EXTRACTION:** -- Text: "Tesla car" → Name: "Tesla", Type: "Vehicle" -- Text: "Google's search engine" → Name: "Google", Type: "Company" + Name: "Search Engine", Type: "Product" -- Text: "Microsoft Office suite" → Name: "Microsoft Office", Type: "Software" -- Text: "John's startup company" → Name: "John", Type: "Person" + Name: "Startup", Type: "Company" - -**INCORRECT Examples:** -- Text: "Tesla car" → ❌ Name: "Tesla car", Type: "Vehicle" -- Text: "authentication system" → ❌ Name: "authentication system", Type: "System" -- Text: "payment service" → ❌ Name: "payment service", Type: "Service" - -Format your response as a JSON object with the following structure: - -{ - "entities": [ - { - "name": "Entity Name", - "type": "Entity Type" - } - // Additional entities... - ] -} -`; - const userPrompt = ` - -${context.episodeContent} - - - -${JSON.stringify(context.entityTypes || {}, null, 2)} - -`; - - return [ - { role: "system", content: sysPrompt }, - { role: "user", content: userPrompt }, - ]; -}; -/** - * Extract entities from an episode using JSON-based approach - */ -export const extractJson = (context: Record): CoreMessage[] => { - const sysPrompt = `You are an AI assistant that extracts entity nodes from text. -Your primary task is to extract and classify significant entities mentioned in the content.`; - - const userPrompt = ` - -${JSON.stringify(context.previousEpisodes || [], null, 2)} - - - -${context.episodeContent} - - - -${JSON.stringify(context.entityTypes || {}, null, 2)} - - -Instructions: - -Extract all significant entities mentioned in the CURRENT EPISODE. For each entity, provide a name and type. -Respond with a JSON object containing an "entities" array of objects, each with "name" and "type" properties. - -Guidelines: -1. Extract significant entities, concepts, or actors mentioned in the content. -2. Avoid creating nodes for relationships or actions. -3. Avoid creating nodes for temporal information like dates, times or years (these will be added to edges later). -4. **CLEAN ENTITY NAMES**: Extract ONLY the core entity name WITHOUT type descriptors: - - "Tesla car" → Name: "Tesla", Type: "Vehicle" - - Remove words like "app", "system", "platform", "tool", "service", "company" from entity names -5. Use full names when available and avoid abbreviations. - -${context.customPrompt || ""} -`; - - return [ - { role: "system", content: sysPrompt }, - { role: "user", content: userPrompt }, - ]; -}; /** * Resolve entity duplications @@ -286,84 +151,53 @@ export const dedupeNodes = (context: Record): CoreMessage[] => { return [ { role: "system", - content: `You are a helpful assistant who determines whether or not ENTITIES extracted from a conversation are duplicates of existing entities. + content: `You are a helpful assistant who determines whether extracted entities are duplicates of existing entities. -## CRITICAL RULE: Entity Type Matters -DO NOT mark entities with different types as duplicates, even if they have identical names. -- DO NOT mark "John" (Person) and "John" (Company) as duplicates -- DO NOT mark "Apple" (Company) and "Apple" (Fruit) as duplicates -- DO NOT mark "Core" (App) and "Core" (Concept) as duplicates - -Consider entities as potential duplicates ONLY if they have: -1. Similar or identical names AND -2. The EXACT SAME entity type +Focus on name-based similarity and contextual meaning to identify duplicates. Each entity in ENTITIES is represented as a JSON object with the following structure: { id: integer id of the entity, name: "name of the entity", - entity_type: "ontological classification of the entity", - entity_type_description: "Description of what the entity type represents", duplication_candidates: [ { idx: integer index of the candidate entity, name: "name of the candidate entity", - entity_type: "ontological classification of the candidate entity", ... } ] } -## Duplication Decision Rules -For each entity, determine if it is a duplicate of any of its duplication candidates: +## Duplication Decision Framework ### MARK AS DUPLICATE (duplicate_idx >= 0) when: -- Verify the candidate has the SAME entity_type as the current entity -- AND confirm the entities refer to the same real-world object or concept -- AND check that the names are very similar or identical - -### SPECIAL RULE FOR PREDICATES: -**ALWAYS mark identical predicates as duplicates** - predicates are universal and reusable: -- Mark "is associated with" (Predicate) vs "is associated with" (Predicate) → duplicate_idx = 0 ✓ -- Mark "works for" (Predicate) vs "works for" (Predicate) → duplicate_idx = 0 ✓ -- Mark "owns" (Predicate) vs "owns" (Predicate) → duplicate_idx = 0 ✓ +- **IDENTICAL NAMES**: Exact same name or obvious synonyms +- **SEMANTIC EQUIVALENCE**: Different names but clearly referring to the same entity +- **STRUCTURAL VARIATIONS**: Same entity with minor formatting differences ### DO NOT mark as duplicate (duplicate_idx = -1) when: -- Confirm the candidate has a DIFFERENT entity_type (even with identical names) -- Identify they are related but distinct entities -- Recognize they have similar names or purposes but refer to separate instances or concepts -- Distinguish when one is a general concept and the other is a specific instance -- **EXCEPTION**: DO NOT apply this rule to Predicates - always deduplicate identical predicates +- **DIFFERENT INSTANCES**: Similar names but different real-world entities +- **CONTEXTUAL DISTINCTION**: Same name but different contexts suggest distinct entities +- **HIERARCHICAL RELATIONSHIPS**: One is part of/contains the other -## Examples: +## Example Patterns: -**CORRECT - Mark as NOT Duplicates (Different Types):** -- Set "Tesla" (Company) vs "Tesla" (Car) → duplicate_idx = -1 -- Set "Apple" (Company) vs "Apple" (Fruit) → duplicate_idx = -1 -- Set "Core" (App) vs "Core" (System) → duplicate_idx = -1 +**DUPLICATE CASES:** +- "John Smith" vs "John Smith" → Check context for same person +- "Microsoft" vs "Microsoft Corporation" → Same organization (duplicate_idx = 0) +- "iPhone" vs "Apple iPhone" → Same product (duplicate_idx = 0) +- "Tier 1" vs "Tier 1" → Same classification level (duplicate_idx = 0) -**CORRECT - Mark Predicates AS Duplicates (Same Name, Same Type):** -- Set "is associated with" (Predicate) vs "is associated with" (Predicate) → duplicate_idx = 0 -- Set "works for" (Predicate) vs "works for" (Predicate) → duplicate_idx = 0 -- Set "owns" (Predicate) vs "owns" (Predicate) → duplicate_idx = 0 +**NOT DUPLICATE CASES:** +- "Meeting Room A" vs "Meeting Room B" → Different rooms (duplicate_idx = -1) +- "Project Alpha" vs "Project Beta" → Different projects (duplicate_idx = -1) +- "Essential" vs "Critical" → Different priority levels (duplicate_idx = -1) +- "Team Lead" vs "Team Member" → Different roles (duplicate_idx = -1) -**CORRECT - Evaluate Potential Duplicates (Same Type):** -- Check if "John Smith" (Person) vs "John Smith" (Person) refer to same person -- Check if "Microsoft" (Company) vs "Microsoft Corporation" (Company) are the same company -- Check if "iPhone" (Product) vs "Apple iPhone" (Product) are the same product - -**CORRECT - Mark as NOT Duplicates (Same Type, Different Instances):** -- Set "Meeting" (Event) vs "Meeting" (Event) → duplicate_idx = -1 (different meetings) -- Set "Project" (Task) vs "Project" (Task) → duplicate_idx = -1 (different projects) -- **NOTE**: DO NOT apply this rule to Predicates - always deduplicate identical predicates - -## Task: -Provide your response as a JSON object with an "entity_resolutions" array containing one entry for each entity. - -For each entity, include: -- "id": the id of the entity (integer) -- "name": the name of the entity (string) -- "duplicate_idx": the index of the duplicate candidate, or -1 if no duplicate (integer) +## Decision Guidelines: +- **CONSERVATIVE APPROACH**: When uncertain, prefer NOT marking as duplicate +- **CONTEXT MATTERS**: Consider the episode content and previous episodes +- **SEMANTIC MEANING**: Focus on whether they refer to the same real-world entity Format your response as follows: @@ -380,12 +214,9 @@ Format your response as follows: ## Important Instructions: -- FIRST check if entity types match before considering any duplication -- If entity types don't match, immediately set duplicate_idx = -1 -- Only mark entities with identical types as potential duplicates -- When in doubt, prefer NOT marking as duplicate (duplicate_idx = -1) - Always include all entities from the input in your response - Always wrap the output in these tags +- When in doubt, prefer NOT marking as duplicate (duplicate_idx = -1) `, }, { @@ -412,16 +243,21 @@ export const extractAttributes = ( ): CoreMessage[] => { const sysPrompt = ` You are an AI assistant that extracts and enhances entity attributes based on context. -Your task is to analyze entities and provide appropriate attribute values for each entity based on its type definition. +Your task is to analyze entities and provide appropriate attribute values based on available information. For each entity: -1. Look at its type and identify the required and optional attributes from the entity type definitions -2. Check if the entity already has values for these attributes -3. For missing attributes, extract appropriate values from the context if possible -4. For existing attributes, enhance or correct them if needed based on the context -5. Give empty attributes object ({}) when there are no attributes to update -6. Only include attributes that you're updating - don't repeat existing attributes that don't need changes -7. I'll merge your new attributes with the current attributes, so only provide values that should be added or modified +1. Analyze the context to identify relevant attributes for the entity +2. Extract appropriate values from the episode content if available +3. Focus on factual, descriptive attributes rather than type classifications +4. Give empty attributes object ({}) when there are no attributes to update +5. Only include attributes that you're adding or modifying +6. I'll merge your new attributes with existing ones, so only provide updates + +Common attribute types to consider: +- Descriptive properties (color, size, status, etc.) +- Relational context (role, position, relationship, etc.) +- Temporal information (duration, frequency, etc.) +- Qualitative aspects (importance, preference, etc.) Provide your output in this structure: @@ -441,10 +277,6 @@ Provide your output in this structure: `; const userPrompt = ` - -${JSON.stringify(context.entityTypes, null, 2)} - - ${JSON.stringify(context.entities, null, 2)} @@ -453,7 +285,7 @@ ${JSON.stringify(context.entities, null, 2)} ${context.episodeContent} -Based on the above information, please extract and enhance attributes for each entity according to its type definition. Return only the uuid and updated attributes for each entity.`; +Based on the above information, please extract and enhance attributes for each entity based on the context. Return only the uuid and updated attributes for each entity.`; return [ { role: "system", content: sysPrompt }, { role: "user", content: userPrompt }, diff --git a/apps/webapp/app/services/prompts/statements.ts b/apps/webapp/app/services/prompts/statements.ts index ed88236..5d956e8 100644 --- a/apps/webapp/app/services/prompts/statements.ts +++ b/apps/webapp/app/services/prompts/statements.ts @@ -132,10 +132,8 @@ Format your response as a JSON object with the following structure: "edges": [ { "source": "[Subject Entity Name - MUST be from AVAILABLE ENTITIES]", - "sourceType": "[Source Entity Type]", "predicate": "[Relationship Type]", "target": "[Object Entity Name - MUST be from AVAILABLE ENTITIES]", - "targetType": "[Target Entity Type]", "fact": "[Natural language representation of the fact]", "attributes": { "confidence": confidence of the fact, diff --git a/packages/types/src/graph/graph.entity.ts b/packages/types/src/graph/graph.entity.ts index 3ca9274..32b8566 100644 --- a/packages/types/src/graph/graph.entity.ts +++ b/packages/types/src/graph/graph.entity.ts @@ -48,10 +48,10 @@ export interface EpisodicNode { export interface EntityNode { uuid: string; name: string; - type: string; // Single type - either from presets or custom + type?: string; // Optional type - can be inferred from statements attributes: Record; nameEmbedding: number[]; - typeEmbedding: number[]; + typeEmbedding?: number[]; // Optional since type is optional createdAt: Date; userId: string; space?: string; @@ -123,10 +123,10 @@ export type AddEpisodeResult = { export interface ExtractedTripleData { source: string; - sourceType: string; + sourceType?: string; // Optional - can be inferred from statements predicate: string; target: string; - targetType: string; + targetType?: string; // Optional - can be inferred from statements fact: string; attributes?: Record; }