refactor: implement statement extraction and resolution

This commit is contained in:
Manoj K 2025-06-03 19:20:56 +05:30
parent 0853a30897
commit 96d829642b
5 changed files with 579 additions and 147 deletions

View File

@ -10,7 +10,8 @@ import {
import { LLMMappings, LLMModelEnum } from "@recall/types"; import { LLMMappings, LLMModelEnum } from "@recall/types";
import { logger } from "./logger.service"; import { logger } from "./logger.service";
import crypto from "crypto"; import crypto from "crypto";
import { extract_message, extract_text } from "./prompts/extractNodes"; import { dedupeNodes, extract_message, extract_text } from "./prompts/nodes";
import { extract_statements } from "./prompts/statements";
export enum EpisodeType { export enum EpisodeType {
Conversation = "CONVERSATION", Conversation = "CONVERSATION",
@ -41,7 +42,7 @@ export interface EpisodicNode {
* Entities represent subjects, objects, or predicates in statements * Entities represent subjects, objects, or predicates in statements
*/ */
export interface EntityNode { export interface EntityNode {
uuid?: string; uuid: string;
name: string; name: string;
type: string; type: string;
attributes: Record<string, any>; attributes: Record<string, any>;
@ -211,8 +212,8 @@ export class KnowledgeGraphService {
// Step 5: Statement Extraction - Extract statements (triples) instead of direct edges // Step 5: Statement Extraction - Extract statements (triples) instead of direct edges
const extractedStatements = await this.extractStatements( const extractedStatements = await this.extractStatements(
resolvedNodes,
episode, episode,
resolvedNodes,
previousEpisodes, previousEpisodes,
); );
@ -292,21 +293,27 @@ export class KnowledgeGraphService {
}, },
); );
const extractedEntities = JSON.parse(responseText || "{}").entities || [];
// Convert to EntityNode objects // Convert to EntityNode objects
const entities: EntityNode[] = []; const entities: EntityNode[] = [];
for (const entity of extractedEntities) { const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
entities.push({ if (outputMatch && outputMatch[1]) {
uuid: crypto.randomUUID(), responseText = outputMatch[1].trim();
name: entity.name, const extractedEntities = JSON.parse(responseText || "{}").entities || [];
type: entity.type,
attributes: entity.attributes || {}, entities.push(
nameEmbedding: [], // Will be populated later ...(await Promise.all(
createdAt: new Date(), extractedEntities.map(async (entity: any) => ({
userId: episode.userId, uuid: crypto.randomUUID(),
}); name: entity.name,
type: entity.type,
attributes: entity.attributes || {},
nameEmbedding: await this.getEmbedding(entity.name),
createdAt: new Date(),
userId: episode.userId,
})),
)),
);
} }
return entities; return entities;
@ -321,55 +328,102 @@ export class KnowledgeGraphService {
previousEpisodes: EpisodicNode[], previousEpisodes: EpisodicNode[],
): Promise<{ resolvedNodes: EntityNode[]; uuidMap: Map<string, string> }> { ): Promise<{ resolvedNodes: EntityNode[]; uuidMap: Map<string, string> }> {
const uuidMap = new Map<string, string>(); const uuidMap = new Map<string, string>();
const resolvedNodes: EntityNode[] = [];
for (const extractedNode of extractedNodes) { const existingNodesLists = await Promise.all(
// Generate embedding for the node name extractedNodes.map(async (extractedNode) => {
const nameEmbedding = await this.getEmbedding(extractedNode.name); // Check if a similar node already exists in HelixDB
// Use vector similarity search to find similar entities
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance)
const similarEntities = await helixClient.query("findSimilarEntities", {
queryEmbedding: extractedNode.nameEmbedding,
limit: 5, // Get top 5 matches
threshold: 0.85, // 85% similarity threshold
});
// Check if a similar node already exists in HelixDB return similarEntities.nodes;
// Use vector similarity search to find similar entities }),
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance) );
const similarEntities = await helixClient.query("findSimilarEntities", {
queryEmbedding: nameEmbedding, if (!existingNodesLists || existingNodesLists.length === 0) {
limit: 5, // Get top 5 matches extractedNodes.forEach((node) => {
threshold: 0.85, // 85% similarity threshold uuidMap.set(node.uuid, node.uuid);
}); });
return { resolvedNodes: extractedNodes, uuidMap };
const existingNodes = similarEntities.nodes;
// Get entity types dictionary or empty object if not provided
const entityTypesDict = entity_types || {};
if (similarEntities.length > 0) {
// If similar nodes exist, we need to decide if we want to merge with an existing one
// This could involve LLM to determine if they're the same entity
const existingNode = similarEntities[0];
// Map the extracted node UUID to the existing node UUID
uuidMap.set(extractedNode.uuid, existingNode.uuid);
// Add the existing node to our resolved nodes if not already present
if (!resolvedNodes.some((node) => node.uuid === existingNode.uuid)) {
resolvedNodes.push({
uuid: existingNode.uuid,
name: existingNode.name,
type: existingNode.type,
attributes: existingNode.attributes || {},
nameEmbedding: existingNode.nameEmbedding,
createdAt: new Date(existingNode.createdAt),
userId: existingNode.userId,
});
}
} else {
// This is a new entity, add embedding and keep as is
extractedNode.nameEmbedding = nameEmbedding;
resolvedNodes.push(extractedNode);
uuidMap.set(extractedNode.uuid, extractedNode.uuid);
}
} }
return { resolvedNodes, uuidMap }; // Prepare context for LLM
const extractedNodesContext = extractedNodes.map(
(node: EntityNode, i: number) => {
return {
id: i,
name: node.name,
entity_type: node.type,
entity_type_description: "Default Entity Type",
duplication_candidates: existingNodesLists[i].map(
(candidate: EntityNode, j: number) => ({
idx: j,
name: candidate.name,
entity_types: candidate.type,
...candidate.attributes,
}),
),
};
},
);
const context = {
extracted_nodes: extractedNodesContext,
episode_content: episode ? episode.content : "",
previous_episodes: previousEpisodes
? previousEpisodes.map((ep) => ep.content)
: [],
};
const messages = dedupeNodes(context);
let responseText = "";
await this.makeModelCall(
false,
LLMModelEnum.GPT41,
messages as CoreMessage[],
(text) => {
responseText = text;
},
);
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
if (outputMatch && outputMatch[1]) {
responseText = outputMatch[1].trim();
const parsedResponse = JSON.parse(responseText);
const nodeResolutions = parsedResponse.entity_resolutions || [];
// Process each node resolution to either map to an existing node or keep as new
const resolvedNodes = nodeResolutions.map((resolution: any) => {
const resolutionId = resolution.id ?? -1;
const duplicateIdx = resolution.duplicate_idx ?? -1;
const extractedNode = extractedNodes[resolutionId];
// If a duplicate was found, use the existing node, otherwise use the extracted node
const resolvedNode =
duplicateIdx >= 0 &&
duplicateIdx < existingNodesLists[resolutionId]?.length
? existingNodesLists[resolutionId][duplicateIdx]
: extractedNode;
// Update the name if provided in the resolution
if (resolution.name) {
resolvedNode.name = resolution.name;
}
// Map the extracted UUID to the resolved UUID
uuidMap.set(extractedNode.uuid, resolvedNode.uuid);
return resolvedNode;
});
return { resolvedNodes, uuidMap };
}
} }
/** /**
@ -388,18 +442,16 @@ export class KnowledgeGraphService {
content: ep.content, content: ep.content,
createdAt: ep.createdAt.toISOString(), createdAt: ep.createdAt.toISOString(),
})), })),
nodes: resolvedNodes.map((node) => ({ entities: resolvedNodes.map((node) => ({
name: node.name, name: node.name,
type: node.type, type: node.type,
uuid: node.uuid, uuid: node.uuid,
})), })),
referenceTime: episode.validAt.toISOString(), referenceTime: episode.validAt.toISOString(),
relationshipTypes: {}, // Could be populated with relationship definitions
}; };
// Get the statement extraction prompt from the prompt library // Get the statement extraction prompt from the prompt library
// Note: You might need to update your prompts to extract subject-predicate-object patterns const messages = extract_statements(context);
const messages = promptLibrary.extractEdges.edge.call(context);
let responseText = ""; let responseText = "";
@ -413,65 +465,90 @@ export class KnowledgeGraphService {
); );
// Parse the statements from the LLM response // Parse the statements from the LLM response
// This will need to be updated based on your prompt format
const extractedTriples = JSON.parse(responseText || "{}").edges || []; const extractedTriples = JSON.parse(responseText || "{}").edges || [];
// Convert to Triple objects with Statement nodes // Convert extracted triples to Triple objects with Statement nodes
const triples: Triple[] = []; const triples = await Promise.all(
// Fix: Type 'any'.
extractedTriples.map(async (triple: any) => {
// Find the subject and object nodes
const subjectNode = resolvedNodes.find(
(node) => node.name.toLowerCase() === triple.source.toLowerCase(),
);
for (const triple of extractedTriples) { const objectNode = resolvedNodes.find(
const subjectNode = resolvedNodes.find( (node) => node.name.toLowerCase() === triple.target.toLowerCase(),
(node) => );
node.name.toLowerCase() === triple.sourceEntityName.toLowerCase(),
);
const objectNode = resolvedNodes.find( // Find or create a predicate node for the relationship type
(node) => const predicateNode = resolvedNodes.find(
node.name.toLowerCase() === triple.targetEntityName.toLowerCase(), (node) =>
); node.name.toLowerCase() === triple.relationship.toLowerCase(),
) || {
// Find or create a predicate node for the relationship type
const predicateNode = resolvedNodes.find(
(node) =>
node.name.toLowerCase() === triple.relationshipType.toLowerCase(),
) || {
uuid: crypto.randomUUID(),
name: triple.relationshipType,
type: "Predicate",
attributes: {},
nameEmbedding: [], // Will be populated later
createdAt: new Date(),
userId: episode.userId,
};
if (subjectNode && objectNode) {
// Generate embedding for the fact
const factEmbedding = await this.getEmbedding(triple.fact);
// Create a statement node
const statement: StatementNode = {
uuid: crypto.randomUUID(), uuid: crypto.randomUUID(),
fact: triple.fact, name: triple.relationship,
groupId: crypto.randomUUID().slice(0, 8), // Could be used to group related statements type: "Predicate",
attributes: {},
nameEmbedding: await this.getEmbedding(triple.relationship),
createdAt: new Date(), createdAt: new Date(),
validAt: triple.validAt ? new Date(triple.validAt) : episode.validAt,
invalidAt: triple.invalidAt ? new Date(triple.invalidAt) : undefined,
attributesJson: JSON.stringify({}), // Could store additional metadata
embedding: factEmbedding,
userId: episode.userId, userId: episode.userId,
}; };
triples.push({ if (subjectNode && objectNode) {
statement, // Create a statement node
subject: subjectNode, const statement: StatementNode = {
predicate: predicateNode, uuid: crypto.randomUUID(),
object: objectNode, fact: triple.fact,
provenance: episode, factEmbedding: await this.getEmbedding(triple.fact),
}); createdAt: new Date(),
} validAt: episode.validAt,
invalidAt: null,
attributes: triple.attributes || {},
userId: episode.userId,
};
return {
statement,
subject: subjectNode,
predicate: predicateNode,
object: objectNode,
provenance: episode,
};
}
return null;
}),
);
// Filter out null values (where subject or object wasn't found)
return triples.filter(Boolean) as Triple[];
}
private async resolvePredicateNodes(
triples: Triple[],
episode: EpisodicNode,
) {
const predicateNodes: EntityNode[] = triples.map((triple: Triple) => {
return triple.predicate;
});
if (predicateNodes.length === 0) {
return;
} }
return triples; const existingNodesLists = await Promise.all(
predicateNodes.map(async (predicateNode) => {
// Check if a similar node already exists in HelixDB
// Use vector similarity search to find similar entities
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance)
const similarEntities = await helixClient.query("findSimilarEntities", {
queryEmbedding: predicateNode.nameEmbedding,
limit: 5, // Get top 5 matches
threshold: 0.85, // 85% similarity threshold
});
return similarEntities.nodes;
}),
);
} }
/** /**

View File

@ -3,6 +3,6 @@
*/ */
// Export types from individual prompt modules // Export types from individual prompt modules
export { type ExtractedEntity, type ExtractedEntities } from "./extractNodes"; export { type ExtractedEntity, type ExtractedEntities } from "./nodes";
export { type Edge, type ExtractedEdges } from "./extractEdges"; export { type Edge, type ExtractedEdges } from "./extractEdges";
export { type ContradictionResult } from "./contradiction"; export { type ContradictionResult } from "./contradiction";

View File

@ -34,21 +34,13 @@ export interface EntityClassification {
export const extract_message = ( export const extract_message = (
context: Record<string, any>, context: Record<string, any>,
): CoreMessage[] => { ): CoreMessage[] => {
const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages. const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages for a reified knowledge graph.
Your primary task is to extract and classify significant entities mentioned in the conversation.`; Your primary task is to extract and classify significant entities mentioned in the conversation.
const userPrompt = ` In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements.
<PREVIOUS EPISODES> Focus on extracting:
${JSON.stringify(context.previousEpisodes || [], null, 2)} 1. Subject entities (people, objects, concepts)
</PREVIOUS EPISODES> 2. Object entities (people, objects, concepts)
<CURRENT EPISODE>
${context.episodeContent}
</CURRENT EPISODE>
<ENTITY TYPES>
${JSON.stringify(context.entityTypes || {}, null, 2)}
</ENTITY TYPES>
Instructions: Instructions:
@ -63,14 +55,34 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
- Assign the appropriate type for each one. - Assign the appropriate type for each one.
3. **Exclusions**: 3. **Exclusions**:
- Do NOT extract entities representing relationships or actions. - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
- Do NOT extract dates, times, or other temporal informationthese will be handled separately. - Do NOT extract dates, times, or other temporal informationthese will be handled separately.
4. **Formatting**: 4. **Formatting**:
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available). - Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
${context.customPrompt || ""}
`; Format your response as a JSON object with the following structure:
<output>
{
"entities": [
{
"name": "Entity Name",
"type": "Entity Type",
}
// Additional entities...
]
}
</output>`;
const userPrompt = `
<PREVIOUS EPISODES>
${JSON.stringify(context.previousEpisodes || [], null, 2)}
</PREVIOUS EPISODES>
<CURRENT EPISODE>
${context.episodeContent}
</CURRENT EPISODE>`;
return [ return [
{ role: "system", content: sysPrompt }, { role: "system", content: sysPrompt },
@ -82,29 +94,50 @@ ${context.customPrompt || ""}
* Extract entities from text-based content * Extract entities from text-based content
*/ */
export const extract_text = (context: Record<string, any>): CoreMessage[] => { export const extract_text = (context: Record<string, any>): CoreMessage[] => {
const sysPrompt = `You are an AI assistant that extracts entity nodes from text. const sysPrompt = `
Your primary task is to extract and classify the speaker and other significant entities mentioned in the provided text.`; You are an AI assistant that extracts entity nodes from text for a reified knowledge graph.
Your primary task is to extract and classify significant entities mentioned in the provided text.
In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements.
Focus on extracting:
1. Subject entities (people, objects, concepts)
2. Object entities (people, objects, concepts)
Instructions:
You are given a TEXT. Your task is to extract **entity nodes** mentioned **explicitly or implicitly** in the TEXT.
1. **Entity Identification**:
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT.
2. **Entity Classification**:
- Use the descriptions in ENTITY TYPES to classify each extracted entity.
- Assign the appropriate type for each one.
3. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately).
- Do NOT extract dates, times, or other temporal informationthese will be handled separately.
4. **Formatting**:
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
Format your response as a JSON object with the following structure:
<output>
{
"entities": [
{
"name": "Entity Name",
"type": "Entity Type"
}
// Additional entities...
]
}
</output>`;
const userPrompt = ` const userPrompt = `
<TEXT> <TEXT>
${context.episodeContent} ${context.episodeContent}
</TEXT> </TEXT>
<ENTITY TYPES>
${JSON.stringify(context.entityTypes || {}, null, 2)}
</ENTITY TYPES>
Given the above text, extract entities from the TEXT that are explicitly or implicitly mentioned.
For each entity extracted, also determine its entity type based on the provided ENTITY TYPES and their descriptions.
Indicate the classified entity type by providing its entity_type_id.
${context.customPrompt || ""}
Guidelines:
1. Extract significant entities, concepts, or actors mentioned in the conversation.
2. Avoid creating nodes for relationships or actions.
3. Avoid creating nodes for temporal information like dates, times or years (these will be added to edges later).
4. Be as explicit as possible in your node names, using full names and avoiding abbreviations.
`; `;
return [ return [
@ -218,3 +251,82 @@ ${JSON.stringify(context.node, null, 2)}
}, },
]; ];
}; };
/**
* Resolve entity duplications
*/
export const dedupeNodes = (context: Record<string, any>): CoreMessage[] => {
return [
{
role: "system",
content: `You are a helpful assistant who determines whether or not ENTITIES extracted from a conversation are duplicates of existing entities.
Each entity in ENTITIES is represented as a JSON object with the following structure:
{
id: integer id of the entity,
name: "name of the entity",
entity_type: "ontological classification of the entity",
entity_type_description: "Description of what the entity type represents",
duplication_candidates: [
{
idx: integer index of the candidate entity,
name: "name of the candidate entity",
entity_type: "ontological classification of the candidate entity",
...<additional attributes>
}
]
}
For each of the above ENTITIES, determine if the entity is a duplicate of any of its duplication candidates.
Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
Do NOT mark entities as duplicates if:
- They are related but distinct.
- They have similar names or purposes but refer to separate instances or concepts.
Task:
Your response must be a JSON object with an "entity_resolutions" array containing one entry for each entity.
For each entity, include:
- "id": the id of the entity (integer)
- "name": the name of the entity (string)
- "duplicate_idx": the index of the duplicate candidate, or -1 if no duplicate (integer)
Format your response as follows:
<output>
{
"entity_resolutions": [
{
"id": 0,
"name": "Entity Name",
"duplicate_idx": -1
},
// Additional entity resolutions...
]
}
</output>
Notes:
- If an entity is a duplicate of one of its duplication_candidates, set duplicate_idx to the idx of that candidate.
- If an entity is not a duplicate of any candidate, set duplicate_idx to -1.
- Always include all entities from the input in your response.
- Always wrap the output in these tags <output> </output>
`,
},
{
role: "user",
content: `
<PREVIOUS EPISODES>
${JSON.stringify(context.previousEpisodes || [], null, 2)}
</PREVIOUS EPISODES>
<CURRENT EPISODE>
${context.episodeContent}
</CURRENT EPISODE>
<ENTITIES>
${JSON.stringify(context.extracted_nodes, null, 2)}
</ENTITIES>
`,
},
];
};

View File

@ -0,0 +1,243 @@
import { type CoreMessage } from "ai";
import { type Triple } from "../knowledgeGraph.server";
/**
* Extract statements (triples) from episode content in a reified knowledge graph model
* This function generates a prompt for LLM to extract subject-predicate-object statements
* and represent them as first-class nodes with proper connections
*/
export const extract_statements = (
context: Record<string, any>,
): CoreMessage[] => {
return [
{
role: "system",
content: `You are a knowledge graph expert that extracts factual statements from text as subject-predicate-object triples.
Your task is to identify important facts and represent them in a reified knowledge graph model
where each statement is a first-class node connected to subject, predicate, and object entities.
I need to extract factual statements from the following conversation/text and represent them in a reified knowledge graph.
Follow these instructions carefully:
1. Identify key factual statements from the episode content and previous episodes
2. Represent each statement as a subject-predicate-object triple
3. Only use entities from the AVAILABLE ENTITIES list as subjects and objects
4. For each statement, provide:
- The subject entity name (must match exactly one from AVAILABLE ENTITIES)
- The predicate/relationship (a clear, concise verb or relationship type)
- The object entity name (must match exactly one from AVAILABLE ENTITIES)
- A natural language fact that accurately represents the triple
- Any additional attributes relevant to the relationship
IMPORTANT ABOUT TEMPORAL INFORMATION:
- The system tracks when facts become known (validAt) and contradicted (invalidAt) separately
- You must include any temporal information WITHIN the fact statement itself
- For example, if someone worked at a company from 2015-2020, include this in the "fact" field and "attributes.timespan" field
- Do NOT omit temporal information from facts - it's critical context
- Examples of good temporal facts:
* "John worked at Google from 2015 to 2020"
* "Sarah lived in New York until 2018"
* "The project was completed on March 15, 2023"
Format your response as a JSON object with the following structure:
<output>
{
"edges": [
{
"source": "[Subject Entity Name]",
"relationship": "[Predicate/Relationship Type]",
"target": "[Object Entity Name]",
"fact": "[Natural language representation of the fact INCLUDING any temporal information]",
"attributes": {
"confidence": 0.9, // How confident you are in this fact (0-1)
"source": "explicit", // Whether the fact was explicitly stated or inferred
"timespan": { // Include if the fact has a specific time period
"start": "2015", // When the fact started being true (if known)
"end": "2020" // When the fact stopped being true (if known)
}
}
},
// Additional statements...
]
}
</output>
Important guidelines:
- Only include the most significant and factual statements
- Do not invent entities not present in the AVAILABLE ENTITIES list
- Be precise in representing the relationships
- Each fact should be atomic (representing a single piece of information)
- ALWAYS include temporal information when available (dates, periods, etc.) in both the fact text AND attributes
- Facts should be based on the episode content, not general knowledge
- Aim for quality over quantity, prioritize clear, unambiguous statements
- For ongoing facts (still true), omit the "end" field in timespan`,
},
{
role: "user",
content: `
<EPISODE_CONTENT>
${context.episodeContent}
</EPISODE_CONTENT>
<PREVIOUS_EPISODES>
${JSON.stringify(context.previousEpisodes, null, 2)}
</PREVIOUS_EPISODES>
<AVAILABLE_ENTITIES>
${JSON.stringify(context.entities, null, 2)}
</AVAILABLE_ENTITIES>
`,
},
];
};
/**
* Detect contradictions between statements in the knowledge graph
*/
export const detect_contradictions = (
context: Record<string, any>,
): CoreMessage[] => {
return [
{
role: "system",
content:
"You are a knowledge graph reasoning expert that identifies contradictions between statements. " +
"Your task is to analyze pairs of statements and determine if they contradict each other " +
"based on their temporal validity and factual content.",
},
{
role: "user",
content: `
I need to detect contradictions between statements in a temporal knowledge graph.
<NEW STATEMENT>
${context.newStatement}
</NEW STATEMENT>
<EXISTING STATEMENTS>
${JSON.stringify(context.existingStatements, null, 2)}
</EXISTING STATEMENTS>
<REFERENCE TIME>
${context.referenceTime}
</REFERENCE TIME>
Determine if the NEW STATEMENT contradicts any of the EXISTING STATEMENTS.
A contradiction occurs when:
1. Two statements assert incompatible facts about the same subject-predicate pair
2. The statements overlap in their temporal validity periods
For example, if one statement says "John works at Company A from January 2023" and another says
"John works at Company B from March 2023", these would contradict if a person can only work at one
company at a time.
Format your response as a JSON object with the following structure:
{
"hasContradiction": true/false,
"contradictedStatements": [
{
"statementId": "[ID of the contradicted statement]",
"reason": "[Explanation of why these statements contradict]",
"temporalRelationship": "[overlapping/containing/contained/after/before]"
}
]
}
Important guidelines:
- Consider the temporal validity of statements
- Only mark as contradictions if statements are truly incompatible
- Provide clear reasoning for each identified contradiction
- Consider the context and domain constraints
- If no contradictions exist, return an empty contradictedStatements array
`,
},
];
};
/**
* Analyze similar statements to determine duplications and contradictions
* This prompt helps the LLM evaluate semantically similar statements found through vector search
* to determine if they are duplicates or contradictions
*/
export const resolve_statements = (
context: Record<string, any>,
): CoreMessage[] => {
return [
{
role: "system",
content: `You are a knowledge graph expert that analyzes statements to detect duplications and contradictions.
You analyze multiple new statements against existing statements to determine whether the new statement duplicates any existing statement or contradicts any existing statement.
Pay special attention to temporal aspects, event updates, and context changes. If an event changes (like a date shift), statements about the original event are likely contradicted by statements about the updated event.
I need to analyze whether a new statement duplicates or contradicts existing statements in a knowledge graph.
Follow these instructions carefully:
1. Analyze if the new statement is a semantic duplicate of any existing statement
- Two statements are duplicates if they express the same meaning even with different wording
- Consider entity resolution has already been done, so different entity names are NOT an issue
2. Determine if the new statement contradicts any existing valid statements
- Contradictions occur when statements cannot both be true at the same time
- Pay special attention to negations, opposites, and mutually exclusive facts
- Consider temporal validity - statements may only be contradictions within specific time periods
3. IMPORTANT: For events that change (like rescheduled appointments, moved dates, changed locations):
- When an event changes date/time/location, new statements about the updated event likely contradict statements about the original event
- Look for contextual clues about event changes, cancellations, or rescheduling
- Example: If "Concert on June 10" moved to "Concert on June 12", then "John attends June 10 concert" contradicts "John doesn't attend June 12 concert"
4. Format your response as a JSON object with the following structure:
<output>
[{
"statementId": "new_statement_uuid",
"isDuplicate": true/false,
"duplicateId": "existing_statement_uuid-if-duplicate-exists",
"contradictions": ["existing_statement_uuid-1", "existing_statement_uuid-2"], // UUIDs of any contradicted statements
}]
</output>
Important guidelines:
- If the new statement is a duplicate, include the UUID of the duplicate statement
- For contradictions, list all statement UUIDs that the new statement contradicts
- If a statement is both a contradiction AND a duplicate (rare case), mark it as a duplicate
- Identify temporal and contextual shifts that may create implicit contradictions
- Don't give any reason, just give the final output.
`,
},
{
role: "user",
content: `
<NEW_STATEMENTS>
${context.newStatements
.map(
(triple: Triple) => `
StatementId: ${triple.statement.uuid}
Fact: ${triple.statement.fact}
Subject: ${triple.subject}
Predicate: ${triple.predicate}
Object: ${triple.object}
---------------------------
`,
)
.join("")}
</NEW_STATEMENTS>
<SIMILAR_STATEMENTS>
${JSON.stringify(context.similarStatements, null, 2)}
</SIMILAR_STATEMENTS>
<EPISODE_CONTENT>
${context.episodeContent}
</EPISODE_CONTENT>
<REFERENCE_TIME>
${context.referenceTime}
</REFERENCE_TIME> `,
},
];
};