mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 18:38:27 +00:00
refactor: implement statement extraction and resolution
This commit is contained in:
parent
0853a30897
commit
96d829642b
@ -10,7 +10,8 @@ import {
|
|||||||
import { LLMMappings, LLMModelEnum } from "@recall/types";
|
import { LLMMappings, LLMModelEnum } from "@recall/types";
|
||||||
import { logger } from "./logger.service";
|
import { logger } from "./logger.service";
|
||||||
import crypto from "crypto";
|
import crypto from "crypto";
|
||||||
import { extract_message, extract_text } from "./prompts/extractNodes";
|
import { dedupeNodes, extract_message, extract_text } from "./prompts/nodes";
|
||||||
|
import { extract_statements } from "./prompts/statements";
|
||||||
|
|
||||||
export enum EpisodeType {
|
export enum EpisodeType {
|
||||||
Conversation = "CONVERSATION",
|
Conversation = "CONVERSATION",
|
||||||
@ -41,7 +42,7 @@ export interface EpisodicNode {
|
|||||||
* Entities represent subjects, objects, or predicates in statements
|
* Entities represent subjects, objects, or predicates in statements
|
||||||
*/
|
*/
|
||||||
export interface EntityNode {
|
export interface EntityNode {
|
||||||
uuid?: string;
|
uuid: string;
|
||||||
name: string;
|
name: string;
|
||||||
type: string;
|
type: string;
|
||||||
attributes: Record<string, any>;
|
attributes: Record<string, any>;
|
||||||
@ -211,8 +212,8 @@ export class KnowledgeGraphService {
|
|||||||
|
|
||||||
// Step 5: Statement Extraction - Extract statements (triples) instead of direct edges
|
// Step 5: Statement Extraction - Extract statements (triples) instead of direct edges
|
||||||
const extractedStatements = await this.extractStatements(
|
const extractedStatements = await this.extractStatements(
|
||||||
resolvedNodes,
|
|
||||||
episode,
|
episode,
|
||||||
|
resolvedNodes,
|
||||||
previousEpisodes,
|
previousEpisodes,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -292,21 +293,27 @@ export class KnowledgeGraphService {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
const extractedEntities = JSON.parse(responseText || "{}").entities || [];
|
|
||||||
|
|
||||||
// Convert to EntityNode objects
|
// Convert to EntityNode objects
|
||||||
const entities: EntityNode[] = [];
|
const entities: EntityNode[] = [];
|
||||||
|
|
||||||
for (const entity of extractedEntities) {
|
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||||
entities.push({
|
if (outputMatch && outputMatch[1]) {
|
||||||
|
responseText = outputMatch[1].trim();
|
||||||
|
const extractedEntities = JSON.parse(responseText || "{}").entities || [];
|
||||||
|
|
||||||
|
entities.push(
|
||||||
|
...(await Promise.all(
|
||||||
|
extractedEntities.map(async (entity: any) => ({
|
||||||
uuid: crypto.randomUUID(),
|
uuid: crypto.randomUUID(),
|
||||||
name: entity.name,
|
name: entity.name,
|
||||||
type: entity.type,
|
type: entity.type,
|
||||||
attributes: entity.attributes || {},
|
attributes: entity.attributes || {},
|
||||||
nameEmbedding: [], // Will be populated later
|
nameEmbedding: await this.getEmbedding(entity.name),
|
||||||
createdAt: new Date(),
|
createdAt: new Date(),
|
||||||
userId: episode.userId,
|
userId: episode.userId,
|
||||||
});
|
})),
|
||||||
|
)),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return entities;
|
return entities;
|
||||||
@ -321,56 +328,103 @@ export class KnowledgeGraphService {
|
|||||||
previousEpisodes: EpisodicNode[],
|
previousEpisodes: EpisodicNode[],
|
||||||
): Promise<{ resolvedNodes: EntityNode[]; uuidMap: Map<string, string> }> {
|
): Promise<{ resolvedNodes: EntityNode[]; uuidMap: Map<string, string> }> {
|
||||||
const uuidMap = new Map<string, string>();
|
const uuidMap = new Map<string, string>();
|
||||||
const resolvedNodes: EntityNode[] = [];
|
|
||||||
|
|
||||||
for (const extractedNode of extractedNodes) {
|
|
||||||
// Generate embedding for the node name
|
|
||||||
const nameEmbedding = await this.getEmbedding(extractedNode.name);
|
|
||||||
|
|
||||||
|
const existingNodesLists = await Promise.all(
|
||||||
|
extractedNodes.map(async (extractedNode) => {
|
||||||
// Check if a similar node already exists in HelixDB
|
// Check if a similar node already exists in HelixDB
|
||||||
// Use vector similarity search to find similar entities
|
// Use vector similarity search to find similar entities
|
||||||
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance)
|
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance)
|
||||||
const similarEntities = await helixClient.query("findSimilarEntities", {
|
const similarEntities = await helixClient.query("findSimilarEntities", {
|
||||||
queryEmbedding: nameEmbedding,
|
queryEmbedding: extractedNode.nameEmbedding,
|
||||||
limit: 5, // Get top 5 matches
|
limit: 5, // Get top 5 matches
|
||||||
threshold: 0.85, // 85% similarity threshold
|
threshold: 0.85, // 85% similarity threshold
|
||||||
});
|
});
|
||||||
|
|
||||||
const existingNodes = similarEntities.nodes;
|
return similarEntities.nodes;
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
// Get entity types dictionary or empty object if not provided
|
if (!existingNodesLists || existingNodesLists.length === 0) {
|
||||||
const entityTypesDict = entity_types || {};
|
extractedNodes.forEach((node) => {
|
||||||
|
uuidMap.set(node.uuid, node.uuid);
|
||||||
if (similarEntities.length > 0) {
|
|
||||||
// If similar nodes exist, we need to decide if we want to merge with an existing one
|
|
||||||
// This could involve LLM to determine if they're the same entity
|
|
||||||
const existingNode = similarEntities[0];
|
|
||||||
|
|
||||||
// Map the extracted node UUID to the existing node UUID
|
|
||||||
uuidMap.set(extractedNode.uuid, existingNode.uuid);
|
|
||||||
|
|
||||||
// Add the existing node to our resolved nodes if not already present
|
|
||||||
if (!resolvedNodes.some((node) => node.uuid === existingNode.uuid)) {
|
|
||||||
resolvedNodes.push({
|
|
||||||
uuid: existingNode.uuid,
|
|
||||||
name: existingNode.name,
|
|
||||||
type: existingNode.type,
|
|
||||||
attributes: existingNode.attributes || {},
|
|
||||||
nameEmbedding: existingNode.nameEmbedding,
|
|
||||||
createdAt: new Date(existingNode.createdAt),
|
|
||||||
userId: existingNode.userId,
|
|
||||||
});
|
});
|
||||||
|
return { resolvedNodes: extractedNodes, uuidMap };
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// This is a new entity, add embedding and keep as is
|
// Prepare context for LLM
|
||||||
extractedNode.nameEmbedding = nameEmbedding;
|
const extractedNodesContext = extractedNodes.map(
|
||||||
resolvedNodes.push(extractedNode);
|
(node: EntityNode, i: number) => {
|
||||||
uuidMap.set(extractedNode.uuid, extractedNode.uuid);
|
return {
|
||||||
}
|
id: i,
|
||||||
|
name: node.name,
|
||||||
|
entity_type: node.type,
|
||||||
|
entity_type_description: "Default Entity Type",
|
||||||
|
duplication_candidates: existingNodesLists[i].map(
|
||||||
|
(candidate: EntityNode, j: number) => ({
|
||||||
|
idx: j,
|
||||||
|
name: candidate.name,
|
||||||
|
entity_types: candidate.type,
|
||||||
|
...candidate.attributes,
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
};
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
const context = {
|
||||||
|
extracted_nodes: extractedNodesContext,
|
||||||
|
episode_content: episode ? episode.content : "",
|
||||||
|
previous_episodes: previousEpisodes
|
||||||
|
? previousEpisodes.map((ep) => ep.content)
|
||||||
|
: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
const messages = dedupeNodes(context);
|
||||||
|
|
||||||
|
let responseText = "";
|
||||||
|
|
||||||
|
await this.makeModelCall(
|
||||||
|
false,
|
||||||
|
LLMModelEnum.GPT41,
|
||||||
|
messages as CoreMessage[],
|
||||||
|
(text) => {
|
||||||
|
responseText = text;
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||||
|
if (outputMatch && outputMatch[1]) {
|
||||||
|
responseText = outputMatch[1].trim();
|
||||||
|
const parsedResponse = JSON.parse(responseText);
|
||||||
|
const nodeResolutions = parsedResponse.entity_resolutions || [];
|
||||||
|
|
||||||
|
// Process each node resolution to either map to an existing node or keep as new
|
||||||
|
const resolvedNodes = nodeResolutions.map((resolution: any) => {
|
||||||
|
const resolutionId = resolution.id ?? -1;
|
||||||
|
const duplicateIdx = resolution.duplicate_idx ?? -1;
|
||||||
|
const extractedNode = extractedNodes[resolutionId];
|
||||||
|
|
||||||
|
// If a duplicate was found, use the existing node, otherwise use the extracted node
|
||||||
|
const resolvedNode =
|
||||||
|
duplicateIdx >= 0 &&
|
||||||
|
duplicateIdx < existingNodesLists[resolutionId]?.length
|
||||||
|
? existingNodesLists[resolutionId][duplicateIdx]
|
||||||
|
: extractedNode;
|
||||||
|
|
||||||
|
// Update the name if provided in the resolution
|
||||||
|
if (resolution.name) {
|
||||||
|
resolvedNode.name = resolution.name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Map the extracted UUID to the resolved UUID
|
||||||
|
uuidMap.set(extractedNode.uuid, resolvedNode.uuid);
|
||||||
|
|
||||||
|
return resolvedNode;
|
||||||
|
});
|
||||||
|
|
||||||
return { resolvedNodes, uuidMap };
|
return { resolvedNodes, uuidMap };
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract statements as first-class objects from an episode using LLM
|
* Extract statements as first-class objects from an episode using LLM
|
||||||
@ -388,18 +442,16 @@ export class KnowledgeGraphService {
|
|||||||
content: ep.content,
|
content: ep.content,
|
||||||
createdAt: ep.createdAt.toISOString(),
|
createdAt: ep.createdAt.toISOString(),
|
||||||
})),
|
})),
|
||||||
nodes: resolvedNodes.map((node) => ({
|
entities: resolvedNodes.map((node) => ({
|
||||||
name: node.name,
|
name: node.name,
|
||||||
type: node.type,
|
type: node.type,
|
||||||
uuid: node.uuid,
|
uuid: node.uuid,
|
||||||
})),
|
})),
|
||||||
referenceTime: episode.validAt.toISOString(),
|
referenceTime: episode.validAt.toISOString(),
|
||||||
relationshipTypes: {}, // Could be populated with relationship definitions
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the statement extraction prompt from the prompt library
|
// Get the statement extraction prompt from the prompt library
|
||||||
// Note: You might need to update your prompts to extract subject-predicate-object patterns
|
const messages = extract_statements(context);
|
||||||
const messages = promptLibrary.extractEdges.edge.call(context);
|
|
||||||
|
|
||||||
let responseText = "";
|
let responseText = "";
|
||||||
|
|
||||||
@ -413,65 +465,90 @@ export class KnowledgeGraphService {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Parse the statements from the LLM response
|
// Parse the statements from the LLM response
|
||||||
// This will need to be updated based on your prompt format
|
|
||||||
const extractedTriples = JSON.parse(responseText || "{}").edges || [];
|
const extractedTriples = JSON.parse(responseText || "{}").edges || [];
|
||||||
|
|
||||||
// Convert to Triple objects with Statement nodes
|
// Convert extracted triples to Triple objects with Statement nodes
|
||||||
const triples: Triple[] = [];
|
const triples = await Promise.all(
|
||||||
|
// Fix: Type 'any'.
|
||||||
for (const triple of extractedTriples) {
|
extractedTriples.map(async (triple: any) => {
|
||||||
|
// Find the subject and object nodes
|
||||||
const subjectNode = resolvedNodes.find(
|
const subjectNode = resolvedNodes.find(
|
||||||
(node) =>
|
(node) => node.name.toLowerCase() === triple.source.toLowerCase(),
|
||||||
node.name.toLowerCase() === triple.sourceEntityName.toLowerCase(),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
const objectNode = resolvedNodes.find(
|
const objectNode = resolvedNodes.find(
|
||||||
(node) =>
|
(node) => node.name.toLowerCase() === triple.target.toLowerCase(),
|
||||||
node.name.toLowerCase() === triple.targetEntityName.toLowerCase(),
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// Find or create a predicate node for the relationship type
|
// Find or create a predicate node for the relationship type
|
||||||
const predicateNode = resolvedNodes.find(
|
const predicateNode = resolvedNodes.find(
|
||||||
(node) =>
|
(node) =>
|
||||||
node.name.toLowerCase() === triple.relationshipType.toLowerCase(),
|
node.name.toLowerCase() === triple.relationship.toLowerCase(),
|
||||||
) || {
|
) || {
|
||||||
uuid: crypto.randomUUID(),
|
uuid: crypto.randomUUID(),
|
||||||
name: triple.relationshipType,
|
name: triple.relationship,
|
||||||
type: "Predicate",
|
type: "Predicate",
|
||||||
attributes: {},
|
attributes: {},
|
||||||
nameEmbedding: [], // Will be populated later
|
nameEmbedding: await this.getEmbedding(triple.relationship),
|
||||||
createdAt: new Date(),
|
createdAt: new Date(),
|
||||||
userId: episode.userId,
|
userId: episode.userId,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (subjectNode && objectNode) {
|
if (subjectNode && objectNode) {
|
||||||
// Generate embedding for the fact
|
|
||||||
const factEmbedding = await this.getEmbedding(triple.fact);
|
|
||||||
|
|
||||||
// Create a statement node
|
// Create a statement node
|
||||||
const statement: StatementNode = {
|
const statement: StatementNode = {
|
||||||
uuid: crypto.randomUUID(),
|
uuid: crypto.randomUUID(),
|
||||||
fact: triple.fact,
|
fact: triple.fact,
|
||||||
groupId: crypto.randomUUID().slice(0, 8), // Could be used to group related statements
|
factEmbedding: await this.getEmbedding(triple.fact),
|
||||||
createdAt: new Date(),
|
createdAt: new Date(),
|
||||||
validAt: triple.validAt ? new Date(triple.validAt) : episode.validAt,
|
validAt: episode.validAt,
|
||||||
invalidAt: triple.invalidAt ? new Date(triple.invalidAt) : undefined,
|
invalidAt: null,
|
||||||
attributesJson: JSON.stringify({}), // Could store additional metadata
|
attributes: triple.attributes || {},
|
||||||
embedding: factEmbedding,
|
|
||||||
userId: episode.userId,
|
userId: episode.userId,
|
||||||
};
|
};
|
||||||
|
|
||||||
triples.push({
|
return {
|
||||||
statement,
|
statement,
|
||||||
subject: subjectNode,
|
subject: subjectNode,
|
||||||
predicate: predicateNode,
|
predicate: predicateNode,
|
||||||
object: objectNode,
|
object: objectNode,
|
||||||
provenance: episode,
|
provenance: episode,
|
||||||
});
|
};
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Filter out null values (where subject or object wasn't found)
|
||||||
|
return triples.filter(Boolean) as Triple[];
|
||||||
}
|
}
|
||||||
|
|
||||||
return triples;
|
private async resolvePredicateNodes(
|
||||||
|
triples: Triple[],
|
||||||
|
episode: EpisodicNode,
|
||||||
|
) {
|
||||||
|
const predicateNodes: EntityNode[] = triples.map((triple: Triple) => {
|
||||||
|
return triple.predicate;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (predicateNodes.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const existingNodesLists = await Promise.all(
|
||||||
|
predicateNodes.map(async (predicateNode) => {
|
||||||
|
// Check if a similar node already exists in HelixDB
|
||||||
|
// Use vector similarity search to find similar entities
|
||||||
|
// Threshold is 0.85 - meaning at least 85% similarity (lower cosine distance)
|
||||||
|
const similarEntities = await helixClient.query("findSimilarEntities", {
|
||||||
|
queryEmbedding: predicateNode.nameEmbedding,
|
||||||
|
limit: 5, // Get top 5 matches
|
||||||
|
threshold: 0.85, // 85% similarity threshold
|
||||||
|
});
|
||||||
|
|
||||||
|
return similarEntities.nodes;
|
||||||
|
}),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Export types from individual prompt modules
|
// Export types from individual prompt modules
|
||||||
export { type ExtractedEntity, type ExtractedEntities } from "./extractNodes";
|
export { type ExtractedEntity, type ExtractedEntities } from "./nodes";
|
||||||
export { type Edge, type ExtractedEdges } from "./extractEdges";
|
export { type Edge, type ExtractedEdges } from "./extractEdges";
|
||||||
export { type ContradictionResult } from "./contradiction";
|
export { type ContradictionResult } from "./contradiction";
|
||||||
|
|||||||
@ -34,21 +34,13 @@ export interface EntityClassification {
|
|||||||
export const extract_message = (
|
export const extract_message = (
|
||||||
context: Record<string, any>,
|
context: Record<string, any>,
|
||||||
): CoreMessage[] => {
|
): CoreMessage[] => {
|
||||||
const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages.
|
const sysPrompt = `You are an AI assistant that extracts entity nodes from conversational messages for a reified knowledge graph.
|
||||||
Your primary task is to extract and classify significant entities mentioned in the conversation.`;
|
Your primary task is to extract and classify significant entities mentioned in the conversation.
|
||||||
|
|
||||||
const userPrompt = `
|
In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements.
|
||||||
<PREVIOUS EPISODES>
|
Focus on extracting:
|
||||||
${JSON.stringify(context.previousEpisodes || [], null, 2)}
|
1. Subject entities (people, objects, concepts)
|
||||||
</PREVIOUS EPISODES>
|
2. Object entities (people, objects, concepts)
|
||||||
|
|
||||||
<CURRENT EPISODE>
|
|
||||||
${context.episodeContent}
|
|
||||||
</CURRENT EPISODE>
|
|
||||||
|
|
||||||
<ENTITY TYPES>
|
|
||||||
${JSON.stringify(context.entityTypes || {}, null, 2)}
|
|
||||||
</ENTITY TYPES>
|
|
||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
|
|
||||||
@ -63,14 +55,34 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
|
|||||||
- Assign the appropriate type for each one.
|
- Assign the appropriate type for each one.
|
||||||
|
|
||||||
3. **Exclusions**:
|
3. **Exclusions**:
|
||||||
- Do NOT extract entities representing relationships or actions.
|
- Do NOT extract entities representing relationships or actions (predicates will be handled separately).
|
||||||
- Do NOT extract dates, times, or other temporal information—these will be handled separately.
|
- Do NOT extract dates, times, or other temporal information—these will be handled separately.
|
||||||
|
|
||||||
4. **Formatting**:
|
4. **Formatting**:
|
||||||
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
|
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
|
||||||
|
|
||||||
${context.customPrompt || ""}
|
|
||||||
`;
|
Format your response as a JSON object with the following structure:
|
||||||
|
<output>
|
||||||
|
{
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"name": "Entity Name",
|
||||||
|
"type": "Entity Type",
|
||||||
|
}
|
||||||
|
// Additional entities...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</output>`;
|
||||||
|
|
||||||
|
const userPrompt = `
|
||||||
|
<PREVIOUS EPISODES>
|
||||||
|
${JSON.stringify(context.previousEpisodes || [], null, 2)}
|
||||||
|
</PREVIOUS EPISODES>
|
||||||
|
|
||||||
|
<CURRENT EPISODE>
|
||||||
|
${context.episodeContent}
|
||||||
|
</CURRENT EPISODE>`;
|
||||||
|
|
||||||
return [
|
return [
|
||||||
{ role: "system", content: sysPrompt },
|
{ role: "system", content: sysPrompt },
|
||||||
@ -82,29 +94,50 @@ ${context.customPrompt || ""}
|
|||||||
* Extract entities from text-based content
|
* Extract entities from text-based content
|
||||||
*/
|
*/
|
||||||
export const extract_text = (context: Record<string, any>): CoreMessage[] => {
|
export const extract_text = (context: Record<string, any>): CoreMessage[] => {
|
||||||
const sysPrompt = `You are an AI assistant that extracts entity nodes from text.
|
const sysPrompt = `
|
||||||
Your primary task is to extract and classify the speaker and other significant entities mentioned in the provided text.`;
|
You are an AI assistant that extracts entity nodes from text for a reified knowledge graph.
|
||||||
|
Your primary task is to extract and classify significant entities mentioned in the provided text.
|
||||||
|
|
||||||
|
In a reified knowledge graph, we need to identify subject and object entities that will be connected through statements.
|
||||||
|
Focus on extracting:
|
||||||
|
1. Subject entities (people, objects, concepts)
|
||||||
|
2. Object entities (people, objects, concepts)
|
||||||
|
|
||||||
|
Instructions:
|
||||||
|
|
||||||
|
You are given a TEXT. Your task is to extract **entity nodes** mentioned **explicitly or implicitly** in the TEXT.
|
||||||
|
|
||||||
|
1. **Entity Identification**:
|
||||||
|
- Extract all significant entities, concepts, or actors that are **explicitly or implicitly** mentioned in the TEXT.
|
||||||
|
|
||||||
|
2. **Entity Classification**:
|
||||||
|
- Use the descriptions in ENTITY TYPES to classify each extracted entity.
|
||||||
|
- Assign the appropriate type for each one.
|
||||||
|
|
||||||
|
3. **Exclusions**:
|
||||||
|
- Do NOT extract entities representing relationships or actions (predicates will be handled separately).
|
||||||
|
- Do NOT extract dates, times, or other temporal information—these will be handled separately.
|
||||||
|
|
||||||
|
4. **Formatting**:
|
||||||
|
- Be **explicit and unambiguous** in naming entities (e.g., use full names when available).
|
||||||
|
|
||||||
|
|
||||||
|
Format your response as a JSON object with the following structure:
|
||||||
|
<output>
|
||||||
|
{
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"name": "Entity Name",
|
||||||
|
"type": "Entity Type"
|
||||||
|
}
|
||||||
|
// Additional entities...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</output>`;
|
||||||
const userPrompt = `
|
const userPrompt = `
|
||||||
<TEXT>
|
<TEXT>
|
||||||
${context.episodeContent}
|
${context.episodeContent}
|
||||||
</TEXT>
|
</TEXT>
|
||||||
|
|
||||||
<ENTITY TYPES>
|
|
||||||
${JSON.stringify(context.entityTypes || {}, null, 2)}
|
|
||||||
</ENTITY TYPES>
|
|
||||||
|
|
||||||
Given the above text, extract entities from the TEXT that are explicitly or implicitly mentioned.
|
|
||||||
For each entity extracted, also determine its entity type based on the provided ENTITY TYPES and their descriptions.
|
|
||||||
Indicate the classified entity type by providing its entity_type_id.
|
|
||||||
|
|
||||||
${context.customPrompt || ""}
|
|
||||||
|
|
||||||
Guidelines:
|
|
||||||
1. Extract significant entities, concepts, or actors mentioned in the conversation.
|
|
||||||
2. Avoid creating nodes for relationships or actions.
|
|
||||||
3. Avoid creating nodes for temporal information like dates, times or years (these will be added to edges later).
|
|
||||||
4. Be as explicit as possible in your node names, using full names and avoiding abbreviations.
|
|
||||||
`;
|
`;
|
||||||
|
|
||||||
return [
|
return [
|
||||||
@ -218,3 +251,82 @@ ${JSON.stringify(context.node, null, 2)}
|
|||||||
},
|
},
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve entity duplications
|
||||||
|
*/
|
||||||
|
export const dedupeNodes = (context: Record<string, any>): CoreMessage[] => {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: `You are a helpful assistant who determines whether or not ENTITIES extracted from a conversation are duplicates of existing entities.
|
||||||
|
|
||||||
|
Each entity in ENTITIES is represented as a JSON object with the following structure:
|
||||||
|
{
|
||||||
|
id: integer id of the entity,
|
||||||
|
name: "name of the entity",
|
||||||
|
entity_type: "ontological classification of the entity",
|
||||||
|
entity_type_description: "Description of what the entity type represents",
|
||||||
|
duplication_candidates: [
|
||||||
|
{
|
||||||
|
idx: integer index of the candidate entity,
|
||||||
|
name: "name of the candidate entity",
|
||||||
|
entity_type: "ontological classification of the candidate entity",
|
||||||
|
...<additional attributes>
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
For each of the above ENTITIES, determine if the entity is a duplicate of any of its duplication candidates.
|
||||||
|
Entities should only be considered duplicates if they refer to the *same real-world object or concept*.
|
||||||
|
Do NOT mark entities as duplicates if:
|
||||||
|
- They are related but distinct.
|
||||||
|
- They have similar names or purposes but refer to separate instances or concepts.
|
||||||
|
|
||||||
|
Task:
|
||||||
|
Your response must be a JSON object with an "entity_resolutions" array containing one entry for each entity.
|
||||||
|
|
||||||
|
For each entity, include:
|
||||||
|
- "id": the id of the entity (integer)
|
||||||
|
- "name": the name of the entity (string)
|
||||||
|
- "duplicate_idx": the index of the duplicate candidate, or -1 if no duplicate (integer)
|
||||||
|
|
||||||
|
Format your response as follows:
|
||||||
|
<output>
|
||||||
|
{
|
||||||
|
"entity_resolutions": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"name": "Entity Name",
|
||||||
|
"duplicate_idx": -1
|
||||||
|
},
|
||||||
|
// Additional entity resolutions...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</output>
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- If an entity is a duplicate of one of its duplication_candidates, set duplicate_idx to the idx of that candidate.
|
||||||
|
- If an entity is not a duplicate of any candidate, set duplicate_idx to -1.
|
||||||
|
- Always include all entities from the input in your response.
|
||||||
|
- Always wrap the output in these tags <output> </output>
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `
|
||||||
|
<PREVIOUS EPISODES>
|
||||||
|
${JSON.stringify(context.previousEpisodes || [], null, 2)}
|
||||||
|
</PREVIOUS EPISODES>
|
||||||
|
|
||||||
|
<CURRENT EPISODE>
|
||||||
|
${context.episodeContent}
|
||||||
|
</CURRENT EPISODE>
|
||||||
|
|
||||||
|
<ENTITIES>
|
||||||
|
${JSON.stringify(context.extracted_nodes, null, 2)}
|
||||||
|
</ENTITIES>
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
};
|
||||||
243
apps/webapp/app/services/prompts/statements.ts
Normal file
243
apps/webapp/app/services/prompts/statements.ts
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
import { type CoreMessage } from "ai";
|
||||||
|
import { type Triple } from "../knowledgeGraph.server";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract statements (triples) from episode content in a reified knowledge graph model
|
||||||
|
* This function generates a prompt for LLM to extract subject-predicate-object statements
|
||||||
|
* and represent them as first-class nodes with proper connections
|
||||||
|
*/
|
||||||
|
export const extract_statements = (
|
||||||
|
context: Record<string, any>,
|
||||||
|
): CoreMessage[] => {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: `You are a knowledge graph expert that extracts factual statements from text as subject-predicate-object triples.
|
||||||
|
Your task is to identify important facts and represent them in a reified knowledge graph model
|
||||||
|
where each statement is a first-class node connected to subject, predicate, and object entities.
|
||||||
|
|
||||||
|
I need to extract factual statements from the following conversation/text and represent them in a reified knowledge graph.
|
||||||
|
|
||||||
|
Follow these instructions carefully:
|
||||||
|
|
||||||
|
1. Identify key factual statements from the episode content and previous episodes
|
||||||
|
2. Represent each statement as a subject-predicate-object triple
|
||||||
|
3. Only use entities from the AVAILABLE ENTITIES list as subjects and objects
|
||||||
|
4. For each statement, provide:
|
||||||
|
- The subject entity name (must match exactly one from AVAILABLE ENTITIES)
|
||||||
|
- The predicate/relationship (a clear, concise verb or relationship type)
|
||||||
|
- The object entity name (must match exactly one from AVAILABLE ENTITIES)
|
||||||
|
- A natural language fact that accurately represents the triple
|
||||||
|
- Any additional attributes relevant to the relationship
|
||||||
|
|
||||||
|
IMPORTANT ABOUT TEMPORAL INFORMATION:
|
||||||
|
- The system tracks when facts become known (validAt) and contradicted (invalidAt) separately
|
||||||
|
- You must include any temporal information WITHIN the fact statement itself
|
||||||
|
- For example, if someone worked at a company from 2015-2020, include this in the "fact" field and "attributes.timespan" field
|
||||||
|
- Do NOT omit temporal information from facts - it's critical context
|
||||||
|
- Examples of good temporal facts:
|
||||||
|
* "John worked at Google from 2015 to 2020"
|
||||||
|
* "Sarah lived in New York until 2018"
|
||||||
|
* "The project was completed on March 15, 2023"
|
||||||
|
|
||||||
|
Format your response as a JSON object with the following structure:
|
||||||
|
<output>
|
||||||
|
{
|
||||||
|
"edges": [
|
||||||
|
{
|
||||||
|
"source": "[Subject Entity Name]",
|
||||||
|
"relationship": "[Predicate/Relationship Type]",
|
||||||
|
"target": "[Object Entity Name]",
|
||||||
|
"fact": "[Natural language representation of the fact INCLUDING any temporal information]",
|
||||||
|
"attributes": {
|
||||||
|
"confidence": 0.9, // How confident you are in this fact (0-1)
|
||||||
|
"source": "explicit", // Whether the fact was explicitly stated or inferred
|
||||||
|
"timespan": { // Include if the fact has a specific time period
|
||||||
|
"start": "2015", // When the fact started being true (if known)
|
||||||
|
"end": "2020" // When the fact stopped being true (if known)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// Additional statements...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</output>
|
||||||
|
|
||||||
|
Important guidelines:
|
||||||
|
- Only include the most significant and factual statements
|
||||||
|
- Do not invent entities not present in the AVAILABLE ENTITIES list
|
||||||
|
- Be precise in representing the relationships
|
||||||
|
- Each fact should be atomic (representing a single piece of information)
|
||||||
|
- ALWAYS include temporal information when available (dates, periods, etc.) in both the fact text AND attributes
|
||||||
|
- Facts should be based on the episode content, not general knowledge
|
||||||
|
- Aim for quality over quantity, prioritize clear, unambiguous statements
|
||||||
|
- For ongoing facts (still true), omit the "end" field in timespan`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `
|
||||||
|
<EPISODE_CONTENT>
|
||||||
|
${context.episodeContent}
|
||||||
|
</EPISODE_CONTENT>
|
||||||
|
|
||||||
|
<PREVIOUS_EPISODES>
|
||||||
|
${JSON.stringify(context.previousEpisodes, null, 2)}
|
||||||
|
</PREVIOUS_EPISODES>
|
||||||
|
|
||||||
|
<AVAILABLE_ENTITIES>
|
||||||
|
${JSON.stringify(context.entities, null, 2)}
|
||||||
|
</AVAILABLE_ENTITIES>
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect contradictions between statements in the knowledge graph
|
||||||
|
*/
|
||||||
|
export const detect_contradictions = (
|
||||||
|
context: Record<string, any>,
|
||||||
|
): CoreMessage[] => {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content:
|
||||||
|
"You are a knowledge graph reasoning expert that identifies contradictions between statements. " +
|
||||||
|
"Your task is to analyze pairs of statements and determine if they contradict each other " +
|
||||||
|
"based on their temporal validity and factual content.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `
|
||||||
|
I need to detect contradictions between statements in a temporal knowledge graph.
|
||||||
|
|
||||||
|
<NEW STATEMENT>
|
||||||
|
${context.newStatement}
|
||||||
|
</NEW STATEMENT>
|
||||||
|
|
||||||
|
<EXISTING STATEMENTS>
|
||||||
|
${JSON.stringify(context.existingStatements, null, 2)}
|
||||||
|
</EXISTING STATEMENTS>
|
||||||
|
|
||||||
|
<REFERENCE TIME>
|
||||||
|
${context.referenceTime}
|
||||||
|
</REFERENCE TIME>
|
||||||
|
|
||||||
|
Determine if the NEW STATEMENT contradicts any of the EXISTING STATEMENTS.
|
||||||
|
A contradiction occurs when:
|
||||||
|
|
||||||
|
1. Two statements assert incompatible facts about the same subject-predicate pair
|
||||||
|
2. The statements overlap in their temporal validity periods
|
||||||
|
|
||||||
|
For example, if one statement says "John works at Company A from January 2023" and another says
|
||||||
|
"John works at Company B from March 2023", these would contradict if a person can only work at one
|
||||||
|
company at a time.
|
||||||
|
|
||||||
|
Format your response as a JSON object with the following structure:
|
||||||
|
{
|
||||||
|
"hasContradiction": true/false,
|
||||||
|
"contradictedStatements": [
|
||||||
|
{
|
||||||
|
"statementId": "[ID of the contradicted statement]",
|
||||||
|
"reason": "[Explanation of why these statements contradict]",
|
||||||
|
"temporalRelationship": "[overlapping/containing/contained/after/before]"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Important guidelines:
|
||||||
|
- Consider the temporal validity of statements
|
||||||
|
- Only mark as contradictions if statements are truly incompatible
|
||||||
|
- Provide clear reasoning for each identified contradiction
|
||||||
|
- Consider the context and domain constraints
|
||||||
|
- If no contradictions exist, return an empty contradictedStatements array
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze similar statements to determine duplications and contradictions
|
||||||
|
* This prompt helps the LLM evaluate semantically similar statements found through vector search
|
||||||
|
* to determine if they are duplicates or contradictions
|
||||||
|
*/
|
||||||
|
export const resolve_statements = (
|
||||||
|
context: Record<string, any>,
|
||||||
|
): CoreMessage[] => {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: `You are a knowledge graph expert that analyzes statements to detect duplications and contradictions.
|
||||||
|
You analyze multiple new statements against existing statements to determine whether the new statement duplicates any existing statement or contradicts any existing statement.
|
||||||
|
Pay special attention to temporal aspects, event updates, and context changes. If an event changes (like a date shift), statements about the original event are likely contradicted by statements about the updated event.
|
||||||
|
|
||||||
|
|
||||||
|
I need to analyze whether a new statement duplicates or contradicts existing statements in a knowledge graph.
|
||||||
|
|
||||||
|
|
||||||
|
Follow these instructions carefully:
|
||||||
|
|
||||||
|
1. Analyze if the new statement is a semantic duplicate of any existing statement
|
||||||
|
- Two statements are duplicates if they express the same meaning even with different wording
|
||||||
|
- Consider entity resolution has already been done, so different entity names are NOT an issue
|
||||||
|
|
||||||
|
2. Determine if the new statement contradicts any existing valid statements
|
||||||
|
- Contradictions occur when statements cannot both be true at the same time
|
||||||
|
- Pay special attention to negations, opposites, and mutually exclusive facts
|
||||||
|
- Consider temporal validity - statements may only be contradictions within specific time periods
|
||||||
|
|
||||||
|
3. IMPORTANT: For events that change (like rescheduled appointments, moved dates, changed locations):
|
||||||
|
- When an event changes date/time/location, new statements about the updated event likely contradict statements about the original event
|
||||||
|
- Look for contextual clues about event changes, cancellations, or rescheduling
|
||||||
|
- Example: If "Concert on June 10" moved to "Concert on June 12", then "John attends June 10 concert" contradicts "John doesn't attend June 12 concert"
|
||||||
|
|
||||||
|
4. Format your response as a JSON object with the following structure:
|
||||||
|
<output>
|
||||||
|
[{
|
||||||
|
"statementId": "new_statement_uuid",
|
||||||
|
"isDuplicate": true/false,
|
||||||
|
"duplicateId": "existing_statement_uuid-if-duplicate-exists",
|
||||||
|
"contradictions": ["existing_statement_uuid-1", "existing_statement_uuid-2"], // UUIDs of any contradicted statements
|
||||||
|
}]
|
||||||
|
</output>
|
||||||
|
|
||||||
|
Important guidelines:
|
||||||
|
- If the new statement is a duplicate, include the UUID of the duplicate statement
|
||||||
|
- For contradictions, list all statement UUIDs that the new statement contradicts
|
||||||
|
- If a statement is both a contradiction AND a duplicate (rare case), mark it as a duplicate
|
||||||
|
- Identify temporal and contextual shifts that may create implicit contradictions
|
||||||
|
- Don't give any reason, just give the final output.
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `
|
||||||
|
<NEW_STATEMENTS>
|
||||||
|
${context.newStatements
|
||||||
|
.map(
|
||||||
|
(triple: Triple) => `
|
||||||
|
StatementId: ${triple.statement.uuid}
|
||||||
|
Fact: ${triple.statement.fact}
|
||||||
|
Subject: ${triple.subject}
|
||||||
|
Predicate: ${triple.predicate}
|
||||||
|
Object: ${triple.object}
|
||||||
|
---------------------------
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.join("")}
|
||||||
|
</NEW_STATEMENTS>
|
||||||
|
|
||||||
|
<SIMILAR_STATEMENTS>
|
||||||
|
${JSON.stringify(context.similarStatements, null, 2)}
|
||||||
|
</SIMILAR_STATEMENTS>
|
||||||
|
|
||||||
|
<EPISODE_CONTENT>
|
||||||
|
${context.episodeContent}
|
||||||
|
</EPISODE_CONTENT>
|
||||||
|
|
||||||
|
<REFERENCE_TIME>
|
||||||
|
${context.referenceTime}
|
||||||
|
</REFERENCE_TIME> `,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
};
|
||||||
Loading…
x
Reference in New Issue
Block a user