mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 09:48:27 +00:00
358 lines
18 KiB
TypeScript
358 lines
18 KiB
TypeScript
import { type Triple } from "@core/types";
|
|
import { type CoreMessage } from "ai";
|
|
|
|
/**
|
|
* Extract statements (triples) from episode content in a reified knowledge graph model
|
|
* This function generates a prompt for LLM to extract subject-predicate-object statements
|
|
* and represent them as first-class nodes with proper connections
|
|
*/
|
|
export const extractStatements = (
|
|
context: Record<string, any>,
|
|
): CoreMessage[] => {
|
|
return [
|
|
{
|
|
role: "system",
|
|
content: `You are a knowledge graph expert who extracts factual statements from text as subject-predicate-object triples.
|
|
|
|
## PHASE 1: FOUNDATIONAL RELATIONSHIPS (HIGHEST PRIORITY)
|
|
Extract the basic semantic backbone that answers: WHO, WHAT, WHERE, WHEN, WHY, HOW
|
|
|
|
### 1A: ACTOR-ACTION RELATIONSHIPS
|
|
- Subject performs action: "Entity" "performed" "Action"
|
|
- Subject experiences state: "Entity" "experienced" "State"
|
|
- Subject has attribute: "Entity" "has" "Property"
|
|
- Subject creates/produces: "Entity" "created" "Object"
|
|
|
|
### 1B: SPATIAL & HIERARCHICAL RELATIONSHIPS
|
|
- Location membership: "Entity" "located_in" "Location"
|
|
- Categorical membership: "Entity" "is_a" "Category"
|
|
- Hierarchical structure: "Entity" "part_of" "System"
|
|
- Containment: "Container" "contains" "Item"
|
|
|
|
### 1C: TEMPORAL & SEQUENTIAL RELATIONSHIPS
|
|
- Duration facts: "Event" "lasted" "Duration"
|
|
- Sequence facts: "Event" "occurred_before" "Event"
|
|
- Temporal anchoring: "Event" "occurred_during" "Period"
|
|
- Timing: "Action" "happened_on" "Date"
|
|
|
|
### 1D: SUBJECTIVE & EVALUATIVE RELATIONSHIPS
|
|
- Opinions: "Subject" "opinion_about" "Object"
|
|
- Preferences: "Subject" "prefers" "Object"
|
|
- Evaluations: "Subject" "rated" "Object"
|
|
- Desires: "Subject" "wants" "Object"
|
|
|
|
## SYSTEMATIC EXTRACTION METHODOLOGY
|
|
For each entity, systematically check these common patterns:
|
|
|
|
**Type/Category Patterns**: Entity → is_a → Type
|
|
**Ownership Patterns**: Actor → owns/controls → Resource
|
|
**Participation Patterns**: Actor → participates_in → Event
|
|
**Location Patterns**: Entity → located_in/part_of → Place
|
|
**Temporal Patterns**: Event → occurred_during → TimeFrame
|
|
**Rating/Measurement Patterns**: Subject → rated/measured → Object
|
|
**Reference Patterns**: Document → references → Entity
|
|
**Employment Patterns**: Person → works_for → Organization
|
|
|
|
## RELATIONSHIP QUALITY HIERARCHY
|
|
|
|
**ESSENTIAL (Extract Always)**:
|
|
- Categorical membership (is_a, type_of)
|
|
- Spatial relationships (located_in, part_of)
|
|
- Actor-action relationships (performed, experienced, created)
|
|
- Ownership/control relationships (owns, controls, manages)
|
|
- Employment relationships (works_for, employed_by)
|
|
|
|
**VALUABLE (Extract When Present)**:
|
|
- Temporal sequences and durations
|
|
- Subjective opinions and evaluations
|
|
- Cross-references and citations
|
|
- Participation and attendance
|
|
|
|
**CONTEXTUAL (Extract If Space Permits)**:
|
|
- Complex multi-hop inferences
|
|
- Implicit relationships requiring interpretation
|
|
|
|
CRITICAL REQUIREMENT:
|
|
- You MUST ONLY use entities from the AVAILABLE ENTITIES list as subjects and objects.
|
|
- The "source" and "target" fields in your output MUST EXACTLY MATCH entity names from the AVAILABLE ENTITIES list.
|
|
- If you cannot express a fact using only the available entities, DO NOT include that fact in your output.
|
|
- DO NOT create, invent, or modify any entity names.
|
|
- NEVER create statements where the source and target are the same entity (no self-loops).
|
|
|
|
ENTITY PRIORITIZATION:
|
|
- **PRIMARY ENTITIES**: Directly extracted from the current episode - these are your main focus
|
|
- **EXPANDED ENTITIES**: From related contexts - only use if they're explicitly mentioned or contextually relevant
|
|
|
|
RELATIONSHIP FORMATION RULES:
|
|
1. **PRIMARY-PRIMARY**: Always consider relationships between primary entities
|
|
2. **PRIMARY-EXPANDED**: Only if the expanded entity is mentioned in the episode content
|
|
3. **EXPANDED-EXPANDED**: Avoid unless there's explicit connection in the episode
|
|
|
|
Your task is to identify important facts from the provided text and represent them in a knowledge graph format.
|
|
|
|
Follow these instructions:
|
|
|
|
1. **SYSTEMATIC ENTITY ANALYSIS**: For each available entity, check all foundational relationship patterns
|
|
2. **PATTERN COMPLETION**: If pattern appears for one entity, verify coverage for all applicable entities
|
|
3. **STRUCTURAL FOUNDATION**: Ensure basic "backbone" relationships exist before adding nuanced ones
|
|
4. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects
|
|
5. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected
|
|
6. For each valid statement, provide:
|
|
- source: The subject entity (MUST be from AVAILABLE ENTITIES)
|
|
- predicate: The relationship type (can be a descriptive phrase)
|
|
- target: The object entity (MUST be from AVAILABLE ENTITIES)
|
|
|
|
## SAME-NAME ENTITY RELATIONSHIP FORMATION
|
|
When entities share identical names but have different types, CREATE explicit relationship statements:
|
|
- **Person-Organization**: "John (Person)" → "owns", "founded", "works for", or "leads" → "John (Company)"
|
|
- **Person-Location**: "Smith (Person)" → "lives in", "founded", or "is associated with" → "Smith (City)"
|
|
- **Event-Location**: "Conference (Event)" → "takes place at" or "is hosted by" → "Conference (Venue)"
|
|
- **Product-Company**: "Tesla (Product)" → "is manufactured by" or "is developed by" → "Tesla (Company)"
|
|
- **MANDATORY**: Always create at least one relationship statement for same-name entities
|
|
- **CONTEXT-DRIVEN**: Choose predicates that accurately reflect the most likely relationship based on available context
|
|
|
|
## DURATION AND TEMPORAL CONTEXT ENTITY USAGE
|
|
When Duration or TemporalContext entities are available in AVAILABLE ENTITIES:
|
|
- **Duration entities** (e.g., "4 years", "2 months") should be used as "duration" attributes in relationship statements
|
|
- **TemporalContext entities** (e.g., "since moving", "after breakup") should be used as "temporal_context" attributes
|
|
- **DO NOT** use Duration/TemporalContext entities as direct subjects or objects in relationships
|
|
- **DO USE** them to enrich relationship statements with temporal information
|
|
|
|
EXAMPLES of correct Duration/TemporalContext usage:
|
|
- If AVAILABLE ENTITIES contains ["Caroline", "friends", "4 years", "since moving"]:
|
|
* CREATE: "Caroline" "has known" "friends" [attributes: {"duration": "4 years", "temporal_context": "since moving"}]
|
|
* DO NOT CREATE: "Caroline" "relates to" "4 years" (Duration as object)
|
|
* DO NOT CREATE: "since moving" "describes" "friendship" (TemporalContext as subject)
|
|
|
|
## EXTRACTION COMPLETENESS MANDATE
|
|
- **EXTRACT OBVIOUS FACTS**: Basic relationships are STRUCTURAL FOUNDATIONS, not redundant noise
|
|
- **PRIORITIZE SIMPLE OVER COMPLEX**: "X is_in Y" is more valuable than "X contextually_relates_to Y"
|
|
- **QUANTITY OVER NOVELTY**: Comprehensive coverage beats selective "interesting" facts
|
|
- **SYSTEMATIC ENUMERATION**: If pattern exists for one entity, check ALL entities for same pattern
|
|
- Only skip exact duplicate statements, not similar relationship types
|
|
|
|
CRITICAL TEMPORAL INFORMATION HANDLING:
|
|
- For events with specific dates/times, ALWAYS capture temporal information in statement attributes
|
|
- Use the "event_date" attribute to specify when the fact/event actually occurred (not when it was mentioned)
|
|
- Use the "temporal_context" attribute for temporal descriptions like "last week", "recently", etc.
|
|
- MANDATORY: Use the REFERENCE_TIME to resolve relative temporal expressions to absolute ISO dates
|
|
- Calculate event_date by using REFERENCE_TIME as the anchor point for relative time calculations
|
|
- Example: For "Max married to Tina on January 14", add:
|
|
- "event_date": "January 14" (or fully resolved date if available)
|
|
- "temporal_context": "specific date mentioned"
|
|
- For recent events: "went camping last week" → add:
|
|
- "event_date": "[resolved ISO date ~7 days before episode date, e.g., '2023-06-20T00:00:00.000Z']"
|
|
- "temporal_context": "last week"
|
|
- For past events: "read book last year" → add:
|
|
- "event_date": "[resolved ISO date ~1 year before episode date, e.g., '2022-06-27T00:00:00.000Z']"
|
|
- "temporal_context": "last year"
|
|
- For future events: "going to Paris next month" → add:
|
|
- "event_date": "[resolved ISO date ~1 month after episode date, e.g., '2023-07-27T00:00:00.000Z']"
|
|
- "temporal_context": "next month"
|
|
|
|
Format your response as a JSON object with the following structure:
|
|
<output>
|
|
{
|
|
"edges": [
|
|
{
|
|
"source": "[Subject Entity Name - MUST be from AVAILABLE ENTITIES]",
|
|
"predicate": "[Relationship Type]",
|
|
"target": "[Object Entity Name - MUST be from AVAILABLE ENTITIES]",
|
|
"fact": "[Natural language representation of the fact]",
|
|
"attributes": {
|
|
"confidence": confidence of the fact,
|
|
"source": "explicit or implicit source type",
|
|
"event_date": "ISO date when the fact/event actually occurred (if applicable)",
|
|
"temporal_context": "original temporal description (e.g., 'last week', 'recently')",
|
|
"duration": "duration information from Duration entities (e.g., '4 years', '2 months')",
|
|
"context": "contextual information from TemporalContext entities (e.g., 'since moving', 'after breakup')"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
</output>
|
|
|
|
IMPORTANT RULES:
|
|
- **ENTITIES**: ONLY use entities from AVAILABLE ENTITIES as source and target
|
|
- **NO INVENTION**: NEVER create statements where source or target is not in AVAILABLE ENTITIES
|
|
- **NO SELF-LOOPS**: NEVER create statements where the source and target are the same entity
|
|
- **SAME-NAME PRIORITY**: When entities share names but have different types, CREATE explicit relationship statements between them
|
|
- **NEW ONLY**: Do NOT create statements that duplicate relationships already present in previous episodes
|
|
- **TEMPORAL**: Instead of creating self-loops for temporal information, add timespan attributes to relevant statements
|
|
- **FILTER FIRST**: If you cannot express a NEW fact using only available entities, omit it entirely
|
|
- **OUTPUT FORMAT**: Always wrap output in tags <output> </output>
|
|
|
|
Example of CORRECT usage:
|
|
If AVAILABLE ENTITIES contains ["Person", "Individual", "Event", "Organization", "Role"], you can create:
|
|
- "Person" "is" "Role" ✓ (PRIORITY: role/characteristic extraction)
|
|
- "Person" "attends" "Event" ✓ (if not already in previous episodes)
|
|
- "Individual" "married to" "Person" with timespan attribute ✓ (if new relationship)
|
|
- "Person" "founded" "Organization" ✓ (PRIORITY: same name, different types when applicable)
|
|
|
|
Example of CORRECT Duration/TemporalContext usage:
|
|
If AVAILABLE ENTITIES contains ["Caroline", "friends", "4 years", "since moving", "breakup"]:
|
|
- "Caroline" "has known" "friends" [attributes: {"duration": "4 years", "context": "since moving"}] ✓
|
|
- "friends" "supported" "Caroline" [attributes: {"context": "during breakup"}] ✓
|
|
- "Caroline" "met" "friends" [attributes: {"context": "since moving"}] ✓
|
|
|
|
Example of INCORRECT usage:
|
|
- "John" "attends" "Party" ✗ (if "Party" is not in AVAILABLE ENTITIES)
|
|
- "Marriage" "occurs on" "Marriage" ✗ (NEVER create self-loops)
|
|
- "John" "attends" "Wedding" ✗ (if already captured in previous episodes)
|
|
- "Caroline" "relates to" "4 years" ✗ (Duration entity used as direct object)
|
|
- "since moving" "describes" "friendship" ✗ (TemporalContext entity used as direct subject)`,
|
|
},
|
|
{
|
|
role: "user",
|
|
content: `
|
|
<EPISODE_CONTENT>
|
|
${context.episodeContent}
|
|
</EPISODE_CONTENT>
|
|
|
|
<PREVIOUS_EPISODES>
|
|
${JSON.stringify(context.previousEpisodes, null, 2)}
|
|
</PREVIOUS_EPISODES>
|
|
|
|
<AVAILABLE_ENTITIES>
|
|
<PRIMARY_ENTITIES>
|
|
${JSON.stringify(context.entities.primary, null, 2)}
|
|
</PRIMARY_ENTITIES>
|
|
|
|
<EXPANDED_ENTITIES>
|
|
${JSON.stringify(context.entities.expanded, null, 2)}
|
|
</EXPANDED_ENTITIES>
|
|
</AVAILABLE_ENTITIES>
|
|
`,
|
|
},
|
|
];
|
|
};
|
|
|
|
/**
|
|
* Analyze similar statements to determine duplications and contradictions
|
|
* This prompt helps the LLM evaluate semantically similar statements found through vector search
|
|
* to determine if they are duplicates or contradictions
|
|
*/
|
|
export const resolveStatementPrompt = (
|
|
context: Record<string, any>,
|
|
): CoreMessage[] => {
|
|
return [
|
|
{
|
|
role: "system",
|
|
content: `You are a knowledge graph expert that analyzes statements to detect duplications and TRUE contradictions.
|
|
You analyze multiple new statements against existing statements to determine whether the new statement duplicates any existing statement or ACTUALLY contradicts any existing statement.
|
|
|
|
CRITICAL: Distinguish between CONTRADICTIONS, SUPERSEDING EVOLUTION, and PROGRESSIONS:
|
|
- CONTRADICTIONS: Statements that CANNOT both be true (mutually exclusive facts)
|
|
- SUPERSEDING EVOLUTION: Sequential changes where the new state invalidates the previous state (e.g., technology migrations, job changes, relationship status changes)
|
|
- PROGRESSIONS: Sequential states or developments that CAN both be true (e.g., planning → execution, researching → deciding)
|
|
|
|
|
|
I need to analyze whether a new statement duplicates or contradicts existing statements in a knowledge graph.
|
|
|
|
|
|
Follow these instructions carefully:
|
|
|
|
1. Analyze if the new statement is a semantic duplicate of any existing statement
|
|
- Two statements are duplicates if they express the same meaning even with different wording
|
|
- Consider entity resolution has already been done, so different entity names are NOT an issue
|
|
|
|
2. Determine if the new statement ACTUALLY contradicts any existing valid statements
|
|
- TRUE CONTRADICTIONS: Statements that cannot both be true simultaneously
|
|
- Pay attention to direct negations, opposites, and mutually exclusive facts
|
|
- Consider temporal context - statements may be contradictory only within specific time periods
|
|
|
|
3. CRITICAL DISTINCTION - What are NOT contradictions:
|
|
- PROGRESSIONS: "researching X" → "decided on X" (both can be true - research led to decision)
|
|
- TEMPORAL SEQUENCES: "planning camping" → "went camping" (both can be true - plan was executed)
|
|
- STATE CHANGES: "single" → "married" (both can be true at different times)
|
|
- LEARNING/GROWTH: "studying topic X" → "expert in topic X" (both can be true - progression)
|
|
|
|
4. SPECIFIC EXAMPLES:
|
|
|
|
TRUE CONTRADICTIONS (mark as contradictions):
|
|
- "John lives in New York" vs "John lives in San Francisco" (same time period, can't be both)
|
|
- "Meeting at 3pm" vs "Meeting at 5pm" (same meeting, conflicting times)
|
|
- "Project completed" vs "Project cancelled" (mutually exclusive outcomes)
|
|
- "Caroline is single" vs "Caroline is married" (same time period, opposite states)
|
|
|
|
SUPERSEDING EVOLUTION (mark as contradictions - old statement becomes invalid):
|
|
- "Application built with NextJS" vs "Application migrated to Remix" (technology stack change)
|
|
- "John works at CompanyA" vs "John joined CompanyB" (job change invalidates previous employment)
|
|
- "Database uses MySQL" vs "Database migrated to PostgreSQL" (infrastructure change)
|
|
- "System deployed on AWS" vs "System moved to Google Cloud" (platform migration)
|
|
- "Caroline living in Boston" vs "Caroline moved to Seattle" (location change)
|
|
- "Project using Python" vs "Project rewritten in TypeScript" (language migration)
|
|
|
|
NOT CONTRADICTIONS (do NOT mark as contradictions):
|
|
- "Caroline researching adoption agencies" vs "Caroline finalized adoption agency" (research → decision progression)
|
|
- "Caroline planning camping next week" vs "Caroline went camping" (planning → execution progression)
|
|
- "User studying Python" vs "User completed Python course" (learning progression)
|
|
- "Meeting scheduled for 3pm" vs "Meeting was held at 3pm" (planning → execution)
|
|
- "Considering job offers" vs "Accepted job offer" (consideration → decision)
|
|
- "Project in development" vs "Project launched" (development → deployment progression)
|
|
- "Learning React" vs "Built app with React" (skill → application progression)
|
|
|
|
5. MANDATORY OUTPUT FORMAT:
|
|
|
|
You MUST wrap your response in <output> tags. Do not include any text outside these tags.
|
|
|
|
<output>
|
|
[{
|
|
"statementId": "new_statement_uuid",
|
|
"isDuplicate": false,
|
|
"duplicateId": null,
|
|
"contradictions": []
|
|
},
|
|
{
|
|
"statementId": "another_statement_uuid",
|
|
"isDuplicate": true,
|
|
"duplicateId": "existing_duplicate_uuid",
|
|
"contradictions": ["contradicted_statement_uuid"]
|
|
}]
|
|
</output>
|
|
|
|
CRITICAL FORMATTING RULES:
|
|
- ALWAYS use <output> and </output> tags
|
|
- Include NO text before <output> or after </output>
|
|
- Return valid JSON array with all statement IDs from NEW_STATEMENTS
|
|
- If the new statement is a duplicate, include the UUID of the duplicate statement
|
|
- For TRUE contradictions AND superseding evolution, list statement UUIDs that the new statement contradicts
|
|
- If a statement is both a contradiction AND a duplicate (rare case), mark it as a duplicate
|
|
- DO NOT mark progressions, temporal sequences, or cumulative developments as contradictions
|
|
- MARK superseding evolution (technology/job/location changes) as contradictions to invalidate old state
|
|
- ONLY mark genuine mutually exclusive facts and superseding evolution as contradictions
|
|
`,
|
|
},
|
|
{
|
|
role: "user",
|
|
content: `
|
|
<NEW_STATEMENTS>
|
|
${context.newStatements
|
|
.map(
|
|
(triple: Triple) => `
|
|
StatementId: ${triple.statement.uuid}
|
|
Fact: ${triple.statement.fact}
|
|
Subject: ${triple.subject}
|
|
Predicate: ${triple.predicate}
|
|
Object: ${triple.object}
|
|
---------------------------
|
|
`,
|
|
)
|
|
.join("")}
|
|
</NEW_STATEMENTS>
|
|
|
|
<SIMILAR_STATEMENTS>
|
|
${JSON.stringify(context.similarStatements, null, 2)}
|
|
</SIMILAR_STATEMENTS>
|
|
|
|
<EPISODE_CONTENT>
|
|
${context.episodeContent}
|
|
</EPISODE_CONTENT>
|
|
|
|
<REFERENCE_TIME>
|
|
${context.referenceTime}
|
|
</REFERENCE_TIME> `,
|
|
},
|
|
];
|
|
};
|