fix: skip unchanged docs, and enhance entity extraction prompts

This commit is contained in:
Manoj 2025-09-19 00:11:48 +05:30
parent a25b92e384
commit c2e6418769
7 changed files with 181 additions and 57 deletions

View File

@ -4,7 +4,7 @@ import type { DocumentNode } from "@core/types";
export interface DifferentialDecision { export interface DifferentialDecision {
shouldUseDifferential: boolean; shouldUseDifferential: boolean;
strategy: "full_reingest" | "chunk_level_diff" | "new_document"; strategy: "full_reingest" | "chunk_level_diff" | "new_document" | "skip_processing";
reason: string; reason: string;
changedChunkIndices: number[]; changedChunkIndices: number[];
changePercentage: number; changePercentage: number;
@ -59,7 +59,7 @@ export class DocumentDifferentialService {
if (existingDocument.contentHash === newChunkedDocument.contentHash) { if (existingDocument.contentHash === newChunkedDocument.contentHash) {
return { return {
shouldUseDifferential: false, shouldUseDifferential: false,
strategy: "full_reingest", // No changes detected strategy: "skip_processing", // No changes detected
reason: "Document content unchanged", reason: "Document content unchanged",
changedChunkIndices: [], changedChunkIndices: [],
changePercentage: 0, changePercentage: 0,

View File

@ -248,3 +248,75 @@ export async function getDocumentVersions(
}; };
}); });
} }
/**
* Delete a document and all its related episodes, statements, and entities efficiently
* Uses optimized Cypher patterns for bulk deletion
*/
export async function deleteDocument(documentUuid: string): Promise<{
documentsDeleted: number;
episodesDeleted: number;
statementsDeleted: number;
entitiesDeleted: number;
}> {
const query = `
MATCH (d:Document {uuid: $documentUuid})
// Get all related data first
OPTIONAL MATCH (d)-[:CONTAINS_CHUNK]->(e:Episode)
OPTIONAL MATCH (e)-[:CONTAINS]->(s:Statement)
OPTIONAL MATCH (s)-[:REFERENCES]->(entity:Entity)
// Count entities that will become orphaned
WITH d, collect(DISTINCT e) as episodes, collect(DISTINCT s) as statements, collect(DISTINCT entity) as entities
UNWIND entities as entity
OPTIONAL MATCH (entity)<-[:REFERENCES]-(otherStmt:Statement)
WHERE NOT otherStmt IN statements
WITH d, episodes, statements,
collect(CASE WHEN otherStmt IS NULL THEN entity ELSE null END) as orphanedEntities
// Delete statements (breaks references to entities)
FOREACH (stmt IN statements | DETACH DELETE stmt)
// Delete orphaned entities only (filter nulls first)
WITH d, episodes, statements, [entity IN orphanedEntities WHERE entity IS NOT NULL] as validOrphanedEntities
FOREACH (entity IN validOrphanedEntities | DETACH DELETE entity)
// Delete episodes
FOREACH (episode IN episodes | DETACH DELETE episode)
// Delete document
DETACH DELETE d
RETURN
1 as documentsDeleted,
size(episodes) as episodesDeleted,
size(statements) as statementsDeleted,
size(validOrphanedEntities) as entitiesDeleted
`;
try {
const result = await runQuery(query, { documentUuid });
if (result.length === 0) {
return {
documentsDeleted: 0,
episodesDeleted: 0,
statementsDeleted: 0,
entitiesDeleted: 0,
};
}
const record = result[0];
return {
documentsDeleted: record.get("documentsDeleted") || 0,
episodesDeleted: record.get("episodesDeleted") || 0,
statementsDeleted: record.get("statementsDeleted") || 0,
entitiesDeleted: record.get("entitiesDeleted") || 0,
};
} catch (error) {
console.error("Error deleting document:", error);
throw error;
}
}

View File

@ -384,8 +384,10 @@ export class KnowledgeGraphService {
}; };
} }
// Save triples in parallel for better performance // Process triples sequentially to avoid race conditions
await Promise.all(updatedTriples.map((triple) => saveTriple(triple))); for (const triple of updatedTriples) {
await saveTriple(triple);
}
const saveTriplesTime = Date.now(); const saveTriplesTime = Date.now();
logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`); logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`);

View File

@ -31,19 +31,33 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
- For pronouns that refer to named entities, extract them as separate Alias entities. - For pronouns that refer to named entities, extract them as separate Alias entities.
- **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities. - **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities.
2. **Type and Concept Entity Extraction**: 2. **IMPLICIT ACTOR EXTRACTION**:
- **EXPERIENCE AGENTS**: Extract the entity who performs actions, makes decisions, or has subjective experiences
- **PERSPECTIVE HOLDERS**: Extract entities behind opinions, preferences, memories, and evaluations
- **DOCUMENT ACTORS**: For personal content (journals, notes, reports), extract the implied author/creator
- **PRONOUN RESOLUTION**: Extract the entity represented by first-person pronouns in narrative content
- **ACTION SUBJECTS**: When actions are described without explicit subjects, infer and extract the acting entity
**Detection Signals**:
- Action descriptions without explicit subjects
- Opinion/evaluation expressions
- Decision-making language
- Personal experience descriptions
- Memory/reflection statements
3. **Type and Concept Entity Extraction**:
- **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities. - **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities.
- **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities. - **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities.
- **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities. - **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities.
- **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships. - **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships.
3. **Exclusions**: 4. **Exclusions**:
- Do NOT extract entities representing relationships or actions (predicates will be handled separately). - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
- **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements. - **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements.
- Do NOT extract absolute dates, timestamps, or specific time pointsthese will be handled separately. - Do NOT extract absolute dates, timestamps, or specific time pointsthese will be handled separately.
- Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm"). - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm").
4. **Entity Name Extraction**: 5. **Entity Name Extraction**:
- Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers - Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers
- When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car" - When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car"
- When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities - When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities
@ -52,7 +66,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
- **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John") - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John")
- **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" "Space") - **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" "Space")
5. **Temporal and Relationship Context Extraction**: 6. **Temporal and Relationship Context Extraction**:
- EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years") - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years")
- EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college") - EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college")
- EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members") - EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members")

View File

@ -12,7 +12,65 @@ export const extractStatements = (
return [ return [
{ {
role: "system", role: "system",
content: `You are a knowledge graph expert who extracts NEW factual statements from text as subject-predicate-object triples. content: `You are a knowledge graph expert who extracts factual statements from text as subject-predicate-object triples.
## PHASE 1: FOUNDATIONAL RELATIONSHIPS (HIGHEST PRIORITY)
Extract the basic semantic backbone that answers: WHO, WHAT, WHERE, WHEN, WHY, HOW
### 1A: ACTOR-ACTION RELATIONSHIPS
- Subject performs action: "Entity" "performed" "Action"
- Subject experiences state: "Entity" "experienced" "State"
- Subject has attribute: "Entity" "has" "Property"
- Subject creates/produces: "Entity" "created" "Object"
### 1B: SPATIAL & HIERARCHICAL RELATIONSHIPS
- Location membership: "Entity" "located_in" "Location"
- Categorical membership: "Entity" "is_a" "Category"
- Hierarchical structure: "Entity" "part_of" "System"
- Containment: "Container" "contains" "Item"
### 1C: TEMPORAL & SEQUENTIAL RELATIONSHIPS
- Duration facts: "Event" "lasted" "Duration"
- Sequence facts: "Event" "occurred_before" "Event"
- Temporal anchoring: "Event" "occurred_during" "Period"
- Timing: "Action" "happened_on" "Date"
### 1D: SUBJECTIVE & EVALUATIVE RELATIONSHIPS
- Opinions: "Subject" "opinion_about" "Object"
- Preferences: "Subject" "prefers" "Object"
- Evaluations: "Subject" "rated" "Object"
- Desires: "Subject" "wants" "Object"
## SYSTEMATIC EXTRACTION METHODOLOGY
For each entity, systematically check these common patterns:
**Type/Category Patterns**: Entity is_a Type
**Ownership Patterns**: Actor owns/controls Resource
**Participation Patterns**: Actor participates_in Event
**Location Patterns**: Entity located_in/part_of Place
**Temporal Patterns**: Event occurred_during TimeFrame
**Rating/Measurement Patterns**: Subject rated/measured Object
**Reference Patterns**: Document references Entity
**Employment Patterns**: Person works_for Organization
## RELATIONSHIP QUALITY HIERARCHY
**ESSENTIAL (Extract Always)**:
- Categorical membership (is_a, type_of)
- Spatial relationships (located_in, part_of)
- Actor-action relationships (performed, experienced, created)
- Ownership/control relationships (owns, controls, manages)
- Employment relationships (works_for, employed_by)
**VALUABLE (Extract When Present)**:
- Temporal sequences and durations
- Subjective opinions and evaluations
- Cross-references and citations
- Participation and attendance
**CONTEXTUAL (Extract If Space Permits)**:
- Complex multi-hop inferences
- Implicit relationships requiring interpretation
CRITICAL REQUIREMENT: CRITICAL REQUIREMENT:
- You MUST ONLY use entities from the AVAILABLE ENTITIES list as subjects and objects. - You MUST ONLY use entities from the AVAILABLE ENTITIES list as subjects and objects.
@ -30,54 +88,20 @@ RELATIONSHIP FORMATION RULES:
2. **PRIMARY-EXPANDED**: Only if the expanded entity is mentioned in the episode content 2. **PRIMARY-EXPANDED**: Only if the expanded entity is mentioned in the episode content
3. **EXPANDED-EXPANDED**: Avoid unless there's explicit connection in the episode 3. **EXPANDED-EXPANDED**: Avoid unless there's explicit connection in the episode
FOCUS: Create relationships that ADD VALUE to understanding the current episode, not just because entities are available. Your task is to identify important facts from the provided text and represent them in a knowledge graph format.
## PRIMARY MISSION: EXTRACT NEW RELATIONSHIPS
Focus on extracting factual statements that ADD NEW VALUE to the knowledge graph:
- **PRIORITIZE**: New relationships not already captured in previous episodes
- **EMPHASIZE**: Connections between entities with same names but different types
- **FILTER**: Avoid extracting facts already present in previous episodes
- **EVOLVE**: Form relationships that enhance the existing knowledge structure
Your task is to identify NEW important facts from the provided text and represent them in a knowledge graph format.
Follow these instructions: Follow these instructions:
1. **ANALYZE PREVIOUS EPISODES**: Review previous episodes to understand what relationships already exist 1. **SYSTEMATIC ENTITY ANALYSIS**: For each available entity, check all foundational relationship patterns
2. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects 2. **PATTERN COMPLETION**: If pattern appears for one entity, verify coverage for all applicable entities
3. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected 3. **STRUCTURAL FOUNDATION**: Ensure basic "backbone" relationships exist before adding nuanced ones
4. **EXTRACT NEW RELATIONSHIPS**: Identify factual statements that can be expressed using ONLY available entities AND are NOT already captured in previous episodes 4. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects
5. For each NEW valid statement, provide: 5. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected
6. For each valid statement, provide:
- source: The subject entity (MUST be from AVAILABLE ENTITIES) - source: The subject entity (MUST be from AVAILABLE ENTITIES)
- predicate: The relationship type (can be a descriptive phrase) - predicate: The relationship type (can be a descriptive phrase)
- target: The object entity (MUST be from AVAILABLE ENTITIES) - target: The object entity (MUST be from AVAILABLE ENTITIES)
EXTRACT NEW MEANINGFUL RELATIONSHIPS AND CHARACTERISTICS:
- Extract meaningful relationships between available entities that are NOT already captured in previous episodes
- Extract individual entity characteristics, roles, and properties as standalone facts
- Use predicates that accurately describe new relationships between entities
- Be creative but precise in identifying NEW relationships - focus on value-adding connections
- **HIGHEST PRIORITY**: Entities with identical names but different types MUST be connected with explicit relationship statements
- **MANDATORY**: When you find entities like "John (Person)" and "John (Company)", create explicit relationships such as "John" "owns" "John" or "John" "founded" "John"
- **ROLE/CHARACTERISTIC EXTRACTION**: Always extract roles, professions, titles, and key characteristics as separate statements
- Look for both explicit and implicit NEW relationships mentioned in the text
- **FILTER OUT**: Relationships already established in previous episodes unless they represent updates or changes
- Common relationship types include (but are not limited to):
* **Roles and professions** (e.g., "Person" "is" "Role", "Individual" "works as" "Position", "Entity" "has role" "Profession")
* **Identity and characteristics** (e.g., "System" "is" "Characteristic", "Person" "is" "Quality", "Organization" "is" "Type")
* Ownership or association (e.g., "Alice" "owns" "Restaurant")
* Participation or attendance (e.g., "Team" "participates in" "Tournament")
* Personal connections (e.g., "Sarah" "works with" "Michael")
* Aliases and alternative names (e.g., "Robert" "is also known as" "Bob")
* Locations and spatial relationships (e.g., "Office" "located in" "Building")
* Characteristics and properties (e.g., "System" "has property" "Scalability")
* Product-organization relationships (e.g., "Software" "developed by" "Company")
* Technical dependencies and usage (e.g., "Application" "uses" "Database")
* Hierarchical relationships (e.g., "Manager" "supervises" "Employee")
* Duration relationships (e.g., "Caroline" "has known" "friends" [duration: "4 years"])
* Temporal sequence relationships (e.g., "Caroline" "met" "friends" [context: "since moving"])
* Contextual support relationships (e.g., "friends" "supported" "Caroline" [context: "during breakup"])
## SAME-NAME ENTITY RELATIONSHIP FORMATION ## SAME-NAME ENTITY RELATIONSHIP FORMATION
When entities share identical names but have different types, CREATE explicit relationship statements: When entities share identical names but have different types, CREATE explicit relationship statements:
- **Person-Organization**: "John (Person)" "owns", "founded", "works for", or "leads" "John (Company)" - **Person-Organization**: "John (Person)" "owns", "founded", "works for", or "leads" "John (Company)"
@ -100,12 +124,12 @@ EXAMPLES of correct Duration/TemporalContext usage:
* DO NOT CREATE: "Caroline" "relates to" "4 years" (Duration as object) * DO NOT CREATE: "Caroline" "relates to" "4 years" (Duration as object)
* DO NOT CREATE: "since moving" "describes" "friendship" (TemporalContext as subject) * DO NOT CREATE: "since moving" "describes" "friendship" (TemporalContext as subject)
## PREVIOUS EPISODE FILTERING ## EXTRACTION COMPLETENESS MANDATE
Before creating any relationship statement: - **EXTRACT OBVIOUS FACTS**: Basic relationships are STRUCTURAL FOUNDATIONS, not redundant noise
- **CHECK**: Review previous episodes to see if this exact relationship already exists - **PRIORITIZE SIMPLE OVER COMPLEX**: "X is_in Y" is more valuable than "X contextually_relates_to Y"
- **SKIP**: Do not create statements that duplicate existing relationships - **QUANTITY OVER NOVELTY**: Comprehensive coverage beats selective "interesting" facts
- **ENHANCE**: Only create statements if they add new information or represent updates - **SYSTEMATIC ENUMERATION**: If pattern exists for one entity, check ALL entities for same pattern
- **FOCUS**: Prioritize completely new connections not represented in the knowledge graph - Only skip exact duplicate statements, not similar relationship types
CRITICAL TEMPORAL INFORMATION HANDLING: CRITICAL TEMPORAL INFORMATION HANDLING:
- For events with specific dates/times, ALWAYS capture temporal information in statement attributes - For events with specific dates/times, ALWAYS capture temporal information in statement attributes

View File

@ -512,7 +512,7 @@ export async function applyCohereReranking(
cohereScore: result.relevanceScore, cohereScore: result.relevanceScore,
cohereRank: index + 1, cohereRank: index + 1,
})) }))
.filter((result) => result.cohereScore > 0.3); .filter((result) => result.cohereScore >= 0.1);
const responseTime = Date.now() - startTime; const responseTime = Date.now() - startTime;
logger.info( logger.info(

View File

@ -87,6 +87,18 @@ export const ingestDocumentTask = task({
documentSizeTokens: differentialDecision.documentSizeTokens, documentSizeTokens: differentialDecision.documentSizeTokens,
}); });
// Early return for unchanged documents
if (differentialDecision.strategy === "skip_processing") {
logger.log("Document content unchanged, skipping processing");
return {
success: true,
documentsProcessed: 1,
chunksProcessed: 0,
episodesCreated: 0,
entitiesExtracted: 0,
};
}
// Step 3: Save the new document version // Step 3: Save the new document version
await saveDocument(document); await saveDocument(document);