From a4b6a4f984d12dc4cb4264c3b6c8ff3b79e57c9d Mon Sep 17 00:00:00 2001 From: Manoj Date: Fri, 19 Sep 2025 00:11:48 +0530 Subject: [PATCH] fix: skip unchanged docs, and enhance entity extraction prompts --- .../app/services/documentDiffer.server.ts | 4 +- .../app/services/graphModels/document.ts | 72 +++++++++++ .../app/services/knowledgeGraph.server.ts | 6 +- apps/webapp/app/services/prompts/nodes.ts | 22 +++- .../webapp/app/services/prompts/statements.ts | 120 +++++++++++------- apps/webapp/app/services/search/rerank.ts | 2 +- .../app/trigger/ingest/ingest-document.ts | 12 ++ 7 files changed, 181 insertions(+), 57 deletions(-) diff --git a/apps/webapp/app/services/documentDiffer.server.ts b/apps/webapp/app/services/documentDiffer.server.ts index c970c70..e7f8c04 100644 --- a/apps/webapp/app/services/documentDiffer.server.ts +++ b/apps/webapp/app/services/documentDiffer.server.ts @@ -4,7 +4,7 @@ import type { DocumentNode } from "@core/types"; export interface DifferentialDecision { shouldUseDifferential: boolean; - strategy: "full_reingest" | "chunk_level_diff" | "new_document"; + strategy: "full_reingest" | "chunk_level_diff" | "new_document" | "skip_processing"; reason: string; changedChunkIndices: number[]; changePercentage: number; @@ -59,7 +59,7 @@ export class DocumentDifferentialService { if (existingDocument.contentHash === newChunkedDocument.contentHash) { return { shouldUseDifferential: false, - strategy: "full_reingest", // No changes detected + strategy: "skip_processing", // No changes detected reason: "Document content unchanged", changedChunkIndices: [], changePercentage: 0, diff --git a/apps/webapp/app/services/graphModels/document.ts b/apps/webapp/app/services/graphModels/document.ts index cdfbf38..3501b4d 100644 --- a/apps/webapp/app/services/graphModels/document.ts +++ b/apps/webapp/app/services/graphModels/document.ts @@ -248,3 +248,75 @@ export async function getDocumentVersions( }; }); } + +/** + * Delete a document and all its related episodes, statements, and entities efficiently + * Uses optimized Cypher patterns for bulk deletion + */ +export async function deleteDocument(documentUuid: string): Promise<{ + documentsDeleted: number; + episodesDeleted: number; + statementsDeleted: number; + entitiesDeleted: number; +}> { + const query = ` + MATCH (d:Document {uuid: $documentUuid}) + + // Get all related data first + OPTIONAL MATCH (d)-[:CONTAINS_CHUNK]->(e:Episode) + OPTIONAL MATCH (e)-[:CONTAINS]->(s:Statement) + OPTIONAL MATCH (s)-[:REFERENCES]->(entity:Entity) + + // Count entities that will become orphaned + WITH d, collect(DISTINCT e) as episodes, collect(DISTINCT s) as statements, collect(DISTINCT entity) as entities + UNWIND entities as entity + OPTIONAL MATCH (entity)<-[:REFERENCES]-(otherStmt:Statement) + WHERE NOT otherStmt IN statements + + WITH d, episodes, statements, + collect(CASE WHEN otherStmt IS NULL THEN entity ELSE null END) as orphanedEntities + + // Delete statements (breaks references to entities) + FOREACH (stmt IN statements | DETACH DELETE stmt) + + // Delete orphaned entities only (filter nulls first) + WITH d, episodes, statements, [entity IN orphanedEntities WHERE entity IS NOT NULL] as validOrphanedEntities + FOREACH (entity IN validOrphanedEntities | DETACH DELETE entity) + + // Delete episodes + FOREACH (episode IN episodes | DETACH DELETE episode) + + // Delete document + DETACH DELETE d + + RETURN + 1 as documentsDeleted, + size(episodes) as episodesDeleted, + size(statements) as statementsDeleted, + size(validOrphanedEntities) as entitiesDeleted + `; + + try { + const result = await runQuery(query, { documentUuid }); + + if (result.length === 0) { + return { + documentsDeleted: 0, + episodesDeleted: 0, + statementsDeleted: 0, + entitiesDeleted: 0, + }; + } + + const record = result[0]; + return { + documentsDeleted: record.get("documentsDeleted") || 0, + episodesDeleted: record.get("episodesDeleted") || 0, + statementsDeleted: record.get("statementsDeleted") || 0, + entitiesDeleted: record.get("entitiesDeleted") || 0, + }; + } catch (error) { + console.error("Error deleting document:", error); + throw error; + } +} diff --git a/apps/webapp/app/services/knowledgeGraph.server.ts b/apps/webapp/app/services/knowledgeGraph.server.ts index aee3f89..55ec284 100644 --- a/apps/webapp/app/services/knowledgeGraph.server.ts +++ b/apps/webapp/app/services/knowledgeGraph.server.ts @@ -384,8 +384,10 @@ export class KnowledgeGraphService { }; } - // Save triples in parallel for better performance - await Promise.all(updatedTriples.map((triple) => saveTriple(triple))); + // Process triples sequentially to avoid race conditions + for (const triple of updatedTriples) { + await saveTriple(triple); + } const saveTriplesTime = Date.now(); logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`); diff --git a/apps/webapp/app/services/prompts/nodes.ts b/apps/webapp/app/services/prompts/nodes.ts index 42e5266..ea7f275 100644 --- a/apps/webapp/app/services/prompts/nodes.ts +++ b/apps/webapp/app/services/prompts/nodes.ts @@ -31,19 +31,33 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr - For pronouns that refer to named entities, extract them as separate Alias entities. - **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities. -2. **Type and Concept Entity Extraction**: +2. **IMPLICIT ACTOR EXTRACTION**: + - **EXPERIENCE AGENTS**: Extract the entity who performs actions, makes decisions, or has subjective experiences + - **PERSPECTIVE HOLDERS**: Extract entities behind opinions, preferences, memories, and evaluations + - **DOCUMENT ACTORS**: For personal content (journals, notes, reports), extract the implied author/creator + - **PRONOUN RESOLUTION**: Extract the entity represented by first-person pronouns in narrative content + - **ACTION SUBJECTS**: When actions are described without explicit subjects, infer and extract the acting entity + + **Detection Signals**: + - Action descriptions without explicit subjects + - Opinion/evaluation expressions + - Decision-making language + - Personal experience descriptions + - Memory/reflection statements + +3. **Type and Concept Entity Extraction**: - **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities. - **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities. - **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities. - **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships. -3. **Exclusions**: +4. **Exclusions**: - Do NOT extract entities representing relationships or actions (predicates will be handled separately). - **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements. - Do NOT extract absolute dates, timestamps, or specific time points—these will be handled separately. - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm"). -4. **Entity Name Extraction**: +5. **Entity Name Extraction**: - Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers - When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car" - When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities @@ -52,7 +66,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John") - **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" → "Space") -5. **Temporal and Relationship Context Extraction**: +6. **Temporal and Relationship Context Extraction**: - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years") - EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college") - EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members") diff --git a/apps/webapp/app/services/prompts/statements.ts b/apps/webapp/app/services/prompts/statements.ts index 5d956e8..db0a0d7 100644 --- a/apps/webapp/app/services/prompts/statements.ts +++ b/apps/webapp/app/services/prompts/statements.ts @@ -12,7 +12,65 @@ export const extractStatements = ( return [ { role: "system", - content: `You are a knowledge graph expert who extracts NEW factual statements from text as subject-predicate-object triples. + content: `You are a knowledge graph expert who extracts factual statements from text as subject-predicate-object triples. + +## PHASE 1: FOUNDATIONAL RELATIONSHIPS (HIGHEST PRIORITY) +Extract the basic semantic backbone that answers: WHO, WHAT, WHERE, WHEN, WHY, HOW + +### 1A: ACTOR-ACTION RELATIONSHIPS +- Subject performs action: "Entity" "performed" "Action" +- Subject experiences state: "Entity" "experienced" "State" +- Subject has attribute: "Entity" "has" "Property" +- Subject creates/produces: "Entity" "created" "Object" + +### 1B: SPATIAL & HIERARCHICAL RELATIONSHIPS +- Location membership: "Entity" "located_in" "Location" +- Categorical membership: "Entity" "is_a" "Category" +- Hierarchical structure: "Entity" "part_of" "System" +- Containment: "Container" "contains" "Item" + +### 1C: TEMPORAL & SEQUENTIAL RELATIONSHIPS +- Duration facts: "Event" "lasted" "Duration" +- Sequence facts: "Event" "occurred_before" "Event" +- Temporal anchoring: "Event" "occurred_during" "Period" +- Timing: "Action" "happened_on" "Date" + +### 1D: SUBJECTIVE & EVALUATIVE RELATIONSHIPS +- Opinions: "Subject" "opinion_about" "Object" +- Preferences: "Subject" "prefers" "Object" +- Evaluations: "Subject" "rated" "Object" +- Desires: "Subject" "wants" "Object" + +## SYSTEMATIC EXTRACTION METHODOLOGY +For each entity, systematically check these common patterns: + +**Type/Category Patterns**: Entity → is_a → Type +**Ownership Patterns**: Actor → owns/controls → Resource +**Participation Patterns**: Actor → participates_in → Event +**Location Patterns**: Entity → located_in/part_of → Place +**Temporal Patterns**: Event → occurred_during → TimeFrame +**Rating/Measurement Patterns**: Subject → rated/measured → Object +**Reference Patterns**: Document → references → Entity +**Employment Patterns**: Person → works_for → Organization + +## RELATIONSHIP QUALITY HIERARCHY + +**ESSENTIAL (Extract Always)**: +- Categorical membership (is_a, type_of) +- Spatial relationships (located_in, part_of) +- Actor-action relationships (performed, experienced, created) +- Ownership/control relationships (owns, controls, manages) +- Employment relationships (works_for, employed_by) + +**VALUABLE (Extract When Present)**: +- Temporal sequences and durations +- Subjective opinions and evaluations +- Cross-references and citations +- Participation and attendance + +**CONTEXTUAL (Extract If Space Permits)**: +- Complex multi-hop inferences +- Implicit relationships requiring interpretation CRITICAL REQUIREMENT: - You MUST ONLY use entities from the AVAILABLE ENTITIES list as subjects and objects. @@ -30,54 +88,20 @@ RELATIONSHIP FORMATION RULES: 2. **PRIMARY-EXPANDED**: Only if the expanded entity is mentioned in the episode content 3. **EXPANDED-EXPANDED**: Avoid unless there's explicit connection in the episode -FOCUS: Create relationships that ADD VALUE to understanding the current episode, not just because entities are available. - -## PRIMARY MISSION: EXTRACT NEW RELATIONSHIPS -Focus on extracting factual statements that ADD NEW VALUE to the knowledge graph: -- **PRIORITIZE**: New relationships not already captured in previous episodes -- **EMPHASIZE**: Connections between entities with same names but different types -- **FILTER**: Avoid extracting facts already present in previous episodes -- **EVOLVE**: Form relationships that enhance the existing knowledge structure - -Your task is to identify NEW important facts from the provided text and represent them in a knowledge graph format. +Your task is to identify important facts from the provided text and represent them in a knowledge graph format. Follow these instructions: -1. **ANALYZE PREVIOUS EPISODES**: Review previous episodes to understand what relationships already exist -2. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects -3. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected -4. **EXTRACT NEW RELATIONSHIPS**: Identify factual statements that can be expressed using ONLY available entities AND are NOT already captured in previous episodes -5. For each NEW valid statement, provide: +1. **SYSTEMATIC ENTITY ANALYSIS**: For each available entity, check all foundational relationship patterns +2. **PATTERN COMPLETION**: If pattern appears for one entity, verify coverage for all applicable entities +3. **STRUCTURAL FOUNDATION**: Ensure basic "backbone" relationships exist before adding nuanced ones +4. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects +5. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected +6. For each valid statement, provide: - source: The subject entity (MUST be from AVAILABLE ENTITIES) - predicate: The relationship type (can be a descriptive phrase) - target: The object entity (MUST be from AVAILABLE ENTITIES) -EXTRACT NEW MEANINGFUL RELATIONSHIPS AND CHARACTERISTICS: -- Extract meaningful relationships between available entities that are NOT already captured in previous episodes -- Extract individual entity characteristics, roles, and properties as standalone facts -- Use predicates that accurately describe new relationships between entities -- Be creative but precise in identifying NEW relationships - focus on value-adding connections -- **HIGHEST PRIORITY**: Entities with identical names but different types MUST be connected with explicit relationship statements -- **MANDATORY**: When you find entities like "John (Person)" and "John (Company)", create explicit relationships such as "John" "owns" "John" or "John" "founded" "John" -- **ROLE/CHARACTERISTIC EXTRACTION**: Always extract roles, professions, titles, and key characteristics as separate statements -- Look for both explicit and implicit NEW relationships mentioned in the text -- **FILTER OUT**: Relationships already established in previous episodes unless they represent updates or changes -- Common relationship types include (but are not limited to): - * **Roles and professions** (e.g., "Person" "is" "Role", "Individual" "works as" "Position", "Entity" "has role" "Profession") - * **Identity and characteristics** (e.g., "System" "is" "Characteristic", "Person" "is" "Quality", "Organization" "is" "Type") - * Ownership or association (e.g., "Alice" "owns" "Restaurant") - * Participation or attendance (e.g., "Team" "participates in" "Tournament") - * Personal connections (e.g., "Sarah" "works with" "Michael") - * Aliases and alternative names (e.g., "Robert" "is also known as" "Bob") - * Locations and spatial relationships (e.g., "Office" "located in" "Building") - * Characteristics and properties (e.g., "System" "has property" "Scalability") - * Product-organization relationships (e.g., "Software" "developed by" "Company") - * Technical dependencies and usage (e.g., "Application" "uses" "Database") - * Hierarchical relationships (e.g., "Manager" "supervises" "Employee") - * Duration relationships (e.g., "Caroline" "has known" "friends" [duration: "4 years"]) - * Temporal sequence relationships (e.g., "Caroline" "met" "friends" [context: "since moving"]) - * Contextual support relationships (e.g., "friends" "supported" "Caroline" [context: "during breakup"]) - ## SAME-NAME ENTITY RELATIONSHIP FORMATION When entities share identical names but have different types, CREATE explicit relationship statements: - **Person-Organization**: "John (Person)" → "owns", "founded", "works for", or "leads" → "John (Company)" @@ -100,12 +124,12 @@ EXAMPLES of correct Duration/TemporalContext usage: * DO NOT CREATE: "Caroline" "relates to" "4 years" (Duration as object) * DO NOT CREATE: "since moving" "describes" "friendship" (TemporalContext as subject) -## PREVIOUS EPISODE FILTERING -Before creating any relationship statement: -- **CHECK**: Review previous episodes to see if this exact relationship already exists -- **SKIP**: Do not create statements that duplicate existing relationships -- **ENHANCE**: Only create statements if they add new information or represent updates -- **FOCUS**: Prioritize completely new connections not represented in the knowledge graph +## EXTRACTION COMPLETENESS MANDATE +- **EXTRACT OBVIOUS FACTS**: Basic relationships are STRUCTURAL FOUNDATIONS, not redundant noise +- **PRIORITIZE SIMPLE OVER COMPLEX**: "X is_in Y" is more valuable than "X contextually_relates_to Y" +- **QUANTITY OVER NOVELTY**: Comprehensive coverage beats selective "interesting" facts +- **SYSTEMATIC ENUMERATION**: If pattern exists for one entity, check ALL entities for same pattern +- Only skip exact duplicate statements, not similar relationship types CRITICAL TEMPORAL INFORMATION HANDLING: - For events with specific dates/times, ALWAYS capture temporal information in statement attributes diff --git a/apps/webapp/app/services/search/rerank.ts b/apps/webapp/app/services/search/rerank.ts index c9b59cd..1a45b68 100644 --- a/apps/webapp/app/services/search/rerank.ts +++ b/apps/webapp/app/services/search/rerank.ts @@ -512,7 +512,7 @@ export async function applyCohereReranking( cohereScore: result.relevanceScore, cohereRank: index + 1, })) - .filter((result) => result.cohereScore > 0.3); + .filter((result) => result.cohereScore >= 0.1); const responseTime = Date.now() - startTime; logger.info( diff --git a/apps/webapp/app/trigger/ingest/ingest-document.ts b/apps/webapp/app/trigger/ingest/ingest-document.ts index 5e6ef50..a7d7341 100644 --- a/apps/webapp/app/trigger/ingest/ingest-document.ts +++ b/apps/webapp/app/trigger/ingest/ingest-document.ts @@ -87,6 +87,18 @@ export const ingestDocumentTask = task({ documentSizeTokens: differentialDecision.documentSizeTokens, }); + // Early return for unchanged documents + if (differentialDecision.strategy === "skip_processing") { + logger.log("Document content unchanged, skipping processing"); + return { + success: true, + documentsProcessed: 1, + chunksProcessed: 0, + episodesCreated: 0, + entitiesExtracted: 0, + }; + } + // Step 3: Save the new document version await saveDocument(document);