From a4b6a4f984d12dc4cb4264c3b6c8ff3b79e57c9d Mon Sep 17 00:00:00 2001
From: Manoj <saimanoj58@gmail.com>
Date: Fri, 19 Sep 2025 00:11:48 +0530
Subject: [PATCH] fix: skip unchanged docs, and enhance entity extraction
 prompts

---
 .../app/services/documentDiffer.server.ts     |   4 +-
 .../app/services/graphModels/document.ts      |  72 +++++++++++
 .../app/services/knowledgeGraph.server.ts     |   6 +-
 apps/webapp/app/services/prompts/nodes.ts     |  22 +++-
 .../webapp/app/services/prompts/statements.ts | 120 +++++++++++-------
 apps/webapp/app/services/search/rerank.ts     |   2 +-
 .../app/trigger/ingest/ingest-document.ts     |  12 ++
 7 files changed, 181 insertions(+), 57 deletions(-)

diff --git a/apps/webapp/app/services/documentDiffer.server.ts b/apps/webapp/app/services/documentDiffer.server.ts
index c970c70..e7f8c04 100644
--- a/apps/webapp/app/services/documentDiffer.server.ts
+++ b/apps/webapp/app/services/documentDiffer.server.ts
@@ -4,7 +4,7 @@ import type { DocumentNode } from "@core/types";
 
 export interface DifferentialDecision {
   shouldUseDifferential: boolean;
-  strategy: "full_reingest" | "chunk_level_diff" | "new_document";
+  strategy: "full_reingest" | "chunk_level_diff" | "new_document" | "skip_processing";
   reason: string;
   changedChunkIndices: number[];
   changePercentage: number;
@@ -59,7 +59,7 @@ export class DocumentDifferentialService {
     if (existingDocument.contentHash === newChunkedDocument.contentHash) {
       return {
         shouldUseDifferential: false,
-        strategy: "full_reingest", // No changes detected
+        strategy: "skip_processing", // No changes detected
         reason: "Document content unchanged",
         changedChunkIndices: [],
         changePercentage: 0,
diff --git a/apps/webapp/app/services/graphModels/document.ts b/apps/webapp/app/services/graphModels/document.ts
index cdfbf38..3501b4d 100644
--- a/apps/webapp/app/services/graphModels/document.ts
+++ b/apps/webapp/app/services/graphModels/document.ts
@@ -248,3 +248,75 @@ export async function getDocumentVersions(
     };
   });
 }
+
+/**
+ * Delete a document and all its related episodes, statements, and entities efficiently
+ * Uses optimized Cypher patterns for bulk deletion
+ */
+export async function deleteDocument(documentUuid: string): Promise<{
+  documentsDeleted: number;
+  episodesDeleted: number;
+  statementsDeleted: number;
+  entitiesDeleted: number;
+}> {
+  const query = `
+    MATCH (d:Document {uuid: $documentUuid})
+
+    // Get all related data first
+    OPTIONAL MATCH (d)-[:CONTAINS_CHUNK]->(e:Episode)
+    OPTIONAL MATCH (e)-[:CONTAINS]->(s:Statement)
+    OPTIONAL MATCH (s)-[:REFERENCES]->(entity:Entity)
+
+    // Count entities that will become orphaned
+    WITH d, collect(DISTINCT e) as episodes, collect(DISTINCT s) as statements, collect(DISTINCT entity) as entities
+    UNWIND entities as entity
+    OPTIONAL MATCH (entity)<-[:REFERENCES]-(otherStmt:Statement)
+    WHERE NOT otherStmt IN statements
+
+    WITH d, episodes, statements,
+         collect(CASE WHEN otherStmt IS NULL THEN entity ELSE null END) as orphanedEntities
+
+    // Delete statements (breaks references to entities)
+    FOREACH (stmt IN statements | DETACH DELETE stmt)
+
+    // Delete orphaned entities only (filter nulls first)
+    WITH d, episodes, statements, [entity IN orphanedEntities WHERE entity IS NOT NULL] as validOrphanedEntities
+    FOREACH (entity IN validOrphanedEntities | DETACH DELETE entity)
+
+    // Delete episodes
+    FOREACH (episode IN episodes | DETACH DELETE episode)
+
+    // Delete document
+    DETACH DELETE d
+
+    RETURN
+      1 as documentsDeleted,
+      size(episodes) as episodesDeleted,
+      size(statements) as statementsDeleted,
+      size(validOrphanedEntities) as entitiesDeleted
+  `;
+
+  try {
+    const result = await runQuery(query, { documentUuid });
+
+    if (result.length === 0) {
+      return {
+        documentsDeleted: 0,
+        episodesDeleted: 0,
+        statementsDeleted: 0,
+        entitiesDeleted: 0,
+      };
+    }
+
+    const record = result[0];
+    return {
+      documentsDeleted: record.get("documentsDeleted") || 0,
+      episodesDeleted: record.get("episodesDeleted") || 0,
+      statementsDeleted: record.get("statementsDeleted") || 0,
+      entitiesDeleted: record.get("entitiesDeleted") || 0,
+    };
+  } catch (error) {
+    console.error("Error deleting document:", error);
+    throw error;
+  }
+}
diff --git a/apps/webapp/app/services/knowledgeGraph.server.ts b/apps/webapp/app/services/knowledgeGraph.server.ts
index aee3f89..55ec284 100644
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@@ -384,8 +384,10 @@ export class KnowledgeGraphService {
         };
       }
 
-      // Save triples in parallel for better performance
-      await Promise.all(updatedTriples.map((triple) => saveTriple(triple)));
+      // Process triples sequentially to avoid race conditions
+      for (const triple of updatedTriples) {
+        await saveTriple(triple);
+      }
 
       const saveTriplesTime = Date.now();
       logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`);
diff --git a/apps/webapp/app/services/prompts/nodes.ts b/apps/webapp/app/services/prompts/nodes.ts
index 42e5266..ea7f275 100644
--- a/apps/webapp/app/services/prompts/nodes.ts
+++ b/apps/webapp/app/services/prompts/nodes.ts
@@ -31,19 +31,33 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
    - For pronouns that refer to named entities, extract them as separate Alias entities.
    - **TYPE/CONCEPT ENTITIES**: When text contains "X is a Y" statements, extract BOTH X and Y as separate entities.
 
-2. **Type and Concept Entity Extraction**:
+2. **IMPLICIT ACTOR EXTRACTION**:
+   - **EXPERIENCE AGENTS**: Extract the entity who performs actions, makes decisions, or has subjective experiences
+   - **PERSPECTIVE HOLDERS**: Extract entities behind opinions, preferences, memories, and evaluations
+   - **DOCUMENT ACTORS**: For personal content (journals, notes, reports), extract the implied author/creator
+   - **PRONOUN RESOLUTION**: Extract the entity represented by first-person pronouns in narrative content
+   - **ACTION SUBJECTS**: When actions are described without explicit subjects, infer and extract the acting entity
+
+   **Detection Signals**:
+   - Action descriptions without explicit subjects
+   - Opinion/evaluation expressions
+   - Decision-making language
+   - Personal experience descriptions
+   - Memory/reflection statements
+
+3. **Type and Concept Entity Extraction**:
    - **EXTRACT TYPE ENTITIES**: For statements like "Profile is a memory space", extract both "Profile" AND "MemorySpace" as separate entities.
    - **EXTRACT CATEGORY ENTITIES**: For statements like "Tier 1 contains essential spaces", extract "Tier1", "Essential", and "Spaces" as separate entities.
    - **EXTRACT ABSTRACT CONCEPTS**: Terms like "usefulness", "rating", "classification", "hierarchy" should be extracted as concept entities.
    - **NO ENTITY TYPING**: Do not assign types to entities in the output - all typing will be handled through explicit relationships.
 
-3. **Exclusions**:
+4. **Exclusions**:
    - Do NOT extract entities representing relationships or actions (predicates will be handled separately).
    - **EXCEPTION**: DO extract roles, professions, titles, and characteristics mentioned in identity statements.
    - Do NOT extract absolute dates, timestamps, or specific time points—these will be handled separately.
    - Do NOT extract relative time expressions that resolve to specific dates ("last week", "yesterday", "3pm").
 
-4. **Entity Name Extraction**:
+5. **Entity Name Extraction**:
    - Extract ONLY the core entity name, WITHOUT any descriptors or qualifiers
    - When text mentions "Tesla car", extract TWO entities: "Tesla" AND "Car" 
    - When text mentions "memory space system", extract "Memory", "Space", AND "System" as separate entities
@@ -52,7 +66,7 @@ You are given a conversation context and a CURRENT EPISODE. Your task is to extr
    - **FULL NAMES**: Use complete names when available (e.g., "John Smith" not "John")
    - **CONCEPT NORMALIZATION**: Convert to singular form where appropriate ("spaces" → "Space")
 
-5. **Temporal and Relationship Context Extraction**:
+6. **Temporal and Relationship Context Extraction**:
    - EXTRACT duration expressions that describe relationship spans ("4 years", "2 months", "5 years")
    - EXTRACT temporal context that anchors relationships ("since moving", "after graduation", "during college")
    - EXTRACT relationship qualifiers ("close friends", "support system", "work team", "family members")
diff --git a/apps/webapp/app/services/prompts/statements.ts b/apps/webapp/app/services/prompts/statements.ts
index 5d956e8..db0a0d7 100644
--- a/apps/webapp/app/services/prompts/statements.ts
+++ b/apps/webapp/app/services/prompts/statements.ts
@@ -12,7 +12,65 @@ export const extractStatements = (
   return [
     {
       role: "system",
-      content: `You are a knowledge graph expert who extracts NEW factual statements from text as subject-predicate-object triples.
+      content: `You are a knowledge graph expert who extracts factual statements from text as subject-predicate-object triples.
+
+## PHASE 1: FOUNDATIONAL RELATIONSHIPS (HIGHEST PRIORITY)
+Extract the basic semantic backbone that answers: WHO, WHAT, WHERE, WHEN, WHY, HOW
+
+### 1A: ACTOR-ACTION RELATIONSHIPS
+- Subject performs action: "Entity" "performed" "Action"
+- Subject experiences state: "Entity" "experienced" "State"
+- Subject has attribute: "Entity" "has" "Property"
+- Subject creates/produces: "Entity" "created" "Object"
+
+### 1B: SPATIAL & HIERARCHICAL RELATIONSHIPS
+- Location membership: "Entity" "located_in" "Location"
+- Categorical membership: "Entity" "is_a" "Category"
+- Hierarchical structure: "Entity" "part_of" "System"
+- Containment: "Container" "contains" "Item"
+
+### 1C: TEMPORAL & SEQUENTIAL RELATIONSHIPS
+- Duration facts: "Event" "lasted" "Duration"
+- Sequence facts: "Event" "occurred_before" "Event"
+- Temporal anchoring: "Event" "occurred_during" "Period"
+- Timing: "Action" "happened_on" "Date"
+
+### 1D: SUBJECTIVE & EVALUATIVE RELATIONSHIPS
+- Opinions: "Subject" "opinion_about" "Object"
+- Preferences: "Subject" "prefers" "Object"
+- Evaluations: "Subject" "rated" "Object"
+- Desires: "Subject" "wants" "Object"
+
+## SYSTEMATIC EXTRACTION METHODOLOGY
+For each entity, systematically check these common patterns:
+
+**Type/Category Patterns**: Entity → is_a → Type
+**Ownership Patterns**: Actor → owns/controls → Resource
+**Participation Patterns**: Actor → participates_in → Event
+**Location Patterns**: Entity → located_in/part_of → Place
+**Temporal Patterns**: Event → occurred_during → TimeFrame
+**Rating/Measurement Patterns**: Subject → rated/measured → Object
+**Reference Patterns**: Document → references → Entity
+**Employment Patterns**: Person → works_for → Organization
+
+## RELATIONSHIP QUALITY HIERARCHY
+
+**ESSENTIAL (Extract Always)**:
+- Categorical membership (is_a, type_of)
+- Spatial relationships (located_in, part_of)
+- Actor-action relationships (performed, experienced, created)
+- Ownership/control relationships (owns, controls, manages)
+- Employment relationships (works_for, employed_by)
+
+**VALUABLE (Extract When Present)**:
+- Temporal sequences and durations
+- Subjective opinions and evaluations
+- Cross-references and citations
+- Participation and attendance
+
+**CONTEXTUAL (Extract If Space Permits)**:
+- Complex multi-hop inferences
+- Implicit relationships requiring interpretation
 
 CRITICAL REQUIREMENT:
 - You MUST ONLY use entities from the AVAILABLE ENTITIES list as subjects and objects.
@@ -30,54 +88,20 @@ RELATIONSHIP FORMATION RULES:
 2. **PRIMARY-EXPANDED**: Only if the expanded entity is mentioned in the episode content
 3. **EXPANDED-EXPANDED**: Avoid unless there's explicit connection in the episode
 
-FOCUS: Create relationships that ADD VALUE to understanding the current episode, not just because entities are available.
-
-## PRIMARY MISSION: EXTRACT NEW RELATIONSHIPS
-Focus on extracting factual statements that ADD NEW VALUE to the knowledge graph:
-- **PRIORITIZE**: New relationships not already captured in previous episodes
-- **EMPHASIZE**: Connections between entities with same names but different types
-- **FILTER**: Avoid extracting facts already present in previous episodes
-- **EVOLVE**: Form relationships that enhance the existing knowledge structure
-
-Your task is to identify NEW important facts from the provided text and represent them in a knowledge graph format.
+Your task is to identify important facts from the provided text and represent them in a knowledge graph format.
 
 Follow these instructions:
 
-1. **ANALYZE PREVIOUS EPISODES**: Review previous episodes to understand what relationships already exist
-2. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects
-3. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected
-4. **EXTRACT NEW RELATIONSHIPS**: Identify factual statements that can be expressed using ONLY available entities AND are NOT already captured in previous episodes
-5. For each NEW valid statement, provide:
+1. **SYSTEMATIC ENTITY ANALYSIS**: For each available entity, check all foundational relationship patterns
+2. **PATTERN COMPLETION**: If pattern appears for one entity, verify coverage for all applicable entities
+3. **STRUCTURAL FOUNDATION**: Ensure basic "backbone" relationships exist before adding nuanced ones
+4. **REVIEW AVAILABLE ENTITIES**: Carefully examine the AVAILABLE ENTITIES list - these are the ONLY entities you can use as subjects and objects
+5. **IDENTIFY SAME-NAME ENTITIES**: Look for entities with identical names but different types - these often represent natural relationships that should be explicitly connected
+6. For each valid statement, provide:
    - source: The subject entity (MUST be from AVAILABLE ENTITIES)
    - predicate: The relationship type (can be a descriptive phrase)
    - target: The object entity (MUST be from AVAILABLE ENTITIES)
 
-EXTRACT NEW MEANINGFUL RELATIONSHIPS AND CHARACTERISTICS:
-- Extract meaningful relationships between available entities that are NOT already captured in previous episodes
-- Extract individual entity characteristics, roles, and properties as standalone facts
-- Use predicates that accurately describe new relationships between entities
-- Be creative but precise in identifying NEW relationships - focus on value-adding connections
-- **HIGHEST PRIORITY**: Entities with identical names but different types MUST be connected with explicit relationship statements
-- **MANDATORY**: When you find entities like "John (Person)" and "John (Company)", create explicit relationships such as "John" "owns" "John" or "John" "founded" "John"
-- **ROLE/CHARACTERISTIC EXTRACTION**: Always extract roles, professions, titles, and key characteristics as separate statements
-- Look for both explicit and implicit NEW relationships mentioned in the text
-- **FILTER OUT**: Relationships already established in previous episodes unless they represent updates or changes
-- Common relationship types include (but are not limited to):
-  * **Roles and professions** (e.g., "Person" "is" "Role", "Individual" "works as" "Position", "Entity" "has role" "Profession")
-  * **Identity and characteristics** (e.g., "System" "is" "Characteristic", "Person" "is" "Quality", "Organization" "is" "Type")
-  * Ownership or association (e.g., "Alice" "owns" "Restaurant")
-  * Participation or attendance (e.g., "Team" "participates in" "Tournament")
-  * Personal connections (e.g., "Sarah" "works with" "Michael")
-  * Aliases and alternative names (e.g., "Robert" "is also known as" "Bob")
-  * Locations and spatial relationships (e.g., "Office" "located in" "Building")
-  * Characteristics and properties (e.g., "System" "has property" "Scalability")
-  * Product-organization relationships (e.g., "Software" "developed by" "Company")
-  * Technical dependencies and usage (e.g., "Application" "uses" "Database")
-  * Hierarchical relationships (e.g., "Manager" "supervises" "Employee")
-  * Duration relationships (e.g., "Caroline" "has known" "friends" [duration: "4 years"])
-  * Temporal sequence relationships (e.g., "Caroline" "met" "friends" [context: "since moving"])
-  * Contextual support relationships (e.g., "friends" "supported" "Caroline" [context: "during breakup"])
-
 ## SAME-NAME ENTITY RELATIONSHIP FORMATION
 When entities share identical names but have different types, CREATE explicit relationship statements:
 - **Person-Organization**: "John (Person)" → "owns", "founded", "works for", or "leads" → "John (Company)"
@@ -100,12 +124,12 @@ EXAMPLES of correct Duration/TemporalContext usage:
   * DO NOT CREATE: "Caroline" "relates to" "4 years" (Duration as object)
   * DO NOT CREATE: "since moving" "describes" "friendship" (TemporalContext as subject)
 
-## PREVIOUS EPISODE FILTERING
-Before creating any relationship statement:
-- **CHECK**: Review previous episodes to see if this exact relationship already exists
-- **SKIP**: Do not create statements that duplicate existing relationships
-- **ENHANCE**: Only create statements if they add new information or represent updates
-- **FOCUS**: Prioritize completely new connections not represented in the knowledge graph
+## EXTRACTION COMPLETENESS MANDATE
+- **EXTRACT OBVIOUS FACTS**: Basic relationships are STRUCTURAL FOUNDATIONS, not redundant noise
+- **PRIORITIZE SIMPLE OVER COMPLEX**: "X is_in Y" is more valuable than "X contextually_relates_to Y"
+- **QUANTITY OVER NOVELTY**: Comprehensive coverage beats selective "interesting" facts
+- **SYSTEMATIC ENUMERATION**: If pattern exists for one entity, check ALL entities for same pattern
+- Only skip exact duplicate statements, not similar relationship types
 
 CRITICAL TEMPORAL INFORMATION HANDLING:
 - For events with specific dates/times, ALWAYS capture temporal information in statement attributes
diff --git a/apps/webapp/app/services/search/rerank.ts b/apps/webapp/app/services/search/rerank.ts
index c9b59cd..1a45b68 100644
--- a/apps/webapp/app/services/search/rerank.ts
+++ b/apps/webapp/app/services/search/rerank.ts
@@ -512,7 +512,7 @@ export async function applyCohereReranking(
         cohereScore: result.relevanceScore,
         cohereRank: index + 1,
       }))
-      .filter((result) => result.cohereScore > 0.3);
+      .filter((result) => result.cohereScore >= 0.1);
 
     const responseTime = Date.now() - startTime;
     logger.info(
diff --git a/apps/webapp/app/trigger/ingest/ingest-document.ts b/apps/webapp/app/trigger/ingest/ingest-document.ts
index 5e6ef50..a7d7341 100644
--- a/apps/webapp/app/trigger/ingest/ingest-document.ts
+++ b/apps/webapp/app/trigger/ingest/ingest-document.ts
@@ -87,6 +87,18 @@ export const ingestDocumentTask = task({
         documentSizeTokens: differentialDecision.documentSizeTokens,
       });
 
+      // Early return for unchanged documents
+      if (differentialDecision.strategy === "skip_processing") {
+        logger.log("Document content unchanged, skipping processing");
+        return {
+          success: true,
+          documentsProcessed: 1,
+          chunksProcessed: 0,
+          episodesCreated: 0,
+          entitiesExtracted: 0,
+        };
+      }
+
       // Step 3: Save the new document version
       await saveDocument(document);