Feat: clustering fact statements

2026-01-11 16:48:27 +00:00 · 2025-07-31 22:11:37 +05:30 · 2025-07-31 22:11:37 +05:30 · 29ca8b7ad3
commit 29ca8b7ad3
parent 2a6acaf899
8 changed files with 1170 additions and 2 deletions
--- a/apps/webapp/app/lib/neo4j.server.ts
+++ b/apps/webapp/app/lib/neo4j.server.ts
@ -141,6 +141,9 @@ const initializeSchema = async () => {
    await runQuery(
      "CREATE CONSTRAINT statement_uuid IF NOT EXISTS FOR (n:Statement) REQUIRE n.uuid IS UNIQUE",
    );
+    await runQuery(
+      "CREATE CONSTRAINT cluster_uuid IF NOT EXISTS FOR (n:Cluster) REQUIRE n.uuid IS UNIQUE",
+    );

    // Create indexes for better query performance
    await runQuery(
@ -152,9 +155,18 @@ const initializeSchema = async () => {
    await runQuery(
      "CREATE INDEX statement_invalid_at IF NOT EXISTS FOR (n:Statement) ON (n.invalidAt)",
    );
+    await runQuery(
+      "CREATE INDEX statement_cluster_id IF NOT EXISTS FOR (n:Statement) ON (n.clusterId)",
+    );
    await runQuery(
      "CREATE INDEX entity_name IF NOT EXISTS FOR (n:Entity) ON (n.name)",
    );
+    await runQuery(
+      "CREATE INDEX cluster_user_id IF NOT EXISTS FOR (n:Cluster) ON (n.userId)",
+    );
+    await runQuery(
+      "CREATE INDEX cluster_aspect_type IF NOT EXISTS FOR (n:Cluster) ON (n.aspectType)",
+    );

    // Create vector indexes for semantic search (if using Neo4j 5.0+)
    await runQuery(`
--- a/apps/webapp/app/routes/api.v1.clusters.$clusterId.tsx
+++ b/apps/webapp/app/routes/api.v1.clusters.$clusterId.tsx
@ -0,0 +1,40 @@
+import { json, type LoaderFunctionArgs } from "@remix-run/node";
+import { ClusteringService } from "~/services/clustering.server";
+import { logger } from "~/services/logger.service";
+import { requireUser } from "~/services/session.server";
+
+const clusteringService = new ClusteringService();
+
+export async function loader({ request, params }: LoaderFunctionArgs) {
+  try {
+    const user = await requireUser(request);
+    const { clusterId } = params;
+    
+    if (!clusterId) {
+      return json(
+        { success: false, error: "Cluster ID is required" },
+        { status: 400 }
+      );
+    }
+    
+    const statements = await clusteringService.getClusterStatements(clusterId, user.id);
+    
+    return json({
+      success: true,
+      data: {
+        clusterId,
+        statements
+      }
+    });
+    
+  } catch (error) {
+    logger.error("Error fetching cluster statements:", { error });
+    return json(
+      { 
+        success: false, 
+        error: error instanceof Error ? error.message : "Unknown error" 
+      },
+      { status: 500 }
+    );
+  }
+}
--- a/apps/webapp/app/routes/api.v1.clusters.drift.tsx
+++ b/apps/webapp/app/routes/api.v1.clusters.drift.tsx
@ -0,0 +1,29 @@
+import { json, type LoaderFunctionArgs } from "@remix-run/node";
+import { ClusteringService } from "~/services/clustering.server";
+import { logger } from "~/services/logger.service";
+import { requireUser } from "~/services/session.server";
+
+const clusteringService = new ClusteringService();
+
+export async function loader({ request }: LoaderFunctionArgs) {
+  try {
+    const user = await requireUser(request);
+    
+    const driftMetrics = await clusteringService.detectClusterDrift(user.id);
+    
+    return json({
+      success: true,
+      data: driftMetrics
+    });
+    
+  } catch (error) {
+    logger.error("Error checking cluster drift:", { error });
+    return json(
+      { 
+        success: false, 
+        error: error instanceof Error ? error.message : "Unknown error" 
+      },
+      { status: 500 }
+    );
+  }
+}
--- a/apps/webapp/app/routes/api.v1.clusters.tsx
+++ b/apps/webapp/app/routes/api.v1.clusters.tsx
@ -0,0 +1,71 @@
+import { json, type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/node";
+import { z } from "zod";
+import { logger } from "~/services/logger.service";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { ClusteringService } from "~/services/clustering.server";
+
+const clusteringService = new ClusteringService();
+
+
+const { action, loader } = createActionApiRoute(
+  {
+    body: z.object({
+      mode: z.enum(['auto', 'incremental', 'complete']).optional().default('auto'),
+      forceComplete: z.boolean().optional().default(false)
+    }),
+    allowJWT: true,
+    authorization: {
+      action: "search",
+    },
+    corsStrategy: "all",
+  },
+  async ({ body, authentication, request }) => {
+    try {
+      
+      if (request.method === "POST") {
+        let result;
+        switch (body.mode) {
+          case 'incremental':
+            result = await clusteringService.performIncrementalClustering(authentication.userId);
+            break;
+          case 'complete':
+            result = await clusteringService.performCompleteClustering(authentication.userId);
+            break;
+          case 'auto':
+          default:
+            result = await clusteringService.performClustering(authentication.userId, body.forceComplete);
+            break;
+        }
+        
+        return json({
+          success: true,
+          data: result
+        });
+      } else if (request.method === "GET") {
+        const clusters = await clusteringService.getClusters(authentication.userId);
+        return json({
+          success: true,
+          data: clusters
+        });
+      }
+      
+      return json(
+        { success: false, error: "Method not allowed" },
+        { status: 405 }
+      );
+      
+    } catch (error) {
+      logger.error("Error in clustering action:", { error });
+      return json(
+        { 
+          success: false, 
+          error: error instanceof Error ? error.message : "Unknown error" 
+        },
+        { status: 500 }
+      );
+    }
+  },
+);
+
+export { action, loader };
+
--- a/apps/webapp/app/services/clustering.server.ts
+++ b/apps/webapp/app/services/clustering.server.ts
@ -0,0 +1,989 @@
+import { type CoreMessage } from "ai";
+import { logger } from "./logger.service";
+import { runQuery } from "~/lib/neo4j.server";
+import { makeModelCall } from "~/lib/model.server";
+import crypto from "crypto";
+
+export interface ClusterNode {
+  uuid: string;
+  name: string;
+  aspectType: "thematic" | "social" | "activity";
+  description?: string;
+  size: number;
+  createdAt: Date;
+  userId: string;
+  cohesionScore?: number;
+}
+
+export interface StatementSimilarityEdge {
+  sourceStatementId: string;
+  targetStatementId: string;
+  weight: number;
+  sharedEntities: string[];
+}
+
+export interface DriftMetrics {
+  intraCohesion: number;
+  sizeImbalance: number;
+  newEntityConcentration: number;
+  shouldRecluster: boolean;
+}
+
+export class ClusteringService {
+  private readonly MIN_CLUSTER_SIZE = 10;
+  private readonly LEIDEN_GAMMA = 0.7;
+  private readonly LEIDEN_MAX_LEVELS = 5;
+  private readonly LEIDEN_TOLERANCE = 0.001;
+  private readonly COHESION_THRESHOLD = 0.6;
+
+  /**
+   * Create weighted edges between Statement nodes based on shared entities
+   * Can be run incrementally for new statements or as complete rebuild
+   */
+  async createStatementSimilarityGraph(
+    userId: string,
+    incremental: boolean = false,
+  ): Promise<void> {
+    logger.info(
+      `Creating statement similarity graph for clustering (${incremental ? "incremental" : "complete"})`,
+    );
+
+    const query = `
+        MATCH (s1:Statement)-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]->(e:Entity)<-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]-(s2:Statement)
+        WHERE s1.userId = $userId 
+          AND s2.userId = $userId
+          AND s1.invalidAt IS NULL 
+          AND s2.invalidAt IS NULL
+          AND id(s1) < id(s2)
+        WITH s1, s2, collect(DISTINCT e.uuid) as sharedEntities
+        WHERE size(sharedEntities) > 0
+        MERGE (s1)-[r:SIMILAR_TO]-(s2)
+        SET r.weight = size(sharedEntities) * 2,
+            r.sharedEntities = sharedEntities,
+            r.createdAt = datetime()
+        RETURN count(r) as edgesCreated
+      `;
+    const result = await runQuery(query, { userId });
+    const edgesCreated = result[0]?.get("edgesCreated") || 0;
+
+    logger.info(
+      `${incremental ? "Updated" : "Created"} ${edgesCreated} similarity edges between statements`,
+    );
+  }
+
+  /**
+   * Execute Leiden algorithm for community detection on statement similarity graph
+   */
+  async executeLeidentClustering(
+    userId: string,
+    incremental: boolean = false,
+  ): Promise<void> {
+    logger.info(
+      `Executing Leiden clustering algorithm (${incremental ? "incremental" : "complete"})`,
+    );
+
+    // Create/update the similarity graph
+    await this.createStatementSimilarityGraph(userId, incremental);
+
+    const clusteringQuery = `
+      MATCH (source:Statement) WHERE source.userId = $userId
+      OPTIONAL MATCH (source)-[r:SIMILAR_TO]->(target:Statement) 
+      WHERE target.userId = $userId
+      WITH gds.graph.project(
+        'statementSimilarity_' + $userId,
+        source,
+        target,
+        {
+          relationshipProperties: r { .weight }
+        },
+        { undirectedRelationshipTypes: ['*'] }
+      ) AS g
+
+      CALL gds.leiden.write(
+        g.graphName,
+        {
+          writeProperty: 'tempClusterId',
+          relationshipWeightProperty: 'weight',
+          gamma: 0.7,
+          maxLevels: 10,
+          tolerance: 0.001
+        }
+      )
+      YIELD communityCount
+
+      CALL gds.graph.drop(g.graphName)
+      YIELD graphName as droppedGraphName
+
+      RETURN communityCount, g.nodeCount, g.relationshipCount
+    `;
+
+    const result = await runQuery(clusteringQuery, {
+      userId,
+      gamma: this.LEIDEN_GAMMA,
+      maxLevels: this.LEIDEN_MAX_LEVELS,
+      tolerance: this.LEIDEN_TOLERANCE,
+    });
+
+    const communityCount = result[0]?.get("communityCount") || 0;
+    logger.info(`Leiden clustering found ${communityCount} communities`);
+
+    // Filter clusters by minimum size and assign final cluster IDs
+    await this.filterAndAssignClusters(userId, incremental);
+
+    const removeRelationsQuery = `
+            MATCH (s1:Statement)-[r:SIMILAR_TO]-(s2:Statement)
+        WHERE s1.userId = $userId AND s2.userId = $userId
+        DELETE r`;
+
+    await runQuery(removeRelationsQuery, { userId });
+  }
+
+  /**
+   * Perform incremental clustering for new statements
+   */
+  async performIncrementalClustering(userId: string): Promise<{
+    newStatementsProcessed: number;
+    newClustersCreated: number;
+  }> {
+    logger.info(`Starting incremental clustering for user ${userId}`);
+
+    try {
+      // Check if there are unclustered statements
+      const unclusteredQuery = `
+        MATCH (s:Statement)
+        WHERE s.userId = $userId AND s.clusterId IS NULL AND s.invalidAt IS NULL
+        RETURN count(s) as unclusteredCount
+      `;
+
+      const unclusteredResult = await runQuery(unclusteredQuery, { userId });
+      const unclusteredCount =
+        unclusteredResult[0]?.get("unclusteredCount") || 0;
+
+      if (unclusteredCount === 0) {
+        logger.info(
+          "No unclustered statements found, skipping incremental clustering",
+        );
+        return {
+          newStatementsProcessed: 0,
+          newClustersCreated: 0,
+        };
+      }
+
+      logger.info(`Found ${unclusteredCount} unclustered statements`);
+
+      let newClustersCreated = 0;
+      // Run incremental clustering on remaining statements
+      await this.executeLeidentClustering(userId, true);
+      await this.createClusterNodes(userId);
+
+      // Count new clusters created
+      const newClustersQuery = `
+          MATCH (c:Cluster)
+          WHERE c.userId = $userId AND c.createdAt > datetime() - duration('PT5M')
+          RETURN count(c) as newClusters
+        `;
+      const newClustersResult = await runQuery(newClustersQuery, { userId });
+      newClustersCreated = newClustersResult[0]?.get("newClusters") || 0;
+
+      return {
+        newStatementsProcessed: unclusteredCount,
+        newClustersCreated,
+      };
+    } catch (error) {
+      logger.error("Error in incremental clustering:", { error });
+      throw error;
+    }
+  }
+
+  /**
+   * Perform complete clustering (for new users or full rebuilds)
+   */
+  async performCompleteClustering(userId: string): Promise<{
+    clustersCreated: number;
+    statementsProcessed: number;
+  }> {
+    logger.info(`Starting complete clustering for user ${userId}`);
+
+    try {
+      // Clear any existing cluster assignments
+      await runQuery(
+        `
+        MATCH (s:Statement)
+        WHERE s.userId = $userId
+        REMOVE s.clusterId, s.tempClusterId
+      `,
+        { userId },
+      );
+
+      // Clear statement-to-statement similarity relationships
+      await runQuery(
+        `
+        MATCH (s1:Statement)-[r:SIMILAR_TO]-(s2:Statement)
+        WHERE s1.userId = $userId AND s2.userId = $userId
+        DELETE r
+      `,
+        { userId },
+      );
+
+      // Clear existing cluster nodes
+      await runQuery(
+        `
+        MATCH (c:Cluster)
+        WHERE c.userId = $userId
+        DETACH DELETE c
+      `,
+        { userId },
+      );
+
+      // Execute complete clustering pipeline
+      await this.executeLeidentClustering(userId, false);
+      await this.createClusterNodes(userId);
+
+      // Get results
+      const resultsQuery = `
+        MATCH (c:Cluster) WHERE c.userId = $userId
+        WITH count(c) as clusters
+        MATCH (s:Statement) WHERE s.userId = $userId AND s.clusterId IS NOT NULL
+        RETURN clusters, count(s) as statementsProcessed
+      `;
+
+      const results = await runQuery(resultsQuery, { userId });
+      const clustersCreated = results[0]?.get("clusters") || 0;
+      const statementsProcessed = results[0]?.get("statementsProcessed") || 0;
+
+      logger.info(
+        `Complete clustering finished: ${clustersCreated} clusters, ${statementsProcessed} statements processed`,
+      );
+
+      return { clustersCreated, statementsProcessed };
+    } catch (error) {
+      logger.error("Error in complete clustering:", { error });
+      throw error;
+    }
+  }
+
+  /**
+   * Filter clusters by minimum size and assign final cluster IDs
+   */
+  private async filterAndAssignClusters(
+    userId: string,
+    incremental: boolean = false,
+  ): Promise<void> {
+    const filterQuery = `
+      // Step 1: Get all temp cluster groups and their total sizes
+      MATCH (s:Statement)
+      WHERE s.userId = $userId AND s.tempClusterId IS NOT NULL
+      WITH s.tempClusterId as tempId, collect(s) as allStatements
+      
+      // Step 2: Filter by minimum size
+      WHERE size(allStatements) >= $minSize
+      
+      // Step 3: Check if any statements already have a permanent clusterId
+      WITH tempId, allStatements,
+          [stmt IN allStatements WHERE stmt.clusterId IS NOT NULL] as existingClustered,
+          [stmt IN allStatements WHERE stmt.clusterId IS NULL] as newStatements
+      
+      // Step 4: Determine the final cluster ID
+      WITH tempId, allStatements, existingClustered, newStatements,
+          CASE 
+            WHEN size(existingClustered) > 0 THEN existingClustered[0].clusterId
+            ELSE toString(randomUUID())
+          END as finalClusterId
+      
+      // Step 5: Assign cluster ID to new statements (handles empty arrays gracefully)
+      FOREACH (stmt IN newStatements | 
+        SET stmt.clusterId = finalClusterId
+        REMOVE stmt.tempClusterId
+      )
+      
+      // Step 6: Clean up temp IDs from existing statements
+      FOREACH (existingStmt IN existingClustered |
+        REMOVE existingStmt.tempClusterId
+      )
+      
+      RETURN count(DISTINCT finalClusterId) as validClusters
+    `;
+
+    const result = await runQuery(filterQuery, {
+      userId,
+      minSize: this.MIN_CLUSTER_SIZE,
+    });
+
+    // Remove temp cluster IDs from statements that didn't meet minimum size
+    await runQuery(
+      `
+      MATCH (s:Statement)
+      WHERE s.userId = $userId AND s.tempClusterId IS NOT NULL
+      REMOVE s.tempClusterId
+    `,
+      { userId },
+    );
+
+    const validClusters = result[0]?.get("validClusters") || 0;
+
+    if (incremental) {
+      await this.updateClusterEmbeddings(userId);
+    }
+    logger.info(
+      `${incremental ? "Updated" : "Created"} ${validClusters} valid clusters after size filtering`,
+    );
+  }
+
+  /**
+   * Create Cluster nodes with metadata (hybrid storage approach)
+   * Only creates cluster nodes for cluster IDs that don't already exist
+   */
+  async createClusterNodes(userId: string): Promise<void> {
+    logger.info("Creating cluster metadata nodes for new clusters only");
+
+    const query = `
+      MATCH (s:Statement)
+      WHERE s.userId = $userId AND s.clusterId IS NOT NULL
+      WITH s.clusterId as clusterId, collect(s) as statements
+      
+      // Only process cluster IDs that don't already have a Cluster node
+      WHERE NOT EXISTS {
+        MATCH (existing:Cluster {uuid: clusterId, userId: $userId})
+      }
+      
+      // Get representative entities for naming
+      UNWIND statements as stmt
+      MATCH (stmt)-[:HAS_SUBJECT]->(subj:Entity)
+      MATCH (stmt)-[:HAS_PREDICATE]->(pred:Entity)  
+      MATCH (stmt)-[:HAS_OBJECT]->(obj:Entity)
+      
+      WITH clusterId, statements,
+           collect(DISTINCT subj.name) as subjects,
+           collect(DISTINCT pred.name) as predicates,
+           collect(DISTINCT obj.name) as objects
+      
+      // Get top 10 most frequent entities of each type
+      WITH clusterId, statements,
+           apoc.coll.frequencies(subjects)[0..10] as topSubjects,
+           apoc.coll.frequencies(predicates)[0..10] as topPredicates,
+           apoc.coll.frequencies(objects)[0..10] as topObjects
+      
+      // Calculate cluster embedding as average of statement embeddings
+      WITH clusterId, statements, topSubjects, topPredicates, topObjects,
+           [stmt IN statements WHERE stmt.factEmbedding IS NOT NULL | stmt.factEmbedding] as validEmbeddings
+      
+      // Calculate average embedding (centroid)
+      WITH clusterId, statements, topSubjects, topPredicates, topObjects, validEmbeddings,
+           CASE 
+             WHEN size(validEmbeddings) > 0 THEN
+               reduce(avg = [i IN range(0, size(validEmbeddings[0])-1) | 0.0], 
+                      embedding IN validEmbeddings | 
+                      [i IN range(0, size(embedding)-1) | avg[i] + embedding[i] / size(validEmbeddings)])
+             ELSE null
+           END as clusterEmbedding
+
+      CREATE (c:Cluster {
+        uuid: clusterId,
+        size: size(statements),
+        createdAt: datetime(),
+        userId: $userId,
+        topSubjects: [item in topSubjects | item.item],
+        topPredicates: [item in topPredicates | item.item],
+        topObjects: [item in topObjects | item.item],
+        clusterEmbedding: clusterEmbedding,
+        embeddingCount: size(validEmbeddings),
+        needsNaming: true
+      })
+      
+      RETURN count(c) as clustersCreated
+    `;
+
+    const result = await runQuery(query, { userId });
+    const clustersCreated = result[0]?.get("clustersCreated") || 0;
+
+    logger.info(`Created ${clustersCreated} new cluster metadata nodes`);
+
+    // Only generate names for new clusters (those with needsNaming = true)
+    if (clustersCreated > 0) {
+      await this.generateClusterNames(userId);
+    }
+  }
+
+  /**
+   * Calculate TF-IDF scores for a specific cluster
+   *
+   * Uses cluster-based document frequency (not statement-based) for optimal cluster naming:
+   * - TF: How often a term appears within this specific cluster
+   * - DF: How many clusters (not statements) contain this term
+   * - IDF: log(total_clusters / clusters_containing_term)
+   *
+   * This approach identifies terms that are frequent in THIS cluster but rare across OTHER clusters,
+   * making them highly distinctive for cluster naming and differentiation.
+   *
+   * Example: "SOL" appears in 100/100 statements in Cluster A, but only 1/10 total clusters
+   * - Cluster-based IDF: log(10/1) = high distinctiveness ✓ (good for naming)
+   * - Statement-based IDF: log(1000/100) = lower distinctiveness (less useful for naming)
+   */
+  private async calculateClusterTFIDFForCluster(
+    userId: string,
+    targetClusterId: string,
+  ): Promise<{
+    subjects: Array<{ term: string; score: number }>;
+    predicates: Array<{ term: string; score: number }>;
+    objects: Array<{ term: string; score: number }>;
+  } | null> {
+    // Get all clusters and their entity frequencies (needed for cluster-based IDF calculation)
+    // We need ALL clusters to calculate how rare each term is across the cluster space
+    const allClustersQuery = `
+    MATCH (s:Statement)
+    WHERE s.userId = $userId AND s.clusterId IS NOT NULL
+    MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
+    MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)  
+    MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
+    WITH s.clusterId as clusterId, 
+         collect(DISTINCT subj.name) as subjects,
+         collect(DISTINCT pred.name) as predicates,
+         collect(DISTINCT obj.name) as objects
+    RETURN clusterId, subjects, predicates, objects
+  `;
+
+    const allClusters = await runQuery(allClustersQuery, {
+      userId,
+    });
+
+    // Build document frequency maps from all clusters
+    // DF = number of clusters that contain each term (not number of statements)
+    const subjectDF = new Map<string, number>();
+    const predicateDF = new Map<string, number>();
+    const objectDF = new Map<string, number>();
+    const totalClusters = allClusters.length;
+
+    // Calculate cluster-based document frequencies
+    // For each term, count how many different clusters it appears in
+    for (const record of allClusters) {
+      const subjects = (record.get("subjects") as string[]) || [];
+      const predicates = (record.get("predicates") as string[]) || [];
+      const objects = (record.get("objects") as string[]) || [];
+
+      // Count unique terms per cluster (each cluster contributes max 1 to DF for each term)
+      new Set(subjects).forEach((term) => {
+        subjectDF.set(term, (subjectDF.get(term) || 0) + 1);
+      });
+      new Set(predicates).forEach((term) => {
+        predicateDF.set(term, (predicateDF.get(term) || 0) + 1);
+      });
+      new Set(objects).forEach((term) => {
+        objectDF.set(term, (objectDF.get(term) || 0) + 1);
+      });
+    }
+
+    // Find the target cluster data for TF calculation
+    const targetCluster = allClusters.find(
+      (record) => record.get("clusterId") === targetClusterId,
+    );
+
+    if (!targetCluster) {
+      return null;
+    }
+
+    const subjects = (targetCluster.get("subjects") as string[]) || [];
+    const predicates = (targetCluster.get("predicates") as string[]) || [];
+    const objects = (targetCluster.get("objects") as string[]) || [];
+
+    // Calculate term frequencies within this specific cluster
+    // TF = how often each term appears in this cluster's statements
+    const subjectTF = new Map<string, number>();
+    const predicateTF = new Map<string, number>();
+    const objectTF = new Map<string, number>();
+
+    subjects.forEach((term) => {
+      subjectTF.set(term, (subjectTF.get(term) || 0) + 1);
+    });
+    predicates.forEach((term) => {
+      predicateTF.set(term, (predicateTF.get(term) || 0) + 1);
+    });
+    objects.forEach((term) => {
+      objectTF.set(term, (objectTF.get(term) || 0) + 1);
+    });
+
+    // Calculate TF-IDF scores using cluster-based document frequency
+    // Higher scores = terms frequent in THIS cluster but rare across OTHER clusters
+    const calculateTFIDF = (
+      tf: Map<string, number>,
+      df: Map<string, number>,
+      totalTerms: number,
+    ) => {
+      return Array.from(tf.entries())
+        .map(([term, freq]) => {
+          // TF: Normalized frequency within this cluster
+          const termFreq = freq / totalTerms;
+
+          // DF: Number of clusters containing this term
+          const docFreq = df.get(term) || 1;
+
+          // IDF: Inverse document frequency (cluster-based)
+          // Higher when term appears in fewer clusters
+          const idf = Math.log(totalClusters / docFreq);
+
+          // TF-IDF: Final distinctiveness score
+          const tfidf = termFreq * idf;
+
+          return { term, score: tfidf };
+        })
+        .sort((a, b) => b.score - a.score)
+        .slice(0, 10); // Top 10 most distinctive terms
+    };
+
+    return {
+      subjects: calculateTFIDF(subjectTF, subjectDF, subjects.length),
+      predicates: calculateTFIDF(predicateTF, predicateDF, predicates.length),
+      objects: calculateTFIDF(objectTF, objectDF, objects.length),
+    };
+  }
+
+  /**
+   * Generate cluster names using LLM based on TF-IDF analysis
+   */
+  private async generateClusterNames(userId: string): Promise<void> {
+    logger.info("Generating cluster names using TF-IDF analysis");
+
+    const getClustersQuery = `
+    MATCH (c:Cluster)
+    WHERE c.userId = $userId AND c.needsNaming = true
+    RETURN c.uuid as clusterId, c.size as size
+  `;
+
+    const clusters = await runQuery(getClustersQuery, { userId });
+
+    for (const record of clusters) {
+      const clusterId = record.get("clusterId");
+      const size = record.get("size");
+
+      // Calculate TF-IDF only for this specific cluster
+      const tfidfData = await this.calculateClusterTFIDFForCluster(
+        userId,
+        clusterId,
+      );
+      if (!tfidfData) {
+        logger.warn(`No TF-IDF data found for cluster ${clusterId}`);
+        continue;
+      }
+
+      const namingPrompt = this.createTFIDFClusterNamingPrompt({
+        ...tfidfData,
+        size,
+      });
+
+      let responseText = "";
+      await makeModelCall(false, namingPrompt, (text) => {
+        responseText = text;
+      });
+
+      try {
+        const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
+        if (outputMatch && outputMatch[1]) {
+          const response = JSON.parse(outputMatch[1].trim());
+
+          const updateQuery = `
+          MATCH (c:Cluster {uuid: $clusterId})
+          SET c.name = $name,
+              c.description = $description,
+              c.needsNaming = false
+        `;
+
+          await runQuery(updateQuery, {
+            clusterId,
+            name: response.name || `Cluster ${clusterId}`,
+            description: response.description || null,
+          });
+        }
+      } catch (error) {
+        logger.error(`Error naming cluster ${clusterId}:`, { error });
+
+        // Fallback naming
+        await runQuery(
+          `
+        MATCH (c:Cluster {uuid: $clusterId})
+        SET c.name = 'Cluster ' + substring($clusterId, 0, 8),
+            c.needsNaming = false
+      `,
+          { clusterId },
+        );
+      }
+    }
+  }
+
+  /**
+   * Create prompt for unsupervised cluster naming using TF-IDF scores
+   */
+  private createTFIDFClusterNamingPrompt(data: {
+    subjects: Array<{ term: string; score: number }>;
+    predicates: Array<{ term: string; score: number }>;
+    objects: Array<{ term: string; score: number }>;
+    size: number;
+  }): CoreMessage[] {
+    const formatTerms = (terms: Array<{ term: string; score: number }>) =>
+      terms.map((t) => `"${t.term}" (${t.score.toFixed(3)})`).join(", ");
+
+    return [
+      {
+        role: "system",
+        content: `You are an expert at analyzing semantic patterns and creating descriptive cluster names. You will receive TF-IDF scores showing the most distinctive terms for a cluster of knowledge graph statements.
+
+        TF-IDF Analysis:
+        - Higher scores = terms that are frequent in THIS cluster but rare in OTHER clusters
+        - These scores reveal what makes this cluster semantically unique
+        - Focus on the highest-scoring terms as they are the most distinctive
+
+        Knowledge Graph Context:
+        - Subjects: Who or what is performing actions
+        - Predicates: The relationships, actions, or connections
+        - Objects: Who or what is being acted upon or referenced
+
+        Naming Guidelines:
+        1. Create a 2-4 word descriptive name that captures the core semantic theme
+        2. Focus on the highest TF-IDF scoring terms - they reveal the cluster's uniqueness
+        3. Look for patterns across subjects, predicates, and objects together
+        4. Use natural language that a user would understand
+        5. Avoid generic terms - be specific based on the distinctive patterns
+
+        Return only a JSON object:
+        <output>
+        {
+        "name": "Descriptive cluster name",
+        "description": "Brief explanation of the semantic pattern based on TF-IDF analysis"
+        }
+        </output>`,
+      },
+      {
+        role: "user",
+        content: `Analyze this cluster of ${data.size} statements. The TF-IDF scores show what makes this cluster distinctive:
+
+**Distinctive Subjects (TF-IDF):**
+${formatTerms(data.subjects)}
+
+**Distinctive Predicates (TF-IDF):**
+${formatTerms(data.predicates)}
+
+**Distinctive Objects (TF-IDF):**
+${formatTerms(data.objects)}
+
+Based on these distinctive patterns, what is the most accurate name for this semantic cluster?`,
+      },
+    ];
+  }
+
+  /**
+   * Update cluster embeddings incrementally when new statements are added
+   */
+  private async updateClusterEmbeddings(userId: string): Promise<void> {
+    logger.info("Updating cluster embeddings after new statements");
+
+    const updateQuery = `
+      MATCH (c:Cluster)
+      WHERE c.userId = $userId
+      
+      MATCH (s:Statement {clusterId: c.uuid, userId: $userId})
+      WHERE s.factEmbedding IS NOT NULL
+      
+      WITH c, collect(s.factEmbedding) as allEmbeddings
+      WHERE size(allEmbeddings) > 0
+      
+      // Recalculate average embedding
+      WITH c, allEmbeddings,
+           reduce(avg = [i IN range(0, size(allEmbeddings[0])-1) | 0.0], 
+                  embedding IN allEmbeddings | 
+                  [i IN range(0, size(embedding)-1) | avg[i] + embedding[i] / size(allEmbeddings)]) as newEmbedding
+      
+      SET c.clusterEmbedding = newEmbedding,
+          c.embeddingCount = size(allEmbeddings),
+          c.lastEmbeddingUpdate = datetime()
+      
+      RETURN count(c) as updatedClusters
+    `;
+
+    const result = await runQuery(updateQuery, { userId });
+    const updatedClusters = result[0]?.get("updatedClusters") || 0;
+
+    logger.info(`Updated embeddings for ${updatedClusters} clusters`);
+  }
+
+  /**
+   * Detect cluster drift using embedding-based cohesion analysis
+   */
+  async detectClusterDrift(userId: string): Promise<{
+    driftDetected: boolean;
+    lowCohesionClusters: string[];
+    avgCohesion: number;
+    reason: string;
+  }> {
+    logger.info("Detecting cluster drift using embedding cohesion analysis");
+
+    // First update cluster embeddings to ensure they're current
+    await this.updateClusterEmbeddings(userId);
+
+    // Calculate cohesion for all clusters
+    const cohesionQuery = `
+      MATCH (c:Cluster)
+      WHERE c.userId = $userId AND c.clusterEmbedding IS NOT NULL
+      
+      MATCH (s:Statement {clusterId: c.uuid, userId: $userId})
+      WHERE s.factEmbedding IS NOT NULL
+      
+      WITH c, collect(s.factEmbedding) as statementEmbeddings, c.clusterEmbedding as clusterEmbedding
+      WHERE size(statementEmbeddings) >= $minClusterSize
+      
+      // Calculate average cosine similarity for this cluster
+      UNWIND statementEmbeddings as stmtEmb
+      WITH c, stmtEmb, clusterEmbedding,
+           reduce(dot = 0.0, i IN range(0, size(stmtEmb)-1) | dot + stmtEmb[i] * clusterEmbedding[i]) as dotProduct,
+           sqrt(reduce(mag1 = 0.0, i IN range(0, size(stmtEmb)-1) | mag1 + stmtEmb[i] * stmtEmb[i])) as stmtMagnitude,
+           sqrt(reduce(mag2 = 0.0, i IN range(0, size(clusterEmbedding)-1) | mag2 + clusterEmbedding[i] * clusterEmbedding[i])) as clusterMagnitude
+      
+      WITH c, 
+           CASE 
+             WHEN stmtMagnitude > 0 AND clusterMagnitude > 0 
+             THEN dotProduct / (stmtMagnitude * clusterMagnitude)
+             ELSE 0.0
+           END as cosineSimilarity
+      
+      WITH c, avg(cosineSimilarity) as clusterCohesion
+      
+      // Update cluster with cohesion score
+      SET c.cohesionScore = clusterCohesion
+      
+      RETURN c.uuid as clusterId, c.size as clusterSize, clusterCohesion
+      ORDER BY clusterCohesion ASC
+    `;
+
+    const cohesionResults = await runQuery(cohesionQuery, {
+      userId,
+      minClusterSize: this.MIN_CLUSTER_SIZE,
+    });
+
+    const clusterCohesions = cohesionResults.map((record) => ({
+      clusterId: record.get("clusterId"),
+      size: record.get("clusterSize"),
+      cohesion: record.get("clusterCohesion") || 0.0,
+    }));
+
+    const avgCohesion =
+      clusterCohesions.length > 0
+        ? clusterCohesions.reduce((sum, c) => sum + c.cohesion, 0) /
+          clusterCohesions.length
+        : 0.0;
+
+    const lowCohesionClusters = clusterCohesions
+      .filter((c) => c.cohesion < this.COHESION_THRESHOLD)
+      .map((c) => c.clusterId);
+
+    const driftDetected =
+      lowCohesionClusters.length > 0 || avgCohesion < this.COHESION_THRESHOLD;
+
+    let reason = "";
+    if (lowCohesionClusters.length > 0) {
+      reason = `${lowCohesionClusters.length} clusters have low cohesion (< ${this.COHESION_THRESHOLD})`;
+    } else if (avgCohesion < this.COHESION_THRESHOLD) {
+      reason = `Overall average cohesion (${avgCohesion.toFixed(3)}) below threshold (${this.COHESION_THRESHOLD})`;
+    }
+
+    logger.info(
+      `Drift detection completed: ${driftDetected ? "DRIFT DETECTED" : "NO DRIFT"} - ${reason || "Clusters are cohesive"}`,
+    );
+
+    return {
+      driftDetected,
+      lowCohesionClusters,
+      avgCohesion,
+      reason: reason || "Clusters are cohesive",
+    };
+  }
+
+  /**
+   * Handle cluster evolution when drift is detected
+   */
+  async evolveCluster(oldClusterId: string, userId: string): Promise<string> {
+    logger.info(`Evolving cluster ${oldClusterId}`);
+
+    const newClusterId = crypto.randomUUID();
+
+    // Create evolution relationship
+    const evolutionQuery = `
+      MATCH (oldCluster:Cluster {uuid: $oldClusterId})
+      CREATE (newCluster:Cluster {
+        uuid: $newClusterId,
+        createdAt: datetime(),
+        userId: $userId,
+        size: 0,
+        needsNaming: true
+      })
+      CREATE (oldCluster)-[:EVOLVED_TO {createdAt: datetime()}]->(newCluster)
+      RETURN newCluster.uuid as uuid
+    `;
+
+    await runQuery(evolutionQuery, {
+      oldClusterId,
+      newClusterId,
+      userId,
+    });
+
+    return newClusterId;
+  }
+
+  /**
+   * Main clustering orchestration method - intelligently chooses between incremental and complete clustering
+   */
+  async performClustering(
+    userId: string,
+    forceComplete: boolean = false,
+  ): Promise<{
+    clustersCreated: number;
+    statementsProcessed: number;
+    driftMetrics?: DriftMetrics;
+    approach: "incremental" | "complete";
+  }> {
+    logger.info(`Starting clustering process for user ${userId}`);
+
+    try {
+      // Check if user has any existing clusters
+      const existingClustersQuery = `
+        MATCH (c:Cluster)
+        WHERE c.userId = $userId
+        RETURN count(c) as existingClusters
+      `;
+      const existingResult = await runQuery(existingClustersQuery, { userId });
+      const existingClusters = existingResult[0]?.get("existingClusters") || 0;
+
+      // Check total statement count
+      // const totalStatementsQuery = `
+      //   MATCH (s:Statement)
+      //   WHERE s.userId = $userId AND s.invalidAt IS NULL
+      //   RETURN count(s) as totalStatements
+      // `;
+      // const totalResult = await runQuery(totalStatementsQuery, { userId });
+      // const totalStatements = totalResult[0]?.get("totalStatements") || 0;
+
+      // Determine clustering approach
+      let useIncremental =
+        existingClusters > 0 && !forceComplete ? true : false;
+      let driftMetrics: DriftMetrics | undefined;
+
+      // if (
+      //   !forceComplete &&
+      //   existingClusters > 0 &&
+      //   totalStatements >= this.MIN_CLUSTER_SIZE
+      // ) {
+      //   // Check for drift to decide approach
+      //   driftMetrics = await this.detectClusterDrift(userId);
+
+      //   if (!driftMetrics.shouldRecluster) {
+      //     useIncremental = true;
+      //     logger.info("Using incremental clustering approach");
+      //   } else {
+      //     logger.info("Drift detected, using complete clustering approach");
+      //   }
+      // } else if (totalStatements < this.MIN_CLUSTER_SIZE) {
+      //   logger.info(
+      //     `Insufficient statements (${totalStatements}) for clustering, minimum required: ${this.MIN_CLUSTER_SIZE}`,
+      //   );
+      //   return {
+      //     clustersCreated: 0,
+      //     statementsProcessed: 0,
+      //     driftMetrics,
+      //     approach: "incremental",
+      //   };
+      // } else {
+      //   logger.info("Using complete clustering approach (new user or forced)");
+      // }
+
+      // Execute appropriate clustering strategy
+      if (useIncremental) {
+        const incrementalResult =
+          await this.performIncrementalClustering(userId);
+        return {
+          clustersCreated: incrementalResult.newClustersCreated,
+          statementsProcessed: incrementalResult.newStatementsProcessed,
+          driftMetrics,
+          approach: "incremental",
+        };
+      } else {
+        const completeResult = await this.performCompleteClustering(userId);
+        return {
+          clustersCreated: completeResult.clustersCreated,
+          statementsProcessed: completeResult.statementsProcessed,
+          driftMetrics,
+          approach: "complete",
+        };
+      }
+    } catch (error) {
+      logger.error("Error in clustering process:", { error });
+      throw error;
+    }
+  }
+
+  /**
+   * Force complete reclustering (useful for maintenance or when drift is too high)
+   */
+  async forceCompleteClustering(userId: string): Promise<{
+    clustersCreated: number;
+    statementsProcessed: number;
+  }> {
+    return await this.performCompleteClustering(userId);
+  }
+
+  /**
+   * Get cluster information for a user
+   */
+  async getClusters(userId: string): Promise<ClusterNode[]> {
+    const query = `
+      MATCH (c:Cluster)
+      WHERE c.userId = $userId
+      RETURN c
+      ORDER BY c.size DESC
+    `;
+
+    const result = await runQuery(query, { userId });
+
+    return result.map((record) => {
+      const cluster = record.get("c").properties;
+      return {
+        uuid: cluster.uuid,
+        name: cluster.name || `Cluster ${cluster.uuid.substring(0, 8)}`,
+        aspectType: cluster.aspectType || "thematic",
+        description: cluster.description,
+        size: cluster.size || 0,
+        createdAt: new Date(cluster.createdAt),
+        userId: cluster.userId,
+        cohesionScore: cluster.cohesionScore,
+      };
+    });
+  }
+
+  /**
+   * Get statements in a specific cluster
+   */
+  async getClusterStatements(
+    clusterId: string,
+    userId: string,
+  ): Promise<any[]> {
+    const query = `
+      MATCH (s:Statement)
+      WHERE s.clusterId = $clusterId AND s.userId = $userId
+      MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
+      MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)
+      MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
+      RETURN s, subj.name as subject, pred.name as predicate, obj.name as object
+      ORDER BY s.createdAt DESC
+    `;
+
+    const result = await runQuery(query, { clusterId, userId });
+
+    return result.map((record) => {
+      const statement = record.get("s").properties;
+      return {
+        uuid: statement.uuid,
+        fact: statement.fact,
+        subject: record.get("subject"),
+        predicate: record.get("predicate"),
+        object: record.get("object"),
+        createdAt: new Date(statement.createdAt),
+        validAt: new Date(statement.validAt),
+      };
+    });
+  }
+}
--- a/apps/webapp/app/services/knowledgeGraph.server.ts
+++ b/apps/webapp/app/services/knowledgeGraph.server.ts
@ -8,6 +8,7 @@ import {
  type Triple,
 } from "@core/types";
 import { logger } from "./logger.service";
+import { ClusteringService } from "./clustering.server";
 import crypto from "crypto";
 import {
  dedupeNodes,
@ -53,6 +54,12 @@ import { type PrismaClient } from "@prisma/client";
 const DEFAULT_EPISODE_WINDOW = 5;

 export class KnowledgeGraphService {
+  private clusteringService: ClusteringService;
+
+  constructor() {
+    this.clusteringService = new ClusteringService();
+  }
+
  async getEmbedding(text: string) {
    return getEmbedding(text);
  }
@ -188,6 +195,26 @@ export class KnowledgeGraphService {
      // Invalidate invalidated statements
      await invalidateStatements({ statementIds: invalidatedStatements });

+      // Trigger incremental clustering process after successful ingestion
+      if (resolvedStatements.length > 0) {
+        try {
+          logger.info(
+            "Triggering incremental clustering process after episode ingestion",
+          );
+          const clusteringResult =
+            await this.clusteringService.performClustering(
+              params.userId,
+              false,
+            );
+          logger.info(
+            `Incremental clustering completed: ${clusteringResult.clustersCreated} clusters created, ${clusteringResult.statementsProcessed} statements processed`,
+          );
+        } catch (clusteringError) {
+          logger.error("Error in incremental clustering process:");
+          // Don't fail the entire ingestion if clustering fails
+        }
+      }
+
      const endTime = Date.now();
      const processingTimeMs = endTime - startTime;
      logger.log(`Processing time: ${processingTimeMs} ms`);
--- a/apps/webapp/integrations/slack/main
+++ b/apps/webapp/integrations/slack/main
--- a/docker-compose.aws.yaml
+++ b/docker-compose.aws.yaml
@ -50,8 +50,8 @@ services:
    image: neo4j:5.25-community
    environment:
      - NEO4J_AUTH=${NEO4J_AUTH}
-      - NEO4J_dbms_security_procedures_unrestricted=gds.*
-      - NEO4J_dbms_security_procedures_allowlist=gds.*
+      - NEO4J_dbms_security_procedures_unrestricted=gds.*,apoc.*
+      - NEO4J_dbms_security_procedures_allowlist=gds.*,apoc.*
    ports:
      - "7474:7474"
      - "7687:7687"