mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 16:48:27 +00:00
Feat: clustering fact statements
This commit is contained in:
parent
2a6acaf899
commit
29ca8b7ad3
@ -141,6 +141,9 @@ const initializeSchema = async () => {
|
||||
await runQuery(
|
||||
"CREATE CONSTRAINT statement_uuid IF NOT EXISTS FOR (n:Statement) REQUIRE n.uuid IS UNIQUE",
|
||||
);
|
||||
await runQuery(
|
||||
"CREATE CONSTRAINT cluster_uuid IF NOT EXISTS FOR (n:Cluster) REQUIRE n.uuid IS UNIQUE",
|
||||
);
|
||||
|
||||
// Create indexes for better query performance
|
||||
await runQuery(
|
||||
@ -152,9 +155,18 @@ const initializeSchema = async () => {
|
||||
await runQuery(
|
||||
"CREATE INDEX statement_invalid_at IF NOT EXISTS FOR (n:Statement) ON (n.invalidAt)",
|
||||
);
|
||||
await runQuery(
|
||||
"CREATE INDEX statement_cluster_id IF NOT EXISTS FOR (n:Statement) ON (n.clusterId)",
|
||||
);
|
||||
await runQuery(
|
||||
"CREATE INDEX entity_name IF NOT EXISTS FOR (n:Entity) ON (n.name)",
|
||||
);
|
||||
await runQuery(
|
||||
"CREATE INDEX cluster_user_id IF NOT EXISTS FOR (n:Cluster) ON (n.userId)",
|
||||
);
|
||||
await runQuery(
|
||||
"CREATE INDEX cluster_aspect_type IF NOT EXISTS FOR (n:Cluster) ON (n.aspectType)",
|
||||
);
|
||||
|
||||
// Create vector indexes for semantic search (if using Neo4j 5.0+)
|
||||
await runQuery(`
|
||||
|
||||
40
apps/webapp/app/routes/api.v1.clusters.$clusterId.tsx
Normal file
40
apps/webapp/app/routes/api.v1.clusters.$clusterId.tsx
Normal file
@ -0,0 +1,40 @@
|
||||
import { json, type LoaderFunctionArgs } from "@remix-run/node";
|
||||
import { ClusteringService } from "~/services/clustering.server";
|
||||
import { logger } from "~/services/logger.service";
|
||||
import { requireUser } from "~/services/session.server";
|
||||
|
||||
const clusteringService = new ClusteringService();
|
||||
|
||||
export async function loader({ request, params }: LoaderFunctionArgs) {
|
||||
try {
|
||||
const user = await requireUser(request);
|
||||
const { clusterId } = params;
|
||||
|
||||
if (!clusterId) {
|
||||
return json(
|
||||
{ success: false, error: "Cluster ID is required" },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const statements = await clusteringService.getClusterStatements(clusterId, user.id);
|
||||
|
||||
return json({
|
||||
success: true,
|
||||
data: {
|
||||
clusterId,
|
||||
statements
|
||||
}
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
logger.error("Error fetching cluster statements:", { error });
|
||||
return json(
|
||||
{
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error"
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
29
apps/webapp/app/routes/api.v1.clusters.drift.tsx
Normal file
29
apps/webapp/app/routes/api.v1.clusters.drift.tsx
Normal file
@ -0,0 +1,29 @@
|
||||
import { json, type LoaderFunctionArgs } from "@remix-run/node";
|
||||
import { ClusteringService } from "~/services/clustering.server";
|
||||
import { logger } from "~/services/logger.service";
|
||||
import { requireUser } from "~/services/session.server";
|
||||
|
||||
const clusteringService = new ClusteringService();
|
||||
|
||||
export async function loader({ request }: LoaderFunctionArgs) {
|
||||
try {
|
||||
const user = await requireUser(request);
|
||||
|
||||
const driftMetrics = await clusteringService.detectClusterDrift(user.id);
|
||||
|
||||
return json({
|
||||
success: true,
|
||||
data: driftMetrics
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
logger.error("Error checking cluster drift:", { error });
|
||||
return json(
|
||||
{
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error"
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
71
apps/webapp/app/routes/api.v1.clusters.tsx
Normal file
71
apps/webapp/app/routes/api.v1.clusters.tsx
Normal file
@ -0,0 +1,71 @@
|
||||
import { json, type ActionFunctionArgs, type LoaderFunctionArgs } from "@remix-run/node";
|
||||
import { z } from "zod";
|
||||
import { logger } from "~/services/logger.service";
|
||||
import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
|
||||
import { ClusteringService } from "~/services/clustering.server";
|
||||
|
||||
const clusteringService = new ClusteringService();
|
||||
|
||||
|
||||
const { action, loader } = createActionApiRoute(
|
||||
{
|
||||
body: z.object({
|
||||
mode: z.enum(['auto', 'incremental', 'complete']).optional().default('auto'),
|
||||
forceComplete: z.boolean().optional().default(false)
|
||||
}),
|
||||
allowJWT: true,
|
||||
authorization: {
|
||||
action: "search",
|
||||
},
|
||||
corsStrategy: "all",
|
||||
},
|
||||
async ({ body, authentication, request }) => {
|
||||
try {
|
||||
|
||||
if (request.method === "POST") {
|
||||
let result;
|
||||
switch (body.mode) {
|
||||
case 'incremental':
|
||||
result = await clusteringService.performIncrementalClustering(authentication.userId);
|
||||
break;
|
||||
case 'complete':
|
||||
result = await clusteringService.performCompleteClustering(authentication.userId);
|
||||
break;
|
||||
case 'auto':
|
||||
default:
|
||||
result = await clusteringService.performClustering(authentication.userId, body.forceComplete);
|
||||
break;
|
||||
}
|
||||
|
||||
return json({
|
||||
success: true,
|
||||
data: result
|
||||
});
|
||||
} else if (request.method === "GET") {
|
||||
const clusters = await clusteringService.getClusters(authentication.userId);
|
||||
return json({
|
||||
success: true,
|
||||
data: clusters
|
||||
});
|
||||
}
|
||||
|
||||
return json(
|
||||
{ success: false, error: "Method not allowed" },
|
||||
{ status: 405 }
|
||||
);
|
||||
|
||||
} catch (error) {
|
||||
logger.error("Error in clustering action:", { error });
|
||||
return json(
|
||||
{
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error"
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
export { action, loader };
|
||||
|
||||
989
apps/webapp/app/services/clustering.server.ts
Normal file
989
apps/webapp/app/services/clustering.server.ts
Normal file
@ -0,0 +1,989 @@
|
||||
import { type CoreMessage } from "ai";
|
||||
import { logger } from "./logger.service";
|
||||
import { runQuery } from "~/lib/neo4j.server";
|
||||
import { makeModelCall } from "~/lib/model.server";
|
||||
import crypto from "crypto";
|
||||
|
||||
export interface ClusterNode {
|
||||
uuid: string;
|
||||
name: string;
|
||||
aspectType: "thematic" | "social" | "activity";
|
||||
description?: string;
|
||||
size: number;
|
||||
createdAt: Date;
|
||||
userId: string;
|
||||
cohesionScore?: number;
|
||||
}
|
||||
|
||||
export interface StatementSimilarityEdge {
|
||||
sourceStatementId: string;
|
||||
targetStatementId: string;
|
||||
weight: number;
|
||||
sharedEntities: string[];
|
||||
}
|
||||
|
||||
export interface DriftMetrics {
|
||||
intraCohesion: number;
|
||||
sizeImbalance: number;
|
||||
newEntityConcentration: number;
|
||||
shouldRecluster: boolean;
|
||||
}
|
||||
|
||||
export class ClusteringService {
|
||||
private readonly MIN_CLUSTER_SIZE = 10;
|
||||
private readonly LEIDEN_GAMMA = 0.7;
|
||||
private readonly LEIDEN_MAX_LEVELS = 5;
|
||||
private readonly LEIDEN_TOLERANCE = 0.001;
|
||||
private readonly COHESION_THRESHOLD = 0.6;
|
||||
|
||||
/**
|
||||
* Create weighted edges between Statement nodes based on shared entities
|
||||
* Can be run incrementally for new statements or as complete rebuild
|
||||
*/
|
||||
async createStatementSimilarityGraph(
|
||||
userId: string,
|
||||
incremental: boolean = false,
|
||||
): Promise<void> {
|
||||
logger.info(
|
||||
`Creating statement similarity graph for clustering (${incremental ? "incremental" : "complete"})`,
|
||||
);
|
||||
|
||||
const query = `
|
||||
MATCH (s1:Statement)-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]->(e:Entity)<-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]-(s2:Statement)
|
||||
WHERE s1.userId = $userId
|
||||
AND s2.userId = $userId
|
||||
AND s1.invalidAt IS NULL
|
||||
AND s2.invalidAt IS NULL
|
||||
AND id(s1) < id(s2)
|
||||
WITH s1, s2, collect(DISTINCT e.uuid) as sharedEntities
|
||||
WHERE size(sharedEntities) > 0
|
||||
MERGE (s1)-[r:SIMILAR_TO]-(s2)
|
||||
SET r.weight = size(sharedEntities) * 2,
|
||||
r.sharedEntities = sharedEntities,
|
||||
r.createdAt = datetime()
|
||||
RETURN count(r) as edgesCreated
|
||||
`;
|
||||
const result = await runQuery(query, { userId });
|
||||
const edgesCreated = result[0]?.get("edgesCreated") || 0;
|
||||
|
||||
logger.info(
|
||||
`${incremental ? "Updated" : "Created"} ${edgesCreated} similarity edges between statements`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute Leiden algorithm for community detection on statement similarity graph
|
||||
*/
|
||||
async executeLeidentClustering(
|
||||
userId: string,
|
||||
incremental: boolean = false,
|
||||
): Promise<void> {
|
||||
logger.info(
|
||||
`Executing Leiden clustering algorithm (${incremental ? "incremental" : "complete"})`,
|
||||
);
|
||||
|
||||
// Create/update the similarity graph
|
||||
await this.createStatementSimilarityGraph(userId, incremental);
|
||||
|
||||
const clusteringQuery = `
|
||||
MATCH (source:Statement) WHERE source.userId = $userId
|
||||
OPTIONAL MATCH (source)-[r:SIMILAR_TO]->(target:Statement)
|
||||
WHERE target.userId = $userId
|
||||
WITH gds.graph.project(
|
||||
'statementSimilarity_' + $userId,
|
||||
source,
|
||||
target,
|
||||
{
|
||||
relationshipProperties: r { .weight }
|
||||
},
|
||||
{ undirectedRelationshipTypes: ['*'] }
|
||||
) AS g
|
||||
|
||||
CALL gds.leiden.write(
|
||||
g.graphName,
|
||||
{
|
||||
writeProperty: 'tempClusterId',
|
||||
relationshipWeightProperty: 'weight',
|
||||
gamma: 0.7,
|
||||
maxLevels: 10,
|
||||
tolerance: 0.001
|
||||
}
|
||||
)
|
||||
YIELD communityCount
|
||||
|
||||
CALL gds.graph.drop(g.graphName)
|
||||
YIELD graphName as droppedGraphName
|
||||
|
||||
RETURN communityCount, g.nodeCount, g.relationshipCount
|
||||
`;
|
||||
|
||||
const result = await runQuery(clusteringQuery, {
|
||||
userId,
|
||||
gamma: this.LEIDEN_GAMMA,
|
||||
maxLevels: this.LEIDEN_MAX_LEVELS,
|
||||
tolerance: this.LEIDEN_TOLERANCE,
|
||||
});
|
||||
|
||||
const communityCount = result[0]?.get("communityCount") || 0;
|
||||
logger.info(`Leiden clustering found ${communityCount} communities`);
|
||||
|
||||
// Filter clusters by minimum size and assign final cluster IDs
|
||||
await this.filterAndAssignClusters(userId, incremental);
|
||||
|
||||
const removeRelationsQuery = `
|
||||
MATCH (s1:Statement)-[r:SIMILAR_TO]-(s2:Statement)
|
||||
WHERE s1.userId = $userId AND s2.userId = $userId
|
||||
DELETE r`;
|
||||
|
||||
await runQuery(removeRelationsQuery, { userId });
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform incremental clustering for new statements
|
||||
*/
|
||||
async performIncrementalClustering(userId: string): Promise<{
|
||||
newStatementsProcessed: number;
|
||||
newClustersCreated: number;
|
||||
}> {
|
||||
logger.info(`Starting incremental clustering for user ${userId}`);
|
||||
|
||||
try {
|
||||
// Check if there are unclustered statements
|
||||
const unclusteredQuery = `
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId AND s.clusterId IS NULL AND s.invalidAt IS NULL
|
||||
RETURN count(s) as unclusteredCount
|
||||
`;
|
||||
|
||||
const unclusteredResult = await runQuery(unclusteredQuery, { userId });
|
||||
const unclusteredCount =
|
||||
unclusteredResult[0]?.get("unclusteredCount") || 0;
|
||||
|
||||
if (unclusteredCount === 0) {
|
||||
logger.info(
|
||||
"No unclustered statements found, skipping incremental clustering",
|
||||
);
|
||||
return {
|
||||
newStatementsProcessed: 0,
|
||||
newClustersCreated: 0,
|
||||
};
|
||||
}
|
||||
|
||||
logger.info(`Found ${unclusteredCount} unclustered statements`);
|
||||
|
||||
let newClustersCreated = 0;
|
||||
// Run incremental clustering on remaining statements
|
||||
await this.executeLeidentClustering(userId, true);
|
||||
await this.createClusterNodes(userId);
|
||||
|
||||
// Count new clusters created
|
||||
const newClustersQuery = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId AND c.createdAt > datetime() - duration('PT5M')
|
||||
RETURN count(c) as newClusters
|
||||
`;
|
||||
const newClustersResult = await runQuery(newClustersQuery, { userId });
|
||||
newClustersCreated = newClustersResult[0]?.get("newClusters") || 0;
|
||||
|
||||
return {
|
||||
newStatementsProcessed: unclusteredCount,
|
||||
newClustersCreated,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error("Error in incremental clustering:", { error });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform complete clustering (for new users or full rebuilds)
|
||||
*/
|
||||
async performCompleteClustering(userId: string): Promise<{
|
||||
clustersCreated: number;
|
||||
statementsProcessed: number;
|
||||
}> {
|
||||
logger.info(`Starting complete clustering for user ${userId}`);
|
||||
|
||||
try {
|
||||
// Clear any existing cluster assignments
|
||||
await runQuery(
|
||||
`
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId
|
||||
REMOVE s.clusterId, s.tempClusterId
|
||||
`,
|
||||
{ userId },
|
||||
);
|
||||
|
||||
// Clear statement-to-statement similarity relationships
|
||||
await runQuery(
|
||||
`
|
||||
MATCH (s1:Statement)-[r:SIMILAR_TO]-(s2:Statement)
|
||||
WHERE s1.userId = $userId AND s2.userId = $userId
|
||||
DELETE r
|
||||
`,
|
||||
{ userId },
|
||||
);
|
||||
|
||||
// Clear existing cluster nodes
|
||||
await runQuery(
|
||||
`
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId
|
||||
DETACH DELETE c
|
||||
`,
|
||||
{ userId },
|
||||
);
|
||||
|
||||
// Execute complete clustering pipeline
|
||||
await this.executeLeidentClustering(userId, false);
|
||||
await this.createClusterNodes(userId);
|
||||
|
||||
// Get results
|
||||
const resultsQuery = `
|
||||
MATCH (c:Cluster) WHERE c.userId = $userId
|
||||
WITH count(c) as clusters
|
||||
MATCH (s:Statement) WHERE s.userId = $userId AND s.clusterId IS NOT NULL
|
||||
RETURN clusters, count(s) as statementsProcessed
|
||||
`;
|
||||
|
||||
const results = await runQuery(resultsQuery, { userId });
|
||||
const clustersCreated = results[0]?.get("clusters") || 0;
|
||||
const statementsProcessed = results[0]?.get("statementsProcessed") || 0;
|
||||
|
||||
logger.info(
|
||||
`Complete clustering finished: ${clustersCreated} clusters, ${statementsProcessed} statements processed`,
|
||||
);
|
||||
|
||||
return { clustersCreated, statementsProcessed };
|
||||
} catch (error) {
|
||||
logger.error("Error in complete clustering:", { error });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter clusters by minimum size and assign final cluster IDs
|
||||
*/
|
||||
private async filterAndAssignClusters(
|
||||
userId: string,
|
||||
incremental: boolean = false,
|
||||
): Promise<void> {
|
||||
const filterQuery = `
|
||||
// Step 1: Get all temp cluster groups and their total sizes
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId AND s.tempClusterId IS NOT NULL
|
||||
WITH s.tempClusterId as tempId, collect(s) as allStatements
|
||||
|
||||
// Step 2: Filter by minimum size
|
||||
WHERE size(allStatements) >= $minSize
|
||||
|
||||
// Step 3: Check if any statements already have a permanent clusterId
|
||||
WITH tempId, allStatements,
|
||||
[stmt IN allStatements WHERE stmt.clusterId IS NOT NULL] as existingClustered,
|
||||
[stmt IN allStatements WHERE stmt.clusterId IS NULL] as newStatements
|
||||
|
||||
// Step 4: Determine the final cluster ID
|
||||
WITH tempId, allStatements, existingClustered, newStatements,
|
||||
CASE
|
||||
WHEN size(existingClustered) > 0 THEN existingClustered[0].clusterId
|
||||
ELSE toString(randomUUID())
|
||||
END as finalClusterId
|
||||
|
||||
// Step 5: Assign cluster ID to new statements (handles empty arrays gracefully)
|
||||
FOREACH (stmt IN newStatements |
|
||||
SET stmt.clusterId = finalClusterId
|
||||
REMOVE stmt.tempClusterId
|
||||
)
|
||||
|
||||
// Step 6: Clean up temp IDs from existing statements
|
||||
FOREACH (existingStmt IN existingClustered |
|
||||
REMOVE existingStmt.tempClusterId
|
||||
)
|
||||
|
||||
RETURN count(DISTINCT finalClusterId) as validClusters
|
||||
`;
|
||||
|
||||
const result = await runQuery(filterQuery, {
|
||||
userId,
|
||||
minSize: this.MIN_CLUSTER_SIZE,
|
||||
});
|
||||
|
||||
// Remove temp cluster IDs from statements that didn't meet minimum size
|
||||
await runQuery(
|
||||
`
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId AND s.tempClusterId IS NOT NULL
|
||||
REMOVE s.tempClusterId
|
||||
`,
|
||||
{ userId },
|
||||
);
|
||||
|
||||
const validClusters = result[0]?.get("validClusters") || 0;
|
||||
|
||||
if (incremental) {
|
||||
await this.updateClusterEmbeddings(userId);
|
||||
}
|
||||
logger.info(
|
||||
`${incremental ? "Updated" : "Created"} ${validClusters} valid clusters after size filtering`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Cluster nodes with metadata (hybrid storage approach)
|
||||
* Only creates cluster nodes for cluster IDs that don't already exist
|
||||
*/
|
||||
async createClusterNodes(userId: string): Promise<void> {
|
||||
logger.info("Creating cluster metadata nodes for new clusters only");
|
||||
|
||||
const query = `
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId AND s.clusterId IS NOT NULL
|
||||
WITH s.clusterId as clusterId, collect(s) as statements
|
||||
|
||||
// Only process cluster IDs that don't already have a Cluster node
|
||||
WHERE NOT EXISTS {
|
||||
MATCH (existing:Cluster {uuid: clusterId, userId: $userId})
|
||||
}
|
||||
|
||||
// Get representative entities for naming
|
||||
UNWIND statements as stmt
|
||||
MATCH (stmt)-[:HAS_SUBJECT]->(subj:Entity)
|
||||
MATCH (stmt)-[:HAS_PREDICATE]->(pred:Entity)
|
||||
MATCH (stmt)-[:HAS_OBJECT]->(obj:Entity)
|
||||
|
||||
WITH clusterId, statements,
|
||||
collect(DISTINCT subj.name) as subjects,
|
||||
collect(DISTINCT pred.name) as predicates,
|
||||
collect(DISTINCT obj.name) as objects
|
||||
|
||||
// Get top 10 most frequent entities of each type
|
||||
WITH clusterId, statements,
|
||||
apoc.coll.frequencies(subjects)[0..10] as topSubjects,
|
||||
apoc.coll.frequencies(predicates)[0..10] as topPredicates,
|
||||
apoc.coll.frequencies(objects)[0..10] as topObjects
|
||||
|
||||
// Calculate cluster embedding as average of statement embeddings
|
||||
WITH clusterId, statements, topSubjects, topPredicates, topObjects,
|
||||
[stmt IN statements WHERE stmt.factEmbedding IS NOT NULL | stmt.factEmbedding] as validEmbeddings
|
||||
|
||||
// Calculate average embedding (centroid)
|
||||
WITH clusterId, statements, topSubjects, topPredicates, topObjects, validEmbeddings,
|
||||
CASE
|
||||
WHEN size(validEmbeddings) > 0 THEN
|
||||
reduce(avg = [i IN range(0, size(validEmbeddings[0])-1) | 0.0],
|
||||
embedding IN validEmbeddings |
|
||||
[i IN range(0, size(embedding)-1) | avg[i] + embedding[i] / size(validEmbeddings)])
|
||||
ELSE null
|
||||
END as clusterEmbedding
|
||||
|
||||
CREATE (c:Cluster {
|
||||
uuid: clusterId,
|
||||
size: size(statements),
|
||||
createdAt: datetime(),
|
||||
userId: $userId,
|
||||
topSubjects: [item in topSubjects | item.item],
|
||||
topPredicates: [item in topPredicates | item.item],
|
||||
topObjects: [item in topObjects | item.item],
|
||||
clusterEmbedding: clusterEmbedding,
|
||||
embeddingCount: size(validEmbeddings),
|
||||
needsNaming: true
|
||||
})
|
||||
|
||||
RETURN count(c) as clustersCreated
|
||||
`;
|
||||
|
||||
const result = await runQuery(query, { userId });
|
||||
const clustersCreated = result[0]?.get("clustersCreated") || 0;
|
||||
|
||||
logger.info(`Created ${clustersCreated} new cluster metadata nodes`);
|
||||
|
||||
// Only generate names for new clusters (those with needsNaming = true)
|
||||
if (clustersCreated > 0) {
|
||||
await this.generateClusterNames(userId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate TF-IDF scores for a specific cluster
|
||||
*
|
||||
* Uses cluster-based document frequency (not statement-based) for optimal cluster naming:
|
||||
* - TF: How often a term appears within this specific cluster
|
||||
* - DF: How many clusters (not statements) contain this term
|
||||
* - IDF: log(total_clusters / clusters_containing_term)
|
||||
*
|
||||
* This approach identifies terms that are frequent in THIS cluster but rare across OTHER clusters,
|
||||
* making them highly distinctive for cluster naming and differentiation.
|
||||
*
|
||||
* Example: "SOL" appears in 100/100 statements in Cluster A, but only 1/10 total clusters
|
||||
* - Cluster-based IDF: log(10/1) = high distinctiveness ✓ (good for naming)
|
||||
* - Statement-based IDF: log(1000/100) = lower distinctiveness (less useful for naming)
|
||||
*/
|
||||
private async calculateClusterTFIDFForCluster(
|
||||
userId: string,
|
||||
targetClusterId: string,
|
||||
): Promise<{
|
||||
subjects: Array<{ term: string; score: number }>;
|
||||
predicates: Array<{ term: string; score: number }>;
|
||||
objects: Array<{ term: string; score: number }>;
|
||||
} | null> {
|
||||
// Get all clusters and their entity frequencies (needed for cluster-based IDF calculation)
|
||||
// We need ALL clusters to calculate how rare each term is across the cluster space
|
||||
const allClustersQuery = `
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId AND s.clusterId IS NOT NULL
|
||||
MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
|
||||
MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)
|
||||
MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
|
||||
WITH s.clusterId as clusterId,
|
||||
collect(DISTINCT subj.name) as subjects,
|
||||
collect(DISTINCT pred.name) as predicates,
|
||||
collect(DISTINCT obj.name) as objects
|
||||
RETURN clusterId, subjects, predicates, objects
|
||||
`;
|
||||
|
||||
const allClusters = await runQuery(allClustersQuery, {
|
||||
userId,
|
||||
});
|
||||
|
||||
// Build document frequency maps from all clusters
|
||||
// DF = number of clusters that contain each term (not number of statements)
|
||||
const subjectDF = new Map<string, number>();
|
||||
const predicateDF = new Map<string, number>();
|
||||
const objectDF = new Map<string, number>();
|
||||
const totalClusters = allClusters.length;
|
||||
|
||||
// Calculate cluster-based document frequencies
|
||||
// For each term, count how many different clusters it appears in
|
||||
for (const record of allClusters) {
|
||||
const subjects = (record.get("subjects") as string[]) || [];
|
||||
const predicates = (record.get("predicates") as string[]) || [];
|
||||
const objects = (record.get("objects") as string[]) || [];
|
||||
|
||||
// Count unique terms per cluster (each cluster contributes max 1 to DF for each term)
|
||||
new Set(subjects).forEach((term) => {
|
||||
subjectDF.set(term, (subjectDF.get(term) || 0) + 1);
|
||||
});
|
||||
new Set(predicates).forEach((term) => {
|
||||
predicateDF.set(term, (predicateDF.get(term) || 0) + 1);
|
||||
});
|
||||
new Set(objects).forEach((term) => {
|
||||
objectDF.set(term, (objectDF.get(term) || 0) + 1);
|
||||
});
|
||||
}
|
||||
|
||||
// Find the target cluster data for TF calculation
|
||||
const targetCluster = allClusters.find(
|
||||
(record) => record.get("clusterId") === targetClusterId,
|
||||
);
|
||||
|
||||
if (!targetCluster) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const subjects = (targetCluster.get("subjects") as string[]) || [];
|
||||
const predicates = (targetCluster.get("predicates") as string[]) || [];
|
||||
const objects = (targetCluster.get("objects") as string[]) || [];
|
||||
|
||||
// Calculate term frequencies within this specific cluster
|
||||
// TF = how often each term appears in this cluster's statements
|
||||
const subjectTF = new Map<string, number>();
|
||||
const predicateTF = new Map<string, number>();
|
||||
const objectTF = new Map<string, number>();
|
||||
|
||||
subjects.forEach((term) => {
|
||||
subjectTF.set(term, (subjectTF.get(term) || 0) + 1);
|
||||
});
|
||||
predicates.forEach((term) => {
|
||||
predicateTF.set(term, (predicateTF.get(term) || 0) + 1);
|
||||
});
|
||||
objects.forEach((term) => {
|
||||
objectTF.set(term, (objectTF.get(term) || 0) + 1);
|
||||
});
|
||||
|
||||
// Calculate TF-IDF scores using cluster-based document frequency
|
||||
// Higher scores = terms frequent in THIS cluster but rare across OTHER clusters
|
||||
const calculateTFIDF = (
|
||||
tf: Map<string, number>,
|
||||
df: Map<string, number>,
|
||||
totalTerms: number,
|
||||
) => {
|
||||
return Array.from(tf.entries())
|
||||
.map(([term, freq]) => {
|
||||
// TF: Normalized frequency within this cluster
|
||||
const termFreq = freq / totalTerms;
|
||||
|
||||
// DF: Number of clusters containing this term
|
||||
const docFreq = df.get(term) || 1;
|
||||
|
||||
// IDF: Inverse document frequency (cluster-based)
|
||||
// Higher when term appears in fewer clusters
|
||||
const idf = Math.log(totalClusters / docFreq);
|
||||
|
||||
// TF-IDF: Final distinctiveness score
|
||||
const tfidf = termFreq * idf;
|
||||
|
||||
return { term, score: tfidf };
|
||||
})
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, 10); // Top 10 most distinctive terms
|
||||
};
|
||||
|
||||
return {
|
||||
subjects: calculateTFIDF(subjectTF, subjectDF, subjects.length),
|
||||
predicates: calculateTFIDF(predicateTF, predicateDF, predicates.length),
|
||||
objects: calculateTFIDF(objectTF, objectDF, objects.length),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate cluster names using LLM based on TF-IDF analysis
|
||||
*/
|
||||
private async generateClusterNames(userId: string): Promise<void> {
|
||||
logger.info("Generating cluster names using TF-IDF analysis");
|
||||
|
||||
const getClustersQuery = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId AND c.needsNaming = true
|
||||
RETURN c.uuid as clusterId, c.size as size
|
||||
`;
|
||||
|
||||
const clusters = await runQuery(getClustersQuery, { userId });
|
||||
|
||||
for (const record of clusters) {
|
||||
const clusterId = record.get("clusterId");
|
||||
const size = record.get("size");
|
||||
|
||||
// Calculate TF-IDF only for this specific cluster
|
||||
const tfidfData = await this.calculateClusterTFIDFForCluster(
|
||||
userId,
|
||||
clusterId,
|
||||
);
|
||||
if (!tfidfData) {
|
||||
logger.warn(`No TF-IDF data found for cluster ${clusterId}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const namingPrompt = this.createTFIDFClusterNamingPrompt({
|
||||
...tfidfData,
|
||||
size,
|
||||
});
|
||||
|
||||
let responseText = "";
|
||||
await makeModelCall(false, namingPrompt, (text) => {
|
||||
responseText = text;
|
||||
});
|
||||
|
||||
try {
|
||||
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||
if (outputMatch && outputMatch[1]) {
|
||||
const response = JSON.parse(outputMatch[1].trim());
|
||||
|
||||
const updateQuery = `
|
||||
MATCH (c:Cluster {uuid: $clusterId})
|
||||
SET c.name = $name,
|
||||
c.description = $description,
|
||||
c.needsNaming = false
|
||||
`;
|
||||
|
||||
await runQuery(updateQuery, {
|
||||
clusterId,
|
||||
name: response.name || `Cluster ${clusterId}`,
|
||||
description: response.description || null,
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Error naming cluster ${clusterId}:`, { error });
|
||||
|
||||
// Fallback naming
|
||||
await runQuery(
|
||||
`
|
||||
MATCH (c:Cluster {uuid: $clusterId})
|
||||
SET c.name = 'Cluster ' + substring($clusterId, 0, 8),
|
||||
c.needsNaming = false
|
||||
`,
|
||||
{ clusterId },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create prompt for unsupervised cluster naming using TF-IDF scores
|
||||
*/
|
||||
private createTFIDFClusterNamingPrompt(data: {
|
||||
subjects: Array<{ term: string; score: number }>;
|
||||
predicates: Array<{ term: string; score: number }>;
|
||||
objects: Array<{ term: string; score: number }>;
|
||||
size: number;
|
||||
}): CoreMessage[] {
|
||||
const formatTerms = (terms: Array<{ term: string; score: number }>) =>
|
||||
terms.map((t) => `"${t.term}" (${t.score.toFixed(3)})`).join(", ");
|
||||
|
||||
return [
|
||||
{
|
||||
role: "system",
|
||||
content: `You are an expert at analyzing semantic patterns and creating descriptive cluster names. You will receive TF-IDF scores showing the most distinctive terms for a cluster of knowledge graph statements.
|
||||
|
||||
TF-IDF Analysis:
|
||||
- Higher scores = terms that are frequent in THIS cluster but rare in OTHER clusters
|
||||
- These scores reveal what makes this cluster semantically unique
|
||||
- Focus on the highest-scoring terms as they are the most distinctive
|
||||
|
||||
Knowledge Graph Context:
|
||||
- Subjects: Who or what is performing actions
|
||||
- Predicates: The relationships, actions, or connections
|
||||
- Objects: Who or what is being acted upon or referenced
|
||||
|
||||
Naming Guidelines:
|
||||
1. Create a 2-4 word descriptive name that captures the core semantic theme
|
||||
2. Focus on the highest TF-IDF scoring terms - they reveal the cluster's uniqueness
|
||||
3. Look for patterns across subjects, predicates, and objects together
|
||||
4. Use natural language that a user would understand
|
||||
5. Avoid generic terms - be specific based on the distinctive patterns
|
||||
|
||||
Return only a JSON object:
|
||||
<output>
|
||||
{
|
||||
"name": "Descriptive cluster name",
|
||||
"description": "Brief explanation of the semantic pattern based on TF-IDF analysis"
|
||||
}
|
||||
</output>`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `Analyze this cluster of ${data.size} statements. The TF-IDF scores show what makes this cluster distinctive:
|
||||
|
||||
**Distinctive Subjects (TF-IDF):**
|
||||
${formatTerms(data.subjects)}
|
||||
|
||||
**Distinctive Predicates (TF-IDF):**
|
||||
${formatTerms(data.predicates)}
|
||||
|
||||
**Distinctive Objects (TF-IDF):**
|
||||
${formatTerms(data.objects)}
|
||||
|
||||
Based on these distinctive patterns, what is the most accurate name for this semantic cluster?`,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Update cluster embeddings incrementally when new statements are added
|
||||
*/
|
||||
private async updateClusterEmbeddings(userId: string): Promise<void> {
|
||||
logger.info("Updating cluster embeddings after new statements");
|
||||
|
||||
const updateQuery = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId
|
||||
|
||||
MATCH (s:Statement {clusterId: c.uuid, userId: $userId})
|
||||
WHERE s.factEmbedding IS NOT NULL
|
||||
|
||||
WITH c, collect(s.factEmbedding) as allEmbeddings
|
||||
WHERE size(allEmbeddings) > 0
|
||||
|
||||
// Recalculate average embedding
|
||||
WITH c, allEmbeddings,
|
||||
reduce(avg = [i IN range(0, size(allEmbeddings[0])-1) | 0.0],
|
||||
embedding IN allEmbeddings |
|
||||
[i IN range(0, size(embedding)-1) | avg[i] + embedding[i] / size(allEmbeddings)]) as newEmbedding
|
||||
|
||||
SET c.clusterEmbedding = newEmbedding,
|
||||
c.embeddingCount = size(allEmbeddings),
|
||||
c.lastEmbeddingUpdate = datetime()
|
||||
|
||||
RETURN count(c) as updatedClusters
|
||||
`;
|
||||
|
||||
const result = await runQuery(updateQuery, { userId });
|
||||
const updatedClusters = result[0]?.get("updatedClusters") || 0;
|
||||
|
||||
logger.info(`Updated embeddings for ${updatedClusters} clusters`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect cluster drift using embedding-based cohesion analysis
|
||||
*/
|
||||
async detectClusterDrift(userId: string): Promise<{
|
||||
driftDetected: boolean;
|
||||
lowCohesionClusters: string[];
|
||||
avgCohesion: number;
|
||||
reason: string;
|
||||
}> {
|
||||
logger.info("Detecting cluster drift using embedding cohesion analysis");
|
||||
|
||||
// First update cluster embeddings to ensure they're current
|
||||
await this.updateClusterEmbeddings(userId);
|
||||
|
||||
// Calculate cohesion for all clusters
|
||||
const cohesionQuery = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId AND c.clusterEmbedding IS NOT NULL
|
||||
|
||||
MATCH (s:Statement {clusterId: c.uuid, userId: $userId})
|
||||
WHERE s.factEmbedding IS NOT NULL
|
||||
|
||||
WITH c, collect(s.factEmbedding) as statementEmbeddings, c.clusterEmbedding as clusterEmbedding
|
||||
WHERE size(statementEmbeddings) >= $minClusterSize
|
||||
|
||||
// Calculate average cosine similarity for this cluster
|
||||
UNWIND statementEmbeddings as stmtEmb
|
||||
WITH c, stmtEmb, clusterEmbedding,
|
||||
reduce(dot = 0.0, i IN range(0, size(stmtEmb)-1) | dot + stmtEmb[i] * clusterEmbedding[i]) as dotProduct,
|
||||
sqrt(reduce(mag1 = 0.0, i IN range(0, size(stmtEmb)-1) | mag1 + stmtEmb[i] * stmtEmb[i])) as stmtMagnitude,
|
||||
sqrt(reduce(mag2 = 0.0, i IN range(0, size(clusterEmbedding)-1) | mag2 + clusterEmbedding[i] * clusterEmbedding[i])) as clusterMagnitude
|
||||
|
||||
WITH c,
|
||||
CASE
|
||||
WHEN stmtMagnitude > 0 AND clusterMagnitude > 0
|
||||
THEN dotProduct / (stmtMagnitude * clusterMagnitude)
|
||||
ELSE 0.0
|
||||
END as cosineSimilarity
|
||||
|
||||
WITH c, avg(cosineSimilarity) as clusterCohesion
|
||||
|
||||
// Update cluster with cohesion score
|
||||
SET c.cohesionScore = clusterCohesion
|
||||
|
||||
RETURN c.uuid as clusterId, c.size as clusterSize, clusterCohesion
|
||||
ORDER BY clusterCohesion ASC
|
||||
`;
|
||||
|
||||
const cohesionResults = await runQuery(cohesionQuery, {
|
||||
userId,
|
||||
minClusterSize: this.MIN_CLUSTER_SIZE,
|
||||
});
|
||||
|
||||
const clusterCohesions = cohesionResults.map((record) => ({
|
||||
clusterId: record.get("clusterId"),
|
||||
size: record.get("clusterSize"),
|
||||
cohesion: record.get("clusterCohesion") || 0.0,
|
||||
}));
|
||||
|
||||
const avgCohesion =
|
||||
clusterCohesions.length > 0
|
||||
? clusterCohesions.reduce((sum, c) => sum + c.cohesion, 0) /
|
||||
clusterCohesions.length
|
||||
: 0.0;
|
||||
|
||||
const lowCohesionClusters = clusterCohesions
|
||||
.filter((c) => c.cohesion < this.COHESION_THRESHOLD)
|
||||
.map((c) => c.clusterId);
|
||||
|
||||
const driftDetected =
|
||||
lowCohesionClusters.length > 0 || avgCohesion < this.COHESION_THRESHOLD;
|
||||
|
||||
let reason = "";
|
||||
if (lowCohesionClusters.length > 0) {
|
||||
reason = `${lowCohesionClusters.length} clusters have low cohesion (< ${this.COHESION_THRESHOLD})`;
|
||||
} else if (avgCohesion < this.COHESION_THRESHOLD) {
|
||||
reason = `Overall average cohesion (${avgCohesion.toFixed(3)}) below threshold (${this.COHESION_THRESHOLD})`;
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`Drift detection completed: ${driftDetected ? "DRIFT DETECTED" : "NO DRIFT"} - ${reason || "Clusters are cohesive"}`,
|
||||
);
|
||||
|
||||
return {
|
||||
driftDetected,
|
||||
lowCohesionClusters,
|
||||
avgCohesion,
|
||||
reason: reason || "Clusters are cohesive",
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle cluster evolution when drift is detected
|
||||
*/
|
||||
async evolveCluster(oldClusterId: string, userId: string): Promise<string> {
|
||||
logger.info(`Evolving cluster ${oldClusterId}`);
|
||||
|
||||
const newClusterId = crypto.randomUUID();
|
||||
|
||||
// Create evolution relationship
|
||||
const evolutionQuery = `
|
||||
MATCH (oldCluster:Cluster {uuid: $oldClusterId})
|
||||
CREATE (newCluster:Cluster {
|
||||
uuid: $newClusterId,
|
||||
createdAt: datetime(),
|
||||
userId: $userId,
|
||||
size: 0,
|
||||
needsNaming: true
|
||||
})
|
||||
CREATE (oldCluster)-[:EVOLVED_TO {createdAt: datetime()}]->(newCluster)
|
||||
RETURN newCluster.uuid as uuid
|
||||
`;
|
||||
|
||||
await runQuery(evolutionQuery, {
|
||||
oldClusterId,
|
||||
newClusterId,
|
||||
userId,
|
||||
});
|
||||
|
||||
return newClusterId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main clustering orchestration method - intelligently chooses between incremental and complete clustering
|
||||
*/
|
||||
async performClustering(
|
||||
userId: string,
|
||||
forceComplete: boolean = false,
|
||||
): Promise<{
|
||||
clustersCreated: number;
|
||||
statementsProcessed: number;
|
||||
driftMetrics?: DriftMetrics;
|
||||
approach: "incremental" | "complete";
|
||||
}> {
|
||||
logger.info(`Starting clustering process for user ${userId}`);
|
||||
|
||||
try {
|
||||
// Check if user has any existing clusters
|
||||
const existingClustersQuery = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId
|
||||
RETURN count(c) as existingClusters
|
||||
`;
|
||||
const existingResult = await runQuery(existingClustersQuery, { userId });
|
||||
const existingClusters = existingResult[0]?.get("existingClusters") || 0;
|
||||
|
||||
// Check total statement count
|
||||
// const totalStatementsQuery = `
|
||||
// MATCH (s:Statement)
|
||||
// WHERE s.userId = $userId AND s.invalidAt IS NULL
|
||||
// RETURN count(s) as totalStatements
|
||||
// `;
|
||||
// const totalResult = await runQuery(totalStatementsQuery, { userId });
|
||||
// const totalStatements = totalResult[0]?.get("totalStatements") || 0;
|
||||
|
||||
// Determine clustering approach
|
||||
let useIncremental =
|
||||
existingClusters > 0 && !forceComplete ? true : false;
|
||||
let driftMetrics: DriftMetrics | undefined;
|
||||
|
||||
// if (
|
||||
// !forceComplete &&
|
||||
// existingClusters > 0 &&
|
||||
// totalStatements >= this.MIN_CLUSTER_SIZE
|
||||
// ) {
|
||||
// // Check for drift to decide approach
|
||||
// driftMetrics = await this.detectClusterDrift(userId);
|
||||
|
||||
// if (!driftMetrics.shouldRecluster) {
|
||||
// useIncremental = true;
|
||||
// logger.info("Using incremental clustering approach");
|
||||
// } else {
|
||||
// logger.info("Drift detected, using complete clustering approach");
|
||||
// }
|
||||
// } else if (totalStatements < this.MIN_CLUSTER_SIZE) {
|
||||
// logger.info(
|
||||
// `Insufficient statements (${totalStatements}) for clustering, minimum required: ${this.MIN_CLUSTER_SIZE}`,
|
||||
// );
|
||||
// return {
|
||||
// clustersCreated: 0,
|
||||
// statementsProcessed: 0,
|
||||
// driftMetrics,
|
||||
// approach: "incremental",
|
||||
// };
|
||||
// } else {
|
||||
// logger.info("Using complete clustering approach (new user or forced)");
|
||||
// }
|
||||
|
||||
// Execute appropriate clustering strategy
|
||||
if (useIncremental) {
|
||||
const incrementalResult =
|
||||
await this.performIncrementalClustering(userId);
|
||||
return {
|
||||
clustersCreated: incrementalResult.newClustersCreated,
|
||||
statementsProcessed: incrementalResult.newStatementsProcessed,
|
||||
driftMetrics,
|
||||
approach: "incremental",
|
||||
};
|
||||
} else {
|
||||
const completeResult = await this.performCompleteClustering(userId);
|
||||
return {
|
||||
clustersCreated: completeResult.clustersCreated,
|
||||
statementsProcessed: completeResult.statementsProcessed,
|
||||
driftMetrics,
|
||||
approach: "complete",
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Error in clustering process:", { error });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Force complete reclustering (useful for maintenance or when drift is too high)
|
||||
*/
|
||||
async forceCompleteClustering(userId: string): Promise<{
|
||||
clustersCreated: number;
|
||||
statementsProcessed: number;
|
||||
}> {
|
||||
return await this.performCompleteClustering(userId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cluster information for a user
|
||||
*/
|
||||
async getClusters(userId: string): Promise<ClusterNode[]> {
|
||||
const query = `
|
||||
MATCH (c:Cluster)
|
||||
WHERE c.userId = $userId
|
||||
RETURN c
|
||||
ORDER BY c.size DESC
|
||||
`;
|
||||
|
||||
const result = await runQuery(query, { userId });
|
||||
|
||||
return result.map((record) => {
|
||||
const cluster = record.get("c").properties;
|
||||
return {
|
||||
uuid: cluster.uuid,
|
||||
name: cluster.name || `Cluster ${cluster.uuid.substring(0, 8)}`,
|
||||
aspectType: cluster.aspectType || "thematic",
|
||||
description: cluster.description,
|
||||
size: cluster.size || 0,
|
||||
createdAt: new Date(cluster.createdAt),
|
||||
userId: cluster.userId,
|
||||
cohesionScore: cluster.cohesionScore,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statements in a specific cluster
|
||||
*/
|
||||
async getClusterStatements(
|
||||
clusterId: string,
|
||||
userId: string,
|
||||
): Promise<any[]> {
|
||||
const query = `
|
||||
MATCH (s:Statement)
|
||||
WHERE s.clusterId = $clusterId AND s.userId = $userId
|
||||
MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
|
||||
MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)
|
||||
MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
|
||||
RETURN s, subj.name as subject, pred.name as predicate, obj.name as object
|
||||
ORDER BY s.createdAt DESC
|
||||
`;
|
||||
|
||||
const result = await runQuery(query, { clusterId, userId });
|
||||
|
||||
return result.map((record) => {
|
||||
const statement = record.get("s").properties;
|
||||
return {
|
||||
uuid: statement.uuid,
|
||||
fact: statement.fact,
|
||||
subject: record.get("subject"),
|
||||
predicate: record.get("predicate"),
|
||||
object: record.get("object"),
|
||||
createdAt: new Date(statement.createdAt),
|
||||
validAt: new Date(statement.validAt),
|
||||
};
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -8,6 +8,7 @@ import {
|
||||
type Triple,
|
||||
} from "@core/types";
|
||||
import { logger } from "./logger.service";
|
||||
import { ClusteringService } from "./clustering.server";
|
||||
import crypto from "crypto";
|
||||
import {
|
||||
dedupeNodes,
|
||||
@ -53,6 +54,12 @@ import { type PrismaClient } from "@prisma/client";
|
||||
const DEFAULT_EPISODE_WINDOW = 5;
|
||||
|
||||
export class KnowledgeGraphService {
|
||||
private clusteringService: ClusteringService;
|
||||
|
||||
constructor() {
|
||||
this.clusteringService = new ClusteringService();
|
||||
}
|
||||
|
||||
async getEmbedding(text: string) {
|
||||
return getEmbedding(text);
|
||||
}
|
||||
@ -188,6 +195,26 @@ export class KnowledgeGraphService {
|
||||
// Invalidate invalidated statements
|
||||
await invalidateStatements({ statementIds: invalidatedStatements });
|
||||
|
||||
// Trigger incremental clustering process after successful ingestion
|
||||
if (resolvedStatements.length > 0) {
|
||||
try {
|
||||
logger.info(
|
||||
"Triggering incremental clustering process after episode ingestion",
|
||||
);
|
||||
const clusteringResult =
|
||||
await this.clusteringService.performClustering(
|
||||
params.userId,
|
||||
false,
|
||||
);
|
||||
logger.info(
|
||||
`Incremental clustering completed: ${clusteringResult.clustersCreated} clusters created, ${clusteringResult.statementsProcessed} statements processed`,
|
||||
);
|
||||
} catch (clusteringError) {
|
||||
logger.error("Error in incremental clustering process:");
|
||||
// Don't fail the entire ingestion if clustering fails
|
||||
}
|
||||
}
|
||||
|
||||
const endTime = Date.now();
|
||||
const processingTimeMs = endTime - startTime;
|
||||
logger.log(`Processing time: ${processingTimeMs} ms`);
|
||||
|
||||
BIN
apps/webapp/integrations/slack/main
Executable file
BIN
apps/webapp/integrations/slack/main
Executable file
Binary file not shown.
@ -50,8 +50,8 @@ services:
|
||||
image: neo4j:5.25-community
|
||||
environment:
|
||||
- NEO4J_AUTH=${NEO4J_AUTH}
|
||||
- NEO4J_dbms_security_procedures_unrestricted=gds.*
|
||||
- NEO4J_dbms_security_procedures_allowlist=gds.*
|
||||
- NEO4J_dbms_security_procedures_unrestricted=gds.*,apoc.*
|
||||
- NEO4J_dbms_security_procedures_allowlist=gds.*,apoc.*
|
||||
ports:
|
||||
- "7474:7474"
|
||||
- "7687:7687"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user