mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-21 16:48:30 +00:00
Feat: cluster drift
This commit is contained in:
parent
29ca8b7ad3
commit
6afd993da7
@ -74,7 +74,7 @@ export class ClusteringService {
|
|||||||
/**
|
/**
|
||||||
* Execute Leiden algorithm for community detection on statement similarity graph
|
* Execute Leiden algorithm for community detection on statement similarity graph
|
||||||
*/
|
*/
|
||||||
async executeLeidentClustering(
|
async executeLeidenClustering(
|
||||||
userId: string,
|
userId: string,
|
||||||
incremental: boolean = false,
|
incremental: boolean = false,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
@ -173,7 +173,7 @@ export class ClusteringService {
|
|||||||
|
|
||||||
let newClustersCreated = 0;
|
let newClustersCreated = 0;
|
||||||
// Run incremental clustering on remaining statements
|
// Run incremental clustering on remaining statements
|
||||||
await this.executeLeidentClustering(userId, true);
|
await this.executeLeidenClustering(userId, true);
|
||||||
await this.createClusterNodes(userId);
|
await this.createClusterNodes(userId);
|
||||||
|
|
||||||
// Count new clusters created
|
// Count new clusters created
|
||||||
@ -185,9 +185,21 @@ export class ClusteringService {
|
|||||||
const newClustersResult = await runQuery(newClustersQuery, { userId });
|
const newClustersResult = await runQuery(newClustersQuery, { userId });
|
||||||
newClustersCreated = newClustersResult[0]?.get("newClusters") || 0;
|
newClustersCreated = newClustersResult[0]?.get("newClusters") || 0;
|
||||||
|
|
||||||
|
const drift = await this.detectClusterDrift(userId);
|
||||||
|
const newClustersCreatedDrift = 0;
|
||||||
|
if (drift.driftDetected) {
|
||||||
|
logger.info("Cluster drift detected, evolving clusters");
|
||||||
|
const { newClustersCreated: newClustersCreatedDrift, splitClusters } =
|
||||||
|
await this.handleClusterDrift(userId);
|
||||||
|
|
||||||
|
if (splitClusters.length > 0) {
|
||||||
|
logger.info("Split clusters detected, evolving clusters");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
newStatementsProcessed: unclusteredCount,
|
newStatementsProcessed: unclusteredCount,
|
||||||
newClustersCreated,
|
newClustersCreated: newClustersCreated + newClustersCreatedDrift,
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Error in incremental clustering:", { error });
|
logger.error("Error in incremental clustering:", { error });
|
||||||
@ -236,7 +248,7 @@ export class ClusteringService {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Execute complete clustering pipeline
|
// Execute complete clustering pipeline
|
||||||
await this.executeLeidentClustering(userId, false);
|
await this.executeLeidenClustering(userId, false);
|
||||||
await this.createClusterNodes(userId);
|
await this.createClusterNodes(userId);
|
||||||
|
|
||||||
// Get results
|
// Get results
|
||||||
@ -795,34 +807,295 @@ Based on these distinctive patterns, what is the most accurate name for this sem
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handle cluster evolution when drift is detected
|
* Handle cluster evolution when drift is detected by splitting low-cohesion clusters
|
||||||
*/
|
*/
|
||||||
async evolveCluster(oldClusterId: string, userId: string): Promise<string> {
|
async evolveCluster(oldClusterId: string, userId: string): Promise<string[]> {
|
||||||
logger.info(`Evolving cluster ${oldClusterId}`);
|
logger.info(`Splitting cluster ${oldClusterId} due to low cohesion`);
|
||||||
|
|
||||||
const newClusterId = crypto.randomUUID();
|
try {
|
||||||
|
// Step 1: Get all statements from the low-cohesion cluster
|
||||||
|
const statementsQuery = `
|
||||||
|
MATCH (s:Statement)
|
||||||
|
WHERE s.clusterId = $oldClusterId AND s.userId = $userId
|
||||||
|
RETURN collect(s.uuid) as statementIds, count(s) as statementCount
|
||||||
|
`;
|
||||||
|
const statementsResult = await runQuery(statementsQuery, {
|
||||||
|
oldClusterId,
|
||||||
|
userId,
|
||||||
|
});
|
||||||
|
const statementIds = statementsResult[0]?.get("statementIds") || [];
|
||||||
|
const statementCount = statementsResult[0]?.get("statementCount") || 0;
|
||||||
|
|
||||||
// Create evolution relationship
|
if (statementCount < this.MIN_CLUSTER_SIZE * 2) {
|
||||||
const evolutionQuery = `
|
logger.info(
|
||||||
MATCH (oldCluster:Cluster {uuid: $oldClusterId})
|
`Cluster ${oldClusterId} too small to split (${statementCount} statements)`,
|
||||||
CREATE (newCluster:Cluster {
|
);
|
||||||
uuid: $newClusterId,
|
return [oldClusterId]; // Return original cluster if too small to split
|
||||||
createdAt: datetime(),
|
}
|
||||||
userId: $userId,
|
|
||||||
size: 0,
|
|
||||||
needsNaming: true
|
|
||||||
})
|
|
||||||
CREATE (oldCluster)-[:EVOLVED_TO {createdAt: datetime()}]->(newCluster)
|
|
||||||
RETURN newCluster.uuid as uuid
|
|
||||||
`;
|
|
||||||
|
|
||||||
await runQuery(evolutionQuery, {
|
// Step 2: Create similarity edges within this cluster's statements
|
||||||
oldClusterId,
|
const similarityQuery = `
|
||||||
newClusterId,
|
MATCH (s1:Statement)-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]->(e:Entity)<-[:HAS_SUBJECT|HAS_PREDICATE|HAS_OBJECT]-(s2:Statement)
|
||||||
userId,
|
WHERE s1.clusterId = $oldClusterId AND s2.clusterId = $oldClusterId
|
||||||
});
|
AND s1.userId = $userId AND s2.userId = $userId
|
||||||
|
AND s1.invalidAt IS NULL AND s2.invalidAt IS NULL
|
||||||
|
AND id(s1) < id(s2)
|
||||||
|
WITH s1, s2, collect(DISTINCT e.uuid) as sharedEntities
|
||||||
|
WHERE size(sharedEntities) > 0
|
||||||
|
MERGE (s1)-[r:TEMP_SIMILAR_TO]-(s2)
|
||||||
|
SET r.weight = size(sharedEntities) * 2,
|
||||||
|
r.sharedEntities = sharedEntities
|
||||||
|
RETURN count(r) as edgesCreated
|
||||||
|
`;
|
||||||
|
await runQuery(similarityQuery, { oldClusterId, userId });
|
||||||
|
|
||||||
return newClusterId;
|
// Step 3: Run Leiden clustering on the cluster's statements
|
||||||
|
const leidenQuery = `
|
||||||
|
MATCH (source:Statement) WHERE source.userId = $userId
|
||||||
|
OPTIONAL MATCH (source)-[r:TEMP_SIMILAR_TO]->(target:Statement)
|
||||||
|
WHERE target.userId = $userId and target.clusterId = $oldClusterId
|
||||||
|
WITH gds.graph.project(
|
||||||
|
'cluster_split_' + $userId + '_' + $oldClusterId,
|
||||||
|
source,
|
||||||
|
target,
|
||||||
|
{
|
||||||
|
relationshipProperties: r { .weight }
|
||||||
|
},
|
||||||
|
{ undirectedRelationshipTypes: ['*'] }
|
||||||
|
) AS g
|
||||||
|
|
||||||
|
CALL gds.leiden.write(
|
||||||
|
g.graphName,
|
||||||
|
{
|
||||||
|
writeProperty: 'tempClusterId',
|
||||||
|
relationshipWeightProperty: 'weight',
|
||||||
|
gamma: $gamma,
|
||||||
|
maxLevels: $maxLevels,
|
||||||
|
tolerance: $tolerance,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
YIELD communityCount
|
||||||
|
|
||||||
|
CALL gds.graph.drop(g.graphName)
|
||||||
|
YIELD graphName as droppedGraphName
|
||||||
|
|
||||||
|
RETURN communityCount, g.nodeCount, g.relationshipCount
|
||||||
|
`;
|
||||||
|
|
||||||
|
const leidenResult = await runQuery(leidenQuery, {
|
||||||
|
oldClusterId,
|
||||||
|
userId,
|
||||||
|
gamma: this.LEIDEN_GAMMA,
|
||||||
|
maxLevels: this.LEIDEN_MAX_LEVELS,
|
||||||
|
tolerance: this.LEIDEN_TOLERANCE,
|
||||||
|
});
|
||||||
|
const subClusterCount = leidenResult[0]?.get("communityCount") || 1;
|
||||||
|
|
||||||
|
// Step 4: Create new cluster IDs for sub-clusters that meet minimum size
|
||||||
|
const newClusterIds: string[] = [];
|
||||||
|
const assignClustersQuery = `
|
||||||
|
MATCH (s:Statement)
|
||||||
|
WHERE s.clusterId = $oldClusterId AND s.userId = $userId AND s.tempClusterId IS NOT NULL
|
||||||
|
WITH s.tempClusterId as tempId, collect(s) as statements
|
||||||
|
WHERE size(statements) >= $minSize
|
||||||
|
|
||||||
|
WITH tempId, statements, randomUUID() as newClusterId
|
||||||
|
|
||||||
|
FOREACH (stmt IN statements |
|
||||||
|
SET stmt.clusterId = newClusterId
|
||||||
|
REMOVE stmt.tempClusterId
|
||||||
|
)
|
||||||
|
|
||||||
|
RETURN collect(newClusterId) as newClusterIds, count(DISTINCT newClusterId) as validSubClusters
|
||||||
|
`;
|
||||||
|
|
||||||
|
const assignResult = await runQuery(assignClustersQuery, {
|
||||||
|
oldClusterId,
|
||||||
|
userId,
|
||||||
|
minSize: this.MIN_CLUSTER_SIZE,
|
||||||
|
});
|
||||||
|
const validNewClusterIds = assignResult[0]?.get("newClusterIds") || [];
|
||||||
|
newClusterIds.push(...validNewClusterIds);
|
||||||
|
|
||||||
|
// Step 5: Handle statements that didn't make it into valid sub-clusters
|
||||||
|
const orphanQuery = `
|
||||||
|
MATCH (s:Statement)
|
||||||
|
WHERE s.clusterId = $oldClusterId AND s.userId = $userId
|
||||||
|
REMOVE s.tempClusterId
|
||||||
|
|
||||||
|
// If we have valid sub-clusters, assign orphans to the largest one
|
||||||
|
WITH count(s) as orphanCount
|
||||||
|
MATCH (s2:Statement)
|
||||||
|
WHERE s2.clusterId IN $newClusterIds AND s2.userId = $userId
|
||||||
|
WITH s2.clusterId as clusterId, count(s2) as clusterSize, orphanCount
|
||||||
|
ORDER BY clusterSize DESC
|
||||||
|
LIMIT 1
|
||||||
|
|
||||||
|
MATCH (orphan:Statement)
|
||||||
|
WHERE orphan.clusterId = $oldClusterId AND orphan.userId = $userId
|
||||||
|
SET orphan.clusterId = clusterId
|
||||||
|
|
||||||
|
RETURN count(orphan) as orphansReassigned
|
||||||
|
`;
|
||||||
|
|
||||||
|
if (newClusterIds.length > 0) {
|
||||||
|
await runQuery(orphanQuery, { oldClusterId, userId, newClusterIds });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 6: Create new Cluster nodes and evolution relationships
|
||||||
|
if (newClusterIds.length > 1) {
|
||||||
|
const createClustersQuery = `
|
||||||
|
MATCH (oldCluster:Cluster {uuid: $oldClusterId})
|
||||||
|
|
||||||
|
UNWIND $newClusterIds as newClusterId
|
||||||
|
|
||||||
|
MATCH (s:Statement {clusterId: newClusterId, userId: $userId})
|
||||||
|
WITH oldCluster, newClusterId, count(s) as statementCount
|
||||||
|
|
||||||
|
CREATE (newCluster:Cluster {
|
||||||
|
uuid: newClusterId,
|
||||||
|
createdAt: datetime(),
|
||||||
|
userId: $userId,
|
||||||
|
size: statementCount,
|
||||||
|
needsNaming: true,
|
||||||
|
aspectType: oldCluster.aspectType
|
||||||
|
})
|
||||||
|
|
||||||
|
CREATE (oldCluster)-[:SPLIT_INTO {
|
||||||
|
createdAt: datetime(),
|
||||||
|
reason: 'low_cohesion',
|
||||||
|
originalSize: $originalSize,
|
||||||
|
newSize: statementCount
|
||||||
|
}]->(newCluster)
|
||||||
|
|
||||||
|
RETURN count(newCluster) as clustersCreated
|
||||||
|
`;
|
||||||
|
|
||||||
|
await runQuery(createClustersQuery, {
|
||||||
|
oldClusterId,
|
||||||
|
newClusterIds,
|
||||||
|
originalSize: statementCount,
|
||||||
|
userId,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Mark old cluster as evolved
|
||||||
|
await runQuery(
|
||||||
|
`
|
||||||
|
MATCH (c:Cluster {uuid: $oldClusterId})
|
||||||
|
SET c.evolved = true, c.evolvedAt = datetime()
|
||||||
|
`,
|
||||||
|
{ oldClusterId },
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Successfully split cluster ${oldClusterId} into ${newClusterIds.length} sub-clusters`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
logger.info(`Cluster ${oldClusterId} could not be meaningfully split`);
|
||||||
|
newClusterIds.push(oldClusterId); // Keep original if splitting didn't work
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 7: Clean up temporary relationships
|
||||||
|
await runQuery(
|
||||||
|
`
|
||||||
|
MATCH ()-[r:TEMP_SIMILAR_TO]-()
|
||||||
|
DELETE r
|
||||||
|
`,
|
||||||
|
{},
|
||||||
|
);
|
||||||
|
|
||||||
|
return newClusterIds;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error splitting cluster ${oldClusterId}:`, { error });
|
||||||
|
// Clean up on error
|
||||||
|
await runQuery(
|
||||||
|
`
|
||||||
|
MATCH ()-[r:TEMP_SIMILAR_TO]-()
|
||||||
|
DELETE r
|
||||||
|
|
||||||
|
MATCH (s:Statement)
|
||||||
|
WHERE s.clusterId = $oldClusterId AND s.userId = $userId
|
||||||
|
REMOVE s.tempClusterId
|
||||||
|
`,
|
||||||
|
{ oldClusterId, userId },
|
||||||
|
);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle drift by splitting low-cohesion clusters
|
||||||
|
*/
|
||||||
|
async handleClusterDrift(userId: string): Promise<{
|
||||||
|
clustersProcessed: number;
|
||||||
|
newClustersCreated: number;
|
||||||
|
splitClusters: string[];
|
||||||
|
}> {
|
||||||
|
logger.info(`Handling cluster drift for user ${userId}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Detect drift and get low-cohesion clusters
|
||||||
|
const driftMetrics = await this.detectClusterDrift(userId);
|
||||||
|
|
||||||
|
if (
|
||||||
|
!driftMetrics.driftDetected ||
|
||||||
|
driftMetrics.lowCohesionClusters.length === 0
|
||||||
|
) {
|
||||||
|
logger.info("No drift detected or no clusters need splitting");
|
||||||
|
return {
|
||||||
|
clustersProcessed: 0,
|
||||||
|
newClustersCreated: 0,
|
||||||
|
splitClusters: [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Found ${driftMetrics.lowCohesionClusters.length} clusters with low cohesion`,
|
||||||
|
);
|
||||||
|
|
||||||
|
let totalNewClusters = 0;
|
||||||
|
const splitClusters: string[] = [];
|
||||||
|
|
||||||
|
// Process each low-cohesion cluster
|
||||||
|
for (const clusterId of driftMetrics.lowCohesionClusters) {
|
||||||
|
try {
|
||||||
|
const newClusterIds = await this.evolveCluster(clusterId, userId);
|
||||||
|
|
||||||
|
if (newClusterIds.length > 1) {
|
||||||
|
// Cluster was successfully split
|
||||||
|
totalNewClusters += newClusterIds.length;
|
||||||
|
splitClusters.push(clusterId);
|
||||||
|
logger.info(
|
||||||
|
`Split cluster ${clusterId} into ${newClusterIds.length} sub-clusters`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
logger.info(`Cluster ${clusterId} could not be split meaningfully`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to split cluster ${clusterId}:`, { error });
|
||||||
|
// Continue with other clusters even if one fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update cluster embeddings for new clusters
|
||||||
|
if (totalNewClusters > 0) {
|
||||||
|
await this.updateClusterEmbeddings(userId);
|
||||||
|
await this.generateClusterNames(userId);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`Drift handling completed: ${splitClusters.length} clusters split, ${totalNewClusters} new clusters created`,
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
clustersProcessed: splitClusters.length,
|
||||||
|
newClustersCreated: totalNewClusters,
|
||||||
|
splitClusters,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Error handling cluster drift:", { error });
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user