diff --git a/apps/webapp/app/lib/neo4j.server.ts b/apps/webapp/app/lib/neo4j.server.ts index 24f8561..ed21625 100644 --- a/apps/webapp/app/lib/neo4j.server.ts +++ b/apps/webapp/app/lib/neo4j.server.ts @@ -312,6 +312,18 @@ const initializeSchema = async () => { await runQuery( "CREATE INDEX entity_name IF NOT EXISTS FOR (n:Entity) ON (n.name)", ); + await runQuery( + "CREATE INDEX entity_uuid IF NOT EXISTS FOR (n:Entity) ON (n.uuid)", + ); + await runQuery( + "CREATE INDEX entity_type IF NOT EXISTS FOR (n:Entity) ON (n.type)", + ); + await runQuery( + "CREATE INDEX entity_user_id IF NOT EXISTS FOR (n:Entity) ON (n.userId)", + ); + await runQuery( + "CREATE INDEX statement_user_id IF NOT EXISTS FOR (n:Statement) ON (n.userId)", + ); await runQuery( "CREATE INDEX cluster_user_id IF NOT EXISTS FOR (n:Cluster) ON (n.userId)", ); @@ -322,17 +334,17 @@ const initializeSchema = async () => { // Create vector indexes for semantic search (if using Neo4j 5.0+) await runQuery(` CREATE VECTOR INDEX entity_embedding IF NOT EXISTS FOR (n:Entity) ON n.nameEmbedding - OPTIONS {indexConfig: {\`vector.dimensions\`: 1536, \`vector.similarity_function\`: 'cosine'}} + OPTIONS {indexConfig: {\`vector.dimensions\`: 1024, \`vector.similarity_function\`: 'cosine', \`vector.hnsw.ef_construction\`: 400, \`vector.hnsw.m\`: 32}} `); await runQuery(` CREATE VECTOR INDEX statement_embedding IF NOT EXISTS FOR (n:Statement) ON n.factEmbedding - OPTIONS {indexConfig: {\`vector.dimensions\`: 1536, \`vector.similarity_function\`: 'cosine'}} + OPTIONS {indexConfig: {\`vector.dimensions\`: 1024, \`vector.similarity_function\`: 'cosine', \`vector.hnsw.ef_construction\`: 400, \`vector.hnsw.m\`: 32}} `); await runQuery(` CREATE VECTOR INDEX episode_embedding IF NOT EXISTS FOR (n:Episode) ON n.contentEmbedding - OPTIONS {indexConfig: {\`vector.dimensions\`: 1536, \`vector.similarity_function\`: 'cosine'}} + OPTIONS {indexConfig: {\`vector.dimensions\`: 1024, \`vector.similarity_function\`: 'cosine', \`vector.hnsw.ef_construction\`: 400, \`vector.hnsw.m\`: 32}} `); // Create fulltext indexes for BM25 search @@ -348,7 +360,7 @@ const initializeSchema = async () => { await runQuery(` CREATE FULLTEXT INDEX entity_name_index IF NOT EXISTS - FOR (n:Entity) ON EACH [n.name, n.description] + FOR (n:Entity) ON EACH [n.name] OPTIONS { indexConfig: { \`fulltext.analyzer\`: 'english' diff --git a/apps/webapp/app/services/graphModels/entity.ts b/apps/webapp/app/services/graphModels/entity.ts index 3fa1fc8..a11fcd1 100644 --- a/apps/webapp/app/services/graphModels/entity.ts +++ b/apps/webapp/app/services/graphModels/entity.ts @@ -83,16 +83,15 @@ export async function findSimilarEntities(params: { userId: string; }): Promise { const query = ` - MATCH (entity:Entity) - WHERE entity.nameEmbedding IS NOT NULL - WITH entity, vector.similarity.cosine($queryEmbedding, entity.nameEmbedding) AS score + CALL db.index.vector.queryNodes('entity_embedding', $topK, $queryEmbedding) + YIELD node AS entity, score WHERE score >= $threshold AND entity.userId = $userId RETURN entity, score ORDER BY score DESC `; - const result = await runQuery(query, params); + const result = await runQuery(query, { ...params, topK: params.limit }); return result.map((record) => { const entity = record.get("entity").properties; @@ -118,9 +117,8 @@ export async function findSimilarEntitiesWithSameType(params: { userId: string; }): Promise { const query = ` - MATCH (entity:Entity) - WHERE entity.nameEmbedding IS NOT NULL - WITH entity, vector.similarity.cosine($queryEmbedding, entity.nameEmbedding) AS score + CALL db.index.vector.queryNodes('entity_embedding', $topK, $queryEmbedding) + YIELD node AS entity, score WHERE score >= $threshold AND entity.userId = $userId AND entity.type = $entityType @@ -128,7 +126,7 @@ export async function findSimilarEntitiesWithSameType(params: { ORDER BY score DESC `; - const result = await runQuery(query, params); + const result = await runQuery(query, { ...params, topK: params.limit }); return result.map((record) => { const entity = record.get("entity").properties; diff --git a/apps/webapp/app/services/graphModels/episode.ts b/apps/webapp/app/services/graphModels/episode.ts index 450e33a..634ae85 100644 --- a/apps/webapp/app/services/graphModels/episode.ts +++ b/apps/webapp/app/services/graphModels/episode.ts @@ -137,16 +137,10 @@ export async function searchEpisodesByEmbedding(params: { minSimilarity?: number; }) { const query = ` - MATCH (episode:Episode) + CALL db.index.vector.queryNodes('episode_embedding', $topK, $embedding) + YIELD node AS episode, score WHERE episode.userId = $userId - AND episode.contentEmbedding IS NOT NULL - WITH episode, - CASE - WHEN size(episode.contentEmbedding) = size($embedding) - THEN vector.similarity.cosine($embedding, episode.contentEmbedding) - ELSE 0 - END AS score - WHERE score >= $minSimilarity + AND score >= $minSimilarity RETURN episode, score ORDER BY score DESC`; @@ -154,6 +148,7 @@ export async function searchEpisodesByEmbedding(params: { embedding: params.embedding, minSimilarity: params.minSimilarity, userId: params.userId, + topK: 100, }); if (!result || result.length === 0) { @@ -283,15 +278,10 @@ export async function getRelatedEpisodesEntities(params: { minSimilarity?: number; }) { const query = ` - MATCH (episode:Episode {userId: $userId}) - WHERE episode.contentEmbedding IS NOT NULL - WITH episode, - CASE - WHEN size(episode.contentEmbedding) = size($embedding) - THEN vector.similarity.cosine($embedding, episode.contentEmbedding) - ELSE 0 - END AS score - WHERE score >= $minSimilarity + CALL db.index.vector.queryNodes('episode_embedding', $topK, $embedding) + YIELD node AS episode, score + WHERE episode.userId = $userId + AND score >= $minSimilarity OPTIONAL MATCH (episode)-[:HAS_PROVENANCE]->(stmt:Statement)-[:HAS_SUBJECT|HAS_OBJECT]->(entity:Entity) WHERE entity IS NOT NULL RETURN DISTINCT entity`; @@ -300,6 +290,7 @@ export async function getRelatedEpisodesEntities(params: { embedding: params.embedding, minSimilarity: params.minSimilarity, userId: params.userId, + topK: params.limit || 100, }); return result diff --git a/apps/webapp/app/services/graphModels/statement.ts b/apps/webapp/app/services/graphModels/statement.ts index e87dd8a..bbdf0cc 100644 --- a/apps/webapp/app/services/graphModels/statement.ts +++ b/apps/webapp/app/services/graphModels/statement.ts @@ -110,11 +110,10 @@ export async function findContradictoryStatements({ userId: string; }): Promise { const query = ` - MATCH (statement:Statement) + MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId}) + MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate) WHERE statement.userId = $userId AND statement.invalidAt IS NULL - MATCH (subject:Entity)<-[:HAS_SUBJECT]-(statement)-[:HAS_PREDICATE]->(predicate:Entity) - WHERE subject.uuid = $subjectId AND predicate.uuid = $predicateId RETURN statement `; @@ -157,18 +156,21 @@ export async function findStatementsWithSameSubjectObject({ userId: string; }): Promise { const query = ` - MATCH (statement:Statement) + MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId}) + MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object) + MATCH (statement)-[:HAS_PREDICATE]->(predicate:Entity) WHERE statement.userId = $userId AND statement.invalidAt IS NULL - MATCH (subject:Entity)<-[:HAS_SUBJECT]-(statement)-[:HAS_PREDICATE]->(predicate:Entity) - MATCH (statement)-[:HAS_OBJECT]->(object:Entity) - WHERE subject.uuid = $subjectId - AND object.uuid = $objectId ${excludePredicateId ? "AND predicate.uuid <> $excludePredicateId" : ""} RETURN statement `; - const params = { subjectId, objectId, userId, ...(excludePredicateId && { excludePredicateId }) }; + const params = { + subjectId, + objectId, + userId, + ...(excludePredicateId && { excludePredicateId }), + }; const result = await runQuery(query, params); if (!result || result.length === 0) { @@ -207,13 +209,12 @@ export async function findSimilarStatements({ userId: string; }): Promise { const query = ` - MATCH (statement:Statement) + CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding) + YIELD node AS statement, score WHERE statement.userId = $userId - AND statement.invalidAt IS NULL - AND statement.factEmbedding IS NOT NULL + AND statement.invalidAt IS NULL + AND score >= $threshold ${excludeIds.length > 0 ? "AND NOT statement.uuid IN $excludeIds" : ""} - WITH statement, vector.similarity.cosine($factEmbedding, statement.factEmbedding) AS score - WHERE score >= $threshold RETURN statement, score ORDER BY score DESC `; @@ -223,6 +224,7 @@ export async function findSimilarStatements({ threshold, excludeIds, userId, + topK: 100, }); if (!result || result.length === 0) { @@ -396,17 +398,11 @@ export async function searchStatementsByEmbedding(params: { minSimilarity?: number; }) { const query = ` - MATCH (statement:Statement) + CALL db.index.vector.queryNodes('statement_embedding', $topK, $embedding) + YIELD node AS statement, score WHERE statement.userId = $userId - AND statement.invalidAt IS NULL - AND statement.factEmbedding IS NOT NULL - WITH statement, - CASE - WHEN size(statement.factEmbedding) = size($embedding) - THEN vector.similarity.cosine($embedding, statement.factEmbedding) - ELSE 0 - END AS score - WHERE score >= $minSimilarity + AND statement.invalidAt IS NULL + AND score >= $minSimilarity RETURN statement, score ORDER BY score DESC `; @@ -416,6 +412,7 @@ export async function searchStatementsByEmbedding(params: { minSimilarity: params.minSimilarity, limit: params.limit, userId: params.userId, + topK: params.limit || 100, }); if (!result || result.length === 0) { diff --git a/apps/webapp/app/services/search/utils.ts b/apps/webapp/app/services/search/utils.ts index b5e108b..056433c 100644 --- a/apps/webapp/app/services/search/utils.ts +++ b/apps/webapp/app/services/search/utils.ts @@ -131,23 +131,23 @@ export async function performVectorSearch( // 1. Search for similar statements using Neo4j vector search with provenance count const cypher = ` - MATCH (s:Statement) - WHERE - (s.userId = $userId) - ${timeframeCondition} - ${spaceCondition} - WITH s, vector.similarity.cosine(s.factEmbedding, $embedding) AS score - WHERE score > 0.7 - OPTIONAL MATCH (episode:Episode)-[:HAS_PROVENANCE]->(s) - WITH s, score, count(episode) as provenanceCount - RETURN s, score, provenanceCount - ORDER BY score DESC - `; + CALL db.index.vector.queryNodes('statement_embedding', $topk, $embedding) + YIELD node AS s, score + WHERE s.userId = $userId + AND score >= 0.7 + ${timeframeCondition.replace("AND", "AND").replace("WHERE", "AND")} + ${spaceCondition} + OPTIONAL MATCH (episode:Episode)-[:HAS_PROVENANCE]->(s) + WITH s, score, count(episode) as provenanceCount + RETURN s, score, provenanceCount + ORDER BY score DESC + `; const params = { embedding: query, userId, validAt: options.endTime.toISOString(), + topk: options.limit || 100, ...(options.startTime && { startTime: options.startTime.toISOString() }), ...(options.spaceIds.length > 0 && { spaceIds: options.spaceIds }), }; @@ -281,15 +281,13 @@ export async function extractEntitiesFromQuery( try { // Use vector similarity to find relevant entities const cypher = ` - // Match entities using vector similarity on name embeddings - MATCH (e:Entity) - WHERE e.nameEmbedding IS NOT NULL - AND e.userId = $userId - WITH e, vector.similarity.cosine(e.nameEmbedding, $embedding) AS score - WHERE score > 0.7 + // Match entities using vector index on name embeddings + CALL db.index.vector.queryNodes('entity_embedding', 3, $embedding) + YIELD node AS e, score + WHERE e.userId = $userId + AND score > 0.7 RETURN e ORDER BY score DESC - LIMIT 3 `; const params = {