mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 09:58:28 +00:00
Fix: episode normalization with high complexity model (#80)
This commit is contained in:
parent
46407b0fac
commit
92ca34a02f
@ -1,5 +1,5 @@
|
|||||||
import { runQuery } from "~/lib/neo4j.server";
|
import { runQuery } from "~/lib/neo4j.server";
|
||||||
import { type EntityNode, type EpisodicNode } from "@core/types";
|
import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";
|
||||||
|
|
||||||
export async function saveEpisode(episode: EpisodicNode): Promise<string> {
|
export async function saveEpisode(episode: EpisodicNode): Promise<string> {
|
||||||
const query = `
|
const query = `
|
||||||
@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
|
|||||||
export async function getEpisodeStatements(params: {
|
export async function getEpisodeStatements(params: {
|
||||||
episodeUuid: string;
|
episodeUuid: string;
|
||||||
userId: string;
|
userId: string;
|
||||||
}) {
|
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||||
const query = `
|
const query = `
|
||||||
MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
|
MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
|
||||||
WHERE stmt.invalidAt IS NULL
|
WHERE stmt.invalidAt IS NULL
|
||||||
@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
|
|||||||
return {
|
return {
|
||||||
uuid: stmt.uuid,
|
uuid: stmt.uuid,
|
||||||
fact: stmt.fact,
|
fact: stmt.fact,
|
||||||
factEmbedding: stmt.factEmbedding,
|
|
||||||
createdAt: new Date(stmt.createdAt),
|
createdAt: new Date(stmt.createdAt),
|
||||||
validAt: new Date(stmt.validAt),
|
validAt: new Date(stmt.validAt),
|
||||||
invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
|
invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
|
||||||
|
|||||||
@ -111,7 +111,7 @@ export async function findContradictoryStatements({
|
|||||||
subjectId: string;
|
subjectId: string;
|
||||||
predicateId: string;
|
predicateId: string;
|
||||||
userId: string;
|
userId: string;
|
||||||
}): Promise<StatementNode[]> {
|
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||||
const query = `
|
const query = `
|
||||||
MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
|
MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
|
||||||
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
|
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
|
||||||
@ -131,7 +131,6 @@ export async function findContradictoryStatements({
|
|||||||
return {
|
return {
|
||||||
uuid: statement.uuid,
|
uuid: statement.uuid,
|
||||||
fact: statement.fact,
|
fact: statement.fact,
|
||||||
factEmbedding: statement.factEmbedding,
|
|
||||||
createdAt: new Date(statement.createdAt),
|
createdAt: new Date(statement.createdAt),
|
||||||
validAt: new Date(statement.validAt),
|
validAt: new Date(statement.validAt),
|
||||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||||
@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
|
|||||||
objectId: string;
|
objectId: string;
|
||||||
excludePredicateId?: string;
|
excludePredicateId?: string;
|
||||||
userId: string;
|
userId: string;
|
||||||
}): Promise<StatementNode[]> {
|
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||||
const query = `
|
const query = `
|
||||||
MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
|
MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
|
||||||
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
|
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
|
||||||
@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
|
|||||||
return {
|
return {
|
||||||
uuid: statement.uuid,
|
uuid: statement.uuid,
|
||||||
fact: statement.fact,
|
fact: statement.fact,
|
||||||
factEmbedding: statement.factEmbedding,
|
|
||||||
createdAt: new Date(statement.createdAt),
|
createdAt: new Date(statement.createdAt),
|
||||||
validAt: new Date(statement.validAt),
|
validAt: new Date(statement.validAt),
|
||||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||||
@ -212,7 +210,7 @@ export async function findSimilarStatements({
|
|||||||
threshold?: number;
|
threshold?: number;
|
||||||
excludeIds?: string[];
|
excludeIds?: string[];
|
||||||
userId: string;
|
userId: string;
|
||||||
}): Promise<StatementNode[]> {
|
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||||
const query = `
|
const query = `
|
||||||
CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
|
CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
|
||||||
YIELD node AS statement, score
|
YIELD node AS statement, score
|
||||||
@ -242,7 +240,6 @@ export async function findSimilarStatements({
|
|||||||
return {
|
return {
|
||||||
uuid: statement.uuid,
|
uuid: statement.uuid,
|
||||||
fact: statement.fact,
|
fact: statement.fact,
|
||||||
factEmbedding: statement.factEmbedding,
|
|
||||||
createdAt: new Date(statement.createdAt),
|
createdAt: new Date(statement.createdAt),
|
||||||
validAt: new Date(statement.validAt),
|
validAt: new Date(statement.validAt),
|
||||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||||
|
|||||||
@ -418,10 +418,15 @@ export class KnowledgeGraphService {
|
|||||||
const processingTimeMs = endTime - startTime;
|
const processingTimeMs = endTime - startTime;
|
||||||
logger.log(`Processing time: ${processingTimeMs} ms`);
|
logger.log(`Processing time: ${processingTimeMs} ms`);
|
||||||
|
|
||||||
|
// Count only truly new statements (exclude duplicates)
|
||||||
|
const newStatementsCount = updatedTriples.filter(triple =>
|
||||||
|
triple.statement.createdAt >= episode.createdAt
|
||||||
|
).length;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
episodeUuid: episode.uuid,
|
episodeUuid: episode.uuid,
|
||||||
// nodesCreated: hydratedNodes.length,
|
// nodesCreated: hydratedNodes.length,
|
||||||
statementsCreated: resolvedStatements.length,
|
statementsCreated: newStatementsCount,
|
||||||
processingTimeMs,
|
processingTimeMs,
|
||||||
tokenUsage: tokenMetrics,
|
tokenUsage: tokenMetrics,
|
||||||
};
|
};
|
||||||
@ -529,6 +534,7 @@ export class KnowledgeGraphService {
|
|||||||
referenceTime: episode.validAt.toISOString(),
|
referenceTime: episode.validAt.toISOString(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log("proprietary model", isProprietaryModel(undefined, 'high'));
|
||||||
// Statement extraction requires HIGH complexity (causal reasoning, emotional context)
|
// Statement extraction requires HIGH complexity (causal reasoning, emotional context)
|
||||||
// Choose between proprietary and OSS prompts based on model type
|
// Choose between proprietary and OSS prompts based on model type
|
||||||
const messages = isProprietaryModel(undefined, 'high')
|
const messages = isProprietaryModel(undefined, 'high')
|
||||||
@ -905,7 +911,7 @@ export class KnowledgeGraphService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Step 1: Collect all potential matches for all triples at once
|
// Step 1: Collect all potential matches for all triples at once
|
||||||
const allPotentialMatches: Map<string, StatementNode[]> = new Map();
|
const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
|
||||||
const allExistingTripleData: Map<string, Triple> = new Map();
|
const allExistingTripleData: Map<string, Triple> = new Map();
|
||||||
|
|
||||||
// For preparing the LLM context
|
// For preparing the LLM context
|
||||||
@ -915,7 +921,7 @@ export class KnowledgeGraphService {
|
|||||||
for (const triple of triples) {
|
for (const triple of triples) {
|
||||||
// Track IDs of statements we've already checked to avoid duplicates
|
// Track IDs of statements we've already checked to avoid duplicates
|
||||||
const checkedStatementIds: string[] = [];
|
const checkedStatementIds: string[] = [];
|
||||||
let potentialMatches: StatementNode[] = [];
|
let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];
|
||||||
|
|
||||||
// Phase 1a: Find statements with exact subject-predicate match
|
// Phase 1a: Find statements with exact subject-predicate match
|
||||||
// Example: "John lives_in New York" vs "John lives_in San Francisco"
|
// Example: "John lives_in New York" vs "John lives_in San Francisco"
|
||||||
@ -965,7 +971,7 @@ export class KnowledgeGraphService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Phase 3: Check related memories for contradictory statements
|
// Phase 3: Check related memories for contradictory statements
|
||||||
const previousEpisodesStatements: StatementNode[] = [];
|
const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];
|
||||||
|
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
previousEpisodes.map(async (episode) => {
|
previousEpisodes.map(async (episode) => {
|
||||||
@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
|
|||||||
tokenMetrics.low.output += usage.completionTokens;
|
tokenMetrics.low.output += usage.completionTokens;
|
||||||
tokenMetrics.low.total += usage.totalTokens;
|
tokenMetrics.low.total += usage.totalTokens;
|
||||||
}
|
}
|
||||||
}, undefined, 'low');
|
}, undefined, 'high');
|
||||||
let normalizedEpisodeBody = "";
|
let normalizedEpisodeBody = "";
|
||||||
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||||
if (outputMatch && outputMatch[1]) {
|
if (outputMatch && outputMatch[1]) {
|
||||||
|
|||||||
@ -7,42 +7,25 @@ export const normalizePrompt = (
|
|||||||
|
|
||||||
Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
|
Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
|
||||||
|
|
||||||
<smart_enrichment_process>
|
CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
|
||||||
Evaluate the episode and apply enrichment ONLY where it adds significant value:
|
|
||||||
|
|
||||||
1. PRIMARY FACTS - always preserve the core information from the episode
|
<enrichment_strategy>
|
||||||
2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp
|
1. PRIMARY FACTS - Always preserve the core information from the episode
|
||||||
3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below)
|
2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
|
||||||
4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images
|
3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
|
||||||
5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges
|
4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
|
||||||
6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections
|
5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges
|
||||||
|
|
||||||
ENRICHMENT DECISION MATRIX:
|
When to add context from related memories:
|
||||||
- Clear, complete statement → minimal enrichment (just temporal + attribution)
|
- Unclear pronouns ("she", "it", "they") → resolve to specific entity
|
||||||
- Unclear references → resolve with context
|
- Vague references ("the agency", "the event") → add clarifying details
|
||||||
- Emotional support → preserve feeling, avoid historical dumping
|
- Continuation phrases ("following up", "as we discussed") → connect to previous topic
|
||||||
- New developments → connect to ongoing narrative
|
|
||||||
- Visual content → extract specific details as primary facts
|
|
||||||
</smart_enrichment_process>
|
|
||||||
|
|
||||||
<context_usage_decision>
|
When NOT to add context:
|
||||||
When related memories/previous episodes are provided, evaluate if they improve understanding:
|
- Clear, self-contained statements → no enrichment needed beyond temporal
|
||||||
|
- Emotional responses → preserve tone, avoid over-contextualization
|
||||||
USE CONTEXT when current episode has:
|
- Already established topics → don't repeat details mentioned earlier in conversation
|
||||||
- Unclear pronouns ("she", "it", "they" without clear antecedent)
|
</enrichment_strategy>
|
||||||
- Vague references ("the agency", "the event" without definition in current episode)
|
|
||||||
- Continuation phrases ("following up", "as we discussed")
|
|
||||||
- Incomplete information that context clarifies
|
|
||||||
|
|
||||||
IGNORE CONTEXT when current episode is:
|
|
||||||
- Clear and self-contained ("I got a job in New York")
|
|
||||||
- Simple emotional responses ("Thanks, that's great!")
|
|
||||||
- Generic encouragement ("You're doing awesome!")
|
|
||||||
- Complete statements with all necessary information
|
|
||||||
|
|
||||||
DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
|
|
||||||
resolves ambiguity.
|
|
||||||
</context_usage_decision>
|
|
||||||
|
|
||||||
<temporal_resolution>
|
<temporal_resolution>
|
||||||
Using episode timestamp as anchor, convert ALL relative time references:
|
Using episode timestamp as anchor, convert ALL relative time references:
|
||||||
@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (
|
|||||||
|
|
||||||
Transform this document content into enriched factual statements for knowledge graph storage.
|
Transform this document content into enriched factual statements for knowledge graph storage.
|
||||||
|
|
||||||
|
CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
|
||||||
|
|
||||||
<document_processing_approach>
|
<document_processing_approach>
|
||||||
Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
|
Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
|
||||||
|
|
||||||
|
|||||||
@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";
|
|||||||
|
|
||||||
const documentIngestionQueue = queue({
|
const documentIngestionQueue = queue({
|
||||||
name: "document-ingestion-queue",
|
name: "document-ingestion-queue",
|
||||||
concurrencyLimit: 5,
|
concurrencyLimit: 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Register the Document Ingestion Trigger.dev task
|
// Register the Document Ingestion Trigger.dev task
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user