mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-10 23:48:26 +00:00
Fix: episode normalization with high complexity model
This commit is contained in:
parent
f539ad1ecd
commit
83b59b1c3b
@ -1,5 +1,5 @@
|
||||
import { runQuery } from "~/lib/neo4j.server";
|
||||
import { type EntityNode, type EpisodicNode } from "@core/types";
|
||||
import { StatementNode, type EntityNode, type EpisodicNode } from "@core/types";
|
||||
|
||||
export async function saveEpisode(episode: EpisodicNode): Promise<string> {
|
||||
const query = `
|
||||
@ -308,7 +308,7 @@ export async function getRelatedEpisodesEntities(params: {
|
||||
export async function getEpisodeStatements(params: {
|
||||
episodeUuid: string;
|
||||
userId: string;
|
||||
}) {
|
||||
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||
const query = `
|
||||
MATCH (episode:Episode {uuid: $episodeUuid, userId: $userId})-[:HAS_PROVENANCE]->(stmt:Statement)
|
||||
WHERE stmt.invalidAt IS NULL
|
||||
@ -326,7 +326,6 @@ export async function getEpisodeStatements(params: {
|
||||
return {
|
||||
uuid: stmt.uuid,
|
||||
fact: stmt.fact,
|
||||
factEmbedding: stmt.factEmbedding,
|
||||
createdAt: new Date(stmt.createdAt),
|
||||
validAt: new Date(stmt.validAt),
|
||||
invalidAt: stmt.invalidAt ? new Date(stmt.invalidAt) : null,
|
||||
|
||||
@ -111,7 +111,7 @@ export async function findContradictoryStatements({
|
||||
subjectId: string;
|
||||
predicateId: string;
|
||||
userId: string;
|
||||
}): Promise<StatementNode[]> {
|
||||
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||
const query = `
|
||||
MATCH (subject:Entity {uuid: $subjectId}), (predicate:Entity {uuid: $predicateId})
|
||||
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_PREDICATE]->(predicate)
|
||||
@ -131,7 +131,6 @@ export async function findContradictoryStatements({
|
||||
return {
|
||||
uuid: statement.uuid,
|
||||
fact: statement.fact,
|
||||
factEmbedding: statement.factEmbedding,
|
||||
createdAt: new Date(statement.createdAt),
|
||||
validAt: new Date(statement.validAt),
|
||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||
@ -158,7 +157,7 @@ export async function findStatementsWithSameSubjectObject({
|
||||
objectId: string;
|
||||
excludePredicateId?: string;
|
||||
userId: string;
|
||||
}): Promise<StatementNode[]> {
|
||||
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||
const query = `
|
||||
MATCH (subject:Entity {uuid: $subjectId}), (object:Entity {uuid: $objectId})
|
||||
MATCH (subject)<-[:HAS_SUBJECT]-(statement:Statement)-[:HAS_OBJECT]->(object)
|
||||
@ -186,7 +185,6 @@ export async function findStatementsWithSameSubjectObject({
|
||||
return {
|
||||
uuid: statement.uuid,
|
||||
fact: statement.fact,
|
||||
factEmbedding: statement.factEmbedding,
|
||||
createdAt: new Date(statement.createdAt),
|
||||
validAt: new Date(statement.validAt),
|
||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||
@ -212,7 +210,7 @@ export async function findSimilarStatements({
|
||||
threshold?: number;
|
||||
excludeIds?: string[];
|
||||
userId: string;
|
||||
}): Promise<StatementNode[]> {
|
||||
}): Promise<Omit<StatementNode, "factEmbedding">[]> {
|
||||
const query = `
|
||||
CALL db.index.vector.queryNodes('statement_embedding', $topK, $factEmbedding)
|
||||
YIELD node AS statement, score
|
||||
@ -242,7 +240,6 @@ export async function findSimilarStatements({
|
||||
return {
|
||||
uuid: statement.uuid,
|
||||
fact: statement.fact,
|
||||
factEmbedding: statement.factEmbedding,
|
||||
createdAt: new Date(statement.createdAt),
|
||||
validAt: new Date(statement.validAt),
|
||||
invalidAt: statement.invalidAt ? new Date(statement.invalidAt) : null,
|
||||
|
||||
@ -418,10 +418,15 @@ export class KnowledgeGraphService {
|
||||
const processingTimeMs = endTime - startTime;
|
||||
logger.log(`Processing time: ${processingTimeMs} ms`);
|
||||
|
||||
// Count only truly new statements (exclude duplicates)
|
||||
const newStatementsCount = updatedTriples.filter(triple =>
|
||||
triple.statement.createdAt >= episode.createdAt
|
||||
).length;
|
||||
|
||||
return {
|
||||
episodeUuid: episode.uuid,
|
||||
// nodesCreated: hydratedNodes.length,
|
||||
statementsCreated: resolvedStatements.length,
|
||||
statementsCreated: newStatementsCount,
|
||||
processingTimeMs,
|
||||
tokenUsage: tokenMetrics,
|
||||
};
|
||||
@ -529,6 +534,7 @@ export class KnowledgeGraphService {
|
||||
referenceTime: episode.validAt.toISOString(),
|
||||
};
|
||||
|
||||
console.log("proprietary model", isProprietaryModel(undefined, 'high'));
|
||||
// Statement extraction requires HIGH complexity (causal reasoning, emotional context)
|
||||
// Choose between proprietary and OSS prompts based on model type
|
||||
const messages = isProprietaryModel(undefined, 'high')
|
||||
@ -905,7 +911,7 @@ export class KnowledgeGraphService {
|
||||
}
|
||||
|
||||
// Step 1: Collect all potential matches for all triples at once
|
||||
const allPotentialMatches: Map<string, StatementNode[]> = new Map();
|
||||
const allPotentialMatches: Map<string, Omit<StatementNode, "factEmbedding">[]> = new Map();
|
||||
const allExistingTripleData: Map<string, Triple> = new Map();
|
||||
|
||||
// For preparing the LLM context
|
||||
@ -915,7 +921,7 @@ export class KnowledgeGraphService {
|
||||
for (const triple of triples) {
|
||||
// Track IDs of statements we've already checked to avoid duplicates
|
||||
const checkedStatementIds: string[] = [];
|
||||
let potentialMatches: StatementNode[] = [];
|
||||
let potentialMatches: Omit<StatementNode, "factEmbedding">[] = [];
|
||||
|
||||
// Phase 1a: Find statements with exact subject-predicate match
|
||||
// Example: "John lives_in New York" vs "John lives_in San Francisco"
|
||||
@ -965,7 +971,7 @@ export class KnowledgeGraphService {
|
||||
}
|
||||
|
||||
// Phase 3: Check related memories for contradictory statements
|
||||
const previousEpisodesStatements: StatementNode[] = [];
|
||||
const previousEpisodesStatements: Omit<StatementNode, "factEmbedding">[] = [];
|
||||
|
||||
await Promise.all(
|
||||
previousEpisodes.map(async (episode) => {
|
||||
@ -1264,7 +1270,7 @@ export class KnowledgeGraphService {
|
||||
tokenMetrics.low.output += usage.completionTokens;
|
||||
tokenMetrics.low.total += usage.totalTokens;
|
||||
}
|
||||
}, undefined, 'low');
|
||||
}, undefined, 'high');
|
||||
let normalizedEpisodeBody = "";
|
||||
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||
if (outputMatch && outputMatch[1]) {
|
||||
|
||||
@ -7,42 +7,25 @@ export const normalizePrompt = (
|
||||
|
||||
Create ONE enriched sentence that transforms the episode into a contextually-rich memory using SELECTIVE enrichment.
|
||||
|
||||
<smart_enrichment_process>
|
||||
Evaluate the episode and apply enrichment ONLY where it adds significant value:
|
||||
CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the episode. Every separate fact, preference, request, clarification, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
|
||||
|
||||
1. PRIMARY FACTS - always preserve the core information from the episode
|
||||
2. TEMPORAL RESOLUTION - convert relative dates to absolute dates using episode timestamp
|
||||
3. STRATEGIC ENRICHMENT - add context only for HIGH VALUE cases (see guidelines below)
|
||||
4. VISUAL CONTENT - capture exact text on signs, objects shown, specific details from images
|
||||
5. EMOTIONAL PRESERVATION - maintain the tone and feeling of emotional exchanges
|
||||
6. IDENTITY PRESERVATION - preserve definitional and possessive relationships that establish entity connections
|
||||
<enrichment_strategy>
|
||||
1. PRIMARY FACTS - Always preserve the core information from the episode
|
||||
2. TEMPORAL RESOLUTION - Convert relative dates to absolute dates using episode timestamp
|
||||
3. CONTEXT ENRICHMENT - Add context ONLY when it clarifies unclear references
|
||||
4. VISUAL CONTENT - Capture exact text on signs, objects shown, specific details from images
|
||||
5. EMOTIONAL PRESERVATION - Maintain the tone and feeling of emotional exchanges
|
||||
|
||||
ENRICHMENT DECISION MATRIX:
|
||||
- Clear, complete statement → minimal enrichment (just temporal + attribution)
|
||||
- Unclear references → resolve with context
|
||||
- Emotional support → preserve feeling, avoid historical dumping
|
||||
- New developments → connect to ongoing narrative
|
||||
- Visual content → extract specific details as primary facts
|
||||
</smart_enrichment_process>
|
||||
When to add context from related memories:
|
||||
- Unclear pronouns ("she", "it", "they") → resolve to specific entity
|
||||
- Vague references ("the agency", "the event") → add clarifying details
|
||||
- Continuation phrases ("following up", "as we discussed") → connect to previous topic
|
||||
|
||||
<context_usage_decision>
|
||||
When related memories/previous episodes are provided, evaluate if they improve understanding:
|
||||
|
||||
USE CONTEXT when current episode has:
|
||||
- Unclear pronouns ("she", "it", "they" without clear antecedent)
|
||||
- Vague references ("the agency", "the event" without definition in current episode)
|
||||
- Continuation phrases ("following up", "as we discussed")
|
||||
- Incomplete information that context clarifies
|
||||
|
||||
IGNORE CONTEXT when current episode is:
|
||||
- Clear and self-contained ("I got a job in New York")
|
||||
- Simple emotional responses ("Thanks, that's great!")
|
||||
- Generic encouragement ("You're doing awesome!")
|
||||
- Complete statements with all necessary information
|
||||
|
||||
DECISION RULE: If the current episode can be understood perfectly without context, don't use it. Only use context when it genuinely clarifies or
|
||||
resolves ambiguity.
|
||||
</context_usage_decision>
|
||||
When NOT to add context:
|
||||
- Clear, self-contained statements → no enrichment needed beyond temporal
|
||||
- Emotional responses → preserve tone, avoid over-contextualization
|
||||
- Already established topics → don't repeat details mentioned earlier in conversation
|
||||
</enrichment_strategy>
|
||||
|
||||
<temporal_resolution>
|
||||
Using episode timestamp as anchor, convert ALL relative time references:
|
||||
@ -270,6 +253,8 @@ export const normalizeDocumentPrompt = (
|
||||
|
||||
Transform this document content into enriched factual statements for knowledge graph storage.
|
||||
|
||||
CRITICAL: CAPTURE ALL DISTINCT PIECES OF INFORMATION from the document. Every separate fact, specification, procedure, data point, or detail mentioned must be preserved in your enriched output. Missing information is unacceptable.
|
||||
|
||||
<document_processing_approach>
|
||||
Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ import { ingestTask } from "./ingest";
|
||||
|
||||
const documentIngestionQueue = queue({
|
||||
name: "document-ingestion-queue",
|
||||
concurrencyLimit: 5,
|
||||
concurrencyLimit: 1,
|
||||
});
|
||||
|
||||
// Register the Document Ingestion Trigger.dev task
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user