mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-23 00:08:29 +00:00
Feat: add documents to the kg
This commit is contained in:
parent
084ad5be85
commit
5b0dd7d4a7
@ -1,8 +1,10 @@
|
|||||||
// lib/ingest.queue.ts
|
// lib/ingest.queue.ts
|
||||||
import { IngestionStatus } from "@core/database";
|
import { IngestionStatus } from "@core/database";
|
||||||
|
import { EpisodeType } from "@core/types";
|
||||||
import { type z } from "zod";
|
import { type z } from "zod";
|
||||||
import { prisma } from "~/db.server";
|
import { prisma } from "~/db.server";
|
||||||
import { type IngestBodyRequest, ingestTask } from "~/trigger/ingest/ingest";
|
import { type IngestBodyRequest, ingestTask } from "~/trigger/ingest/ingest";
|
||||||
|
import { ingestDocumentTask } from "~/trigger/ingest/ingest-document";
|
||||||
|
|
||||||
export const addToQueue = async (
|
export const addToQueue = async (
|
||||||
body: z.infer<typeof IngestBodyRequest>,
|
body: z.infer<typeof IngestBodyRequest>,
|
||||||
@ -35,16 +37,38 @@ export const addToQueue = async (
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const handler = await ingestTask.trigger(
|
let handler;
|
||||||
{ body, userId, workspaceId: user.Workspace.id, queueId: queuePersist.id },
|
if (body.type === EpisodeType.DOCUMENT) {
|
||||||
{
|
handler = await ingestDocumentTask.trigger(
|
||||||
queue: "ingestion-queue",
|
{
|
||||||
concurrencyKey: userId,
|
body,
|
||||||
tags: [user.id, queuePersist.id],
|
userId,
|
||||||
},
|
workspaceId: user.Workspace.id,
|
||||||
);
|
queueId: queuePersist.id,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
queue: "document-ingestion-queue",
|
||||||
|
concurrencyKey: userId,
|
||||||
|
tags: [user.id, queuePersist.id],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} else if (body.type === EpisodeType.CONVERSATION) {
|
||||||
|
handler = await ingestTask.trigger(
|
||||||
|
{
|
||||||
|
body,
|
||||||
|
userId,
|
||||||
|
workspaceId: user.Workspace.id,
|
||||||
|
queueId: queuePersist.id,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
queue: "ingestion-queue",
|
||||||
|
concurrencyKey: userId,
|
||||||
|
tags: [user.id, queuePersist.id],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return { id: handler.id, token: handler.publicAccessToken };
|
return { id: handler?.id, token: handler?.publicAccessToken };
|
||||||
};
|
};
|
||||||
|
|
||||||
export { IngestBodyRequest };
|
export { IngestBodyRequest };
|
||||||
|
|||||||
@ -355,6 +355,12 @@ const initializeSchema = async () => {
|
|||||||
await runQuery(
|
await runQuery(
|
||||||
"CREATE INDEX entity_user_uuid IF NOT EXISTS FOR (n:Entity) ON (n.userId, n.uuid)",
|
"CREATE INDEX entity_user_uuid IF NOT EXISTS FOR (n:Entity) ON (n.userId, n.uuid)",
|
||||||
);
|
);
|
||||||
|
await runQuery(
|
||||||
|
"CREATE INDEX episode_user_uuid IF NOT EXISTS FOR (n:Episode) ON (n.userId, n.uuid)",
|
||||||
|
);
|
||||||
|
await runQuery(
|
||||||
|
"CREATE INDEX episode_user_id IF NOT EXISTS FOR (n:Episode) ON (n.userId)",
|
||||||
|
);
|
||||||
|
|
||||||
// Create vector indexes for semantic search (if using Neo4j 5.0+)
|
// Create vector indexes for semantic search (if using Neo4j 5.0+)
|
||||||
await runQuery(`
|
await runQuery(`
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import { addToQueue } from "~/lib/ingest.server";
|
|||||||
import { prisma } from "~/db.server";
|
import { prisma } from "~/db.server";
|
||||||
import { logger } from "~/services/logger.service";
|
import { logger } from "~/services/logger.service";
|
||||||
import { triggerWebhookDelivery } from "~/trigger/webhooks/webhook-delivery";
|
import { triggerWebhookDelivery } from "~/trigger/webhooks/webhook-delivery";
|
||||||
|
import { EpisodeTypeEnum } from "@core/types";
|
||||||
|
|
||||||
const ActivityCreateSchema = z.object({
|
const ActivityCreateSchema = z.object({
|
||||||
text: z.string().min(1, "Text is required"),
|
text: z.string().min(1, "Text is required"),
|
||||||
@ -56,6 +57,7 @@ const { action, loader } = createActionApiRoute(
|
|||||||
episodeBody: body.text,
|
episodeBody: body.text,
|
||||||
referenceTime: new Date().toISOString(),
|
referenceTime: new Date().toISOString(),
|
||||||
source: body.source,
|
source: body.source,
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
};
|
};
|
||||||
|
|
||||||
const queueResponse = await addToQueue(
|
const queueResponse = await addToQueue(
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import { addToQueue } from "~/lib/ingest.server";
|
|||||||
import { SearchService } from "~/services/search.server";
|
import { SearchService } from "~/services/search.server";
|
||||||
import { handleTransport } from "~/utils/mcp";
|
import { handleTransport } from "~/utils/mcp";
|
||||||
import { SpaceService } from "~/services/space.server";
|
import { SpaceService } from "~/services/space.server";
|
||||||
|
import { EpisodeTypeEnum } from "@core/types";
|
||||||
|
|
||||||
// Map to store transports by session ID with cleanup tracking
|
// Map to store transports by session ID with cleanup tracking
|
||||||
const transports: {
|
const transports: {
|
||||||
@ -124,6 +125,7 @@ const handleMCPRequest = async (
|
|||||||
episodeBody: args.message,
|
episodeBody: args.message,
|
||||||
referenceTime: new Date().toISOString(),
|
referenceTime: new Date().toISOString(),
|
||||||
source,
|
source,
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
},
|
},
|
||||||
userId,
|
userId,
|
||||||
);
|
);
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import { SpacePattern } from "~/services/spacePattern.server";
|
|||||||
import { addToQueue } from "~/lib/ingest.server";
|
import { addToQueue } from "~/lib/ingest.server";
|
||||||
import { redirect } from "@remix-run/node";
|
import { redirect } from "@remix-run/node";
|
||||||
import { SpaceService } from "~/services/space.server";
|
import { SpaceService } from "~/services/space.server";
|
||||||
|
import { EpisodeTypeEnum } from "@core/types";
|
||||||
|
|
||||||
export async function loader({ request, params }: LoaderFunctionArgs) {
|
export async function loader({ request, params }: LoaderFunctionArgs) {
|
||||||
const workspace = await requireWorkpace(request);
|
const workspace = await requireWorkpace(request);
|
||||||
@ -68,6 +69,7 @@ export async function action({ request, params }: ActionFunctionArgs) {
|
|||||||
},
|
},
|
||||||
source: space.name,
|
source: space.name,
|
||||||
spaceId: space.id,
|
spaceId: space.id,
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
},
|
},
|
||||||
userId,
|
userId,
|
||||||
);
|
);
|
||||||
|
|||||||
@ -26,6 +26,7 @@ import { updateUser } from "~/models/user.server";
|
|||||||
import { Copy, Check } from "lucide-react";
|
import { Copy, Check } from "lucide-react";
|
||||||
import { addToQueue } from "~/lib/ingest.server";
|
import { addToQueue } from "~/lib/ingest.server";
|
||||||
import { cn } from "~/lib/utils";
|
import { cn } from "~/lib/utils";
|
||||||
|
import { EpisodeTypeEnum } from "@core/types";
|
||||||
|
|
||||||
const ONBOARDING_STEP_COOKIE = "onboardingStep";
|
const ONBOARDING_STEP_COOKIE = "onboardingStep";
|
||||||
const onboardingStepCookie = createCookie(ONBOARDING_STEP_COOKIE, {
|
const onboardingStepCookie = createCookie(ONBOARDING_STEP_COOKIE, {
|
||||||
@ -75,6 +76,7 @@ export async function action({ request }: ActionFunctionArgs) {
|
|||||||
source: "Core",
|
source: "Core",
|
||||||
episodeBody: aboutUser,
|
episodeBody: aboutUser,
|
||||||
referenceTime: new Date().toISOString(),
|
referenceTime: new Date().toISOString(),
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
},
|
},
|
||||||
userId,
|
userId,
|
||||||
);
|
);
|
||||||
|
|||||||
262
apps/webapp/app/services/documentChunker.server.ts
Normal file
262
apps/webapp/app/services/documentChunker.server.ts
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
import { encode } from "gpt-tokenizer";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
|
export interface DocumentChunk {
|
||||||
|
content: string;
|
||||||
|
chunkIndex: number;
|
||||||
|
title?: string;
|
||||||
|
context?: string;
|
||||||
|
startPosition: number;
|
||||||
|
endPosition: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ChunkedDocument {
|
||||||
|
documentId: string;
|
||||||
|
title: string;
|
||||||
|
originalContent: string;
|
||||||
|
chunks: DocumentChunk[];
|
||||||
|
totalChunks: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Document chunking service that splits large documents into semantic chunks
|
||||||
|
* Targets 10-15k tokens per chunk with natural paragraph boundaries
|
||||||
|
*/
|
||||||
|
export class DocumentChunker {
|
||||||
|
private readonly TARGET_CHUNK_SIZE = 12500; // Middle of 10-15k range
|
||||||
|
private readonly MIN_CHUNK_SIZE = 10000;
|
||||||
|
private readonly MAX_CHUNK_SIZE = 15000;
|
||||||
|
private readonly MIN_PARAGRAPH_SIZE = 100; // Minimum tokens for a paragraph to be considered
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chunk a document into semantic sections with natural boundaries
|
||||||
|
*/
|
||||||
|
async chunkDocument(
|
||||||
|
originalContent: string,
|
||||||
|
title: string,
|
||||||
|
): Promise<ChunkedDocument> {
|
||||||
|
const documentId = crypto.randomUUID();
|
||||||
|
|
||||||
|
// First, split by major section headers (markdown style)
|
||||||
|
const majorSections = this.splitByMajorSections(originalContent);
|
||||||
|
|
||||||
|
const chunks: DocumentChunk[] = [];
|
||||||
|
let currentChunk = "";
|
||||||
|
let currentChunkStart = 0;
|
||||||
|
let chunkIndex = 0;
|
||||||
|
|
||||||
|
for (const section of majorSections) {
|
||||||
|
const sectionTokens = encode(section.content).length;
|
||||||
|
const currentChunkTokens = encode(currentChunk).length;
|
||||||
|
|
||||||
|
// If adding this section would exceed max size, finalize current chunk
|
||||||
|
if (currentChunkTokens > 0 && currentChunkTokens + sectionTokens > this.MAX_CHUNK_SIZE) {
|
||||||
|
if (currentChunkTokens >= this.MIN_CHUNK_SIZE) {
|
||||||
|
chunks.push(this.createChunk(
|
||||||
|
currentChunk,
|
||||||
|
chunkIndex,
|
||||||
|
currentChunkStart,
|
||||||
|
currentChunkStart + currentChunk.length,
|
||||||
|
section.title
|
||||||
|
));
|
||||||
|
chunkIndex++;
|
||||||
|
currentChunk = "";
|
||||||
|
currentChunkStart = section.startPosition;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add section to current chunk
|
||||||
|
if (currentChunk) {
|
||||||
|
currentChunk += "\n\n" + section.content;
|
||||||
|
} else {
|
||||||
|
currentChunk = section.content;
|
||||||
|
currentChunkStart = section.startPosition;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If current chunk is large enough and we have a natural break, consider chunking
|
||||||
|
const updatedChunkTokens = encode(currentChunk).length;
|
||||||
|
if (updatedChunkTokens >= this.TARGET_CHUNK_SIZE) {
|
||||||
|
// Try to find a good breaking point within the section
|
||||||
|
const paragraphs = this.splitIntoParagraphs(section.content);
|
||||||
|
if (paragraphs.length > 1) {
|
||||||
|
// Split at paragraph boundary if beneficial
|
||||||
|
const optimalSplit = this.findOptimalParagraphSplit(currentChunk);
|
||||||
|
if (optimalSplit) {
|
||||||
|
chunks.push(this.createChunk(
|
||||||
|
optimalSplit.beforeSplit,
|
||||||
|
chunkIndex,
|
||||||
|
currentChunkStart,
|
||||||
|
currentChunkStart + optimalSplit.beforeSplit.length,
|
||||||
|
section.title
|
||||||
|
));
|
||||||
|
chunkIndex++;
|
||||||
|
currentChunk = optimalSplit.afterSplit;
|
||||||
|
currentChunkStart = currentChunkStart + optimalSplit.beforeSplit.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add remaining content as final chunk
|
||||||
|
if (currentChunk.trim() && encode(currentChunk).length >= this.MIN_PARAGRAPH_SIZE) {
|
||||||
|
chunks.push(this.createChunk(
|
||||||
|
currentChunk,
|
||||||
|
chunkIndex,
|
||||||
|
currentChunkStart,
|
||||||
|
originalContent.length
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
documentId,
|
||||||
|
title,
|
||||||
|
originalContent,
|
||||||
|
chunks,
|
||||||
|
totalChunks: chunks.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private splitByMajorSections(content: string): Array<{
|
||||||
|
content: string;
|
||||||
|
title?: string;
|
||||||
|
startPosition: number;
|
||||||
|
endPosition: number;
|
||||||
|
}> {
|
||||||
|
const sections: Array<{
|
||||||
|
content: string;
|
||||||
|
title?: string;
|
||||||
|
startPosition: number;
|
||||||
|
endPosition: number;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
// Split by markdown headers (# ## ### etc.) or common document patterns
|
||||||
|
const headerRegex = /^(#{1,6}\s+.*$|={3,}$|-{3,}$)/gm;
|
||||||
|
const matches = Array.from(content.matchAll(headerRegex));
|
||||||
|
|
||||||
|
if (matches.length === 0) {
|
||||||
|
// No headers found, treat as single section
|
||||||
|
sections.push({
|
||||||
|
content: content.trim(),
|
||||||
|
startPosition: 0,
|
||||||
|
endPosition: content.length,
|
||||||
|
});
|
||||||
|
return sections;
|
||||||
|
}
|
||||||
|
|
||||||
|
let lastIndex = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < matches.length; i++) {
|
||||||
|
const match = matches[i];
|
||||||
|
const nextMatch = matches[i + 1];
|
||||||
|
|
||||||
|
const sectionStart = lastIndex;
|
||||||
|
const sectionEnd = nextMatch ? nextMatch.index! : content.length;
|
||||||
|
|
||||||
|
const sectionContent = content.slice(sectionStart, sectionEnd).trim();
|
||||||
|
|
||||||
|
if (sectionContent) {
|
||||||
|
sections.push({
|
||||||
|
content: sectionContent,
|
||||||
|
title: this.extractSectionTitle(match[0]),
|
||||||
|
startPosition: sectionStart,
|
||||||
|
endPosition: sectionEnd,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
lastIndex = match.index! + match[0].length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return sections;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractSectionTitle(header: string): string | undefined {
|
||||||
|
// Extract title from markdown header
|
||||||
|
const markdownMatch = header.match(/^#{1,6}\s+(.+)$/);
|
||||||
|
if (markdownMatch) {
|
||||||
|
return markdownMatch[1].trim();
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
private splitIntoParagraphs(content: string): string[] {
|
||||||
|
// Split by double newlines (paragraph breaks) and filter out empty strings
|
||||||
|
return content
|
||||||
|
.split(/\n\s*\n/)
|
||||||
|
.map(p => p.trim())
|
||||||
|
.filter(p => p.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private findOptimalParagraphSplit(content: string): {
|
||||||
|
beforeSplit: string;
|
||||||
|
afterSplit: string;
|
||||||
|
} | null {
|
||||||
|
const paragraphs = this.splitIntoParagraphs(content);
|
||||||
|
if (paragraphs.length < 2) return null;
|
||||||
|
|
||||||
|
let bestSplitIndex = -1;
|
||||||
|
let bestScore = 0;
|
||||||
|
|
||||||
|
// Find the split that gets us closest to target size
|
||||||
|
for (let i = 1; i < paragraphs.length; i++) {
|
||||||
|
const beforeSplit = paragraphs.slice(0, i).join("\n\n");
|
||||||
|
const afterSplit = paragraphs.slice(i).join("\n\n");
|
||||||
|
|
||||||
|
const beforeTokens = encode(beforeSplit).length;
|
||||||
|
const afterTokens = encode(afterSplit).length;
|
||||||
|
|
||||||
|
// Score based on how close we get to target, avoiding too small chunks
|
||||||
|
if (beforeTokens >= this.MIN_CHUNK_SIZE && afterTokens >= this.MIN_PARAGRAPH_SIZE) {
|
||||||
|
const beforeDistance = Math.abs(beforeTokens - this.TARGET_CHUNK_SIZE);
|
||||||
|
const score = 1 / (1 + beforeDistance); // Higher score for closer to target
|
||||||
|
|
||||||
|
if (score > bestScore) {
|
||||||
|
bestScore = score;
|
||||||
|
bestSplitIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestSplitIndex > 0) {
|
||||||
|
return {
|
||||||
|
beforeSplit: paragraphs.slice(0, bestSplitIndex).join("\n\n"),
|
||||||
|
afterSplit: paragraphs.slice(bestSplitIndex).join("\n\n"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private createChunk(
|
||||||
|
content: string,
|
||||||
|
chunkIndex: number,
|
||||||
|
startPosition: number,
|
||||||
|
endPosition: number,
|
||||||
|
title?: string
|
||||||
|
): DocumentChunk {
|
||||||
|
// Generate a concise context/title if not provided
|
||||||
|
const context = title || this.generateChunkContext(content);
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: content.trim(),
|
||||||
|
chunkIndex,
|
||||||
|
title: context,
|
||||||
|
context: `Chunk ${chunkIndex + 1}${context ? `: ${context}` : ""}`,
|
||||||
|
startPosition,
|
||||||
|
endPosition,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private generateChunkContext(content: string): string {
|
||||||
|
// Extract first meaningful line as context (avoiding markdown syntax)
|
||||||
|
const lines = content.split('\n').map(line => line.trim()).filter(Boolean);
|
||||||
|
|
||||||
|
for (const line of lines.slice(0, 3)) {
|
||||||
|
// Skip markdown headers and find first substantial content
|
||||||
|
if (!line.match(/^#{1,6}\s/) && !line.match(/^[=-]{3,}$/) && line.length > 10) {
|
||||||
|
return line.substring(0, 100) + (line.length > 100 ? "..." : "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return "Document content";
|
||||||
|
}
|
||||||
|
}
|
||||||
151
apps/webapp/app/services/graphModels/document.ts
Normal file
151
apps/webapp/app/services/graphModels/document.ts
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
import { runQuery } from "~/lib/neo4j.server";
|
||||||
|
import type { DocumentNode } from "@core/types";
|
||||||
|
|
||||||
|
export async function saveDocument(document: DocumentNode): Promise<string> {
|
||||||
|
const query = `
|
||||||
|
MERGE (d:Document {uuid: $uuid})
|
||||||
|
ON CREATE SET
|
||||||
|
d.title = $title,
|
||||||
|
d.originalContent = $originalContent,
|
||||||
|
d.metadata = $metadata,
|
||||||
|
d.source = $source,
|
||||||
|
d.userId = $userId,
|
||||||
|
d.createdAt = $createdAt,
|
||||||
|
d.validAt = $validAt,
|
||||||
|
d.totalChunks = $totalChunks,
|
||||||
|
d.documentId = $documentId,
|
||||||
|
d.sessionId = $sessionId
|
||||||
|
ON MATCH SET
|
||||||
|
d.title = $title,
|
||||||
|
d.originalContent = $originalContent,
|
||||||
|
d.metadata = $metadata,
|
||||||
|
d.source = $source,
|
||||||
|
d.validAt = $validAt,
|
||||||
|
d.totalChunks = $totalChunks,
|
||||||
|
d.documentId = $documentId,
|
||||||
|
d.sessionId = $sessionId
|
||||||
|
RETURN d.uuid as uuid
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params = {
|
||||||
|
uuid: document.uuid,
|
||||||
|
title: document.title,
|
||||||
|
originalContent: document.originalContent,
|
||||||
|
metadata: JSON.stringify(document.metadata || {}),
|
||||||
|
source: document.source,
|
||||||
|
userId: document.userId || null,
|
||||||
|
createdAt: document.createdAt.toISOString(),
|
||||||
|
validAt: document.validAt.toISOString(),
|
||||||
|
totalChunks: document.totalChunks || 0,
|
||||||
|
documentId: document.documentId || null,
|
||||||
|
sessionId: document.sessionId || null,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await runQuery(query, params);
|
||||||
|
return result[0].get("uuid");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function linkEpisodeToDocument(
|
||||||
|
episodeUuid: string,
|
||||||
|
documentUuid: string,
|
||||||
|
chunkIndex: number,
|
||||||
|
): Promise<void> {
|
||||||
|
const query = `
|
||||||
|
MATCH (e:Episode {uuid: $episodeUuid})
|
||||||
|
MATCH (d:Document {uuid: $documentUuid})
|
||||||
|
MERGE (d)-[r:CONTAINS_CHUNK {chunkIndex: $chunkIndex}]->(e)
|
||||||
|
SET e.documentId = $documentUuid,
|
||||||
|
e.chunkIndex = $chunkIndex
|
||||||
|
RETURN r
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params = {
|
||||||
|
episodeUuid,
|
||||||
|
documentUuid,
|
||||||
|
chunkIndex,
|
||||||
|
};
|
||||||
|
|
||||||
|
await runQuery(query, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getDocument(
|
||||||
|
documentUuid: string,
|
||||||
|
): Promise<DocumentNode | null> {
|
||||||
|
const query = `
|
||||||
|
MATCH (d:Document {uuid: $uuid})
|
||||||
|
RETURN d
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params = { uuid: documentUuid };
|
||||||
|
const result = await runQuery(query, params);
|
||||||
|
|
||||||
|
if (result.length === 0) return null;
|
||||||
|
|
||||||
|
const record = result[0];
|
||||||
|
const documentNode = record.get("d");
|
||||||
|
|
||||||
|
return {
|
||||||
|
uuid: documentNode.properties.uuid,
|
||||||
|
title: documentNode.properties.title,
|
||||||
|
originalContent: documentNode.properties.originalContent,
|
||||||
|
metadata: JSON.parse(documentNode.properties.metadata || "{}"),
|
||||||
|
source: documentNode.properties.source,
|
||||||
|
userId: documentNode.properties.userId,
|
||||||
|
createdAt: new Date(documentNode.properties.createdAt),
|
||||||
|
validAt: new Date(documentNode.properties.validAt),
|
||||||
|
totalChunks: documentNode.properties.totalChunks,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getDocumentEpisodes(documentUuid: string): Promise<
|
||||||
|
Array<{
|
||||||
|
episodeUuid: string;
|
||||||
|
chunkIndex: number;
|
||||||
|
content: string;
|
||||||
|
}>
|
||||||
|
> {
|
||||||
|
const query = `
|
||||||
|
MATCH (d:Document {uuid: $uuid})-[r:CONTAINS_CHUNK]->(e:Episode)
|
||||||
|
RETURN e.uuid as episodeUuid, r.chunkIndex as chunkIndex, e.content as content
|
||||||
|
ORDER BY r.chunkIndex ASC
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params = { uuid: documentUuid };
|
||||||
|
const result = await runQuery(query, params);
|
||||||
|
|
||||||
|
return result.map((record) => ({
|
||||||
|
episodeUuid: record.get("episodeUuid"),
|
||||||
|
chunkIndex: record.get("chunkIndex"),
|
||||||
|
content: record.get("content"),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getUserDocuments(
|
||||||
|
userId: string,
|
||||||
|
limit: number = 50,
|
||||||
|
): Promise<DocumentNode[]> {
|
||||||
|
const query = `
|
||||||
|
MATCH (d:Document {userId: $userId})
|
||||||
|
RETURN d
|
||||||
|
ORDER BY d.createdAt DESC
|
||||||
|
LIMIT $limit
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params = { userId, limit };
|
||||||
|
const result = await runQuery(query, params);
|
||||||
|
|
||||||
|
return result.map((record) => {
|
||||||
|
const documentNode = record.get("d");
|
||||||
|
return {
|
||||||
|
uuid: documentNode.properties.uuid,
|
||||||
|
title: documentNode.properties.title,
|
||||||
|
originalContent: documentNode.properties.originalContent,
|
||||||
|
metadata: JSON.parse(documentNode.properties.metadata || "{}"),
|
||||||
|
source: documentNode.properties.source,
|
||||||
|
userId: documentNode.properties.userId,
|
||||||
|
createdAt: new Date(documentNode.properties.createdAt),
|
||||||
|
validAt: new Date(documentNode.properties.validAt),
|
||||||
|
totalChunks: documentNode.properties.totalChunks,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -1,5 +1,5 @@
|
|||||||
import { runQuery } from "~/lib/neo4j.server";
|
import { runQuery } from "~/lib/neo4j.server";
|
||||||
import type { EntityNode, EpisodicNode } from "@core/types";
|
import { type EntityNode, EpisodeType, type EpisodicNode } from "@core/types";
|
||||||
|
|
||||||
export async function saveEpisode(episode: EpisodicNode): Promise<string> {
|
export async function saveEpisode(episode: EpisodicNode): Promise<string> {
|
||||||
const query = `
|
const query = `
|
||||||
@ -82,6 +82,8 @@ export async function getRecentEpisodes(params: {
|
|||||||
userId: string;
|
userId: string;
|
||||||
source?: string;
|
source?: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
|
type?: EpisodeType;
|
||||||
|
documentId?: string;
|
||||||
}): Promise<EpisodicNode[]> {
|
}): Promise<EpisodicNode[]> {
|
||||||
let filters = `WHERE e.validAt <= $referenceTime
|
let filters = `WHERE e.validAt <= $referenceTime
|
||||||
AND e.userId = $userId`;
|
AND e.userId = $userId`;
|
||||||
@ -90,10 +92,14 @@ export async function getRecentEpisodes(params: {
|
|||||||
filters += `\nAND e.source = $source`;
|
filters += `\nAND e.source = $source`;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sessionId) {
|
if (params.type === EpisodeType.CONVERSATION && params.sessionId) {
|
||||||
filters += `\nAND e.sessionId = $sessionId`;
|
filters += `\nAND e.sessionId = $sessionId`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.type === EpisodeType.DOCUMENT && params.documentId) {
|
||||||
|
filters += `\nAND e.documentId = $documentId`;
|
||||||
|
}
|
||||||
|
|
||||||
const query = `
|
const query = `
|
||||||
MATCH (e:Episode)
|
MATCH (e:Episode)
|
||||||
${filters}
|
${filters}
|
||||||
|
|||||||
@ -6,6 +6,8 @@ import {
|
|||||||
type EpisodicNode,
|
type EpisodicNode,
|
||||||
type StatementNode,
|
type StatementNode,
|
||||||
type Triple,
|
type Triple,
|
||||||
|
EpisodeTypeEnum,
|
||||||
|
type EpisodeType,
|
||||||
} from "@core/types";
|
} from "@core/types";
|
||||||
import { logger } from "./logger.service";
|
import { logger } from "./logger.service";
|
||||||
import { ClusteringService } from "./clustering.server";
|
import { ClusteringService } from "./clustering.server";
|
||||||
@ -48,7 +50,7 @@ import {
|
|||||||
getNodeTypesString,
|
getNodeTypesString,
|
||||||
isPresetType,
|
isPresetType,
|
||||||
} from "~/utils/presets/nodes";
|
} from "~/utils/presets/nodes";
|
||||||
import { normalizePrompt } from "./prompts";
|
import { normalizePrompt, normalizeDocumentPrompt } from "./prompts";
|
||||||
import { type PrismaClient } from "@prisma/client";
|
import { type PrismaClient } from "@prisma/client";
|
||||||
|
|
||||||
// Default number of previous episodes to retrieve for context
|
// Default number of previous episodes to retrieve for context
|
||||||
@ -90,6 +92,8 @@ export class KnowledgeGraphService {
|
|||||||
userId: params.userId,
|
userId: params.userId,
|
||||||
source: params.source,
|
source: params.source,
|
||||||
sessionId: params.sessionId,
|
sessionId: params.sessionId,
|
||||||
|
type: params.type,
|
||||||
|
documentId: params.documentId,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Format session context from previous episodes
|
// Format session context from previous episodes
|
||||||
@ -110,6 +114,7 @@ export class KnowledgeGraphService {
|
|||||||
prisma,
|
prisma,
|
||||||
new Date(params.referenceTime),
|
new Date(params.referenceTime),
|
||||||
sessionContext,
|
sessionContext,
|
||||||
|
params.type,
|
||||||
);
|
);
|
||||||
|
|
||||||
const normalizedTime = Date.now() - startTime;
|
const normalizedTime = Date.now() - startTime;
|
||||||
@ -251,9 +256,9 @@ export class KnowledgeGraphService {
|
|||||||
logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`);
|
logger.log(`Saved triples in ${saveTriplesTime - updatedTriplesTime} ms`);
|
||||||
|
|
||||||
// Invalidate invalidated statements
|
// Invalidate invalidated statements
|
||||||
await invalidateStatements({
|
await invalidateStatements({
|
||||||
statementIds: invalidatedStatements,
|
statementIds: invalidatedStatements,
|
||||||
invalidatedBy: episode.uuid
|
invalidatedBy: episode.uuid,
|
||||||
});
|
});
|
||||||
|
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
@ -1146,6 +1151,7 @@ export class KnowledgeGraphService {
|
|||||||
prisma: PrismaClient,
|
prisma: PrismaClient,
|
||||||
episodeTimestamp?: Date,
|
episodeTimestamp?: Date,
|
||||||
sessionContext?: string,
|
sessionContext?: string,
|
||||||
|
contentType?: EpisodeType,
|
||||||
) {
|
) {
|
||||||
let appEnumValues: Apps[] = [];
|
let appEnumValues: Apps[] = [];
|
||||||
if (Apps[source.toUpperCase() as keyof typeof Apps]) {
|
if (Apps[source.toUpperCase() as keyof typeof Apps]) {
|
||||||
@ -1171,7 +1177,12 @@ export class KnowledgeGraphService {
|
|||||||
episodeTimestamp?.toISOString() || new Date().toISOString(),
|
episodeTimestamp?.toISOString() || new Date().toISOString(),
|
||||||
sessionContext,
|
sessionContext,
|
||||||
};
|
};
|
||||||
const messages = normalizePrompt(context);
|
|
||||||
|
// Route to appropriate normalization prompt based on content type
|
||||||
|
const messages =
|
||||||
|
contentType === EpisodeTypeEnum.DOCUMENT
|
||||||
|
? normalizeDocumentPrompt(context)
|
||||||
|
: normalizePrompt(context);
|
||||||
let responseText = "";
|
let responseText = "";
|
||||||
await makeModelCall(false, messages, (text) => {
|
await makeModelCall(false, messages, (text) => {
|
||||||
responseText = text;
|
responseText = text;
|
||||||
|
|||||||
@ -262,3 +262,139 @@ ${context.relatedMemories}
|
|||||||
{ role: "user", content: userPrompt },
|
{ role: "user", content: userPrompt },
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const normalizeDocumentPrompt = (
|
||||||
|
context: Record<string, any>,
|
||||||
|
): CoreMessage[] => {
|
||||||
|
const sysPrompt = `You are C.O.R.E. (Contextual Observation & Recall Engine), a document memory processing system.
|
||||||
|
|
||||||
|
Transform this document content into enriched factual statements for knowledge graph storage.
|
||||||
|
|
||||||
|
<document_processing_approach>
|
||||||
|
Focus on STRUCTURED CONTENT EXTRACTION optimized for documents:
|
||||||
|
|
||||||
|
1. FACTUAL PRESERVATION - Extract concrete facts, data, and information
|
||||||
|
2. STRUCTURAL AWARENESS - Preserve document hierarchy, lists, tables, code blocks
|
||||||
|
3. CROSS-REFERENCE HANDLING - Maintain internal document references and connections
|
||||||
|
4. TECHNICAL CONTENT - Handle specialized terminology, code, formulas, diagrams
|
||||||
|
5. CONTEXTUAL CHUNKING - This content is part of a larger document, maintain coherence
|
||||||
|
|
||||||
|
DOCUMENT-SPECIFIC ENRICHMENT:
|
||||||
|
- Preserve technical accuracy and specialized vocabulary
|
||||||
|
- Extract structured data (lists, tables, procedures, specifications)
|
||||||
|
- Maintain hierarchical relationships (sections, subsections, bullet points)
|
||||||
|
- Handle code blocks, formulas, and technical diagrams
|
||||||
|
- Capture cross-references and internal document links
|
||||||
|
- Preserve authorship, citations, and source attributions
|
||||||
|
</document_processing_approach>
|
||||||
|
|
||||||
|
<document_content_types>
|
||||||
|
Handle various document formats:
|
||||||
|
- Technical documentation and specifications
|
||||||
|
- Research papers and academic content
|
||||||
|
- Code documentation and API references
|
||||||
|
- Business documents and reports
|
||||||
|
- Notes and knowledge base articles
|
||||||
|
- Structured content (wikis, blogs, guides)
|
||||||
|
</document_content_types>
|
||||||
|
|
||||||
|
<temporal_resolution>
|
||||||
|
For document content, convert relative time references using document timestamp:
|
||||||
|
- Publication dates, modification dates, version information
|
||||||
|
- Time-sensitive information within the document content
|
||||||
|
- Historical context and chronological information
|
||||||
|
</temporal_resolution>
|
||||||
|
|
||||||
|
<entity_types>
|
||||||
|
${context.entityTypes}
|
||||||
|
</entity_types>
|
||||||
|
|
||||||
|
<ingestion_rules>
|
||||||
|
${
|
||||||
|
context.ingestionRules
|
||||||
|
? `Apply these rules for content from ${context.source}:
|
||||||
|
${context.ingestionRules}
|
||||||
|
|
||||||
|
CRITICAL: If content does NOT satisfy these rules, respond with "NOTHING_TO_REMEMBER" regardless of other criteria.`
|
||||||
|
: "No specific ingestion rules defined for this source."
|
||||||
|
}
|
||||||
|
</ingestion_rules>
|
||||||
|
|
||||||
|
<document_quality_control>
|
||||||
|
RETURN "NOTHING_TO_REMEMBER" if content consists ONLY of:
|
||||||
|
- Navigation elements or UI text
|
||||||
|
- Copyright notices and boilerplate
|
||||||
|
- Empty sections or placeholder text
|
||||||
|
- Pure formatting markup without content
|
||||||
|
- Table of contents without substance
|
||||||
|
- Repetitive headers without content
|
||||||
|
|
||||||
|
STORE IN MEMORY for document content containing:
|
||||||
|
- Factual information and data
|
||||||
|
- Technical specifications and procedures
|
||||||
|
- Structured knowledge and explanations
|
||||||
|
- Code examples and implementations
|
||||||
|
- Research findings and conclusions
|
||||||
|
- Process descriptions and workflows
|
||||||
|
- Reference information and definitions
|
||||||
|
- Analysis, insights, and documented decisions
|
||||||
|
</document_quality_control>
|
||||||
|
|
||||||
|
<document_enrichment_examples>
|
||||||
|
TECHNICAL CONTENT:
|
||||||
|
- Original: "The API returns a 200 status code on success"
|
||||||
|
- Enriched: "On June 15, 2024, the REST API documentation specifies that successful requests return HTTP status code 200."
|
||||||
|
|
||||||
|
STRUCTURED CONTENT:
|
||||||
|
- Original: "Step 1: Initialize the database\nStep 2: Run migrations"
|
||||||
|
- Enriched: "On June 15, 2024, the deployment guide outlines a two-step process: first initialize the database, then run migrations."
|
||||||
|
|
||||||
|
CROSS-REFERENCE:
|
||||||
|
- Original: "As mentioned in Section 3, the algorithm complexity is O(n)"
|
||||||
|
- Enriched: "On June 15, 2024, the algorithm analysis document confirms O(n) time complexity, referencing the detailed explanation in Section 3."
|
||||||
|
</document_enrichment_examples>
|
||||||
|
|
||||||
|
CRITICAL OUTPUT FORMAT REQUIREMENT:
|
||||||
|
You MUST wrap your response in <output> tags. This is MANDATORY - no exceptions.
|
||||||
|
|
||||||
|
If the document content should be stored in memory:
|
||||||
|
<output>
|
||||||
|
{{your_enriched_statement_here}}
|
||||||
|
</output>
|
||||||
|
|
||||||
|
If there is nothing worth remembering:
|
||||||
|
<output>
|
||||||
|
NOTHING_TO_REMEMBER
|
||||||
|
</output>
|
||||||
|
|
||||||
|
ALWAYS include opening <output> and closing </output> tags around your entire response.
|
||||||
|
`;
|
||||||
|
|
||||||
|
const userPrompt = `
|
||||||
|
<DOCUMENT_CONTENT>
|
||||||
|
${context.episodeContent}
|
||||||
|
</DOCUMENT_CONTENT>
|
||||||
|
|
||||||
|
<SOURCE>
|
||||||
|
${context.source}
|
||||||
|
</SOURCE>
|
||||||
|
|
||||||
|
<DOCUMENT_TIMESTAMP>
|
||||||
|
${context.episodeTimestamp || "Not provided"}
|
||||||
|
</DOCUMENT_TIMESTAMP>
|
||||||
|
|
||||||
|
<DOCUMENT_SESSION_CONTEXT>
|
||||||
|
${context.sessionContext || "No previous chunks in this document session"}
|
||||||
|
</DOCUMENT_SESSION_CONTEXT>
|
||||||
|
|
||||||
|
<RELATED_MEMORIES>
|
||||||
|
${context.relatedMemories}
|
||||||
|
</RELATED_MEMORIES>
|
||||||
|
|
||||||
|
`;
|
||||||
|
|
||||||
|
return [
|
||||||
|
{ role: "system", content: sysPrompt },
|
||||||
|
{ role: "user", content: userPrompt },
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|||||||
154
apps/webapp/app/trigger/ingest/ingest-document.ts
Normal file
154
apps/webapp/app/trigger/ingest/ingest-document.ts
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
import { queue, task } from "@trigger.dev/sdk";
|
||||||
|
import { type z } from "zod";
|
||||||
|
import crypto from "crypto";
|
||||||
|
|
||||||
|
import { IngestionStatus } from "@core/database";
|
||||||
|
import { EpisodeTypeEnum, type DocumentNode } from "@core/types";
|
||||||
|
import { logger } from "~/services/logger.service";
|
||||||
|
import { DocumentChunker } from "~/services/documentChunker.server";
|
||||||
|
import { saveDocument } from "~/services/graphModels/document";
|
||||||
|
import { type IngestBodyRequest } from "~/lib/ingest.server";
|
||||||
|
import { prisma } from "../utils/prisma";
|
||||||
|
import { ingestTask } from "./ingest";
|
||||||
|
|
||||||
|
const documentIngestionQueue = queue({
|
||||||
|
name: "document-ingestion-queue",
|
||||||
|
concurrencyLimit: 5,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Register the Document Ingestion Trigger.dev task
|
||||||
|
export const ingestDocumentTask = task({
|
||||||
|
id: "ingest-document",
|
||||||
|
queue: documentIngestionQueue,
|
||||||
|
machine: "medium-2x",
|
||||||
|
run: async (payload: {
|
||||||
|
body: z.infer<typeof IngestBodyRequest>;
|
||||||
|
userId: string;
|
||||||
|
workspaceId: string;
|
||||||
|
queueId: string;
|
||||||
|
}) => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
logger.log(`Processing document for user ${payload.userId}`, {
|
||||||
|
documentTitle: payload.body.documentTitle,
|
||||||
|
contentLength: payload.body.episodeBody.length,
|
||||||
|
});
|
||||||
|
|
||||||
|
await prisma.ingestionQueue.update({
|
||||||
|
where: { id: payload.queueId },
|
||||||
|
data: {
|
||||||
|
status: IngestionStatus.PROCESSING,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const documentBody = payload.body as any;
|
||||||
|
|
||||||
|
// Step 1: Create document node
|
||||||
|
const document: DocumentNode = {
|
||||||
|
uuid: crypto.randomUUID(),
|
||||||
|
title: documentBody.documentTitle || "Untitled Document",
|
||||||
|
originalContent: documentBody.episodeBody,
|
||||||
|
metadata: documentBody.metadata || {},
|
||||||
|
source: documentBody.source,
|
||||||
|
userId: payload.userId,
|
||||||
|
createdAt: new Date(),
|
||||||
|
validAt: new Date(documentBody.referenceTime),
|
||||||
|
totalChunks: 0,
|
||||||
|
documentId: documentBody.documentId,
|
||||||
|
sessionId: documentBody.sessionId,
|
||||||
|
};
|
||||||
|
|
||||||
|
await saveDocument(document);
|
||||||
|
|
||||||
|
// Step 2: Chunk the document
|
||||||
|
const documentChunker = new DocumentChunker();
|
||||||
|
const chunkedDocument = await documentChunker.chunkDocument(
|
||||||
|
documentBody.episodeBody,
|
||||||
|
documentBody.documentTitle,
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.log(
|
||||||
|
`Document chunked into ${chunkedDocument.chunks.length} chunks`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Step 3: Queue each chunk as a separate episode
|
||||||
|
for (const chunk of chunkedDocument.chunks) {
|
||||||
|
const chunkEpisodeData = {
|
||||||
|
episodeBody: chunk.content,
|
||||||
|
referenceTime: documentBody.referenceTime,
|
||||||
|
metadata: documentBody.metadata,
|
||||||
|
source: documentBody.source,
|
||||||
|
spaceId: documentBody.spaceId,
|
||||||
|
sessionId: documentBody.sessionId,
|
||||||
|
type: EpisodeTypeEnum.DOCUMENT,
|
||||||
|
documentTitle: documentBody.documentTitle,
|
||||||
|
documentId: documentBody.documentId,
|
||||||
|
chunkIndex: chunk.chunkIndex,
|
||||||
|
};
|
||||||
|
|
||||||
|
const episodeHandler = await ingestTask.trigger(
|
||||||
|
{
|
||||||
|
body: chunkEpisodeData,
|
||||||
|
userId: payload.userId,
|
||||||
|
workspaceId: payload.workspaceId,
|
||||||
|
queueId: payload.queueId,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
queue: "ingestion-queue",
|
||||||
|
concurrencyKey: payload.userId,
|
||||||
|
tags: [payload.userId, payload.queueId],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
if (episodeHandler.id) {
|
||||||
|
logger.log(
|
||||||
|
`Queued chunk ${chunk.chunkIndex + 1}/${chunkedDocument.chunks.length} for processing`,
|
||||||
|
{
|
||||||
|
handlerId: episodeHandler.id,
|
||||||
|
chunkSize: chunk.content.length,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await prisma.ingestionQueue.update({
|
||||||
|
where: { id: payload.queueId },
|
||||||
|
data: {
|
||||||
|
output: {
|
||||||
|
documentUuid: document.uuid,
|
||||||
|
totalChunks: chunkedDocument.chunks.length,
|
||||||
|
episodes: [],
|
||||||
|
},
|
||||||
|
status: IngestionStatus.PROCESSING,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const processingTimeMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
logger.log(
|
||||||
|
`Document chunking processing completed in ${processingTimeMs}ms`,
|
||||||
|
{
|
||||||
|
documentUuid: document.uuid,
|
||||||
|
totalChunks: chunkedDocument.chunks.length,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
return { success: true };
|
||||||
|
} catch (err: any) {
|
||||||
|
await prisma.ingestionQueue.update({
|
||||||
|
where: { id: payload.queueId },
|
||||||
|
data: {
|
||||||
|
error: err.message,
|
||||||
|
status: IngestionStatus.FAILED,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.error(
|
||||||
|
`Error processing document for user ${payload.userId}:`,
|
||||||
|
err,
|
||||||
|
);
|
||||||
|
return { success: false, error: err.message };
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
@ -6,6 +6,7 @@ import { IngestionStatus } from "@core/database";
|
|||||||
import { logger } from "~/services/logger.service";
|
import { logger } from "~/services/logger.service";
|
||||||
import { triggerSpaceAssignment } from "../spaces/space-assignment";
|
import { triggerSpaceAssignment } from "../spaces/space-assignment";
|
||||||
import { prisma } from "../utils/prisma";
|
import { prisma } from "../utils/prisma";
|
||||||
|
import { EpisodeType } from "@core/types";
|
||||||
|
|
||||||
export const IngestBodyRequest = z.object({
|
export const IngestBodyRequest = z.object({
|
||||||
episodeBody: z.string(),
|
episodeBody: z.string(),
|
||||||
@ -14,6 +15,11 @@ export const IngestBodyRequest = z.object({
|
|||||||
source: z.string(),
|
source: z.string(),
|
||||||
spaceId: z.string().optional(),
|
spaceId: z.string().optional(),
|
||||||
sessionId: z.string().optional(),
|
sessionId: z.string().optional(),
|
||||||
|
type: z
|
||||||
|
.enum([EpisodeType.CONVERSATION, EpisodeType.DOCUMENT])
|
||||||
|
.default(EpisodeType.CONVERSATION),
|
||||||
|
documentTitle: z.string().optional(),
|
||||||
|
documentId: z.string().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
const ingestionQueue = queue({
|
const ingestionQueue = queue({
|
||||||
@ -35,7 +41,7 @@ export const ingestTask = task({
|
|||||||
try {
|
try {
|
||||||
logger.log(`Processing job for user ${payload.userId}`);
|
logger.log(`Processing job for user ${payload.userId}`);
|
||||||
|
|
||||||
await prisma.ingestionQueue.update({
|
const ingestionQueue = await prisma.ingestionQueue.update({
|
||||||
where: { id: payload.queueId },
|
where: { id: payload.queueId },
|
||||||
data: {
|
data: {
|
||||||
status: IngestionStatus.PROCESSING,
|
status: IngestionStatus.PROCESSING,
|
||||||
@ -54,11 +60,32 @@ export const ingestTask = task({
|
|||||||
prisma,
|
prisma,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let finalOutput = episodeDetails;
|
||||||
|
let episodeUuids: string[] = episodeDetails.episodeUuid
|
||||||
|
? [episodeDetails.episodeUuid]
|
||||||
|
: [];
|
||||||
|
let currentStatus: IngestionStatus = IngestionStatus.COMPLETED;
|
||||||
|
if (episodeBody.type === EpisodeType.DOCUMENT) {
|
||||||
|
const currentOutput = ingestionQueue.output as any;
|
||||||
|
currentOutput.episodes.push(episodeDetails);
|
||||||
|
episodeUuids = currentOutput.episodes.map(
|
||||||
|
(episode: any) => episode.episodeUuid,
|
||||||
|
);
|
||||||
|
|
||||||
|
finalOutput = {
|
||||||
|
...currentOutput,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (currentOutput.episodes.length !== currentOutput.totalChunks) {
|
||||||
|
currentStatus = IngestionStatus.PROCESSING;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await prisma.ingestionQueue.update({
|
await prisma.ingestionQueue.update({
|
||||||
where: { id: payload.queueId },
|
where: { id: payload.queueId },
|
||||||
data: {
|
data: {
|
||||||
output: episodeDetails,
|
output: finalOutput,
|
||||||
status: IngestionStatus.COMPLETED,
|
status: currentStatus,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -69,12 +96,15 @@ export const ingestTask = task({
|
|||||||
workspaceId: payload.workspaceId,
|
workspaceId: payload.workspaceId,
|
||||||
episodeId: episodeDetails?.episodeUuid,
|
episodeId: episodeDetails?.episodeUuid,
|
||||||
});
|
});
|
||||||
if (episodeDetails.episodeUuid) {
|
if (
|
||||||
|
episodeDetails.episodeUuid &&
|
||||||
|
currentStatus === IngestionStatus.COMPLETED
|
||||||
|
) {
|
||||||
await triggerSpaceAssignment({
|
await triggerSpaceAssignment({
|
||||||
userId: payload.userId,
|
userId: payload.userId,
|
||||||
workspaceId: payload.workspaceId,
|
workspaceId: payload.workspaceId,
|
||||||
mode: "episode",
|
mode: "episode",
|
||||||
episodeId: episodeDetails.episodeUuid,
|
episodeIds: episodeUuids,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} catch (assignmentError) {
|
} catch (assignmentError) {
|
||||||
|
|||||||
@ -25,7 +25,7 @@ interface SpaceAssignmentPayload {
|
|||||||
workspaceId: string;
|
workspaceId: string;
|
||||||
mode: "new_space" | "episode";
|
mode: "new_space" | "episode";
|
||||||
newSpaceId?: string; // For new_space mode
|
newSpaceId?: string; // For new_space mode
|
||||||
episodeId?: string; // For daily_batch mode (default: 1)
|
episodeIds?: string[]; // For daily_batch mode (default: 1)
|
||||||
batchSize?: number; // Processing batch size
|
batchSize?: number; // Processing batch size
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,7 +181,7 @@ export const spaceAssignmentTask = task({
|
|||||||
workspaceId,
|
workspaceId,
|
||||||
mode,
|
mode,
|
||||||
newSpaceId,
|
newSpaceId,
|
||||||
episodeId,
|
episodeIds,
|
||||||
batchSize = mode === "new_space"
|
batchSize = mode === "new_space"
|
||||||
? CONFIG.newSpaceMode.batchSize
|
? CONFIG.newSpaceMode.batchSize
|
||||||
: CONFIG.episodeMode.batchSize,
|
: CONFIG.episodeMode.batchSize,
|
||||||
@ -191,7 +191,7 @@ export const spaceAssignmentTask = task({
|
|||||||
userId,
|
userId,
|
||||||
mode,
|
mode,
|
||||||
newSpaceId,
|
newSpaceId,
|
||||||
episodeId,
|
episodeIds,
|
||||||
batchSize,
|
batchSize,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -213,7 +213,7 @@ export const spaceAssignmentTask = task({
|
|||||||
// 2. Get statements to analyze based on mode
|
// 2. Get statements to analyze based on mode
|
||||||
const statements = await getStatementsToAnalyze(userId, mode, {
|
const statements = await getStatementsToAnalyze(userId, mode, {
|
||||||
newSpaceId,
|
newSpaceId,
|
||||||
episodeId,
|
episodeIds,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (statements.length === 0) {
|
if (statements.length === 0) {
|
||||||
@ -454,7 +454,7 @@ export const spaceAssignmentTask = task({
|
|||||||
async function getStatementsToAnalyze(
|
async function getStatementsToAnalyze(
|
||||||
userId: string,
|
userId: string,
|
||||||
mode: "new_space" | "episode",
|
mode: "new_space" | "episode",
|
||||||
options: { newSpaceId?: string; episodeId?: string },
|
options: { newSpaceId?: string; episodeIds?: string[] },
|
||||||
): Promise<StatementData[]> {
|
): Promise<StatementData[]> {
|
||||||
let query: string;
|
let query: string;
|
||||||
let params: any = { userId };
|
let params: any = { userId };
|
||||||
@ -471,16 +471,19 @@ async function getStatementsToAnalyze(
|
|||||||
ORDER BY s.createdAt DESC
|
ORDER BY s.createdAt DESC
|
||||||
`;
|
`;
|
||||||
} else {
|
} else {
|
||||||
|
// Optimized query: Use UNWIND for better performance with IN clause
|
||||||
|
// and combine entity lookups in single pattern
|
||||||
query = `
|
query = `
|
||||||
MATCH (e:Episode {uuid: $episodeId, userId: $userId})-[:HAS_PROVENANCE]->(s:Statement)
|
UNWIND $episodeIds AS episodeId
|
||||||
|
MATCH (e:Episode {uuid: episodeId, userId: $userId})-[:HAS_PROVENANCE]->(s:Statement)
|
||||||
WHERE s.invalidAt IS NULL
|
WHERE s.invalidAt IS NULL
|
||||||
MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
|
MATCH (s)-[:HAS_SUBJECT]->(subj:Entity),
|
||||||
MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)
|
(s)-[:HAS_PREDICATE]->(pred:Entity),
|
||||||
MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
|
(s)-[:HAS_OBJECT]->(obj:Entity)
|
||||||
RETURN s, subj.name as subject, pred.name as predicate, obj.name as object
|
RETURN s, subj.name as subject, pred.name as predicate, obj.name as object
|
||||||
ORDER BY s.createdAt DESC
|
ORDER BY s.createdAt DESC
|
||||||
`;
|
`;
|
||||||
params.episodeId = options.episodeId;
|
params.episodeIds = options.episodeIds;
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await runQuery(query, params);
|
const result = await runQuery(query, params);
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
import { type Message } from "@core/types";
|
import { EpisodeTypeEnum, type Message } from "@core/types";
|
||||||
import { addToQueue } from "./queue";
|
import { addToQueue } from "./queue";
|
||||||
import { triggerWebhookDelivery } from "../webhooks/webhook-delivery";
|
import { triggerWebhookDelivery } from "../webhooks/webhook-delivery";
|
||||||
import { logger } from "@trigger.dev/sdk";
|
import { logger } from "@trigger.dev/sdk";
|
||||||
@ -149,6 +149,7 @@ export const createActivities = async ({
|
|||||||
episodeBody: message.data.text,
|
episodeBody: message.data.text,
|
||||||
referenceTime: new Date().toISOString(),
|
referenceTime: new Date().toISOString(),
|
||||||
source: integrationAccount?.integrationDefinition.slug,
|
source: integrationAccount?.integrationDefinition.slug,
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
};
|
};
|
||||||
|
|
||||||
const queueResponse = await addToQueue(
|
const queueResponse = await addToQueue(
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import { EpisodeTypeEnum } from "@core/types";
|
||||||
import { addToQueue } from "~/lib/ingest.server";
|
import { addToQueue } from "~/lib/ingest.server";
|
||||||
import { logger } from "~/services/logger.service";
|
import { logger } from "~/services/logger.service";
|
||||||
import { SearchService } from "~/services/search.server";
|
import { SearchService } from "~/services/search.server";
|
||||||
@ -115,6 +116,7 @@ async function handleMemoryIngest(args: any) {
|
|||||||
episodeBody: args.message,
|
episodeBody: args.message,
|
||||||
referenceTime: new Date().toISOString(),
|
referenceTime: new Date().toISOString(),
|
||||||
source: args.source,
|
source: args.source,
|
||||||
|
type: EpisodeTypeEnum.CONVERSATION,
|
||||||
},
|
},
|
||||||
args.userId,
|
args.userId,
|
||||||
);
|
);
|
||||||
|
|||||||
@ -97,6 +97,7 @@
|
|||||||
"execa": "^9.6.0",
|
"execa": "^9.6.0",
|
||||||
"express": "^4.18.1",
|
"express": "^4.18.1",
|
||||||
"fast-sort": "^3.4.0",
|
"fast-sort": "^3.4.0",
|
||||||
|
"gpt-tokenizer": "^3.0.1",
|
||||||
"graphology": "^0.26.0",
|
"graphology": "^0.26.0",
|
||||||
"graphology-layout-force": "^0.2.4",
|
"graphology-layout-force": "^0.2.4",
|
||||||
"graphology-layout-forceatlas2": "^0.10.1",
|
"graphology-layout-forceatlas2": "^0.10.1",
|
||||||
@ -174,10 +175,10 @@
|
|||||||
"prettier-plugin-tailwindcss": "^0.6.11",
|
"prettier-plugin-tailwindcss": "^0.6.11",
|
||||||
"tailwind-scrollbar": "^4.0.2",
|
"tailwind-scrollbar": "^4.0.2",
|
||||||
"tailwindcss": "4.1.7",
|
"tailwindcss": "4.1.7",
|
||||||
|
"tsx": "4.20.4",
|
||||||
"typescript": "5.8.3",
|
"typescript": "5.8.3",
|
||||||
"vite": "^6.0.0",
|
"vite": "^6.0.0",
|
||||||
"vite-tsconfig-paths": "^4.2.1",
|
"vite-tsconfig-paths": "^4.2.1"
|
||||||
"tsx": "4.20.4"
|
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=20.0.0"
|
"node": ">=20.0.0"
|
||||||
|
|||||||
@ -16,7 +16,9 @@ async function init() {
|
|||||||
const build = viteDevServer
|
const build = viteDevServer
|
||||||
? () => viteDevServer.ssrLoadModule("virtual:remix/server-build")
|
? () => viteDevServer.ssrLoadModule("virtual:remix/server-build")
|
||||||
: await import("./build/server/index.js");
|
: await import("./build/server/index.js");
|
||||||
const module = build.entry?.module;
|
const module = viteDevServer
|
||||||
|
? (await build()).entry.module
|
||||||
|
: build.entry?.module;
|
||||||
remixHandler = createRequestHandler({ build });
|
remixHandler = createRequestHandler({ build });
|
||||||
const app = express();
|
const app = express();
|
||||||
app.use(compression());
|
app.use(compression());
|
||||||
|
|||||||
@ -1,6 +1,19 @@
|
|||||||
export enum EpisodeType {
|
/**
|
||||||
Conversation = "CONVERSATION",
|
* Interface for document node in the reified knowledge graph
|
||||||
Text = "TEXT",
|
* Documents are parent containers for episodic chunks
|
||||||
|
*/
|
||||||
|
export interface DocumentNode {
|
||||||
|
uuid: string;
|
||||||
|
title: string;
|
||||||
|
originalContent: string;
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
source: string;
|
||||||
|
userId: string;
|
||||||
|
createdAt: Date;
|
||||||
|
validAt: Date;
|
||||||
|
totalChunks: number;
|
||||||
|
documentId?: string;
|
||||||
|
sessionId?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -21,6 +34,8 @@ export interface EpisodicNode {
|
|||||||
space?: string;
|
space?: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
recallCount?: number;
|
recallCount?: number;
|
||||||
|
documentId?: string;
|
||||||
|
chunkIndex?: number; // Index of this chunk within the document
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -72,14 +87,31 @@ export interface Triple {
|
|||||||
provenance: EpisodicNode;
|
provenance: EpisodicNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export enum EpisodeTypeEnum {
|
||||||
|
CONVERSATION = "CONVERSATION",
|
||||||
|
DOCUMENT = "DOCUMENT",
|
||||||
|
}
|
||||||
|
|
||||||
|
export const EpisodeType = {
|
||||||
|
CONVERSATION: "CONVERSATION",
|
||||||
|
DOCUMENT: "DOCUMENT",
|
||||||
|
};
|
||||||
|
|
||||||
|
export type EpisodeType = (typeof EpisodeType)[keyof typeof EpisodeType];
|
||||||
|
|
||||||
export type AddEpisodeParams = {
|
export type AddEpisodeParams = {
|
||||||
episodeBody: string;
|
episodeBody: string;
|
||||||
referenceTime: Date;
|
referenceTime: Date;
|
||||||
metadata: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
source: string;
|
source: string;
|
||||||
userId: string;
|
userId: string;
|
||||||
spaceId?: string;
|
spaceId?: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
|
type?: EpisodeType;
|
||||||
|
documentTitle?: string;
|
||||||
|
documentId?: string;
|
||||||
|
chunkIndex?: number;
|
||||||
|
chunkContext?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type AddEpisodeResult = {
|
export type AddEpisodeResult = {
|
||||||
|
|||||||
8
pnpm-lock.yaml
generated
8
pnpm-lock.yaml
generated
@ -529,6 +529,9 @@ importers:
|
|||||||
fast-sort:
|
fast-sort:
|
||||||
specifier: ^3.4.0
|
specifier: ^3.4.0
|
||||||
version: 3.4.1
|
version: 3.4.1
|
||||||
|
gpt-tokenizer:
|
||||||
|
specifier: ^3.0.1
|
||||||
|
version: 3.0.1
|
||||||
graphology:
|
graphology:
|
||||||
specifier: ^0.26.0
|
specifier: ^0.26.0
|
||||||
version: 0.26.0(graphology-types@0.24.8)
|
version: 0.26.0(graphology-types@0.24.8)
|
||||||
@ -7634,6 +7637,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
|
resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
|
|
||||||
|
gpt-tokenizer@3.0.1:
|
||||||
|
resolution: {integrity: sha512-5jdaspBq/w4sWw322SvQj1Fku+CN4OAfYZeeEg8U7CWtxBz+zkxZ3h0YOHD43ee+nZYZ5Ud70HRN0ANcdIj4qg==}
|
||||||
|
|
||||||
graceful-fs@4.2.11:
|
graceful-fs@4.2.11:
|
||||||
resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
|
resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
|
||||||
|
|
||||||
@ -20045,6 +20051,8 @@ snapshots:
|
|||||||
|
|
||||||
gopd@1.2.0: {}
|
gopd@1.2.0: {}
|
||||||
|
|
||||||
|
gpt-tokenizer@3.0.1: {}
|
||||||
|
|
||||||
graceful-fs@4.2.11: {}
|
graceful-fs@4.2.11: {}
|
||||||
|
|
||||||
gradient-string@2.0.2:
|
gradient-string@2.0.2:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user