core/apps/webapp/app/jobs/spaces/space-identification.logic.ts
Harshith Mullapudi c869096be8
Feat: Space v3
* feat: space v3

* feat: connected space creation

* fix:

* fix: session_id for memory ingestion

* chore: simplify gitignore patterns for agent directories

---------

Co-authored-by: Manoj <saimanoj58@gmail.com>
2025-10-30 12:30:56 +05:30

230 lines
6.8 KiB
TypeScript

/**
* Space Identification Logic
*
* Uses LLM to identify appropriate spaces for topics discovered by BERT analysis
*/
import { makeModelCall } from "~/lib/model.server";
import { getAllSpacesForUser } from "~/services/graphModels/space";
import { getEpisode } from "~/services/graphModels/episode";
import { logger } from "~/services/logger.service";
import type { SpaceNode } from "@core/types";
export interface TopicData {
keywords: string[];
episodeIds: string[];
}
export interface SpaceProposal {
name: string;
intent: string;
confidence: number;
reason: string;
topics: string[]; // Array of topic IDs
}
interface IdentifySpacesParams {
userId: string;
topics: Record<string, TopicData>;
}
/**
* Identify spaces for topics using LLM analysis
* Takes top 10 keywords and top 5 episodes per topic
*/
export async function identifySpacesForTopics(
params: IdentifySpacesParams,
): Promise<SpaceProposal[]> {
const { userId, topics } = params;
// Get existing spaces for the user
const existingSpaces = await getAllSpacesForUser(userId);
// Prepare topic data with top 10 keywords and top 5 episodes
const topicsForAnalysis = await Promise.all(
Object.entries(topics).map(async ([topicId, topicData]) => {
// Take top 10 keywords
const topKeywords = topicData.keywords.slice(0, 10);
// Take top 5 episodes and fetch their content
const topEpisodeIds = topicData.episodeIds.slice(0, 5);
const episodes = await Promise.all(
topEpisodeIds.map((id) => getEpisode(id)),
);
return {
topicId,
keywords: topKeywords,
episodes: episodes
.filter((e) => e !== null)
.map((e) => ({
content: e!.content.substring(0, 500), // Limit to 500 chars per episode
})),
episodeCount: topicData.episodeIds.length,
};
}),
);
// Build the prompt
const prompt = buildSpaceIdentificationPrompt(
existingSpaces,
topicsForAnalysis,
);
logger.info("Identifying spaces for topics", {
userId,
topicCount: Object.keys(topics).length,
existingSpaceCount: existingSpaces.length,
});
// Call LLM with structured output
let responseText = "";
await makeModelCall(
false, // not streaming
[{ role: "user", content: prompt }],
(text) => {
responseText = text;
},
{
temperature: 0.7,
},
"high", // Use high complexity for space identification
);
// Parse the response
const proposals = parseSpaceProposals(responseText);
logger.info("Space identification completed", {
userId,
proposalCount: proposals.length,
});
return proposals;
}
/**
* Build the prompt for space identification
*/
function buildSpaceIdentificationPrompt(
existingSpaces: SpaceNode[],
topics: Array<{
topicId: string;
keywords: string[];
episodes: Array<{ content: string }>;
episodeCount: number;
}>,
): string {
const existingSpacesSection =
existingSpaces.length > 0
? `## Existing Spaces
The user currently has these spaces:
${existingSpaces.map((s) => `- **${s.name}**: ${s.description || "No description"} (${s.contextCount || 0} episodes)`).join("\n")}
When identifying new spaces, consider if topics fit into existing spaces or if new spaces are needed.`
: `## Existing Spaces
The user currently has no spaces defined. This is a fresh start for space organization.`;
const topicsSection = `## Topics Discovered
BERT topic modeling has identified ${topics.length} distinct topics from the user's episodes. Each topic represents a cluster of semantically related content.
${topics
.map(
(t, idx) => `### Topic ${idx + 1} (ID: ${t.topicId})
**Episode Count**: ${t.episodeCount}
**Top Keywords**: ${t.keywords.join(", ")}
**Sample Episodes** (showing ${t.episodes.length} of ${t.episodeCount}):
${t.episodes.map((e, i) => `${i + 1}. ${e.content}`).join("\n")}
`,
)
.join("\n")}`;
return `You are a knowledge organization expert. Your task is to analyze discovered topics and identify appropriate "spaces" (thematic containers) for organizing episodic memories.
${existingSpacesSection}
${topicsSection}
## Task
Analyze the topics above and identify spaces that would help organize this content meaningfully. For each space:
1. **Consider existing spaces first**: If topics clearly belong to existing spaces, assign them there
2. **Create new spaces when needed**: If topics represent distinct themes not covered by existing spaces
3. **Group related topics**: Multiple topics can be assigned to the same space if they share a theme
4. **Aim for 20-50 episodes per space**: This is the sweet spot for space cohesion
5. **Focus on user intent**: What would help the user find and understand this content later?
## Output Format
Return your analysis as a JSON array of space proposals. Each proposal should have:
\`\`\`json
[
{
"name": "Space name (use existing space name if assigning to existing space)",
"intent": "Clear description of what this space represents",
"confidence": 0.85,
"reason": "Brief explanation of why these topics belong together",
"topics": ["topic-id-1", "topic-id-2"]
}
]
\`\`\`
**Important Guidelines**:
- **confidence**: 0.0-1.0 scale indicating how confident you are this is a good grouping
- **topics**: Array of topic IDs (use the exact IDs from above like "0", "1", "-1", etc.)
- **name**: For existing spaces, use the EXACT name. For new spaces, create a clear, concise name
- Only propose spaces with confidence >= 0.6
- Each topic should only appear in ONE space proposal
- Topic "-1" is the outlier topic (noise) - only include if it genuinely fits a theme
Return ONLY the JSON array, no additional text.`;
}
/**
* Parse space proposals from LLM response
*/
function parseSpaceProposals(responseText: string): SpaceProposal[] {
try {
// Extract JSON from markdown code blocks if present
const jsonMatch = responseText.match(/```(?:json)?\s*(\[[\s\S]*?\])\s*```/);
const jsonText = jsonMatch ? jsonMatch[1] : responseText;
const proposals = JSON.parse(jsonText.trim());
if (!Array.isArray(proposals)) {
throw new Error("Response is not an array");
}
// Validate and filter proposals
return proposals
.filter((p) => {
return (
p.name &&
p.intent &&
typeof p.confidence === "number" &&
p.confidence >= 0.6 &&
Array.isArray(p.topics) &&
p.topics.length > 0
);
})
.map((p) => ({
name: p.name.trim(),
intent: p.intent.trim(),
confidence: p.confidence,
reason: (p.reason || "").trim(),
topics: p.topics.map((t: any) => String(t)),
}));
} catch (error) {
logger.error("Failed to parse space proposals", {
error,
responseText: responseText.substring(0, 500),
});
return [];
}
}