core/apps/webapp/app/jobs/spaces/space-identification.logic.ts

/**
 * Space Identification Logic
 *
 * Uses LLM to identify appropriate spaces for topics discovered by BERT analysis
 */

import { makeModelCall } from "~/lib/model.server";
import { getAllSpacesForUser } from "~/services/graphModels/space";
import { getEpisode } from "~/services/graphModels/episode";
import { logger } from "~/services/logger.service";
import type { SpaceNode } from "@core/types";

export interface TopicData {
  keywords: string[];
  episodeIds: string[];
}

export interface SpaceProposal {
  name: string;
  intent: string;
  confidence: number;
  reason: string;
  topics: string[]; // Array of topic IDs
}

interface IdentifySpacesParams {
  userId: string;
  topics: Record<string, TopicData>;
}

/**
 * Identify spaces for topics using LLM analysis
 * Takes top 10 keywords and top 5 episodes per topic
 */
export async function identifySpacesForTopics(
  params: IdentifySpacesParams,
): Promise<SpaceProposal[]> {
  const { userId, topics } = params;

  // Get existing spaces for the user
  const existingSpaces = await getAllSpacesForUser(userId);

  // Prepare topic data with top 10 keywords and top 5 episodes
  const topicsForAnalysis = await Promise.all(
    Object.entries(topics).map(async ([topicId, topicData]) => {
      // Take top 10 keywords
      const topKeywords = topicData.keywords.slice(0, 10);

      // Take top 5 episodes and fetch their content
      const topEpisodeIds = topicData.episodeIds.slice(0, 5);
      const episodes = await Promise.all(
        topEpisodeIds.map((id) => getEpisode(id)),
      );

      return {
        topicId,
        keywords: topKeywords,
        episodes: episodes
          .filter((e) => e !== null)
          .map((e) => ({
            content: e!.content.substring(0, 500), // Limit to 500 chars per episode
          })),
        episodeCount: topicData.episodeIds.length,
      };
    }),
  );

  // Build the prompt
  const prompt = buildSpaceIdentificationPrompt(
    existingSpaces,
    topicsForAnalysis,
  );

  logger.info("Identifying spaces for topics", {
    userId,
    topicCount: Object.keys(topics).length,
    existingSpaceCount: existingSpaces.length,
  });

  // Call LLM with structured output
  let responseText = "";
  await makeModelCall(
    false, // not streaming
    [{ role: "user", content: prompt }],
    (text) => {
      responseText = text;
    },
    {
      temperature: 0.7,
    },
    "high", // Use high complexity for space identification
  );

  // Parse the response
  const proposals = parseSpaceProposals(responseText);

  logger.info("Space identification completed", {
    userId,
    proposalCount: proposals.length,
  });

  return proposals;
}

/**
 * Build the prompt for space identification
 */
function buildSpaceIdentificationPrompt(
  existingSpaces: SpaceNode[],
  topics: Array<{
    topicId: string;
    keywords: string[];
    episodes: Array<{ content: string }>;
    episodeCount: number;
  }>,
): string {
  const existingSpacesSection =
    existingSpaces.length > 0
      ? `## Existing Spaces

The user currently has these spaces:
${existingSpaces.map((s) => `- **${s.name}**: ${s.description || "No description"} (${s.contextCount || 0} episodes)`).join("\n")}

When identifying new spaces, consider if topics fit into existing spaces or if new spaces are needed.`
      : `## Existing Spaces

The user currently has no spaces defined. This is a fresh start for space organization.`;

  const topicsSection = `## Topics Discovered

BERT topic modeling has identified ${topics.length} distinct topics from the user's episodes. Each topic represents a cluster of semantically related content.

${topics
  .map(
    (t, idx) => `### Topic ${idx + 1} (ID: ${t.topicId})
**Episode Count**: ${t.episodeCount}
**Top Keywords**: ${t.keywords.join(", ")}

**Sample Episodes** (showing ${t.episodes.length} of ${t.episodeCount}):
${t.episodes.map((e, i) => `${i + 1}. ${e.content}`).join("\n")}
`,
  )
  .join("\n")}`;

  return `You are a knowledge organization expert. Your task is to analyze discovered topics and identify appropriate "spaces" (thematic containers) for organizing episodic memories.

${existingSpacesSection}

${topicsSection}

## Task

Analyze the topics above and identify spaces that would help organize this content meaningfully. For each space:

1. **Consider existing spaces first**: If topics clearly belong to existing spaces, assign them there
2. **Create new spaces when needed**: If topics represent distinct themes not covered by existing spaces
3. **Group related topics**: Multiple topics can be assigned to the same space if they share a theme
4. **Aim for 20-50 episodes per space**: This is the sweet spot for space cohesion
5. **Focus on user intent**: What would help the user find and understand this content later?

## Output Format

Return your analysis as a JSON array of space proposals. Each proposal should have:

\`\`\`json
[
  {
    "name": "Space name (use existing space name if assigning to existing space)",
    "intent": "Clear description of what this space represents",
    "confidence": 0.85,
    "reason": "Brief explanation of why these topics belong together",
    "topics": ["topic-id-1", "topic-id-2"]
  }
]
\`\`\`

**Important Guidelines**:
- **confidence**: 0.0-1.0 scale indicating how confident you are this is a good grouping
- **topics**: Array of topic IDs (use the exact IDs from above like "0", "1", "-1", etc.)
- **name**: For existing spaces, use the EXACT name. For new spaces, create a clear, concise name
- Only propose spaces with confidence >= 0.6
- Each topic should only appear in ONE space proposal
- Topic "-1" is the outlier topic (noise) - only include if it genuinely fits a theme

Return ONLY the JSON array, no additional text.`;
}

/**
 * Parse space proposals from LLM response
 */
function parseSpaceProposals(responseText: string): SpaceProposal[] {
  try {
    // Extract JSON from markdown code blocks if present
    const jsonMatch = responseText.match(/```(?:json)?\s*(\[[\s\S]*?\])\s*```/);
    const jsonText = jsonMatch ? jsonMatch[1] : responseText;

    const proposals = JSON.parse(jsonText.trim());

    if (!Array.isArray(proposals)) {
      throw new Error("Response is not an array");
    }

    // Validate and filter proposals
    return proposals
      .filter((p) => {
        return (
          p.name &&
          p.intent &&
          typeof p.confidence === "number" &&
          p.confidence >= 0.6 &&
          Array.isArray(p.topics) &&
          p.topics.length > 0
        );
      })
      .map((p) => ({
        name: p.name.trim(),
        intent: p.intent.trim(),
        confidence: p.confidence,
        reason: (p.reason || "").trim(),
        topics: p.topics.map((t: any) => String(t)),
      }));
  } catch (error) {
    logger.error("Failed to parse space proposals", {
      error,
      responseText: responseText.substring(0, 500),
    });
    return [];
  }
}