core/apps/webapp/app/services/documentChunker.server.ts

import { encode } from "gpt-tokenizer";
import crypto from "crypto";

export interface DocumentChunk {
  content: string;
  chunkIndex: number;
  title?: string;
  context?: string;
  startPosition: number;
  endPosition: number;
  contentHash: string; // Hash for change detection
}

export interface ChunkedDocument {
  documentId: string;
  title: string;
  originalContent: string;
  chunks: DocumentChunk[];
  totalChunks: number;
  contentHash: string; // Hash of the entire document
  chunkHashes: string[]; // Array of chunk hashes for change detection
}

/**
 * Document chunking service that splits large documents into semantic chunks
 * Targets 1-3k tokens per chunk for better entity extraction with natural paragraph boundaries
 */
export class DocumentChunker {
  private TARGET_CHUNK_SIZE = 1000; // Much smaller for better entity extraction
  private MIN_CHUNK_SIZE = 500;
  private MAX_CHUNK_SIZE = 1500;
  private MIN_PARAGRAPH_SIZE = 100; // Minimum tokens for a paragraph to be considered

  constructor(
    targetChunkSize: number = 1000,
    minChunkSize: number = 500,
    maxChunkSize: number = 1500,
    minParagraphSize: number = 100,
  ) {
    this.TARGET_CHUNK_SIZE = targetChunkSize;
    this.MIN_CHUNK_SIZE = minChunkSize;
    this.MAX_CHUNK_SIZE = maxChunkSize;
    this.MIN_PARAGRAPH_SIZE = minParagraphSize;
  }

  /**
   * Chunk a document into semantic sections with natural boundaries
   */
  async chunkDocument(
    originalContent: string,
    title: string,
  ): Promise<ChunkedDocument> {
    const documentId = crypto.randomUUID();
    const contentHash = this.generateContentHash(originalContent);

    // First, split by major section headers (markdown style)
    const majorSections = this.splitByMajorSections(originalContent);

    const chunks: DocumentChunk[] = [];
    let currentChunk = "";
    let currentChunkStart = 0;
    let chunkIndex = 0;

    for (const section of majorSections) {
      const sectionTokens = encode(section.content).length;
      const currentChunkTokens = encode(currentChunk).length;

      // If adding this section would exceed max size, finalize current chunk
      if (
        currentChunkTokens > 0 &&
        currentChunkTokens + sectionTokens > this.MAX_CHUNK_SIZE
      ) {
        if (currentChunkTokens >= this.MIN_CHUNK_SIZE) {
          chunks.push(
            this.createChunk(
              currentChunk,
              chunkIndex,
              currentChunkStart,
              currentChunkStart + currentChunk.length,
              section.title,
            ),
          );
          chunkIndex++;
          currentChunk = "";
          currentChunkStart = section.startPosition;
        }
      }

      // Add section to current chunk
      if (currentChunk) {
        currentChunk += "\n\n" + section.content;
      } else {
        currentChunk = section.content;
        currentChunkStart = section.startPosition;
      }

      // If current chunk is large enough and we have a natural break, consider chunking
      const updatedChunkTokens = encode(currentChunk).length;
      if (updatedChunkTokens >= this.TARGET_CHUNK_SIZE) {
        // Try to find a good breaking point within the section
        const paragraphs = this.splitIntoParagraphs(section.content);
        if (paragraphs.length > 1) {
          // Split at paragraph boundary if beneficial
          const optimalSplit = this.findOptimalParagraphSplit(currentChunk);
          if (optimalSplit) {
            chunks.push(
              this.createChunk(
                optimalSplit.beforeSplit,
                chunkIndex,
                currentChunkStart,
                currentChunkStart + optimalSplit.beforeSplit.length,
                section.title,
              ),
            );
            chunkIndex++;
            currentChunk = optimalSplit.afterSplit;
            currentChunkStart =
              currentChunkStart + optimalSplit.beforeSplit.length;
          }
        }
      }
    }

    // Add remaining content as final chunk
    if (
      currentChunk.trim() &&
      encode(currentChunk).length >= this.MIN_PARAGRAPH_SIZE
    ) {
      chunks.push(
        this.createChunk(
          currentChunk,
          chunkIndex,
          currentChunkStart,
          originalContent.length,
        ),
      );
    }

    // Generate chunk hashes array
    const chunkHashes = chunks.map((chunk) => chunk.contentHash);

    return {
      documentId,
      title,
      originalContent,
      chunks,
      totalChunks: chunks.length,
      contentHash,
      chunkHashes,
    };
  }

  private splitByMajorSections(content: string): Array<{
    content: string;
    title?: string;
    startPosition: number;
    endPosition: number;
  }> {
    const sections: Array<{
      content: string;
      title?: string;
      startPosition: number;
      endPosition: number;
    }> = [];

    // Detect headers from multiple formats
    const headerMatches = this.findAllHeaders(content);

    if (headerMatches.length === 0) {
      // No headers found, try to split by natural boundaries
      return this.splitByNaturalBoundaries(content);
    }

    let lastIndex = 0;

    for (let i = 0; i < headerMatches.length; i++) {
      const match = headerMatches[i];
      const nextMatch = headerMatches[i + 1];

      const sectionStart = lastIndex;
      const sectionEnd = nextMatch ? nextMatch.startIndex : content.length;

      const sectionContent = content.slice(sectionStart, sectionEnd).trim();

      if (sectionContent) {
        sections.push({
          content: sectionContent,
          title: match.title,
          startPosition: sectionStart,
          endPosition: sectionEnd,
        });
      }

      lastIndex = match.endIndex;
    }

    return sections;
  }

  private findAllHeaders(content: string): Array<{
    title: string;
    startIndex: number;
    endIndex: number;
    level: number;
  }> {
    const headers: Array<{
      title: string;
      startIndex: number;
      endIndex: number;
      level: number;
    }> = [];

    // Markdown headers (# ## ### etc.)
    const markdownRegex = /^(#{1,6})\s+(.+)$/gm;
    let match;
    while ((match = markdownRegex.exec(content)) !== null) {
      headers.push({
        title: match[2].trim(),
        startIndex: match.index,
        endIndex: match.index + match[0].length,
        level: match[1].length,
      });
    }

    // HTML headers (<h1>, <h2>, etc.)
    const htmlRegex = /<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi;
    while ((match = htmlRegex.exec(content)) !== null) {
      const textContent = match[2].replace(/<[^>]*>/g, "").trim();
      if (textContent) {
        headers.push({
          title: textContent,
          startIndex: match.index,
          endIndex: match.index + match[0].length,
          level: parseInt(match[1]),
        });
      }
    }

    // Underlined headers (Setext-style)
    const setextRegex = /^(.+)\n(={3,}|-{3,})$/gm;
    while ((match = setextRegex.exec(content)) !== null) {
      const level = match[2].startsWith("=") ? 1 : 2;
      headers.push({
        title: match[1].trim(),
        startIndex: match.index,
        endIndex: match.index + match[0].length,
        level,
      });
    }

    // Sort by position in document
    return headers.sort((a, b) => a.startIndex - b.startIndex);
  }

  private splitByNaturalBoundaries(content: string): Array<{
    content: string;
    title?: string;
    startPosition: number;
    endPosition: number;
  }> {
    const sections: Array<{
      content: string;
      title?: string;
      startPosition: number;
      endPosition: number;
    }> = [];

    // Look for natural boundaries: double line breaks, HTML block elements, etc.
    const boundaryPatterns = [
      /\n\s*\n\s*\n/g, // Triple line breaks (strong boundary)
      /<\/(?:div|section|article|main|p)>\s*<(?:div|section|article|main|p)/gi, // HTML block boundaries
      /\n\s*[-=*]{4,}\s*\n/g, // Horizontal rules
    ];

    let boundaries: number[] = [0];

    for (const pattern of boundaryPatterns) {
      let match;
      while ((match = pattern.exec(content)) !== null) {
        boundaries.push(match.index);
      }
    }

    boundaries.push(content.length);
    boundaries = [...new Set(boundaries)].sort((a, b) => a - b);

    // If no natural boundaries found, split by token count
    if (boundaries.length <= 2) {
      return this.splitByTokenCount(content);
    }

    for (let i = 0; i < boundaries.length - 1; i++) {
      const start = boundaries[i];
      const end = boundaries[i + 1];
      const sectionContent = content.slice(start, end).trim();

      if (
        sectionContent &&
        encode(sectionContent).length >= this.MIN_PARAGRAPH_SIZE
      ) {
        sections.push({
          content: sectionContent,
          startPosition: start,
          endPosition: end,
        });
      }
    }

    return sections.length > 0 ? sections : this.splitByTokenCount(content);
  }

  private splitByTokenCount(content: string): Array<{
    content: string;
    title?: string;
    startPosition: number;
    endPosition: number;
  }> {
    const sections: Array<{
      content: string;
      title?: string;
      startPosition: number;
      endPosition: number;
    }> = [];

    const totalTokens = encode(content).length;
    const numSections = Math.ceil(totalTokens / this.TARGET_CHUNK_SIZE);
    const charsPerSection = Math.ceil(content.length / numSections);

    for (let i = 0; i < numSections; i++) {
      const start = i * charsPerSection;
      const end = Math.min((i + 1) * charsPerSection, content.length);

      // Try to break at word boundaries
      let actualEnd = end;
      if (end < content.length) {
        const nextSpace = content.indexOf(" ", end);
        const nextNewline = content.indexOf("\n", end);
        const nextBoundary = Math.min(
          nextSpace === -1 ? Infinity : nextSpace,
          nextNewline === -1 ? Infinity : nextNewline,
        );
        if (nextBoundary !== Infinity && nextBoundary - end < 100) {
          actualEnd = nextBoundary;
        }
      }

      const sectionContent = content.slice(start, actualEnd).trim();
      if (sectionContent) {
        sections.push({
          content: sectionContent,
          startPosition: start,
          endPosition: actualEnd,
        });
      }
    }

    return sections;
  }

  private splitIntoParagraphs(content: string): string[] {
    // Handle HTML paragraphs first
    if (
      content.includes("<p") ||
      content.includes("<div") ||
      content.includes("<section")
    ) {
      return this.splitHtmlParagraphs(content);
    }

    // Split by double newlines (paragraph breaks) for text/markdown
    return content
      .split(/\n\s*\n/)
      .map((p) => p.trim())
      .filter((p) => p.length > 0);
  }

  private splitHtmlParagraphs(content: string): string[] {
    const paragraphs: string[] = [];

    // Split by HTML block elements
    const blockElements = [
      "p",
      "div",
      "section",
      "article",
      "li",
      "blockquote",
      "pre",
    ];
    const blockRegex = new RegExp(
      `<(${blockElements.join("|")})[^>]*>.*?</\\1>`,
      "gis",
    );

    let lastIndex = 0;
    let match;

    while ((match = blockRegex.exec(content)) !== null) {
      // Add content before this block element
      if (match.index > lastIndex) {
        const beforeContent = content.slice(lastIndex, match.index).trim();
        if (beforeContent) {
          paragraphs.push(beforeContent);
        }
      }

      // Add the block element content (strip tags for text content)
      const blockContent = match[0].replace(/<[^>]*>/g, " ").trim();
      if (blockContent) {
        paragraphs.push(blockContent);
      }

      lastIndex = match.index + match[0].length;
    }

    // Add remaining content
    if (lastIndex < content.length) {
      const remainingContent = content.slice(lastIndex).trim();
      if (remainingContent) {
        // Clean up remaining HTML and split by newlines
        const cleaned = remainingContent.replace(/<[^>]*>/g, " ").trim();
        if (cleaned) {
          paragraphs.push(
            ...cleaned.split(/\n\s*\n/).filter((p) => p.trim().length > 0),
          );
        }
      }
    }

    return paragraphs.length > 0
      ? paragraphs
      : this.splitTextParagraphs(content);
  }

  private splitTextParagraphs(content: string): string[] {
    return content
      .split(/\n\s*\n/)
      .map((p) => p.trim())
      .filter((p) => p.length > 0);
  }

  private findOptimalParagraphSplit(content: string): {
    beforeSplit: string;
    afterSplit: string;
  } | null {
    const paragraphs = this.splitIntoParagraphs(content);
    if (paragraphs.length < 2) return null;

    let bestSplitIndex = -1;
    let bestScore = 0;

    // Find the split that gets us closest to target size
    for (let i = 1; i < paragraphs.length; i++) {
      const beforeSplit = paragraphs.slice(0, i).join("\n\n");
      const afterSplit = paragraphs.slice(i).join("\n\n");

      const beforeTokens = encode(beforeSplit).length;
      const afterTokens = encode(afterSplit).length;

      // Score based on how close we get to target, avoiding too small chunks
      if (
        beforeTokens >= this.MIN_CHUNK_SIZE &&
        afterTokens >= this.MIN_PARAGRAPH_SIZE
      ) {
        const beforeDistance = Math.abs(beforeTokens - this.TARGET_CHUNK_SIZE);
        const score = 1 / (1 + beforeDistance); // Higher score for closer to target

        if (score > bestScore) {
          bestScore = score;
          bestSplitIndex = i;
        }
      }
    }

    if (bestSplitIndex > 0) {
      return {
        beforeSplit: paragraphs.slice(0, bestSplitIndex).join("\n\n"),
        afterSplit: paragraphs.slice(bestSplitIndex).join("\n\n"),
      };
    }

    return null;
  }

  private createChunk(
    content: string,
    chunkIndex: number,
    startPosition: number,
    endPosition: number,
    title?: string,
  ): DocumentChunk {
    // Generate a concise context/title if not provided
    const context = title || this.generateChunkContext(content);
    const contentHash = this.generateContentHash(content.trim());

    return {
      content: content.trim(),
      chunkIndex,
      title: context,
      context: `Chunk ${chunkIndex + 1}${context ? `: ${context}` : ""}`,
      startPosition,
      endPosition,
      contentHash,
    };
  }

  private generateChunkContext(content: string): string {
    // Clean content from HTML tags and markup
    const cleanContent = content
      .replace(/<[^>]*>/g, " ") // Remove HTML tags
      .replace(/#{1,6}\s+/g, "") // Remove markdown headers
      .replace(/[=-]{3,}/g, "") // Remove underline headers
      .replace(/\s+/g, " ") // Normalize whitespace
      .trim();

    if (!cleanContent) {
      return "Document content";
    }

    // Find first substantial sentence or line
    const sentences = cleanContent
      .split(/[.!?]+/)
      .map((s) => s.trim())
      .filter(Boolean);

    for (const sentence of sentences.slice(0, 2)) {
      if (sentence.length > 20) {
        return (
          sentence.substring(0, 100) + (sentence.length > 100 ? "..." : "")
        );
      }
    }

    // Fallback to first meaningful chunk
    const words = cleanContent.split(/\s+/).slice(0, 15).join(" ");
    return words.substring(0, 100) + (words.length > 100 ? "..." : "");
  }

  /**
   * Generate content hash for change detection
   */
  private generateContentHash(content: string): string {
    return crypto
      .createHash("sha256")
      .update(content, "utf8")
      .digest("hex")
      .substring(0, 16);
  }

  /**
   * Compare chunk hashes to detect changes
   */
  static compareChunkHashes(
    oldHashes: string[],
    newHashes: string[],
  ): {
    changedIndices: number[];
    changePercentage: number;
  } {
    const maxLength = Math.max(oldHashes.length, newHashes.length);
    const changedIndices: number[] = [];

    for (let i = 0; i < maxLength; i++) {
      const oldHash = oldHashes[i];
      const newHash = newHashes[i];

      // Mark as changed if hash is different or chunk added/removed
      if (oldHash !== newHash) {
        changedIndices.push(i);
      }
    }

    const changePercentage =
      maxLength > 0 ? (changedIndices.length / maxLength) * 100 : 0;

    return {
      changedIndices,
      changePercentage,
    };
  }

  /**
   * Calculate document size in tokens for threshold decisions
   */
  static getDocumentSizeInTokens(content: string): number {
    return encode(content).length;
  }
}