core/apps/webapp/app/trigger/extension/summary.ts

import { metadata, task } from "@trigger.dev/sdk";
import { type CoreMessage } from "ai";
import * as cheerio from "cheerio";
import { z } from "zod";
import { makeModelCall } from "~/lib/model.server";

export type PageType = "text" | "video";

export const ExtensionSummaryBodyRequest = z.object({
  html: z.string().min(1, "HTML content is required"),
  url: z.string().url("Valid URL is required"),
  title: z.string().optional(),
});

interface ContentExtractionResult {
  pageType: PageType;
  title: string;
  content: string;
  metadata: {
    url: string;
    wordCount: number;
  };
  supported: boolean;
}

/**
 * Detect if page contains video content
 */
function isVideoPage(url: string, $: cheerio.CheerioAPI): boolean {
  const hostname = new URL(url).hostname.toLowerCase();

  // Known video platforms
  if (
    hostname.includes("youtube.com") ||
    hostname.includes("youtu.be") ||
    hostname.includes("vimeo.com") ||
    hostname.includes("twitch.tv") ||
    hostname.includes("tiktok.com")
  ) {
    return true;
  }

  // Generic video content detection
  const videoElements = $("video").length;
  const videoPlayers = $(
    '.video-player, [class*="video-player"], [data-testid*="video"]',
  ).length;

  // If there are multiple video indicators, likely a video-focused page
  return videoElements > 0 || videoPlayers > 2;
}

/**
 * Extract all text content from any webpage
 */
function extractTextContent(
  $: cheerio.CheerioAPI,
  url: string,
): ContentExtractionResult {
  // Extract title from multiple possible locations
  const title =
    $("title").text() ||
    $('meta[property="og:title"]').attr("content") ||
    $('meta[name="title"]').attr("content") ||
    $("h1").first().text() ||
    "Untitled Page";

  // Check if this is primarily a video page
  const isVideo = isVideoPage(url, $);
  const pageType: PageType = isVideo ? "video" : "text";

  let content = "";

  if (isVideo) {
    // For video pages, try to get description/transcript text
    content =
      $("#description, .video-description, .description").text() ||
      $('meta[name="description"]').attr("content") ||
      $('[class*="transcript"], [class*="caption"]').text() ||
      "Video content detected - text summarization not available";
  } else {
    // Simple universal text extraction
    // Remove non-content elements
    $("script, style, noscript, nav, header, footer").remove();

    // Get all text content
    const allText = $("body").text();

    // Split into sentences and filter for meaningful content
    const sentences = allText
      .split(/[.!?]+/)
      .map((s) => s.trim())
      .filter((s) => s.length > 20) // Keep sentences with substance
      .filter(
        (s) =>
          !/^(click|menu|button|nav|home|search|login|signup|subscribe)$/i.test(
            s.toLowerCase(),
          ),
      ) // Remove UI text
      .filter((s) => s.split(" ").length > 3); // Keep sentences with multiple words

    content = sentences.join(". ").slice(0, 10000);
  }

  // Clean up whitespace and normalize text
  content = content.replace(/\s+/g, " ").trim();

  const wordCount = content
    .split(/\s+/)
    .filter((word) => word.length > 0).length;
  const supported = !isVideo && content.length > 50;

  return {
    pageType,
    title: title.trim(),
    content: content.slice(0, 10000), // Limit content size for processing
    metadata: {
      url,
      wordCount,
    },
    supported,
  };
}

/**
 * Generate summary using LLM
 */
async function generateSummary(title: string, content: string) {
  const messages: CoreMessage[] = [
    {
      role: "system",
      content: `You are a helpful assistant that creates concise summaries of web content in HTML format.

Create a clear, informative summary that captures the key points and main ideas from the provided content. The summary should:
- Focus on the most important information and key takeaways
- Be concise but comprehensive
- Maintain the original context and meaning
- Be useful for someone who wants to quickly understand the content
- Format the summary in clean HTML using appropriate tags like <h1>, <h2>, <p>, <ul>, <li> to structure the information

IMPORTANT: Return ONLY the HTML content without any markdown code blocks or formatting. Do not wrap the response in \`\`\`html or any other markdown syntax. Return the raw HTML directly.

Extract the essential information while preserving important details, facts, or insights.`,
    },
    {
      role: "user",
      content: `Title: ${title}
Content: ${content}

Please provide a concise summary of this content in HTML format.`,
    },
  ];

  return await makeModelCall(
    true,
    messages,
    () => {}, // onFinish callback
    { temperature: 0.3 },
  );
}

export const extensionSummary = task({
  id: "extensionSummary",
  maxDuration: 3000,
  run: async (body: z.infer<typeof ExtensionSummaryBodyRequest>) => {
    try {
      const $ = cheerio.load(body.html);

      // Extract content from any webpage
      const extraction = extractTextContent($, body.url);

      // Override title if provided
      if (body.title) {
        extraction.title = body.title;
      }

      let summary = "";

      if (extraction.supported && extraction.content.length > 0) {
        // Generate summary for text content
        const response = (await generateSummary(
          extraction.title,
          extraction.content,
        )) as any;

        const stream = await metadata.stream("messages", response.textStream);

        let finalText: string = "";
        for await (const chunk of stream) {
          finalText = finalText + chunk;
        }

        summary = finalText;
      } else {
        // Handle unsupported content types
        if (extraction.pageType === "video") {
          summary =
            "Video content detected. Text summarization not available for video-focused pages.";
        } else {
          summary =
            "Unable to extract sufficient text content for summarization.";
        }
      }

      const response = {
        success: true,
        pageType: extraction.pageType,
        title: extraction.title,
        summary,
        content: extraction.content.slice(0, 1000), // Return first 1000 chars of content
        supported: extraction.supported,
        metadata: extraction.metadata,
      };

      return response;
    } catch (error) {
      console.error("Error processing extension summary request:", error);

      return {
        success: false,
        error: "Failed to process page content",
        pageType: "text" as PageType,
        title: body.title || "Error",
        summary: "Unable to process this page content.",
        content: "",
        supported: false,
        metadata: {
          url: body.url,
          wordCount: 0,
        },
      };
    }
  },
});