2025-08-24 12:22:12 +05:30

235 lines
6.7 KiB
TypeScript

import { metadata, task } from "@trigger.dev/sdk";
import { type CoreMessage } from "ai";
import * as cheerio from "cheerio";
import { z } from "zod";
import { makeModelCall } from "~/lib/model.server";
export type PageType = "text" | "video";
export const ExtensionSummaryBodyRequest = z.object({
html: z.string().min(1, "HTML content is required"),
url: z.string().url("Valid URL is required"),
title: z.string().optional(),
});
interface ContentExtractionResult {
pageType: PageType;
title: string;
content: string;
metadata: {
url: string;
wordCount: number;
};
supported: boolean;
}
/**
* Detect if page contains video content
*/
function isVideoPage(url: string, $: cheerio.CheerioAPI): boolean {
const hostname = new URL(url).hostname.toLowerCase();
// Known video platforms
if (
hostname.includes("youtube.com") ||
hostname.includes("youtu.be") ||
hostname.includes("vimeo.com") ||
hostname.includes("twitch.tv") ||
hostname.includes("tiktok.com")
) {
return true;
}
// Generic video content detection
const videoElements = $("video").length;
const videoPlayers = $(
'.video-player, [class*="video-player"], [data-testid*="video"]',
).length;
// If there are multiple video indicators, likely a video-focused page
return videoElements > 0 || videoPlayers > 2;
}
/**
* Extract all text content from any webpage
*/
function extractTextContent(
$: cheerio.CheerioAPI,
url: string,
): ContentExtractionResult {
// Extract title from multiple possible locations
const title =
$("title").text() ||
$('meta[property="og:title"]').attr("content") ||
$('meta[name="title"]').attr("content") ||
$("h1").first().text() ||
"Untitled Page";
// Check if this is primarily a video page
const isVideo = isVideoPage(url, $);
const pageType: PageType = isVideo ? "video" : "text";
let content = "";
if (isVideo) {
// For video pages, try to get description/transcript text
content =
$("#description, .video-description, .description").text() ||
$('meta[name="description"]').attr("content") ||
$('[class*="transcript"], [class*="caption"]').text() ||
"Video content detected - text summarization not available";
} else {
// Simple universal text extraction
// Remove non-content elements
$("script, style, noscript, nav, header, footer").remove();
// Get all text content
const allText = $("body").text();
// Split into sentences and filter for meaningful content
const sentences = allText
.split(/[.!?]+/)
.map((s) => s.trim())
.filter((s) => s.length > 20) // Keep sentences with substance
.filter(
(s) =>
!/^(click|menu|button|nav|home|search|login|signup|subscribe)$/i.test(
s.toLowerCase(),
),
) // Remove UI text
.filter((s) => s.split(" ").length > 3); // Keep sentences with multiple words
content = sentences.join(". ").slice(0, 10000);
}
// Clean up whitespace and normalize text
content = content.replace(/\s+/g, " ").trim();
const wordCount = content
.split(/\s+/)
.filter((word) => word.length > 0).length;
const supported = !isVideo && content.length > 50;
return {
pageType,
title: title.trim(),
content: content.slice(0, 10000), // Limit content size for processing
metadata: {
url,
wordCount,
},
supported,
};
}
/**
* Generate summary using LLM
*/
async function generateSummary(title: string, content: string) {
const messages: CoreMessage[] = [
{
role: "system",
content: `You are a helpful assistant that creates concise summaries of web content in HTML format.
Create a clear, informative summary that captures the key points and main ideas from the provided content. The summary should:
- Focus on the most important information and key takeaways
- Be concise but comprehensive
- Maintain the original context and meaning
- Be useful for someone who wants to quickly understand the content
- Format the summary in clean HTML using appropriate tags like <h1>, <h2>, <p>, <ul>, <li> to structure the information
IMPORTANT: Return ONLY the HTML content without any markdown code blocks or formatting. Do not wrap the response in \`\`\`html or any other markdown syntax. Return the raw HTML directly.
Extract the essential information while preserving important details, facts, or insights.`,
},
{
role: "user",
content: `Title: ${title}
Content: ${content}
Please provide a concise summary of this content in HTML format.`,
},
];
return await makeModelCall(
true,
messages,
() => {}, // onFinish callback
{ temperature: 0.3 },
);
}
export const extensionSummary = task({
id: "extensionSummary",
maxDuration: 3000,
run: async (body: z.infer<typeof ExtensionSummaryBodyRequest>) => {
try {
const $ = cheerio.load(body.html);
// Extract content from any webpage
const extraction = extractTextContent($, body.url);
// Override title if provided
if (body.title) {
extraction.title = body.title;
}
let summary = "";
if (extraction.supported && extraction.content.length > 0) {
// Generate summary for text content
const response = (await generateSummary(
extraction.title,
extraction.content,
)) as any;
const stream = await metadata.stream("messages", response.textStream);
let finalText: string = "";
for await (const chunk of stream) {
finalText = finalText + chunk;
}
summary = finalText;
} else {
// Handle unsupported content types
if (extraction.pageType === "video") {
summary =
"Video content detected. Text summarization not available for video-focused pages.";
} else {
summary =
"Unable to extract sufficient text content for summarization.";
}
}
const response = {
success: true,
pageType: extraction.pageType,
title: extraction.title,
summary,
content: extraction.content.slice(0, 1000), // Return first 1000 chars of content
supported: extraction.supported,
metadata: extraction.metadata,
};
return response;
} catch (error) {
console.error("Error processing extension summary request:", error);
return {
success: false,
error: "Failed to process page content",
pageType: "text" as PageType,
title: body.title || "Error",
summary: "Unable to process this page content.",
content: "",
supported: false,
metadata: {
url: body.url,
wordCount: 0,
},
};
}
},
});