mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 17:38:27 +00:00
* feat: Episode ingestion update Benchmarking CORE * Feat: Spaces in knowledge graph * fix: remove daily assignment * Feat: add spaces * Feat: spaces --------- Co-authored-by: Manoj K <saimanoj58@gmail.com>
266 lines
7.8 KiB
JavaScript
266 lines
7.8 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const axios = require("axios");
|
|
|
|
/**
|
|
* LOCOMO Session Summary Ingestion Script
|
|
* Ingests LOCOMO session summaries - comprehensive and available for all conversations
|
|
* More efficient than full conversations while preserving all key information
|
|
*/
|
|
|
|
class LocomoSessionIngester {
|
|
constructor(baseUrl = process.env.BASE_URL) {
|
|
this.baseUrl = baseUrl;
|
|
this.headers = {
|
|
Authorization: `Bearer ${process.env.API_KEY}`,
|
|
};
|
|
this.statusFile = path.join(__dirname, "session_ingestion_status.json");
|
|
|
|
// Create axios instance with default config
|
|
this.axios = axios.create({
|
|
baseURL: this.baseUrl,
|
|
headers: this.headers,
|
|
timeout: 10000,
|
|
});
|
|
}
|
|
|
|
async makeRequest(endpoint, data) {
|
|
try {
|
|
const response = await this.axios.post(endpoint, data, {
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
},
|
|
});
|
|
return response.data;
|
|
} catch (error) {
|
|
if (error.response) {
|
|
throw new Error(`HTTP ${error.response.status}: ${JSON.stringify(error.response.data)}`);
|
|
} else if (error.request) {
|
|
throw new Error(`No response received: ${error.message}`);
|
|
} else {
|
|
throw new Error(`Request error: ${error.message}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
loadIngestionStatus() {
|
|
try {
|
|
if (fs.existsSync(this.statusFile)) {
|
|
return JSON.parse(fs.readFileSync(this.statusFile, "utf8"));
|
|
}
|
|
} catch (error) {
|
|
console.warn("Could not load ingestion status:", error.message);
|
|
}
|
|
return { conversations: {}, timestamp: null };
|
|
}
|
|
|
|
saveIngestionStatus(status) {
|
|
fs.writeFileSync(this.statusFile, JSON.stringify(status, null, 2));
|
|
}
|
|
|
|
formatSessionSummaryForIngestion(conversation, conversationId) {
|
|
const episodes = [];
|
|
const sessionSummary = conversation.session_summary;
|
|
const conv = conversation.conversation;
|
|
const speakerA = conv.speaker_a;
|
|
const speakerB = conv.speaker_b;
|
|
|
|
// Process each session summary
|
|
Object.entries(sessionSummary).forEach(([sessionKey, summary]) => {
|
|
const sessionNumber = sessionKey.replace("session_", "").replace("_summary", "");
|
|
|
|
episodes.push({
|
|
content: `Session ${sessionNumber} Summary: ${summary}`,
|
|
metadata: {
|
|
conversationId,
|
|
sessionNumber: parseInt(sessionNumber),
|
|
speakerA,
|
|
speakerB,
|
|
source: "locomo_sessions",
|
|
type: "session_summary",
|
|
},
|
|
});
|
|
});
|
|
|
|
return episodes;
|
|
}
|
|
|
|
async ingestConversation(conversation, conversationId, forceReingest = false) {
|
|
const status = this.loadIngestionStatus();
|
|
|
|
if (status.conversations[conversationId] && !forceReingest) {
|
|
console.log(`Conversation ${conversationId} already ingested, skipping...`);
|
|
return false;
|
|
}
|
|
|
|
console.log(`Ingesting session summaries for conversation ${conversationId}...`);
|
|
|
|
const episodes = this.formatSessionSummaryForIngestion(conversation, conversationId);
|
|
let successCount = 0;
|
|
let errorCount = 0;
|
|
|
|
console.log(` Total sessions to ingest: ${episodes.length}`);
|
|
|
|
for (const [index, episode] of episodes.entries()) {
|
|
try {
|
|
const payload = {
|
|
episodeBody: episode.content,
|
|
referenceTime: new Date(Date.now() + index * 1000).toISOString(),
|
|
source: "locomo_sessions",
|
|
};
|
|
|
|
await this.makeRequest("/api/v1/add", payload);
|
|
successCount++;
|
|
|
|
// Progress indicator
|
|
if ((index + 1) % 10 === 0) {
|
|
console.log(` Ingested ${index + 1}/${episodes.length} sessions`);
|
|
}
|
|
|
|
// Small delay
|
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
} catch (error) {
|
|
console.error(` Error ingesting session ${index}:`, error.message);
|
|
errorCount++;
|
|
}
|
|
}
|
|
|
|
// Update status
|
|
status.conversations[conversationId] = {
|
|
ingested: true,
|
|
timestamp: new Date().toISOString(),
|
|
totalEpisodes: episodes.length,
|
|
successCount,
|
|
errorCount,
|
|
};
|
|
status.timestamp = new Date().toISOString();
|
|
this.saveIngestionStatus(status);
|
|
|
|
console.log(` Completed: ${successCount} success, ${errorCount} errors`);
|
|
return true;
|
|
}
|
|
|
|
async ingestAll(forceReingest = false) {
|
|
console.log("Starting LOCOMO session summary ingestion...");
|
|
|
|
if (forceReingest) {
|
|
console.log("Force re-ingestion enabled");
|
|
}
|
|
|
|
// Load LOCOMO dataset
|
|
const dataPath = path.join(__dirname, "data", "locomo10.json");
|
|
const conversations = JSON.parse(fs.readFileSync(dataPath, "utf8"));
|
|
|
|
console.log(`Loaded ${conversations.length} conversations`);
|
|
|
|
let ingestedCount = 0;
|
|
let skippedCount = 0;
|
|
|
|
// Test connection first
|
|
try {
|
|
console.log("Testing connection...");
|
|
await this.makeRequest("/api/v1/add", {
|
|
episodeBody: "Session ingestion test",
|
|
referenceTime: new Date().toISOString(),
|
|
source: "test",
|
|
});
|
|
console.log("Connection test successful");
|
|
} catch (error) {
|
|
console.error("Connection test failed:", error.message);
|
|
return;
|
|
}
|
|
|
|
// Ingest all conversations
|
|
for (let i = 0; i < conversations.length; i++) {
|
|
const conversation = conversations[i];
|
|
const conversationId = `locomo_sessions_${i + 1}`;
|
|
|
|
if (i === 0) {
|
|
try {
|
|
const wasIngested = await this.ingestConversation(
|
|
conversation,
|
|
conversationId,
|
|
forceReingest
|
|
);
|
|
|
|
if (wasIngested) {
|
|
ingestedCount++;
|
|
} else {
|
|
skippedCount++;
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error with conversation ${conversationId}:`, error.message);
|
|
}
|
|
}
|
|
}
|
|
|
|
this.printSummary(ingestedCount, skippedCount);
|
|
}
|
|
|
|
printSummary(ingestedCount, skippedCount) {
|
|
console.log("\n=== SESSION SUMMARY INGESTION ===");
|
|
console.log(`Conversations processed: ${ingestedCount}`);
|
|
console.log(`Conversations skipped: ${skippedCount}`);
|
|
|
|
const status = this.loadIngestionStatus();
|
|
const totalSessions = Object.values(status.conversations).reduce(
|
|
(sum, conv) => sum + (conv.totalEpisodes || 0),
|
|
0
|
|
);
|
|
const totalSuccess = Object.values(status.conversations).reduce(
|
|
(sum, conv) => sum + (conv.successCount || 0),
|
|
0
|
|
);
|
|
const totalErrors = Object.values(status.conversations).reduce(
|
|
(sum, conv) => sum + (conv.errorCount || 0),
|
|
0
|
|
);
|
|
|
|
console.log(`Total sessions ingested: ${totalSuccess}/${totalSessions}`);
|
|
console.log(
|
|
`Success rate: ${((totalSuccess / (totalSuccess + totalErrors || 1)) * 100).toFixed(1)}%`
|
|
);
|
|
|
|
console.log("\nReady for evaluation phase!");
|
|
console.log("Benefits: Fast ingestion, comprehensive summaries, all conversations covered");
|
|
}
|
|
|
|
getStatus() {
|
|
const status = this.loadIngestionStatus();
|
|
const conversations = Object.keys(status.conversations).length;
|
|
const totalSessions = Object.values(status.conversations).reduce(
|
|
(sum, conv) => sum + (conv.successCount || 0),
|
|
0
|
|
);
|
|
|
|
return {
|
|
conversations,
|
|
sessions: totalSessions,
|
|
lastIngestion: status.timestamp,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Command line interface
|
|
if (require.main === module) {
|
|
const args = process.argv.slice(2);
|
|
const forceReingest = args.includes("--force");
|
|
const showStatus = args.includes("--status");
|
|
|
|
const ingester = new LocomoSessionIngester();
|
|
|
|
if (showStatus) {
|
|
const status = ingester.getStatus();
|
|
console.log("LOCOMO Session Ingestion Status:");
|
|
console.log(` Conversations: ${status.conversations}`);
|
|
console.log(` Sessions: ${status.sessions}`);
|
|
console.log(` Last ingestion: ${status.lastIngestion || "Never"}`);
|
|
} else {
|
|
ingester.ingestAll(forceReingest).catch(console.error);
|
|
}
|
|
}
|
|
|
|
module.exports = LocomoSessionIngester;
|