mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-12 01:18:27 +00:00
424 lines
14 KiB
JavaScript
Executable File
424 lines
14 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const axios = require("axios");
|
|
|
|
/**
|
|
* LOCOMO Q&A Evaluation Script
|
|
* Evaluates question answering against ingested LOCOMO conversations
|
|
* Assumes conversations are already ingested via ingest_conversations.js
|
|
*/
|
|
|
|
class LocomoEvaluator {
|
|
constructor(baseUrl = "http://localhost:3033") {
|
|
this.baseUrl = baseUrl;
|
|
this.headers = {
|
|
Authorization: "Bearer rc_pat_kbc76ykt3gd81r6ctyeh8as5jryihbeqqvnsi2wt",
|
|
};
|
|
this.results = [];
|
|
|
|
// Create axios instance with default config
|
|
this.axios = axios.create({
|
|
baseURL: this.baseUrl,
|
|
headers: this.headers,
|
|
});
|
|
}
|
|
|
|
async makeRequest(endpoint, data) {
|
|
try {
|
|
const response = await this.axios.post(endpoint, data, {
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
},
|
|
});
|
|
return response.data;
|
|
} catch (error) {
|
|
if (error.response) {
|
|
throw new Error(`HTTP ${error.response.status}: ${JSON.stringify(error.response.data)}`);
|
|
} else if (error.request) {
|
|
throw new Error(`No response received: ${error.message}`);
|
|
} else {
|
|
throw new Error(`Request error: ${error.message}`);
|
|
}
|
|
}
|
|
}
|
|
async searchMemory(question, conversationId = null) {
|
|
try {
|
|
const response = await this.makeRequest("/api/v1/search", {
|
|
query: question,
|
|
});
|
|
|
|
return response;
|
|
} catch (error) {
|
|
console.error("Search error:", error.message);
|
|
return { results: [] };
|
|
}
|
|
}
|
|
|
|
async answerQuestion(question) {
|
|
try {
|
|
const response = await this.makeRequest("/api/v1/qa", {
|
|
question: question,
|
|
});
|
|
|
|
return response;
|
|
} catch (error) {
|
|
console.error("Q&A API error:", error.message);
|
|
return {
|
|
question: question,
|
|
generated_answer: "Error: Could not generate answer",
|
|
};
|
|
}
|
|
}
|
|
|
|
async evaluateAnswer(question, standardAnswer, generatedAnswer) {
|
|
const response = await this.makeRequest("/api/v1/evaluate", {
|
|
question,
|
|
standard_answer: standardAnswer,
|
|
generated_answer: generatedAnswer,
|
|
});
|
|
|
|
return {
|
|
label: response.label,
|
|
reasoning: response.reasoning,
|
|
matchRatio: response.matchRatio,
|
|
evaluationMethod: response.method,
|
|
};
|
|
}
|
|
|
|
async evaluateQuestion(question, expectedAnswer, evidence, conversationId, category) {
|
|
// NEW: Get generated answer from Q&A API
|
|
const qaResponse = await this.answerQuestion(question);
|
|
const generatedAnswer = qaResponse.generated_answer || "";
|
|
|
|
// NEW: Evaluate the generated answer against the expected answer
|
|
const evaluation = await this.evaluateAnswer(question, expectedAnswer, generatedAnswer);
|
|
|
|
return {
|
|
question,
|
|
expectedAnswer,
|
|
evidence,
|
|
category,
|
|
conversationId,
|
|
generatedAnswer: generatedAnswer,
|
|
evaluationResult: evaluation.label,
|
|
evaluationReasoning: evaluation.reasoning,
|
|
matchRatio: evaluation.matchRatio,
|
|
evaluationMethod: evaluation.evaluationMethod,
|
|
};
|
|
}
|
|
|
|
async evaluateConversation(conversation, conversationId) {
|
|
console.log(`Evaluating conversation ${conversationId}...`);
|
|
|
|
const batchSize = 15; // Process 15 questions concurrently
|
|
const qaResults = [];
|
|
const totalQuestions = conversation.qa.length;
|
|
let processed = 0;
|
|
|
|
console.log(`Processing ${totalQuestions} questions in batches of ${batchSize}...`);
|
|
|
|
for (let i = 0; i < totalQuestions; i += batchSize) {
|
|
const batch = conversation.qa.slice(i, i + batchSize);
|
|
const batchStartIndex = i;
|
|
|
|
console.log(
|
|
`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(totalQuestions / batchSize)} (questions ${i + 1}-${Math.min(i + batchSize, totalQuestions)})`
|
|
);
|
|
|
|
// Create promises for the current batch
|
|
const batchPromises = batch.map(async (qa, batchIndex) => {
|
|
const questionIndex = batchStartIndex + batchIndex;
|
|
console.log(qa.question);
|
|
try {
|
|
const result = await this.evaluateQuestion(
|
|
qa.question,
|
|
qa.answer,
|
|
qa.evidence,
|
|
conversationId,
|
|
qa.category
|
|
);
|
|
return { result, index: questionIndex };
|
|
} catch (error) {
|
|
console.error(`Error evaluating question ${questionIndex + 1}:`, error.message);
|
|
return { error: error.message, index: questionIndex, qa };
|
|
}
|
|
});
|
|
|
|
// Process batch concurrently
|
|
const batchResults = await Promise.allSettled(batchPromises);
|
|
|
|
// Process results from this batch
|
|
batchResults.forEach((promiseResult) => {
|
|
if (promiseResult.status === "fulfilled") {
|
|
const { result, error, index, qa } = promiseResult.value;
|
|
if (result) {
|
|
qaResults.push(result);
|
|
} else if (error) {
|
|
// Add a placeholder result for failed evaluations
|
|
qaResults.push({
|
|
question: qa.question,
|
|
expectedAnswer: qa.answer ? qa.answer.toString() : qa.adversarial_answer.toString(),
|
|
evidence: qa.evidence,
|
|
category: qa.category,
|
|
conversationId,
|
|
error: error,
|
|
generatedAnswer: "Error: Evaluation failed",
|
|
evaluationResult: "ERROR",
|
|
evaluationReasoning: `Evaluation failed: ${error}`,
|
|
matchRatio: 0,
|
|
evaluationMethod: "error",
|
|
});
|
|
}
|
|
} else {
|
|
console.error(`Batch promise rejected:`, promiseResult.reason);
|
|
}
|
|
});
|
|
|
|
processed += batch.length;
|
|
console.log(` Completed ${processed}/${totalQuestions} questions`);
|
|
|
|
// Save results periodically (every batch or ~15 questions)
|
|
console.log(`Saving intermediate results...`);
|
|
this.saveResults();
|
|
|
|
// break;
|
|
}
|
|
|
|
console.log(`Completed evaluation of ${totalQuestions} questions`);
|
|
return qaResults;
|
|
}
|
|
|
|
async runEvaluation() {
|
|
console.log("Starting LOCOMO Q&A evaluation...");
|
|
|
|
// Load LOCOMO dataset
|
|
const dataPath = path.join(__dirname, "locomo10.json");
|
|
const conversations = JSON.parse(fs.readFileSync(dataPath, "utf8"));
|
|
|
|
console.log(`Loaded ${conversations.length} conversations for evaluation`);
|
|
|
|
// Evaluate each conversation
|
|
for (let i = 0; i < conversations.length; i++) {
|
|
const conversation = conversations[i];
|
|
const conversationId = `locomo_${i + 1}`;
|
|
|
|
if (i === 0) {
|
|
try {
|
|
const results = await this.evaluateConversation(conversation, conversationId);
|
|
this.results.push({
|
|
conversationId,
|
|
results,
|
|
totalQuestions: conversation.qa.length,
|
|
});
|
|
} catch (error) {
|
|
console.error(`Error evaluating conversation ${conversationId}:`, error.message);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Save and summarize results
|
|
this.saveResults();
|
|
this.printDetailedSummary();
|
|
}
|
|
|
|
saveResults() {
|
|
const resultsPath = path.join(__dirname, "evaluation_results.json");
|
|
const timestamp = new Date().toISOString();
|
|
|
|
const output = {
|
|
timestamp,
|
|
summary: this.calculateSummaryStats(),
|
|
conversations: this.results,
|
|
};
|
|
|
|
fs.writeFileSync(resultsPath, JSON.stringify(output, null, 2));
|
|
console.log(`\nResults saved to ${resultsPath}`);
|
|
}
|
|
|
|
calculateSummaryStats() {
|
|
const totalQuestions = this.results.reduce((sum, conv) => sum + conv.totalQuestions, 0);
|
|
const questionsWithContext = this.results.reduce(
|
|
(sum, conv) => sum + conv.results.filter((r) => r.hasContext).length,
|
|
0
|
|
);
|
|
const questionsWithAnswerInContext = this.results.reduce(
|
|
(sum, conv) => sum + conv.results.filter((r) => r.answerInContext).length,
|
|
0
|
|
);
|
|
|
|
// NEW: Q&A evaluation statistics
|
|
const questionsWithGeneratedAnswers = this.results.reduce(
|
|
(sum, conv) =>
|
|
sum +
|
|
conv.results.filter(
|
|
(r) => r.generatedAnswer && r.generatedAnswer !== "Error: Could not generate answer"
|
|
).length,
|
|
0
|
|
);
|
|
const correctAnswers = this.results.reduce(
|
|
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "CORRECT").length,
|
|
0
|
|
);
|
|
const wrongAnswers = this.results.reduce(
|
|
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "WRONG").length,
|
|
0
|
|
);
|
|
const errorAnswers = this.results.reduce(
|
|
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "ERROR").length,
|
|
0
|
|
);
|
|
|
|
// Category breakdown
|
|
const categoryStats = {};
|
|
this.results.forEach((conv) => {
|
|
conv.results.forEach((result) => {
|
|
const cat = result.category || "unknown";
|
|
if (!categoryStats[cat]) {
|
|
categoryStats[cat] = {
|
|
total: 0,
|
|
withContext: 0,
|
|
withAnswer: 0,
|
|
withGenerated: 0,
|
|
correct: 0,
|
|
wrong: 0,
|
|
errors: 0,
|
|
};
|
|
}
|
|
categoryStats[cat].total++;
|
|
if (result.hasContext) categoryStats[cat].withContext++;
|
|
if (result.answerInContext) categoryStats[cat].withAnswer++;
|
|
if (
|
|
result.generatedAnswer &&
|
|
result.generatedAnswer !== "Error: Could not generate answer" &&
|
|
result.generatedAnswer !== "Error: Evaluation failed"
|
|
) {
|
|
categoryStats[cat].withGenerated++;
|
|
}
|
|
if (result.evaluationResult === "CORRECT") categoryStats[cat].correct++;
|
|
if (result.evaluationResult === "WRONG") categoryStats[cat].wrong++;
|
|
if (result.evaluationResult === "ERROR") categoryStats[cat].errors++;
|
|
});
|
|
});
|
|
|
|
return {
|
|
totalQuestions,
|
|
questionsWithContext,
|
|
questionsWithAnswerInContext,
|
|
contextRetrievalRate: ((questionsWithContext / totalQuestions) * 100).toFixed(1),
|
|
answerFoundRate: ((questionsWithAnswerInContext / totalQuestions) * 100).toFixed(1),
|
|
// NEW: Q&A evaluation metrics
|
|
questionsWithGeneratedAnswers,
|
|
correctAnswers,
|
|
wrongAnswers,
|
|
errorAnswers,
|
|
qaSuccessRate:
|
|
totalQuestions > 0
|
|
? ((questionsWithGeneratedAnswers / totalQuestions) * 100).toFixed(1)
|
|
: "0.0",
|
|
answerAccuracyRate:
|
|
questionsWithGeneratedAnswers > 0
|
|
? ((correctAnswers / questionsWithGeneratedAnswers) * 100).toFixed(1)
|
|
: "0.0",
|
|
categoryBreakdown: categoryStats,
|
|
};
|
|
}
|
|
|
|
printDetailedSummary() {
|
|
const stats = this.calculateSummaryStats();
|
|
|
|
console.log("\n=== LOCOMO EVALUATION RESULTS ===");
|
|
console.log(`Total conversations: ${this.results.length}`);
|
|
console.log(`Total questions: ${stats.totalQuestions}`);
|
|
console.log(
|
|
`Questions with retrieved context: ${stats.questionsWithContext}/${stats.totalQuestions} (${stats.contextRetrievalRate}%)`
|
|
);
|
|
console.log(
|
|
`Questions with answer in context: ${stats.questionsWithAnswerInContext}/${stats.totalQuestions} (${stats.answerFoundRate}%)`
|
|
);
|
|
|
|
console.log("\n=== Q&A EVALUATION RESULTS ===");
|
|
console.log(
|
|
`Questions with generated answers: ${stats.questionsWithGeneratedAnswers}/${stats.totalQuestions} (${stats.qaSuccessRate}%)`
|
|
);
|
|
console.log(
|
|
`Correct answers: ${stats.correctAnswers}/${stats.questionsWithGeneratedAnswers} (${stats.answerAccuracyRate}%)`
|
|
);
|
|
console.log(`Wrong answers: ${stats.wrongAnswers}/${stats.questionsWithGeneratedAnswers}`);
|
|
if (stats.errorAnswers > 0) {
|
|
console.log(`Evaluation errors: ${stats.errorAnswers}/${stats.totalQuestions}`);
|
|
}
|
|
|
|
console.log("\n=== CATEGORY BREAKDOWN ===");
|
|
Object.entries(stats.categoryBreakdown).forEach(([category, catStats]) => {
|
|
const retrievalRate = ((catStats.withAnswer / catStats.total) * 100).toFixed(1);
|
|
const qaRate =
|
|
catStats.withGenerated > 0
|
|
? ((catStats.withGenerated / catStats.total) * 100).toFixed(1)
|
|
: "0.0";
|
|
const accuracyRate =
|
|
catStats.withGenerated > 0
|
|
? ((catStats.correct / catStats.withGenerated) * 100).toFixed(1)
|
|
: "0.0";
|
|
|
|
console.log(`Category ${category}:`);
|
|
console.log(` Total questions: ${catStats.total}`);
|
|
console.log(
|
|
` Context retrieval: ${catStats.withAnswer}/${catStats.total} (${retrievalRate}%)`
|
|
);
|
|
console.log(` Generated answers: ${catStats.withGenerated}/${catStats.total} (${qaRate}%)`);
|
|
console.log(
|
|
` Answer accuracy: ${catStats.correct}/${catStats.withGenerated} (${accuracyRate}%)`
|
|
);
|
|
if (catStats.errors > 0) {
|
|
console.log(` Evaluation errors: ${catStats.errors}/${catStats.total}`);
|
|
}
|
|
});
|
|
|
|
console.log("\n=== PERFORMANCE INSIGHTS ===");
|
|
const avgContextLength =
|
|
this.results.reduce(
|
|
(sum, conv) => sum + conv.results.reduce((s, r) => s + r.contextLength, 0),
|
|
0
|
|
) / stats.totalQuestions;
|
|
console.log(`Average context length: ${avgContextLength.toFixed(0)} characters`);
|
|
|
|
const avgMatchRatio =
|
|
this.results.reduce(
|
|
(sum, conv) => sum + conv.results.reduce((s, r) => s + (r.matchRatio || 0), 0),
|
|
0
|
|
) / stats.totalQuestions;
|
|
console.log(`Average answer match ratio: ${avgMatchRatio.toFixed(3)}`);
|
|
|
|
// Show evaluation method breakdown
|
|
const evaluationMethods = {};
|
|
this.results.forEach((conv) => {
|
|
conv.results.forEach((result) => {
|
|
const method = result.evaluationMethod || "unknown";
|
|
evaluationMethods[method] = (evaluationMethods[method] || 0) + 1;
|
|
});
|
|
});
|
|
|
|
console.log("\n=== EVALUATION SUMMARY ===");
|
|
console.log(
|
|
"This evaluation measures both retrieval performance and answer generation accuracy."
|
|
);
|
|
console.log("Generated answers are evaluated against gold standard answers.");
|
|
|
|
console.log("\n=== EVALUATION METHODS USED ===");
|
|
Object.entries(evaluationMethods).forEach(([method, count]) => {
|
|
const percentage = ((count / stats.totalQuestions) * 100).toFixed(1);
|
|
console.log(`${method}: ${count}/${stats.totalQuestions} (${percentage}%)`);
|
|
});
|
|
}
|
|
}
|
|
|
|
// Command line interface
|
|
if (require.main === module) {
|
|
const evaluator = new LocomoEvaluator();
|
|
evaluator.runEvaluation().catch(console.error);
|
|
}
|
|
|
|
module.exports = LocomoEvaluator;
|