core/benchmarks/evaluate_qa.js
2025-08-23 16:05:41 +05:30

424 lines
14 KiB
JavaScript
Executable File

#!/usr/bin/env node
const fs = require("fs");
const path = require("path");
const axios = require("axios");
/**
* LOCOMO Q&A Evaluation Script
* Evaluates question answering against ingested LOCOMO conversations
* Assumes conversations are already ingested via ingest_conversations.js
*/
class LocomoEvaluator {
constructor(baseUrl = "http://localhost:3033") {
this.baseUrl = baseUrl;
this.headers = {
Authorization: "Bearer rc_pat_kbc76ykt3gd81r6ctyeh8as5jryihbeqqvnsi2wt",
};
this.results = [];
// Create axios instance with default config
this.axios = axios.create({
baseURL: this.baseUrl,
headers: this.headers,
});
}
async makeRequest(endpoint, data) {
try {
const response = await this.axios.post(endpoint, data, {
headers: {
"Content-Type": "application/json",
},
});
return response.data;
} catch (error) {
if (error.response) {
throw new Error(`HTTP ${error.response.status}: ${JSON.stringify(error.response.data)}`);
} else if (error.request) {
throw new Error(`No response received: ${error.message}`);
} else {
throw new Error(`Request error: ${error.message}`);
}
}
}
async searchMemory(question, conversationId = null) {
try {
const response = await this.makeRequest("/api/v1/search", {
query: question,
});
return response;
} catch (error) {
console.error("Search error:", error.message);
return { results: [] };
}
}
async answerQuestion(question) {
try {
const response = await this.makeRequest("/api/v1/qa", {
question: question,
});
return response;
} catch (error) {
console.error("Q&A API error:", error.message);
return {
question: question,
generated_answer: "Error: Could not generate answer",
};
}
}
async evaluateAnswer(question, standardAnswer, generatedAnswer) {
const response = await this.makeRequest("/api/v1/evaluate", {
question,
standard_answer: standardAnswer,
generated_answer: generatedAnswer,
});
return {
label: response.label,
reasoning: response.reasoning,
matchRatio: response.matchRatio,
evaluationMethod: response.method,
};
}
async evaluateQuestion(question, expectedAnswer, evidence, conversationId, category) {
// NEW: Get generated answer from Q&A API
const qaResponse = await this.answerQuestion(question);
const generatedAnswer = qaResponse.generated_answer || "";
// NEW: Evaluate the generated answer against the expected answer
const evaluation = await this.evaluateAnswer(question, expectedAnswer, generatedAnswer);
return {
question,
expectedAnswer,
evidence,
category,
conversationId,
generatedAnswer: generatedAnswer,
evaluationResult: evaluation.label,
evaluationReasoning: evaluation.reasoning,
matchRatio: evaluation.matchRatio,
evaluationMethod: evaluation.evaluationMethod,
};
}
async evaluateConversation(conversation, conversationId) {
console.log(`Evaluating conversation ${conversationId}...`);
const batchSize = 15; // Process 15 questions concurrently
const qaResults = [];
const totalQuestions = conversation.qa.length;
let processed = 0;
console.log(`Processing ${totalQuestions} questions in batches of ${batchSize}...`);
for (let i = 0; i < totalQuestions; i += batchSize) {
const batch = conversation.qa.slice(i, i + batchSize);
const batchStartIndex = i;
console.log(
`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(totalQuestions / batchSize)} (questions ${i + 1}-${Math.min(i + batchSize, totalQuestions)})`
);
// Create promises for the current batch
const batchPromises = batch.map(async (qa, batchIndex) => {
const questionIndex = batchStartIndex + batchIndex;
console.log(qa.question);
try {
const result = await this.evaluateQuestion(
qa.question,
qa.answer,
qa.evidence,
conversationId,
qa.category
);
return { result, index: questionIndex };
} catch (error) {
console.error(`Error evaluating question ${questionIndex + 1}:`, error.message);
return { error: error.message, index: questionIndex, qa };
}
});
// Process batch concurrently
const batchResults = await Promise.allSettled(batchPromises);
// Process results from this batch
batchResults.forEach((promiseResult) => {
if (promiseResult.status === "fulfilled") {
const { result, error, index, qa } = promiseResult.value;
if (result) {
qaResults.push(result);
} else if (error) {
// Add a placeholder result for failed evaluations
qaResults.push({
question: qa.question,
expectedAnswer: qa.answer ? qa.answer.toString() : qa.adversarial_answer.toString(),
evidence: qa.evidence,
category: qa.category,
conversationId,
error: error,
generatedAnswer: "Error: Evaluation failed",
evaluationResult: "ERROR",
evaluationReasoning: `Evaluation failed: ${error}`,
matchRatio: 0,
evaluationMethod: "error",
});
}
} else {
console.error(`Batch promise rejected:`, promiseResult.reason);
}
});
processed += batch.length;
console.log(` Completed ${processed}/${totalQuestions} questions`);
// Save results periodically (every batch or ~15 questions)
console.log(`Saving intermediate results...`);
this.saveResults();
// break;
}
console.log(`Completed evaluation of ${totalQuestions} questions`);
return qaResults;
}
async runEvaluation() {
console.log("Starting LOCOMO Q&A evaluation...");
// Load LOCOMO dataset
const dataPath = path.join(__dirname, "locomo10.json");
const conversations = JSON.parse(fs.readFileSync(dataPath, "utf8"));
console.log(`Loaded ${conversations.length} conversations for evaluation`);
// Evaluate each conversation
for (let i = 0; i < conversations.length; i++) {
const conversation = conversations[i];
const conversationId = `locomo_${i + 1}`;
if (i === 0) {
try {
const results = await this.evaluateConversation(conversation, conversationId);
this.results.push({
conversationId,
results,
totalQuestions: conversation.qa.length,
});
} catch (error) {
console.error(`Error evaluating conversation ${conversationId}:`, error.message);
}
}
}
// Save and summarize results
this.saveResults();
this.printDetailedSummary();
}
saveResults() {
const resultsPath = path.join(__dirname, "evaluation_results.json");
const timestamp = new Date().toISOString();
const output = {
timestamp,
summary: this.calculateSummaryStats(),
conversations: this.results,
};
fs.writeFileSync(resultsPath, JSON.stringify(output, null, 2));
console.log(`\nResults saved to ${resultsPath}`);
}
calculateSummaryStats() {
const totalQuestions = this.results.reduce((sum, conv) => sum + conv.totalQuestions, 0);
const questionsWithContext = this.results.reduce(
(sum, conv) => sum + conv.results.filter((r) => r.hasContext).length,
0
);
const questionsWithAnswerInContext = this.results.reduce(
(sum, conv) => sum + conv.results.filter((r) => r.answerInContext).length,
0
);
// NEW: Q&A evaluation statistics
const questionsWithGeneratedAnswers = this.results.reduce(
(sum, conv) =>
sum +
conv.results.filter(
(r) => r.generatedAnswer && r.generatedAnswer !== "Error: Could not generate answer"
).length,
0
);
const correctAnswers = this.results.reduce(
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "CORRECT").length,
0
);
const wrongAnswers = this.results.reduce(
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "WRONG").length,
0
);
const errorAnswers = this.results.reduce(
(sum, conv) => sum + conv.results.filter((r) => r.evaluationResult === "ERROR").length,
0
);
// Category breakdown
const categoryStats = {};
this.results.forEach((conv) => {
conv.results.forEach((result) => {
const cat = result.category || "unknown";
if (!categoryStats[cat]) {
categoryStats[cat] = {
total: 0,
withContext: 0,
withAnswer: 0,
withGenerated: 0,
correct: 0,
wrong: 0,
errors: 0,
};
}
categoryStats[cat].total++;
if (result.hasContext) categoryStats[cat].withContext++;
if (result.answerInContext) categoryStats[cat].withAnswer++;
if (
result.generatedAnswer &&
result.generatedAnswer !== "Error: Could not generate answer" &&
result.generatedAnswer !== "Error: Evaluation failed"
) {
categoryStats[cat].withGenerated++;
}
if (result.evaluationResult === "CORRECT") categoryStats[cat].correct++;
if (result.evaluationResult === "WRONG") categoryStats[cat].wrong++;
if (result.evaluationResult === "ERROR") categoryStats[cat].errors++;
});
});
return {
totalQuestions,
questionsWithContext,
questionsWithAnswerInContext,
contextRetrievalRate: ((questionsWithContext / totalQuestions) * 100).toFixed(1),
answerFoundRate: ((questionsWithAnswerInContext / totalQuestions) * 100).toFixed(1),
// NEW: Q&A evaluation metrics
questionsWithGeneratedAnswers,
correctAnswers,
wrongAnswers,
errorAnswers,
qaSuccessRate:
totalQuestions > 0
? ((questionsWithGeneratedAnswers / totalQuestions) * 100).toFixed(1)
: "0.0",
answerAccuracyRate:
questionsWithGeneratedAnswers > 0
? ((correctAnswers / questionsWithGeneratedAnswers) * 100).toFixed(1)
: "0.0",
categoryBreakdown: categoryStats,
};
}
printDetailedSummary() {
const stats = this.calculateSummaryStats();
console.log("\n=== LOCOMO EVALUATION RESULTS ===");
console.log(`Total conversations: ${this.results.length}`);
console.log(`Total questions: ${stats.totalQuestions}`);
console.log(
`Questions with retrieved context: ${stats.questionsWithContext}/${stats.totalQuestions} (${stats.contextRetrievalRate}%)`
);
console.log(
`Questions with answer in context: ${stats.questionsWithAnswerInContext}/${stats.totalQuestions} (${stats.answerFoundRate}%)`
);
console.log("\n=== Q&A EVALUATION RESULTS ===");
console.log(
`Questions with generated answers: ${stats.questionsWithGeneratedAnswers}/${stats.totalQuestions} (${stats.qaSuccessRate}%)`
);
console.log(
`Correct answers: ${stats.correctAnswers}/${stats.questionsWithGeneratedAnswers} (${stats.answerAccuracyRate}%)`
);
console.log(`Wrong answers: ${stats.wrongAnswers}/${stats.questionsWithGeneratedAnswers}`);
if (stats.errorAnswers > 0) {
console.log(`Evaluation errors: ${stats.errorAnswers}/${stats.totalQuestions}`);
}
console.log("\n=== CATEGORY BREAKDOWN ===");
Object.entries(stats.categoryBreakdown).forEach(([category, catStats]) => {
const retrievalRate = ((catStats.withAnswer / catStats.total) * 100).toFixed(1);
const qaRate =
catStats.withGenerated > 0
? ((catStats.withGenerated / catStats.total) * 100).toFixed(1)
: "0.0";
const accuracyRate =
catStats.withGenerated > 0
? ((catStats.correct / catStats.withGenerated) * 100).toFixed(1)
: "0.0";
console.log(`Category ${category}:`);
console.log(` Total questions: ${catStats.total}`);
console.log(
` Context retrieval: ${catStats.withAnswer}/${catStats.total} (${retrievalRate}%)`
);
console.log(` Generated answers: ${catStats.withGenerated}/${catStats.total} (${qaRate}%)`);
console.log(
` Answer accuracy: ${catStats.correct}/${catStats.withGenerated} (${accuracyRate}%)`
);
if (catStats.errors > 0) {
console.log(` Evaluation errors: ${catStats.errors}/${catStats.total}`);
}
});
console.log("\n=== PERFORMANCE INSIGHTS ===");
const avgContextLength =
this.results.reduce(
(sum, conv) => sum + conv.results.reduce((s, r) => s + r.contextLength, 0),
0
) / stats.totalQuestions;
console.log(`Average context length: ${avgContextLength.toFixed(0)} characters`);
const avgMatchRatio =
this.results.reduce(
(sum, conv) => sum + conv.results.reduce((s, r) => s + (r.matchRatio || 0), 0),
0
) / stats.totalQuestions;
console.log(`Average answer match ratio: ${avgMatchRatio.toFixed(3)}`);
// Show evaluation method breakdown
const evaluationMethods = {};
this.results.forEach((conv) => {
conv.results.forEach((result) => {
const method = result.evaluationMethod || "unknown";
evaluationMethods[method] = (evaluationMethods[method] || 0) + 1;
});
});
console.log("\n=== EVALUATION SUMMARY ===");
console.log(
"This evaluation measures both retrieval performance and answer generation accuracy."
);
console.log("Generated answers are evaluated against gold standard answers.");
console.log("\n=== EVALUATION METHODS USED ===");
Object.entries(evaluationMethods).forEach(([method, count]) => {
const percentage = ((count / stats.totalQuestions) * 100).toFixed(1);
console.log(`${method}: ${count}/${stats.totalQuestions} (${percentage}%)`);
});
}
}
// Command line interface
if (require.main === module) {
const evaluator = new LocomoEvaluator();
evaluator.runEvaluation().catch(console.error);
}
module.exports = LocomoEvaluator;