mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 10:08:27 +00:00
chore: move api evaluate, qa to core-benchmark
This commit is contained in:
parent
89c37a0360
commit
06ced5ab8b
@ -1,121 +0,0 @@
|
||||
import { z } from "zod";
|
||||
import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
|
||||
import { makeModelCall } from "~/lib/model.server";
|
||||
import { json } from "@remix-run/node";
|
||||
|
||||
export const EvaluateBodyRequest = z.object({
|
||||
question: z.string(),
|
||||
standard_answer: z.string(),
|
||||
generated_answer: z.string(),
|
||||
});
|
||||
|
||||
const { action, loader } = createActionApiRoute(
|
||||
{
|
||||
body: EvaluateBodyRequest,
|
||||
allowJWT: true,
|
||||
authorization: {
|
||||
action: "search", // Using same permission as search
|
||||
},
|
||||
corsStrategy: "all",
|
||||
},
|
||||
async ({ body, authentication: _ }) => {
|
||||
const { question, standard_answer, generated_answer } = body;
|
||||
|
||||
const evaluationPrompt = `Your task is to label an answer to a question as 'CORRECT' or 'WRONG'. You will be given the following data:
|
||||
(1) a question (posed by one user to another user),
|
||||
(2) a 'gold' (ground truth) answer,
|
||||
(3) a generated answer
|
||||
which you will score as CORRECT/WRONG.
|
||||
|
||||
The point of the question is to ask about something one user should know about the other user based on their prior conversations.
|
||||
The gold answer will usually be a concise and short answer that includes the referenced topic, for example:
|
||||
Question: Do you remember what I got the last time I went to Hawaii?
|
||||
Gold answer: A shell necklace
|
||||
The generated answer might be much longer, but you should be generous with your grading - as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.
|
||||
|
||||
For time related questions, the gold answer will be a specific date, month, year, etc. The generated answer might be much longer or use relative time references (like "last Tuesday" or "next month"), but you should be generous with your grading - as long as it refers to the same date or time period as the gold answer, it should be counted as CORRECT. Even if the format differs (e.g., "May 7th" vs "7 May"), consider it CORRECT if it's the same date.
|
||||
|
||||
Now it's time for the real question:
|
||||
Question: ${question}
|
||||
Gold answer: ${standard_answer}
|
||||
Generated answer: ${generated_answer}
|
||||
|
||||
First, provide a short (one sentence) explanation of your reasoning, then finish with CORRECT or WRONG.
|
||||
Do NOT include both CORRECT and WRONG in your response, or it will break the evaluation script.
|
||||
|
||||
Just return the label CORRECT or WRONG in a json format with the key as "label".`;
|
||||
|
||||
try {
|
||||
// Use the LLM to evaluate the answer
|
||||
const llmResponse = await makeModelCall(
|
||||
false, // Don't stream
|
||||
[{ role: "user", content: evaluationPrompt }],
|
||||
(_text: string, _model: string) => {
|
||||
// onFinish callback - we can log model usage here if needed
|
||||
}
|
||||
) as string;
|
||||
|
||||
// Parse the LLM response to extract the label
|
||||
const response = llmResponse.trim();
|
||||
let label = "WRONG";
|
||||
let reasoning = response;
|
||||
|
||||
// Try to parse as JSON first
|
||||
try {
|
||||
const jsonResponse = JSON.parse(response);
|
||||
if (jsonResponse.label && (jsonResponse.label === "CORRECT" || jsonResponse.label === "WRONG")) {
|
||||
label = jsonResponse.label;
|
||||
reasoning = jsonResponse.reasoning || response;
|
||||
}
|
||||
} catch (jsonError) {
|
||||
// If not JSON, look for CORRECT/WRONG in the text
|
||||
if (response.includes("CORRECT") && !response.includes("WRONG")) {
|
||||
label = "CORRECT";
|
||||
} else if (response.includes("WRONG") && !response.includes("CORRECT")) {
|
||||
label = "WRONG";
|
||||
}
|
||||
// Extract reasoning (everything before the final CORRECT/WRONG)
|
||||
const parts = response.split(/(CORRECT|WRONG)$/);
|
||||
if (parts.length > 1) {
|
||||
reasoning = parts[0].trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate match ratio for additional metrics
|
||||
const generatedLower = generated_answer.toLowerCase();
|
||||
const standardLower = standard_answer.toString().toLowerCase();
|
||||
const standardWords = standardLower.split(/\s+/).filter(word => word.length > 2);
|
||||
const matchingWords = standardWords.filter(word => generatedLower.includes(word));
|
||||
const matchRatio = standardWords.length > 0 ? matchingWords.length / standardWords.length : 0;
|
||||
|
||||
return json({
|
||||
label: label,
|
||||
reasoning: reasoning,
|
||||
matchRatio: matchRatio,
|
||||
method: "llm"
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error("Error in LLM evaluation:", error);
|
||||
|
||||
// Fallback to heuristic evaluation
|
||||
const generatedLower = generated_answer.toLowerCase();
|
||||
const standardLower = standard_answer.toString().toLowerCase();
|
||||
|
||||
const standardWords = standardLower.split(/\s+/).filter(word => word.length > 2);
|
||||
const matchingWords = standardWords.filter(word => generatedLower.includes(word));
|
||||
const matchRatio = standardWords.length > 0 ? matchingWords.length / standardWords.length : 0;
|
||||
|
||||
const isCorrect = matchRatio > 0.3; // If 30% of important words match
|
||||
|
||||
return json({
|
||||
label: isCorrect ? "CORRECT" : "WRONG",
|
||||
reasoning: `Generated answer ${isCorrect ? 'contains' : 'does not contain'} sufficient matching content with the gold standard (${matchRatio.toFixed(2)} match ratio)`,
|
||||
matchRatio: matchRatio,
|
||||
method: "heuristic_fallback"
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
export { action, loader };
|
||||
@ -1,139 +0,0 @@
|
||||
import { z } from "zod";
|
||||
import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
|
||||
import { SearchService } from "~/services/search.server";
|
||||
import { makeModelCall } from "~/lib/model.server";
|
||||
import { json } from "@remix-run/node";
|
||||
|
||||
export const QABodyRequest = z.object({
|
||||
question: z.string(),
|
||||
startTime: z.string().optional(),
|
||||
endTime: z.string().optional(),
|
||||
spaceId: z.string().optional(),
|
||||
limit: z.number().optional(),
|
||||
maxBfsDepth: z.number().optional(),
|
||||
includeInvalidated: z.boolean().optional(),
|
||||
entityTypes: z.array(z.string()).optional(),
|
||||
scoreThreshold: z.number().optional(),
|
||||
minResults: z.number().optional(),
|
||||
});
|
||||
|
||||
const searchService = new SearchService();
|
||||
const { action, loader } = createActionApiRoute(
|
||||
{
|
||||
body: QABodyRequest,
|
||||
allowJWT: true,
|
||||
authorization: {
|
||||
action: "search",
|
||||
},
|
||||
corsStrategy: "all",
|
||||
},
|
||||
async ({ body, authentication }) => {
|
||||
// First, search for relevant information
|
||||
const searchResults = await searchService.search(
|
||||
body.question,
|
||||
authentication.userId,
|
||||
{
|
||||
startTime: body.startTime ? new Date(body.startTime) : undefined,
|
||||
endTime: body.endTime ? new Date(body.endTime) : undefined,
|
||||
limit: body.limit || 20, // Get more results for better context
|
||||
maxBfsDepth: body.maxBfsDepth,
|
||||
includeInvalidated: body.includeInvalidated,
|
||||
entityTypes: body.entityTypes,
|
||||
scoreThreshold: body.scoreThreshold,
|
||||
minResults: body.minResults,
|
||||
},
|
||||
);
|
||||
|
||||
// Combine episodes and facts into context
|
||||
let context = [...searchResults.episodes].join("\n\n");
|
||||
|
||||
searchResults.facts.map((fact) => {
|
||||
context += `\n\nfact: ${fact.fact}\n validAt: ${fact.validAt}`;
|
||||
});
|
||||
|
||||
// console.log("Context:", context);
|
||||
|
||||
if (!context.trim()) {
|
||||
return json({
|
||||
question: body.question,
|
||||
generated_answer: "I couldn't find any relevant information to answer this question.",
|
||||
});
|
||||
}
|
||||
|
||||
// Generate answer using LLM
|
||||
const prompt = `You are an analytical AI that reasons deeply about context before answering questions. Your task is to:
|
||||
|
||||
1. FIRST: Look for direct, explicit answers in the context
|
||||
2. ANALYZE the context thoroughly for relevant information
|
||||
3. IDENTIFY patterns, connections, and implications
|
||||
4. REASON about what the context suggests or implies
|
||||
5. ANSWER based on direct evidence OR analysis
|
||||
|
||||
<reasoning>
|
||||
- Scan through ALL episodes and facts completely before answering
|
||||
- Look for every explicit statement that relates to the question
|
||||
- NEVER stop after finding the first answer - continue scanning for more
|
||||
- When asking "what did X show Y", look for ALL items X showed Y on that date
|
||||
- Collect multiple items, events, or details that answer the same question
|
||||
- If not found directly, identify all context elements related to the question
|
||||
- Look for patterns, themes, and implicit information in the context
|
||||
- Consider what the context suggests beyond explicit statements
|
||||
- Note any contradictions or missing information that affects the answer
|
||||
- Pay close attention to temporal information and dates (validAt timestamps)
|
||||
- For time-sensitive questions, prioritize more recent information
|
||||
- Consider the chronological sequence of events when relevant
|
||||
- CRITICAL: Ensure completeness by including ALL relevant items found
|
||||
- If you find 2+ items for the same question, mention them all in your answer
|
||||
- Be precise with details (specific types, colors, descriptions when available)
|
||||
- Draw logical conclusions based on available evidence
|
||||
- Don't give reasoning in the output
|
||||
</reasoning>
|
||||
|
||||
Follow this output format. don't give the JSON with \`\`\`json
|
||||
<output>
|
||||
{"answer" : "Your direct, short(max 2 sentences) answer based on your analysis"}
|
||||
</output>
|
||||
`;
|
||||
|
||||
const userPrompt = `<context>
|
||||
${context}
|
||||
</context>
|
||||
|
||||
<question>
|
||||
Question: ${body.question}
|
||||
</question>
|
||||
`;
|
||||
let responseText = "";
|
||||
let generated_answer = "";
|
||||
try {
|
||||
await makeModelCall(
|
||||
false, // Don't stream
|
||||
[{ role: "system", content: prompt }, { role: "user", content: userPrompt }],
|
||||
(text) => {
|
||||
responseText = text;
|
||||
}
|
||||
);
|
||||
|
||||
const outputMatch = responseText.match(/<output>([\s\S]*?)<\/output>/);
|
||||
if (outputMatch && outputMatch[1]) {
|
||||
try {
|
||||
const parsedOutput = JSON.parse(outputMatch[1].trim());
|
||||
generated_answer = parsedOutput.answer || "No answer provided";
|
||||
} catch (jsonError) {
|
||||
console.error("Error parsing JSON output:", jsonError);
|
||||
generated_answer = outputMatch[1].trim();
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error generating answer:", error);
|
||||
generated_answer = "I encountered an error while generating an answer to this question.";
|
||||
}
|
||||
|
||||
return json({
|
||||
question: body.question,
|
||||
generated_answer,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
export { action, loader };
|
||||
Loading…
x
Reference in New Issue
Block a user