feat: update eval and ego questions

This commit is contained in:
Han Xiao
2025-02-07 11:48:40 +08:00
parent ef34881f59
commit 1168c753ce
4 changed files with 89 additions and 27 deletions

View File

@@ -224,7 +224,7 @@ flowchart TD
## Evaluation
I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation.
I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not.
I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.

View File

@@ -667,10 +667,10 @@ You decided to think out of the box or cut from a completely different angle.`);
object = result.object;
totalTokens = result.totalTokens;
}
context.tokenTracker.trackUsage('agent', totalTokens);
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
thisStep = object as StepAction;
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
context.tokenTracker.trackUsage('agent', totalTokens);
console.log(thisStep)
return {result: thisStep, context};
}

View File

@@ -25,6 +25,63 @@ interface EvaluationResult {
actual_answer: string;
}
interface EvaluationStats {
model_name: string;
pass_rate: number;
avg_steps: number;
max_steps: number;
min_steps: number;
median_steps: number;
avg_tokens: number;
median_tokens: number;
max_tokens: number;
min_tokens: number;
}
function calculateMedian(numbers: number[]): number {
const sorted = [...numbers].sort((a, b) => a - b);
const middle = Math.floor(sorted.length / 2);
if (sorted.length % 2 === 0) {
return (sorted[middle - 1] + sorted[middle]) / 2;
}
return sorted[middle];
}
function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats {
const steps = results.map(r => r.total_steps);
const tokens = results.map(r => r.total_tokens);
const passCount = results.filter(r => r.pass).length;
return {
model_name: modelName,
pass_rate: (passCount / results.length) * 100,
avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length,
max_steps: Math.max(...steps),
min_steps: Math.min(...steps),
median_steps: calculateMedian(steps),
avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length,
median_tokens: calculateMedian(tokens),
max_tokens: Math.max(...tokens),
min_tokens: Math.min(...tokens)
};
}
function printStats(stats: EvaluationStats): void {
console.log('\n=== Evaluation Statistics ===');
console.log(`Model: ${stats.model_name}`);
console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`);
console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`);
console.log(`Maximum Steps: ${stats.max_steps}`);
console.log(`Minimum Steps: ${stats.min_steps}`);
console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`);
console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`);
console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`);
console.log(`Maximum Tokens: ${stats.max_tokens}`);
console.log(`Minimum Tokens: ${stats.min_tokens}`);
console.log('===========================\n');
}
async function getCurrentGitCommit(): Promise<string> {
try {
const {stdout} = await execAsync('git rev-parse --short HEAD');
@@ -72,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise<void> {
const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8'));
const results: EvaluationResult[] = [];
const gitCommit = await getCurrentGitCommit();
const outputFile = `eval-${gitCommit}.json`;
const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown';
const outputFile = `eval-${gitCommit}-${modelName}.json`;
// Process each question
for (let i = 0; i < questions.length; i++) {
const {question, answer: expectedAnswer} = questions[i];
@@ -83,7 +142,7 @@ async function batchEvaluate(inputFile: string): Promise<void> {
const {
result: response,
context
} = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
} = await getResponse(question, 0) as { result: AnswerAction; context: TrackerContext };
const actualAnswer = response.answer;
// Evaluate the response
@@ -114,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise<void> {
actual_answer: 'Error occurred'
});
}
// Save results
await fs.writeFile(outputFile, JSON.stringify(results, null, 2));
console.log(`\nEvaluation results saved to ${outputFile}`);
}
// Calculate and print statistics
const stats = calculateStats(results, modelName);
printStats(stats);
// Save results
await fs.writeFile(outputFile, JSON.stringify({
results,
statistics: stats
}, null, 2));
console.log(`\nEvaluation results saved to ${outputFile}`);
}
// Run batch evaluation if this is the main module

View File

@@ -1,7 +1,7 @@
[
{
"question": "what is jina ai ceo's twitter account",
"answer": "@hxiao"
"question": "what did jina ai ceo say about deepseek that went viral and become a meme?",
"answer": "a side project"
},
{
"question": "when was jina ai founded?",
@@ -24,28 +24,28 @@
"answer": "30"
},
{
"question": "how much rate limit for r.jina.ai api without an api key?",
"answer": "20 RPM (requests per minute)"
"question": "when was jina reader released?",
"answer": "April 2024"
},
{
"question": "How many offices do Jina AI have and where are they?",
"answer": "four: sunnyvale, berlin, beijing, shenzhen"
},
{
"question": "Does jina reranker v2 support multilingual?",
"answer": "Yes"
"question": "what jina-colbert-v2 improves over jina-colbert-v1?",
"answer": "v2 add multilingual support"
},
{
"question": "who are the authors of jina-clip-v2 paper?",
"answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao"
},
{
"question": "what can you find in common between fashion-mnist and bert-as-service?",
"answer": "Both are made by Han Xiao"
"question": "who is the common author of fashion-mnist and node-deepresearch?",
"answer": "Han Xiao"
},
{
"question": "Which countries are the investors of Jina AI from?",
"answer": "USA and China, but no German investors"
"answer": "USA and China only, no German investors"
},
{
"question": "what is the grounding api endpoint of jina ai?",
@@ -56,24 +56,20 @@
"answer": "jina-embeddings-v2-base-en and jina-clip-v1"
},
{
"question": "How much is the 2024 yearbook that jina ai published?",
"answer": "$35 USD"
"question": "Can I purchase the 2024 yearbook that jina ai published today?",
"answer": "No it is sold out."
},
{
"question": "Any meme or crypto coin that announced by jina ai?",
"answer": "No."
"question": "How many free tokens do you get from a new jina api key?",
"answer": "1 million."
},
{
"question": "Who is the legal signatory of Jina AI gmbh?",
"answer": "Jiao Liu"
},
{
"question": "does node-deepresearch project support local LLMs?",
"answer": "Yes."
},
{
"question": "what is the name of the jina ai's mascot?",
"answer": "Jina"
"question": "which llm provider does node-deepresearch project support?",
"answer": "Gemini, Openai and some local LLMs"
},
{
"question": "what is the name of the jina ai's mascot?",