feat: update eval and ego questions

2026-03-22 15:39:06 +08:00 · 2025-02-07 11:48:40 +08:00
parent ef34881f59
commit 1168c753ce
4 changed files with 89 additions and 27 deletions
--- a/README.md
+++ b/README.md
@@ -224,7 +224,7 @@ flowchart TD

 ## Evaluation

-I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation.
+I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not.

 I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.

--- a/src/agent.ts
+++ b/src/agent.ts
@@ -667,10 +667,10 @@ You decided to think out of the box or cut from a completely different angle.`);
      object = result.object;
      totalTokens = result.totalTokens;
    }
-    context.tokenTracker.trackUsage('agent', totalTokens);
-
    await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
    thisStep = object as StepAction;
+    context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
+    context.tokenTracker.trackUsage('agent', totalTokens);
    console.log(thisStep)
    return {result: thisStep, context};
  }
--- a/src/evals/batch-evals.ts
+++ b/src/evals/batch-evals.ts
@@ -25,6 +25,63 @@ interface EvaluationResult {
  actual_answer: string;
 }

+interface EvaluationStats {
+  model_name: string;
+  pass_rate: number;
+  avg_steps: number;
+  max_steps: number;
+  min_steps: number;
+  median_steps: number;
+  avg_tokens: number;
+  median_tokens: number;
+  max_tokens: number;
+  min_tokens: number;
+}
+
+function calculateMedian(numbers: number[]): number {
+  const sorted = [...numbers].sort((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+
+  if (sorted.length % 2 === 0) {
+    return (sorted[middle - 1] + sorted[middle]) / 2;
+  }
+  return sorted[middle];
+}
+
+function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats {
+  const steps = results.map(r => r.total_steps);
+  const tokens = results.map(r => r.total_tokens);
+  const passCount = results.filter(r => r.pass).length;
+
+  return {
+    model_name: modelName,
+    pass_rate: (passCount / results.length) * 100,
+    avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length,
+    max_steps: Math.max(...steps),
+    min_steps: Math.min(...steps),
+    median_steps: calculateMedian(steps),
+    avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length,
+    median_tokens: calculateMedian(tokens),
+    max_tokens: Math.max(...tokens),
+    min_tokens: Math.min(...tokens)
+  };
+}
+
+function printStats(stats: EvaluationStats): void {
+  console.log('\n=== Evaluation Statistics ===');
+  console.log(`Model: ${stats.model_name}`);
+  console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`);
+  console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`);
+  console.log(`Maximum Steps: ${stats.max_steps}`);
+  console.log(`Minimum Steps: ${stats.min_steps}`);
+  console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`);
+  console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`);
+  console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`);
+  console.log(`Maximum Tokens: ${stats.max_tokens}`);
+  console.log(`Minimum Tokens: ${stats.min_tokens}`);
+  console.log('===========================\n');
+}
+
 async function getCurrentGitCommit(): Promise<string> {
  try {
    const {stdout} = await execAsync('git rev-parse --short HEAD');
@@ -72,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise<void> {
  const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8'));
  const results: EvaluationResult[] = [];
  const gitCommit = await getCurrentGitCommit();
-  const outputFile = `eval-${gitCommit}.json`;
+  const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown';
+  const outputFile = `eval-${gitCommit}-${modelName}.json`;
+
  // Process each question
  for (let i = 0; i < questions.length; i++) {
    const {question, answer: expectedAnswer} = questions[i];
@@ -83,7 +142,7 @@ async function batchEvaluate(inputFile: string): Promise<void> {
      const {
        result: response,
        context
-      } = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
+      } = await getResponse(question, 0) as { result: AnswerAction; context: TrackerContext };
      const actualAnswer = response.answer;

      // Evaluate the response
@@ -114,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise<void> {
        actual_answer: 'Error occurred'
      });
    }
-    // Save results
-    await fs.writeFile(outputFile, JSON.stringify(results, null, 2));
-    console.log(`\nEvaluation results saved to ${outputFile}`);
  }

+  // Calculate and print statistics
+  const stats = calculateStats(results, modelName);
+  printStats(stats);

+  // Save results
+  await fs.writeFile(outputFile, JSON.stringify({
+    results,
+    statistics: stats
+  }, null, 2));
+
+  console.log(`\nEvaluation results saved to ${outputFile}`);
 }

 // Run batch evaluation if this is the main module
--- a/src/evals/ego-questions.json
+++ b/src/evals/ego-questions.json
@@ -1,7 +1,7 @@
 [
  {
-    "question": "what is jina ai ceo's twitter account",
-    "answer": "@hxiao"
+    "question": "what did jina ai ceo say about deepseek that went viral and become a meme?",
+    "answer": "a side project"
  },
  {
    "question": "when was jina ai founded?",
@@ -24,28 +24,28 @@
    "answer": "30"
  },
  {
-    "question": "how much rate limit for r.jina.ai api without an api key?",
-    "answer": "20 RPM (requests per minute)"
+    "question": "when was jina reader released?",
+    "answer": "April 2024"
  },
  {
    "question": "How many offices do Jina AI have and where are they?",
    "answer": "four: sunnyvale, berlin, beijing, shenzhen"
  },
  {
-    "question": "Does jina reranker v2 support multilingual?",
-    "answer": "Yes"
+    "question": "what jina-colbert-v2 improves over jina-colbert-v1?",
+    "answer": "v2 add multilingual support"
  },
  {
    "question": "who are the authors of jina-clip-v2 paper?",
    "answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao"
  },
  {
-    "question": "what can you find in common between fashion-mnist and bert-as-service?",
-    "answer": "Both are made by Han Xiao"
+    "question": "who is the common author of fashion-mnist and node-deepresearch?",
+    "answer": "Han Xiao"
  },
  {
    "question": "Which countries are the investors of Jina AI from?",
-    "answer": "USA and China, but no German investors"
+    "answer": "USA and China only, no German investors"
  },
  {
    "question": "what is the grounding api endpoint of jina ai?",
@@ -56,24 +56,20 @@
    "answer": "jina-embeddings-v2-base-en and jina-clip-v1"
  },
  {
-    "question": "How much is the 2024 yearbook that jina ai published?",
-    "answer": "$35 USD"
+    "question": "Can I purchase the 2024 yearbook that jina ai published today?",
+    "answer": "No it is sold out."
  },
  {
-    "question": "Any meme or crypto coin that announced by jina ai?",
-    "answer": "No."
+    "question": "How many free tokens do you get from a new jina api key?",
+    "answer": "1 million."
  },
  {
    "question": "Who is the legal signatory of Jina AI gmbh?",
    "answer": "Jiao Liu"
  },
  {
-    "question": "does node-deepresearch project support local LLMs?",
-    "answer": "Yes."
-  },
-  {
-    "question": "what is the name of the jina ai's mascot?",
-    "answer": "Jina"
+    "question": "which llm provider does node-deepresearch project support?",
+    "answer": "Gemini, Openai and some local LLMs"
  },
  {
    "question": "what is the name of the jina ai's mascot?",