chore: update readme

2026-03-22 07:29:35 +08:00 · 2025-02-06 23:33:38 +08:00
parent c76ab3415c
commit d6811fc2eb
4 changed files with 147 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -217,3 +217,13 @@ flowchart TD

    BeastMode --> FinalAnswer[Generate final answer] --> End
 ```
+
+## Evaluation
+
+I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation.
+
+I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.
+
+```bash
+npm run eval ./src/evals/ego-questions.json
+```
--- a/package.json
+++ b/package.json
@@ -17,6 +17,7 @@
    "lint": "eslint . --ext .ts",
    "lint:fix": "eslint . --ext .ts --fix",
    "serve": "ts-node src/server.ts",
+    "eval": "ts-node src/evals/batch-evals.ts",
    "test": "jest",
    "test:watch": "jest --watch"
  },
--- a/src/config.ts
+++ b/src/config.ts
@@ -84,16 +84,16 @@ const defaultOpenAIConfig: ModelConfig = {
 export const modelConfigs: Record<LLMProvider, ToolConfigs> = {
  gemini: {
    dedup: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'dedup'),
-    evaluator: validateModelConfig({ ...defaultGeminiConfig }, 'evaluator'),
-    errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig }, 'errorAnalyzer'),
+    evaluator: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'evaluator'),
+    errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'errorAnalyzer'),
    queryRewriter: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'queryRewriter'),
    agent: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agent'),
    agentBeastMode: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agentBeastMode')
  },
  openai: {
    dedup: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'dedup'),
-    evaluator: validateModelConfig({ ...defaultOpenAIConfig }, 'evaluator'),
-    errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig }, 'errorAnalyzer'),
+    evaluator: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'evaluator'),
+    errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'errorAnalyzer'),
    queryRewriter: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'queryRewriter'),
    agent: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agent'),
    agentBeastMode: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agentBeastMode')
--- a/src/evals/batch-evals.ts
+++ b/src/evals/batch-evals.ts
@@ -0,0 +1,132 @@
+import fs from 'fs/promises';
+import { exec } from 'child_process';
+import { promisify } from 'util';
+import { getResponse } from '../agent';
+import { generateObject } from 'ai';
+import { getModel, getMaxTokens } from '../config';
+import { z } from 'zod';
+import {AnswerAction, TrackerContext} from "../types";
+
+const execAsync = promisify(exec);
+
+interface Question {
+  question: string;
+  answer: string;
+}
+
+interface EvaluationResult {
+  pass: boolean;
+  reason: string;
+  total_steps: number;
+  total_tokens: number;
+  question: string;
+  expected_answer: string;
+  actual_answer: string;
+}
+
+async function getCurrentGitCommit(): Promise<string> {
+  try {
+    const { stdout } = await execAsync('git rev-parse --short HEAD');
+    return stdout.trim();
+  } catch (error) {
+    console.error('Error getting git commit:', error);
+    return 'unknown';
+  }
+}
+
+async function evaluateAnswer(expectedAnswer: string, actualAnswer: string): Promise<{ pass: boolean; reason: string }> {
+  const prompt = `You are a deterministic evaluator with zero temperature. Compare the following expected answer with the actual answer and determine if they convey the same information.
+
+Expected answer: ${expectedAnswer}
+Actual answer: ${actualAnswer}
+
+Minor wording differences are acceptable as long as the core information of the expected answer is preserved in the actual answer.'`;
+
+  const schema = z.object({
+    pass: z.boolean().describe('Whether the actual answer matches the expected answer'),
+    reason: z.string().describe('Detailed explanation of why the evaluation passed or failed')
+  });
+
+  try {
+    const result = await generateObject({
+      model: getModel('evaluator'),
+      schema,
+      prompt,
+      maxTokens: getMaxTokens('evaluator'),
+      temperature: 0  // Setting temperature to 0 for deterministic output
+    });
+
+    return result.object;
+  } catch (error) {
+    console.error('Evaluation failed:', error);
+    return {
+      pass: false,
+      reason: `Evaluation error: ${error}`
+    };
+  }
+}
+
+async function batchEvaluate(inputFile: string): Promise<void> {
+  // Read and parse input file
+  const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8'));
+  const results: EvaluationResult[] = [];
+
+  // Process each question
+  for (let i = 0; i < questions.length; i++) {
+    const { question, answer: expectedAnswer } = questions[i];
+    console.log(`\nProcessing question ${i + 1}/${questions.length}: ${question}`);
+
+    try {
+      // Get response using the agent
+      const { result: response, context } = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
+      const actualAnswer = response.answer;
+
+      // Evaluate the response
+      const evaluation = await evaluateAnswer(expectedAnswer, actualAnswer);
+
+      // Record results
+      results.push({
+        pass: evaluation.pass,
+        reason: evaluation.reason,
+        total_steps: context.actionTracker.getState().totalStep,
+        total_tokens: context.tokenTracker.getTotalUsage(),
+        question,
+        expected_answer: expectedAnswer,
+        actual_answer: actualAnswer
+      });
+
+      console.log(`Evaluation: ${evaluation.pass ? 'PASS' : 'FAIL'}`);
+      console.log(`Reason: ${evaluation.reason}`);
+    } catch (error) {
+      console.error(`Error processing question: ${question}`, error);
+      results.push({
+        pass: false,
+        reason: `Error: ${error}`,
+        total_steps: 0,
+        total_tokens: 0,
+        question,
+        expected_answer: expectedAnswer,
+        actual_answer: 'Error occurred'
+      });
+    }
+  }
+
+  // Save results
+  const gitCommit = await getCurrentGitCommit();
+  const outputFile = `eval-${gitCommit}.json`;
+  await fs.writeFile(outputFile, JSON.stringify(results, null, 2));
+  console.log(`\nEvaluation results saved to ${outputFile}`);
+}
+
+// Run batch evaluation if this is the main module
+if (require.main === module) {
+  const inputFile = process.argv[2];
+  if (!inputFile) {
+    console.error('Please provide an input file path');
+    process.exit(1);
+  }
+
+  batchEvaluate(inputFile).catch(console.error);
+}
+
+export { batchEvaluate };