From d6811fc2eb4bbf43cccc928fec120d51f6d4f7b5 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Feb 2025 23:33:38 +0800 Subject: [PATCH] chore: update readme --- README.md | 10 +++ package.json | 1 + src/config.ts | 8 +-- src/evals/batch-evals.ts | 132 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 src/evals/batch-evals.ts diff --git a/README.md b/README.md index 9fc1636..4573c70 100644 --- a/README.md +++ b/README.md @@ -217,3 +217,13 @@ flowchart TD BeastMode --> FinalAnswer[Generate final answer] --> End ``` + +## Evaluation + +I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation. + +I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer. + +```bash +npm run eval ./src/evals/ego-questions.json +``` \ No newline at end of file diff --git a/package.json b/package.json index 9339d6f..02aec38 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "lint": "eslint . --ext .ts", "lint:fix": "eslint . --ext .ts --fix", "serve": "ts-node src/server.ts", + "eval": "ts-node src/evals/batch-evals.ts", "test": "jest", "test:watch": "jest --watch" }, diff --git a/src/config.ts b/src/config.ts index 25a87d3..b6de571 100644 --- a/src/config.ts +++ b/src/config.ts @@ -84,16 +84,16 @@ const defaultOpenAIConfig: ModelConfig = { export const modelConfigs: Record = { gemini: { dedup: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'dedup'), - evaluator: validateModelConfig({ ...defaultGeminiConfig }, 'evaluator'), - errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig }, 'errorAnalyzer'), + evaluator: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'evaluator'), + errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'errorAnalyzer'), queryRewriter: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'queryRewriter'), agent: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agent'), agentBeastMode: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agentBeastMode') }, openai: { dedup: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'dedup'), - evaluator: validateModelConfig({ ...defaultOpenAIConfig }, 'evaluator'), - errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig }, 'errorAnalyzer'), + evaluator: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'evaluator'), + errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'errorAnalyzer'), queryRewriter: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'queryRewriter'), agent: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agent'), agentBeastMode: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agentBeastMode') diff --git a/src/evals/batch-evals.ts b/src/evals/batch-evals.ts new file mode 100644 index 0000000..02560b7 --- /dev/null +++ b/src/evals/batch-evals.ts @@ -0,0 +1,132 @@ +import fs from 'fs/promises'; +import { exec } from 'child_process'; +import { promisify } from 'util'; +import { getResponse } from '../agent'; +import { generateObject } from 'ai'; +import { getModel, getMaxTokens } from '../config'; +import { z } from 'zod'; +import {AnswerAction, TrackerContext} from "../types"; + +const execAsync = promisify(exec); + +interface Question { + question: string; + answer: string; +} + +interface EvaluationResult { + pass: boolean; + reason: string; + total_steps: number; + total_tokens: number; + question: string; + expected_answer: string; + actual_answer: string; +} + +async function getCurrentGitCommit(): Promise { + try { + const { stdout } = await execAsync('git rev-parse --short HEAD'); + return stdout.trim(); + } catch (error) { + console.error('Error getting git commit:', error); + return 'unknown'; + } +} + +async function evaluateAnswer(expectedAnswer: string, actualAnswer: string): Promise<{ pass: boolean; reason: string }> { + const prompt = `You are a deterministic evaluator with zero temperature. Compare the following expected answer with the actual answer and determine if they convey the same information. + +Expected answer: ${expectedAnswer} +Actual answer: ${actualAnswer} + +Minor wording differences are acceptable as long as the core information of the expected answer is preserved in the actual answer.'`; + + const schema = z.object({ + pass: z.boolean().describe('Whether the actual answer matches the expected answer'), + reason: z.string().describe('Detailed explanation of why the evaluation passed or failed') + }); + + try { + const result = await generateObject({ + model: getModel('evaluator'), + schema, + prompt, + maxTokens: getMaxTokens('evaluator'), + temperature: 0 // Setting temperature to 0 for deterministic output + }); + + return result.object; + } catch (error) { + console.error('Evaluation failed:', error); + return { + pass: false, + reason: `Evaluation error: ${error}` + }; + } +} + +async function batchEvaluate(inputFile: string): Promise { + // Read and parse input file + const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8')); + const results: EvaluationResult[] = []; + + // Process each question + for (let i = 0; i < questions.length; i++) { + const { question, answer: expectedAnswer } = questions[i]; + console.log(`\nProcessing question ${i + 1}/${questions.length}: ${question}`); + + try { + // Get response using the agent + const { result: response, context } = await getResponse(question) as { result: AnswerAction; context: TrackerContext }; + const actualAnswer = response.answer; + + // Evaluate the response + const evaluation = await evaluateAnswer(expectedAnswer, actualAnswer); + + // Record results + results.push({ + pass: evaluation.pass, + reason: evaluation.reason, + total_steps: context.actionTracker.getState().totalStep, + total_tokens: context.tokenTracker.getTotalUsage(), + question, + expected_answer: expectedAnswer, + actual_answer: actualAnswer + }); + + console.log(`Evaluation: ${evaluation.pass ? 'PASS' : 'FAIL'}`); + console.log(`Reason: ${evaluation.reason}`); + } catch (error) { + console.error(`Error processing question: ${question}`, error); + results.push({ + pass: false, + reason: `Error: ${error}`, + total_steps: 0, + total_tokens: 0, + question, + expected_answer: expectedAnswer, + actual_answer: 'Error occurred' + }); + } + } + + // Save results + const gitCommit = await getCurrentGitCommit(); + const outputFile = `eval-${gitCommit}.json`; + await fs.writeFile(outputFile, JSON.stringify(results, null, 2)); + console.log(`\nEvaluation results saved to ${outputFile}`); +} + +// Run batch evaluation if this is the main module +if (require.main === module) { + const inputFile = process.argv[2]; + if (!inputFile) { + console.error('Please provide an input file path'); + process.exit(1); + } + + batchEvaluate(inputFile).catch(console.error); +} + +export { batchEvaluate }; \ No newline at end of file