diff --git a/README.md b/README.md index 71bd23f..95107a7 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ flowchart TD ## Evaluation -I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation. +I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not. I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer. diff --git a/src/agent.ts b/src/agent.ts index 8b1608c..854356f 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -667,10 +667,10 @@ You decided to think out of the box or cut from a completely different angle.`); object = result.object; totalTokens = result.totalTokens; } - context.tokenTracker.trackUsage('agent', totalTokens); - await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); thisStep = object as StepAction; + context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts}); + context.tokenTracker.trackUsage('agent', totalTokens); console.log(thisStep) return {result: thisStep, context}; } diff --git a/src/evals/batch-evals.ts b/src/evals/batch-evals.ts index 4680469..db31155 100644 --- a/src/evals/batch-evals.ts +++ b/src/evals/batch-evals.ts @@ -25,6 +25,63 @@ interface EvaluationResult { actual_answer: string; } +interface EvaluationStats { + model_name: string; + pass_rate: number; + avg_steps: number; + max_steps: number; + min_steps: number; + median_steps: number; + avg_tokens: number; + median_tokens: number; + max_tokens: number; + min_tokens: number; +} + +function calculateMedian(numbers: number[]): number { + const sorted = [...numbers].sort((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + + if (sorted.length % 2 === 0) { + return (sorted[middle - 1] + sorted[middle]) / 2; + } + return sorted[middle]; +} + +function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats { + const steps = results.map(r => r.total_steps); + const tokens = results.map(r => r.total_tokens); + const passCount = results.filter(r => r.pass).length; + + return { + model_name: modelName, + pass_rate: (passCount / results.length) * 100, + avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length, + max_steps: Math.max(...steps), + min_steps: Math.min(...steps), + median_steps: calculateMedian(steps), + avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length, + median_tokens: calculateMedian(tokens), + max_tokens: Math.max(...tokens), + min_tokens: Math.min(...tokens) + }; +} + +function printStats(stats: EvaluationStats): void { + console.log('\n=== Evaluation Statistics ==='); + console.log(`Model: ${stats.model_name}`); + console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`); + console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`); + console.log(`Maximum Steps: ${stats.max_steps}`); + console.log(`Minimum Steps: ${stats.min_steps}`); + console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`); + console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`); + console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`); + console.log(`Maximum Tokens: ${stats.max_tokens}`); + console.log(`Minimum Tokens: ${stats.min_tokens}`); + console.log('===========================\n'); +} + async function getCurrentGitCommit(): Promise { try { const {stdout} = await execAsync('git rev-parse --short HEAD'); @@ -72,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise { const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8')); const results: EvaluationResult[] = []; const gitCommit = await getCurrentGitCommit(); - const outputFile = `eval-${gitCommit}.json`; + const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown'; + const outputFile = `eval-${gitCommit}-${modelName}.json`; + // Process each question for (let i = 0; i < questions.length; i++) { const {question, answer: expectedAnswer} = questions[i]; @@ -83,7 +142,7 @@ async function batchEvaluate(inputFile: string): Promise { const { result: response, context - } = await getResponse(question) as { result: AnswerAction; context: TrackerContext }; + } = await getResponse(question, 0) as { result: AnswerAction; context: TrackerContext }; const actualAnswer = response.answer; // Evaluate the response @@ -114,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise { actual_answer: 'Error occurred' }); } - // Save results - await fs.writeFile(outputFile, JSON.stringify(results, null, 2)); - console.log(`\nEvaluation results saved to ${outputFile}`); } + // Calculate and print statistics + const stats = calculateStats(results, modelName); + printStats(stats); + // Save results + await fs.writeFile(outputFile, JSON.stringify({ + results, + statistics: stats + }, null, 2)); + + console.log(`\nEvaluation results saved to ${outputFile}`); } // Run batch evaluation if this is the main module diff --git a/src/evals/ego-questions.json b/src/evals/ego-questions.json index 8d37a23..f05d677 100644 --- a/src/evals/ego-questions.json +++ b/src/evals/ego-questions.json @@ -1,7 +1,7 @@ [ { - "question": "what is jina ai ceo's twitter account", - "answer": "@hxiao" + "question": "what did jina ai ceo say about deepseek that went viral and become a meme?", + "answer": "a side project" }, { "question": "when was jina ai founded?", @@ -24,28 +24,28 @@ "answer": "30" }, { - "question": "how much rate limit for r.jina.ai api without an api key?", - "answer": "20 RPM (requests per minute)" + "question": "when was jina reader released?", + "answer": "April 2024" }, { "question": "How many offices do Jina AI have and where are they?", "answer": "four: sunnyvale, berlin, beijing, shenzhen" }, { - "question": "Does jina reranker v2 support multilingual?", - "answer": "Yes" + "question": "what jina-colbert-v2 improves over jina-colbert-v1?", + "answer": "v2 add multilingual support" }, { "question": "who are the authors of jina-clip-v2 paper?", "answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao" }, { - "question": "what can you find in common between fashion-mnist and bert-as-service?", - "answer": "Both are made by Han Xiao" + "question": "who is the common author of fashion-mnist and node-deepresearch?", + "answer": "Han Xiao" }, { "question": "Which countries are the investors of Jina AI from?", - "answer": "USA and China, but no German investors" + "answer": "USA and China only, no German investors" }, { "question": "what is the grounding api endpoint of jina ai?", @@ -56,24 +56,20 @@ "answer": "jina-embeddings-v2-base-en and jina-clip-v1" }, { - "question": "How much is the 2024 yearbook that jina ai published?", - "answer": "$35 USD" + "question": "Can I purchase the 2024 yearbook that jina ai published today?", + "answer": "No it is sold out." }, { - "question": "Any meme or crypto coin that announced by jina ai?", - "answer": "No." + "question": "How many free tokens do you get from a new jina api key?", + "answer": "1 million." }, { "question": "Who is the legal signatory of Jina AI gmbh?", "answer": "Jiao Liu" }, { - "question": "does node-deepresearch project support local LLMs?", - "answer": "Yes." - }, - { - "question": "what is the name of the jina ai's mascot?", - "answer": "Jina" + "question": "which llm provider does node-deepresearch project support?", + "answer": "Gemini, Openai and some local LLMs" }, { "question": "what is the name of the jina ai's mascot?",