mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 15:39:06 +08:00
feat: update eval and ego questions
This commit is contained in:
@@ -224,7 +224,7 @@ flowchart TD
|
||||
|
||||
## Evaluation
|
||||
|
||||
I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation.
|
||||
I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not.
|
||||
|
||||
I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.
|
||||
|
||||
|
||||
@@ -667,10 +667,10 @@ You decided to think out of the box or cut from a completely different angle.`);
|
||||
object = result.object;
|
||||
totalTokens = result.totalTokens;
|
||||
}
|
||||
context.tokenTracker.trackUsage('agent', totalTokens);
|
||||
|
||||
await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
thisStep = object as StepAction;
|
||||
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
||||
context.tokenTracker.trackUsage('agent', totalTokens);
|
||||
console.log(thisStep)
|
||||
return {result: thisStep, context};
|
||||
}
|
||||
|
||||
@@ -25,6 +25,63 @@ interface EvaluationResult {
|
||||
actual_answer: string;
|
||||
}
|
||||
|
||||
interface EvaluationStats {
|
||||
model_name: string;
|
||||
pass_rate: number;
|
||||
avg_steps: number;
|
||||
max_steps: number;
|
||||
min_steps: number;
|
||||
median_steps: number;
|
||||
avg_tokens: number;
|
||||
median_tokens: number;
|
||||
max_tokens: number;
|
||||
min_tokens: number;
|
||||
}
|
||||
|
||||
function calculateMedian(numbers: number[]): number {
|
||||
const sorted = [...numbers].sort((a, b) => a - b);
|
||||
const middle = Math.floor(sorted.length / 2);
|
||||
|
||||
if (sorted.length % 2 === 0) {
|
||||
return (sorted[middle - 1] + sorted[middle]) / 2;
|
||||
}
|
||||
return sorted[middle];
|
||||
}
|
||||
|
||||
function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats {
|
||||
const steps = results.map(r => r.total_steps);
|
||||
const tokens = results.map(r => r.total_tokens);
|
||||
const passCount = results.filter(r => r.pass).length;
|
||||
|
||||
return {
|
||||
model_name: modelName,
|
||||
pass_rate: (passCount / results.length) * 100,
|
||||
avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length,
|
||||
max_steps: Math.max(...steps),
|
||||
min_steps: Math.min(...steps),
|
||||
median_steps: calculateMedian(steps),
|
||||
avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length,
|
||||
median_tokens: calculateMedian(tokens),
|
||||
max_tokens: Math.max(...tokens),
|
||||
min_tokens: Math.min(...tokens)
|
||||
};
|
||||
}
|
||||
|
||||
function printStats(stats: EvaluationStats): void {
|
||||
console.log('\n=== Evaluation Statistics ===');
|
||||
console.log(`Model: ${stats.model_name}`);
|
||||
console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`);
|
||||
console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`);
|
||||
console.log(`Maximum Steps: ${stats.max_steps}`);
|
||||
console.log(`Minimum Steps: ${stats.min_steps}`);
|
||||
console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`);
|
||||
console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`);
|
||||
console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`);
|
||||
console.log(`Maximum Tokens: ${stats.max_tokens}`);
|
||||
console.log(`Minimum Tokens: ${stats.min_tokens}`);
|
||||
console.log('===========================\n');
|
||||
}
|
||||
|
||||
async function getCurrentGitCommit(): Promise<string> {
|
||||
try {
|
||||
const {stdout} = await execAsync('git rev-parse --short HEAD');
|
||||
@@ -72,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise<void> {
|
||||
const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8'));
|
||||
const results: EvaluationResult[] = [];
|
||||
const gitCommit = await getCurrentGitCommit();
|
||||
const outputFile = `eval-${gitCommit}.json`;
|
||||
const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown';
|
||||
const outputFile = `eval-${gitCommit}-${modelName}.json`;
|
||||
|
||||
// Process each question
|
||||
for (let i = 0; i < questions.length; i++) {
|
||||
const {question, answer: expectedAnswer} = questions[i];
|
||||
@@ -83,7 +142,7 @@ async function batchEvaluate(inputFile: string): Promise<void> {
|
||||
const {
|
||||
result: response,
|
||||
context
|
||||
} = await getResponse(question) as { result: AnswerAction; context: TrackerContext };
|
||||
} = await getResponse(question, 0) as { result: AnswerAction; context: TrackerContext };
|
||||
const actualAnswer = response.answer;
|
||||
|
||||
// Evaluate the response
|
||||
@@ -114,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise<void> {
|
||||
actual_answer: 'Error occurred'
|
||||
});
|
||||
}
|
||||
// Save results
|
||||
await fs.writeFile(outputFile, JSON.stringify(results, null, 2));
|
||||
console.log(`\nEvaluation results saved to ${outputFile}`);
|
||||
}
|
||||
|
||||
// Calculate and print statistics
|
||||
const stats = calculateStats(results, modelName);
|
||||
printStats(stats);
|
||||
|
||||
// Save results
|
||||
await fs.writeFile(outputFile, JSON.stringify({
|
||||
results,
|
||||
statistics: stats
|
||||
}, null, 2));
|
||||
|
||||
console.log(`\nEvaluation results saved to ${outputFile}`);
|
||||
}
|
||||
|
||||
// Run batch evaluation if this is the main module
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[
|
||||
{
|
||||
"question": "what is jina ai ceo's twitter account",
|
||||
"answer": "@hxiao"
|
||||
"question": "what did jina ai ceo say about deepseek that went viral and become a meme?",
|
||||
"answer": "a side project"
|
||||
},
|
||||
{
|
||||
"question": "when was jina ai founded?",
|
||||
@@ -24,28 +24,28 @@
|
||||
"answer": "30"
|
||||
},
|
||||
{
|
||||
"question": "how much rate limit for r.jina.ai api without an api key?",
|
||||
"answer": "20 RPM (requests per minute)"
|
||||
"question": "when was jina reader released?",
|
||||
"answer": "April 2024"
|
||||
},
|
||||
{
|
||||
"question": "How many offices do Jina AI have and where are they?",
|
||||
"answer": "four: sunnyvale, berlin, beijing, shenzhen"
|
||||
},
|
||||
{
|
||||
"question": "Does jina reranker v2 support multilingual?",
|
||||
"answer": "Yes"
|
||||
"question": "what jina-colbert-v2 improves over jina-colbert-v1?",
|
||||
"answer": "v2 add multilingual support"
|
||||
},
|
||||
{
|
||||
"question": "who are the authors of jina-clip-v2 paper?",
|
||||
"answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao"
|
||||
},
|
||||
{
|
||||
"question": "what can you find in common between fashion-mnist and bert-as-service?",
|
||||
"answer": "Both are made by Han Xiao"
|
||||
"question": "who is the common author of fashion-mnist and node-deepresearch?",
|
||||
"answer": "Han Xiao"
|
||||
},
|
||||
{
|
||||
"question": "Which countries are the investors of Jina AI from?",
|
||||
"answer": "USA and China, but no German investors"
|
||||
"answer": "USA and China only, no German investors"
|
||||
},
|
||||
{
|
||||
"question": "what is the grounding api endpoint of jina ai?",
|
||||
@@ -56,24 +56,20 @@
|
||||
"answer": "jina-embeddings-v2-base-en and jina-clip-v1"
|
||||
},
|
||||
{
|
||||
"question": "How much is the 2024 yearbook that jina ai published?",
|
||||
"answer": "$35 USD"
|
||||
"question": "Can I purchase the 2024 yearbook that jina ai published today?",
|
||||
"answer": "No it is sold out."
|
||||
},
|
||||
{
|
||||
"question": "Any meme or crypto coin that announced by jina ai?",
|
||||
"answer": "No."
|
||||
"question": "How many free tokens do you get from a new jina api key?",
|
||||
"answer": "1 million."
|
||||
},
|
||||
{
|
||||
"question": "Who is the legal signatory of Jina AI gmbh?",
|
||||
"answer": "Jiao Liu"
|
||||
},
|
||||
{
|
||||
"question": "does node-deepresearch project support local LLMs?",
|
||||
"answer": "Yes."
|
||||
},
|
||||
{
|
||||
"question": "what is the name of the jina ai's mascot?",
|
||||
"answer": "Jina"
|
||||
"question": "which llm provider does node-deepresearch project support?",
|
||||
"answer": "Gemini, Openai and some local LLMs"
|
||||
},
|
||||
{
|
||||
"question": "what is the name of the jina ai's mascot?",
|
||||
|
||||
Reference in New Issue
Block a user