Merge branch 'main' of https://github.com/jina-ai/node-DeepResearch

2025-12-26 06:28:56 +08:00 · 2025-02-07 09:10:09 -05:00 · 2025-02-07 09:10:09 -05:00 · 825419d0b9
commit 825419d0b9
parent ec865b1650 f9cbc4008c
21 changed files with 495 additions and 313 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ knowledge.json
 prompt-*.txt
 queries.json
 questions.json
+eval-*.json

 # Logs
 logs
--- a/5
+++ b/5
@ -28,7 +28,8 @@ COPY package*.json ./
 # Install production dependencies only
 RUN npm install --production  --ignore-scripts

-# Copy built files from the build stage
+# Copy config.json and built files from builder
+COPY --from=builder /app/config.json ./
 COPY --from=builder /app/dist ./dist

 # Set environment variables (Recommended to set at runtime, avoid hardcoding)
@ -41,4 +42,4 @@ ENV BRAVE_API_KEY=${BRAVE_API_KEY}
 EXPOSE 3000

 # Set startup command
-CMD ["node", "./dist/server.js"]
+CMD ["node", "./dist/server.js"]
--- a/README.md
+++ b/README.md
@ -224,10 +224,28 @@ flowchart TD

 ## Evaluation

-I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation.
+I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not.

 I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer.

 ```bash
 npm run eval ./src/evals/ego-questions.json
-```
+```
+
+Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set.
+
+Plain `gemini-2.0-flash` can be run by setting `tokenBudget` to zero, skipping the while-loop and directly answering the question. 
+
+It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as I intentionally filtered out the questions that LLMs can answer.
+
+| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch （#5e80ed4） |
+|--------|------------------|-------------------------------------------------|
+| Pass Rate | 0% | 60%                                             |
+| Average Steps | 1 | 5                                               |
+| Maximum Steps | 1 | 13                                              |
+| Minimum Steps | 1 | 2                                               |
+| Median Steps | 1 | 3                                               |
+| Average Tokens | 428 | 59,408                                          |
+| Median Tokens | 434 | 16,001                                          |
+| Maximum Tokens | 463 | 347,222                                         |
+| Minimum Tokens | 374 | 5,594                                           |
--- a/config.json
+++ b/config.json
@ -0,0 +1,59 @@
+{
+  "env": {
+    "https_proxy": "",
+    "OPENAI_BASE_URL": "",
+    "GEMINI_API_KEY": "",
+    "OPENAI_API_KEY": "",
+    "JINA_API_KEY": "",
+    "BRAVE_API_KEY": "",
+    "DEFAULT_MODEL_NAME": ""
+  },
+  "defaults": {
+    "search_provider": "jina",
+    "llm_provider": "gemini",
+    "step_sleep": 1000
+  },
+  "providers": {
+    "gemini": {
+      "createClient": "createGoogleGenerativeAI"
+    },
+    "openai": {
+      "createClient": "createOpenAI",
+      "clientConfig": {
+        "compatibility": "strict"
+      }
+    }
+  },
+  "models": {
+    "gemini": {
+      "default": {
+        "model": "gemini-2.0-flash",
+        "temperature": 0,
+        "maxTokens": 8000
+      },
+      "tools": {
+        "dedup": { "temperature": 0.1 },
+        "evaluator": {},
+        "errorAnalyzer": {},
+        "queryRewriter": { "temperature": 0.1 },
+        "agent": { "temperature": 0.7 },
+        "agentBeastMode": { "temperature": 0.7 }
+      }
+    },
+    "openai": {
+      "default": {
+        "model": "gpt-4o-mini",
+        "temperature": 0,
+        "maxTokens": 8000
+      },
+      "tools": {
+        "dedup": { "temperature": 0.1 },
+        "evaluator": {},
+        "errorAnalyzer": {},
+        "queryRewriter": { "temperature": 0.1 },
+        "agent": { "temperature": 0.7 },
+        "agentBeastMode": { "temperature": 0.7 }
+      }
+    }
+  }
+}
--- a/package.json
+++ b/package.json
@ -18,7 +18,7 @@
    "lint:fix": "eslint . --ext .ts --fix",
    "serve": "ts-node src/server.ts",
    "eval": "ts-node src/evals/batch-evals.ts",
-    "test": "jest",
+    "test": "jest --testTimeout=30000",
    "test:watch": "jest --watch"
  },
  "keywords": [],
--- a/src/tests/agent.test.ts
+++ b/src/tests/agent.test.ts
@ -1,11 +1,15 @@
 import { getResponse } from '../agent';

 describe('getResponse', () => {
+  afterEach(() => {
+    jest.useRealTimers();
+  });
+
  it('should handle search action', async () => {
-    const result = await getResponse('What is TypeScript?', 1000);
+    const result = await getResponse('What is TypeScript?', 10000);
    expect(result.result.action).toBeDefined();
    expect(result.context).toBeDefined();
    expect(result.context.tokenTracker).toBeDefined();
    expect(result.context.actionTracker).toBeDefined();
-  });
+  }, 30000);
 });
--- a/src/tests/cli.test.ts
+++ b/src/tests/cli.test.ts
@ -1,40 +0,0 @@
-import { exec } from 'child_process';
-import { promisify } from 'util';
-
-const execAsync = promisify(exec);
-
-// Mock environment variables
-process.env.GEMINI_API_KEY = 'test-key';
-process.env.JINA_API_KEY = 'test-key';
-
-jest.mock('../agent', () => ({
-  getResponse: jest.fn().mockResolvedValue({
-    result: {
-      action: 'answer',
-      answer: 'Test answer',
-      references: []
-    }
-  })
-}));
-
-describe('CLI', () => {
-  test('shows version', async () => {
-    const { stdout } = await execAsync('ts-node src/cli.ts --version');
-    expect(stdout.trim()).toMatch(/\d+\.\d+\.\d+/);
-  });
-
-  test('shows help', async () => {
-    const { stdout } = await execAsync('ts-node src/cli.ts --help');
-    expect(stdout).toContain('deepresearch');
-    expect(stdout).toContain('AI-powered research assistant');
-  });
-
-  test('handles invalid token budget', async () => {
-    try {
-      await execAsync('ts-node src/cli.ts -t invalid "test query"');
-      fail('Should have thrown');
-    } catch (error) {
-      expect((error as { stderr: string }).stderr).toContain('Invalid token budget: must be a number');
-    }
-  });
-});
--- a/src/agent.ts
+++ b/src/agent.ts
@ -7,14 +7,14 @@ import fs from 'fs/promises';
 import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
 import {braveSearch} from "./tools/brave-search";
 import {rewriteQuery} from "./tools/query-rewriter";
-import {dedupQueries} from "./tools/dedup";
+import {dedupQueries} from "./tools/jina-dedup";
 import {evaluateAnswer} from "./tools/evaluator";
 import {analyzeSteps} from "./tools/error-analyzer";
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
 import {StepAction, AnswerAction} from "./types";
 import {TrackerContext} from "./types";
-import {jinaSearch} from "./tools/jinaSearch";
+import {search} from "./tools/jina-search";

 async function sleep(ms: number) {
  const seconds = Math.ceil(ms / 1000);
@ -32,7 +32,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
  if (allowSearch) {
    actions.push("search");
    properties.searchQuery = z.string().max(30)
-      .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
+      .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand. Existing queries must be avoided").optional();
  }

  if (allowAnswer) {
@ -75,6 +75,7 @@ function getPrompt(
  question: string,
  context?: string[],
  allQuestions?: string[],
+  allKeywords?: string[],
  allowReflect: boolean = true,
  allowAnswer: boolean = true,
  allowRead: boolean = true,
@ -190,11 +191,18 @@ ${urlList}
  }

  if (allowSearch) {
+
    actionSections.push(`
 <action-search>    
- Query external sources using a public search engine
- Focus on solving one specific aspect of the question
- Only give keywords search query, not full sentences
+${allKeywords?.length ? `
+- Avoid the searched queries below as they do not give any useful information, you need to think out of the box and propose queries from a completely different angle:
+<bad-queries>
+${allKeywords.join('\n')}
+</bad-queries>
+`.trim() : ''}
+- Propose some unique new queries that might help you find the answer to the question
+- Focus on solving one specific aspect of the original question
+- Only use keywords, not full sentences
 </action-search>
 `);
  }
@ -249,7 +257,11 @@ Critical Requirements:
 - Exclude all non-JSON text, markdown, or explanations
 - Maintain strict JSON syntax`);

-  return sections.join('\n\n');
+  return removeExtraLineBreaks(sections.join('\n\n'));
+}
+
+const removeExtraLineBreaks = (text: string) => {
+  return text.replace(/\n{2,}/gm, '\n\n');
 }

 const allContext: StepAction[] = [];  // all steps in the current session, including those leads to wrong results
@ -314,6 +326,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
      currentQuestion,
      diaryContext,
      allQuestions,
+      allKeywords,
      allowReflect,
      allowAnswer,
      allowRead,
@ -497,7 +510,7 @@ But then you realized you have asked them before. You decided to to think out of
          switch (SEARCH_PROVIDER) {
            case 'jina':
              // use jinaSearch
-              results = {results: (await jinaSearch(query, context.tokenTracker)).response?.data || []};
+              results = {results: (await search(query, context.tokenTracker)).response?.data || []};
              break;
            case 'duck':
              results = await duckSearch(query, {safeSearch: SafeSearchType.STRICT});
@ -640,6 +653,7 @@ You decided to think out of the box or cut from a completely different angle.`);
      question,
      diaryContext,
      allQuestions,
+      allKeywords,
      false,
      false,
      false,
@ -652,7 +666,7 @@ You decided to think out of the box or cut from a completely different angle.`);

    const model = getModel('agentBeastMode');
    let object;
-    let totalTokens = 0;
+    let totalTokens;
    try {
      const result = await generateObject({
        model,
@ -667,10 +681,10 @@ You decided to think out of the box or cut from a completely different angle.`);
      object = result.object;
      totalTokens = result.totalTokens;
    }
-    context.tokenTracker.trackUsage('agent', totalTokens);
-
    await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
    thisStep = object as StepAction;
+    context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
+    context.tokenTracker.trackUsage('agent', totalTokens);
    console.log(thisStep)
    return {result: thisStep, context};
  }
--- a/src/config.ts
+++ b/src/config.ts
@ -1,50 +1,48 @@
 import dotenv from 'dotenv';
 import { ProxyAgent, setGlobalDispatcher } from 'undici';
 import { createGoogleGenerativeAI } from '@ai-sdk/google';
-import {createOpenAI, OpenAIProviderSettings} from '@ai-sdk/openai';
-
-export type LLMProvider = 'openai' | 'gemini';
-export type ToolName = keyof ToolConfigs;
-
-function isValidProvider(provider: string): provider is LLMProvider {
-  return provider === 'openai' || provider === 'gemini';
-}
-
-function validateModelConfig(config: ModelConfig, toolName: string): ModelConfig {
-  if (typeof config.model !== 'string' || config.model.length === 0) {
-    throw new Error(`Invalid model name for ${toolName}`);
-  }
-  if (typeof config.temperature !== 'number' || config.temperature < 0 || config.temperature > 1) {
-    throw new Error(`Invalid temperature for ${toolName}`);
-  }
-  if (typeof config.maxTokens !== 'number' || config.maxTokens <= 0) {
-    throw new Error(`Invalid maxTokens for ${toolName}`);
-  }
-  return config;
-}
-
-export interface ModelConfig {
-  model: string;
-  temperature: number;
-  maxTokens: number;
-}
-
-export interface ToolConfigs {
-  dedup: ModelConfig;
-  evaluator: ModelConfig;
-  errorAnalyzer: ModelConfig;
-  queryRewriter: ModelConfig;
-  agent: ModelConfig;
-  agentBeastMode: ModelConfig;
-}
-
+import { createOpenAI, OpenAIProviderSettings } from '@ai-sdk/openai';
+import configJson from '../config.json';

+// Load environment variables
 dotenv.config();

-// Setup the proxy globally if present
-if (process.env.https_proxy) {
+// Types
+export type LLMProvider = 'openai' | 'gemini';
+export type ToolName = keyof typeof configJson.models.gemini.tools;
+
+// Type definitions for our config structure
+type EnvConfig = typeof configJson.env;
+
+interface ProviderConfigBase {
+  createClient: string;
+}
+
+interface OpenAIProviderConfig extends ProviderConfigBase {
+  clientConfig: {
+    compatibility: "strict" | "compatible";
+  };
+}
+
+interface GeminiProviderConfig extends ProviderConfigBase {}
+
+type ProviderConfig = {
+  openai: OpenAIProviderConfig;
+  gemini: GeminiProviderConfig;
+};
+
+// Environment setup
+const env: EnvConfig = { ...configJson.env };
+(Object.keys(env) as (keyof EnvConfig)[]).forEach(key => {
+  if (process.env[key]) {
+    env[key] = process.env[key] || env[key];
+  }
+});
+
+// Setup proxy if present
+if (env.https_proxy) {
  try {
-    const proxyUrl = new URL(process.env.https_proxy).toString();
+    const proxyUrl = new URL(env.https_proxy).toString();
    const dispatcher = new ProxyAgent({ uri: proxyUrl });
    setGlobalDispatcher(dispatcher);
  } catch (error) {
@ -52,79 +50,73 @@ if (process.env.https_proxy) {
  }
 }

-export const OPENAI_BASE_URL = process.env.OPENAI_BASE_URL;
-export const GEMINI_API_KEY = process.env.GEMINI_API_KEY as string;
-export const OPENAI_API_KEY = process.env.OPENAI_API_KEY as string;
-export const JINA_API_KEY = process.env.JINA_API_KEY as string;
-export const BRAVE_API_KEY = process.env.BRAVE_API_KEY as string;
-export const SEARCH_PROVIDER: 'brave' | 'jina' | 'duck' = 'jina';
+// Export environment variables
+export const OPENAI_BASE_URL = env.OPENAI_BASE_URL;
+export const GEMINI_API_KEY = env.GEMINI_API_KEY;
+export const OPENAI_API_KEY = env.OPENAI_API_KEY;
+export const JINA_API_KEY = env.JINA_API_KEY;
+export const BRAVE_API_KEY = env.BRAVE_API_KEY;
+export const SEARCH_PROVIDER = configJson.defaults.search_provider;
+export const STEP_SLEEP = configJson.defaults.step_sleep;
+
+// Determine LLM provider
 export const LLM_PROVIDER: LLMProvider = (() => {
-  const provider = process.env.LLM_PROVIDER || 'gemini';
+  const provider = process.env.LLM_PROVIDER || configJson.defaults.llm_provider;
  if (!isValidProvider(provider)) {
    throw new Error(`Invalid LLM provider: ${provider}`);
  }
  return provider;
 })();

-const DEFAULT_GEMINI_MODEL = process.env.DEFAULT_MODEL_NAME || 'gemini-2.0-flash';
-const DEFAULT_OPENAI_MODEL = process.env.DEFAULT_MODEL_NAME || 'gpt-4o-mini';
+function isValidProvider(provider: string): provider is LLMProvider {
+  return provider === 'openai' || provider === 'gemini';
+}

-const defaultGeminiConfig: ModelConfig = {
-  model: DEFAULT_GEMINI_MODEL,
-  temperature: 0,
-  maxTokens: 8000
-};
+interface ToolConfig {
+  model: string;
+  temperature: number;
+  maxTokens: number;
+}

-const defaultOpenAIConfig: ModelConfig = {
-  model: DEFAULT_OPENAI_MODEL,
-  temperature: 0,
-  maxTokens: 8000
-};
+interface ToolOverrides {
+  temperature?: number;
+  maxTokens?: number;
+}

-export const modelConfigs: Record<LLMProvider, ToolConfigs> = {
-  gemini: {
-    dedup: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'dedup'),
-    evaluator: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'evaluator'),
-    errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'errorAnalyzer'),
-    queryRewriter: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'queryRewriter'),
-    agent: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agent'),
-    agentBeastMode: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agentBeastMode')
-  },
-  openai: {
-    dedup: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'dedup'),
-    evaluator: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'evaluator'),
-    errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'errorAnalyzer'),
-    queryRewriter: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'queryRewriter'),
-    agent: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agent'),
-    agentBeastMode: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agentBeastMode')
-  }
-};
+// Get tool configuration
+export function getToolConfig(toolName: ToolName): ToolConfig {
+  const providerConfig = configJson.models[LLM_PROVIDER];
+  const defaultConfig = providerConfig.default;
+  const toolOverrides = providerConfig.tools[toolName] as ToolOverrides;

-export function getToolConfig(toolName: ToolName): ModelConfig {
-  if (!modelConfigs[LLM_PROVIDER][toolName]) {
-    throw new Error(`Invalid tool name: ${toolName}`);
-  }
-  return modelConfigs[LLM_PROVIDER][toolName];
+  return {
+    model: process.env.DEFAULT_MODEL_NAME || defaultConfig.model,
+    temperature: toolOverrides.temperature ?? defaultConfig.temperature,
+    maxTokens: toolOverrides.maxTokens ?? defaultConfig.maxTokens
+  };
 }

 export function getMaxTokens(toolName: ToolName): number {
  return getToolConfig(toolName).maxTokens;
 }

-
+// Get model instance
 export function getModel(toolName: ToolName) {
  const config = getToolConfig(toolName);
+  const providerConfig = configJson.providers[LLM_PROVIDER] as ProviderConfig[typeof LLM_PROVIDER];

  if (LLM_PROVIDER === 'openai') {
    if (!OPENAI_API_KEY) {
      throw new Error('OPENAI_API_KEY not found');
    }
+
    const opt: OpenAIProviderSettings = {
      apiKey: OPENAI_API_KEY,
-      compatibility: 'strict'
-    }
+      compatibility: (providerConfig as OpenAIProviderConfig).clientConfig.compatibility
+    };
+
    if (OPENAI_BASE_URL) {
-      opt.baseURL = OPENAI_BASE_URL
+      opt.baseURL = OPENAI_BASE_URL;
    }

    return createOpenAI(opt)(config.model);
@ -133,19 +125,36 @@ export function getModel(toolName: ToolName) {
  if (!GEMINI_API_KEY) {
    throw new Error('GEMINI_API_KEY not found');
  }
+
  return createGoogleGenerativeAI({ apiKey: GEMINI_API_KEY })(config.model);
 }

-export const STEP_SLEEP = 1000;
-
+// Validate required environment variables
 if (LLM_PROVIDER === 'gemini' && !GEMINI_API_KEY) throw new Error("GEMINI_API_KEY not found");
 if (LLM_PROVIDER === 'openai' && !OPENAI_API_KEY) throw new Error("OPENAI_API_KEY not found");
 if (!JINA_API_KEY) throw new Error("JINA_API_KEY not found");

-console.log('LLM Provider:', LLM_PROVIDER)
-if (LLM_PROVIDER === 'openai') {
-  console.log('OPENAI_BASE_URL', OPENAI_BASE_URL)
-  console.log('Default Model', DEFAULT_OPENAI_MODEL)
-} else {
-  console.log('Default Model', DEFAULT_GEMINI_MODEL)
-}
+// Log all configurations
+const configSummary = {
+  provider: {
+    name: LLM_PROVIDER,
+    model: LLM_PROVIDER === 'openai'
+      ? configJson.models.openai.default.model
+      : configJson.models.gemini.default.model,
+    ...(LLM_PROVIDER === 'openai' && { baseUrl: OPENAI_BASE_URL })
+  },
+  search: {
+    provider: SEARCH_PROVIDER
+  },
+  tools: Object.fromEntries(
+    Object.keys(configJson.models[LLM_PROVIDER].tools).map(name => [
+      name,
+      getToolConfig(name as ToolName)
+    ])
+  ),
+  defaults: {
+    stepSleep: STEP_SLEEP
+  }
+};
+
+console.log('Configuration Summary:', JSON.stringify(configSummary, null, 2));
--- a/src/evals/batch-evals.ts
+++ b/src/evals/batch-evals.ts
@ -3,9 +3,10 @@ import {exec} from 'child_process';
 import {promisify} from 'util';
 import {getResponse} from '../agent';
 import {generateObject} from 'ai';
-import {getModel, getMaxTokens} from '../config';
+import {GEMINI_API_KEY} from '../config';
 import {z} from 'zod';
 import {AnswerAction, TrackerContext} from "../types";
+import {createGoogleGenerativeAI} from "@ai-sdk/google";

 const execAsync = promisify(exec);

@ -24,6 +25,63 @@ interface EvaluationResult {
  actual_answer: string;
 }

+interface EvaluationStats {
+  model_name: string;
+  pass_rate: number;
+  avg_steps: number;
+  max_steps: number;
+  min_steps: number;
+  median_steps: number;
+  avg_tokens: number;
+  median_tokens: number;
+  max_tokens: number;
+  min_tokens: number;
+}
+
+function calculateMedian(numbers: number[]): number {
+  const sorted = [...numbers].sort((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+
+  if (sorted.length % 2 === 0) {
+    return (sorted[middle - 1] + sorted[middle]) / 2;
+  }
+  return sorted[middle];
+}
+
+function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats {
+  const steps = results.map(r => r.total_steps);
+  const tokens = results.map(r => r.total_tokens);
+  const passCount = results.filter(r => r.pass).length;
+
+  return {
+    model_name: modelName,
+    pass_rate: (passCount / results.length) * 100,
+    avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length,
+    max_steps: Math.max(...steps),
+    min_steps: Math.min(...steps),
+    median_steps: calculateMedian(steps),
+    avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length,
+    median_tokens: calculateMedian(tokens),
+    max_tokens: Math.max(...tokens),
+    min_tokens: Math.min(...tokens)
+  };
+}
+
+function printStats(stats: EvaluationStats): void {
+  console.log('\n=== Evaluation Statistics ===');
+  console.log(`Model: ${stats.model_name}`);
+  console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`);
+  console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`);
+  console.log(`Maximum Steps: ${stats.max_steps}`);
+  console.log(`Minimum Steps: ${stats.min_steps}`);
+  console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`);
+  console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`);
+  console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`);
+  console.log(`Maximum Tokens: ${stats.max_tokens}`);
+  console.log(`Minimum Tokens: ${stats.min_tokens}`);
+  console.log('===========================\n');
+}
+
 async function getCurrentGitCommit(): Promise<string> {
  try {
    const {stdout} = await execAsync('git rev-parse --short HEAD');
@ -49,10 +107,10 @@ Minor wording differences are acceptable as long as the core information of the

  try {
    const result = await generateObject({
-      model: getModel('evaluator'),
+      model: createGoogleGenerativeAI({ apiKey: GEMINI_API_KEY })('gemini-2.0-flash'),  // fix to gemini-2.0-flash for evaluation
      schema,
      prompt,
-      maxTokens: getMaxTokens('evaluator'),
+      maxTokens: 1000,
      temperature: 0  // Setting temperature to 0 for deterministic output
    });

@ -71,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise<void> {
  const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8'));
  const results: EvaluationResult[] = [];
  const gitCommit = await getCurrentGitCommit();
-  const outputFile = `eval-${gitCommit}.json`;
+  const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown';
+  const outputFile = `eval-${gitCommit}-${modelName}.json`;
+
  // Process each question
  for (let i = 0; i < questions.length; i++) {
    const {question, answer: expectedAnswer} = questions[i];
@ -113,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise<void> {
        actual_answer: 'Error occurred'
      });
    }
-    // Save results
-    await fs.writeFile(outputFile, JSON.stringify(results, null, 2));
-    console.log(`\nEvaluation results saved to ${outputFile}`);
  }

+  // Calculate and print statistics
+  const stats = calculateStats(results, modelName);
+  printStats(stats);

+  // Save results
+  await fs.writeFile(outputFile, JSON.stringify({
+    results,
+    statistics: stats
+  }, null, 2));
+
+  console.log(`\nEvaluation results saved to ${outputFile}`);
 }

 // Run batch evaluation if this is the main module
--- a/src/evals/ego-questions.json
+++ b/src/evals/ego-questions.json
@ -1,7 +1,7 @@
 [
  {
-    "question": "what is jina ai ceo's twitter account",
-    "answer": "hxiao"
+    "question": "what did jina ai ceo say about deepseek that went viral and become a meme?",
+    "answer": "a side project"
  },
  {
    "question": "when was jina ai founded?",
@ -12,7 +12,7 @@
    "answer": "ReaderLM-2.0"
  },
  {
-    "question": "what is the lastest blog post that jina ai published?",
+    "question": "what is the latest blog post that jina ai published?",
    "answer": "A Practical Guide to Deploying Search Foundation Models in Production"
  },
  {
@ -24,19 +24,59 @@
    "answer": "30"
  },
  {
-    "question": "how much rate limit for r.jina.ai api without an api key?",
-    "answer": "20 RPM (requests per minute)"
+    "question": "when was jina reader released?",
+    "answer": "April 2024"
  },
  {
    "question": "How many offices do Jina AI have and where are they?",
    "answer": "four: sunnyvale, berlin, beijing, shenzhen"
  },
  {
-    "question": "Does jina reranker v2 support multilingual?",
-    "answer": "Yes"
+    "question": "what exactly jina-colbert-v2 improves over jina-colbert-v1?",
+    "answer": "v2 add multilingual support"
  },
  {
    "question": "who are the authors of jina-clip-v2 paper?",
    "answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao"
+  },
+  {
+    "question": "who created the node-deepresearch project?",
+    "answer": "Han Xiao / jina ai"
+  },
+  {
+    "question": "Which countries are the investors of Jina AI from?",
+    "answer": "USA and China only, no German investors"
+  },
+  {
+    "question": "what is the grounding api endpoint of jina ai?",
+    "answer": "g.jina.ai"
+  },
+  {
+    "question": "which of the following models do not support Matryoshka representation? jina-embeddings-v3, jina-embeddings-v2-base-en, jina-clip-v2, jina-clip-v1",
+    "answer": "jina-embeddings-v2-base-en and jina-clip-v1"
+  },
+  {
+    "question": "Can I purchase the 2024 yearbook that jina ai published today?",
+    "answer": "No it is sold out."
+  },
+  {
+    "question": "How many free tokens do you get from a new jina api key?",
+    "answer": "1 million."
+  },
+  {
+    "question": "Who is the legal signatory of Jina AI gmbh?",
+    "answer": "Jiao Liu"
+  },
+  {
+    "question": "what is the key idea behind node-deepresearch project?",
+    "answer": "It keeps searching, reading webpages, reasoning until an answer is found."
+  },
+  {
+    "question": "what is the name of the jina ai's mascot?",
+    "answer": "No, Jina AI does not have a mascot."
+  },
+  {
+    "question": "Does late chunking work with cls pooling?",
+    "answer": "No. late chunking only works with mean pooling."
  }
 ]
--- a/src/tools/tests/brave-search.test.ts
+++ b/src/tools/tests/brave-search.test.ts
@ -1,12 +0,0 @@
-import { braveSearch } from '../brave-search';
-
-describe('braveSearch', () => {
-  it('should return search results', async () => {
-    const { response } = await braveSearch('test query');
-    expect(response.web.results).toBeDefined();
-    expect(response.web.results.length).toBeGreaterThan(0);
-    expect(response.web.results[0]).toHaveProperty('title');
-    expect(response.web.results[0]).toHaveProperty('url');
-    expect(response.web.results[0]).toHaveProperty('description');
-  });
-});
--- a/src/tools/tests/dedup.test.ts
+++ b/src/tools/tests/dedup.test.ts
@ -1,37 +0,0 @@
-import { dedupQueries } from '../dedup';
-import { LLMProvider } from '../../config';
-
-describe('dedupQueries', () => {
-  const providers: Array<LLMProvider> = ['openai', 'gemini'];
-  const originalEnv = process.env;
-
-  beforeEach(() => {
-    jest.resetModules();
-    process.env = { ...originalEnv };
-  });
-
-  afterEach(() => {
-    process.env = originalEnv;
-  });
-
-  providers.forEach(provider => {
-    describe(`with ${provider} provider`, () => {
-      beforeEach(() => {
-        process.env.LLM_PROVIDER = provider;
-      });
-
-      it('should remove duplicate queries', async () => {
-        jest.setTimeout(10000);
-        const queries = ['typescript tutorial', 'typescript tutorial', 'javascript basics'];
-        const { unique_queries } = await dedupQueries(queries, []);
-        expect(unique_queries).toHaveLength(2);
-        expect(unique_queries).toContain('javascript basics');
-      });
-
-      it('should handle empty input', async () => {
-        const { unique_queries } = await dedupQueries([], []);
-        expect(unique_queries).toHaveLength(0);
-      });
-    });
-  });
-});
--- a/src/tools/tests/error-analyzer.test.ts
+++ b/src/tools/tests/error-analyzer.test.ts
@ -25,7 +25,7 @@ describe('analyzeSteps', () => {
        expect(response).toHaveProperty('recap');
        expect(response).toHaveProperty('blame');
        expect(response).toHaveProperty('improvement');
-      });
+      }, 30000);
    });
  });
 });
--- a/src/tools/tests/evaluator.test.ts
+++ b/src/tools/tests/evaluator.test.ts
@ -32,25 +32,6 @@ describe('evaluateAnswer', () => {
        expect(response).toHaveProperty('pass');
        expect(response).toHaveProperty('think');
        expect(response.type).toBe('definitive');
-        expect(response.pass).toBe(true);
-      });
-
-      it('should evaluate answer freshness', async () => {
-        const tokenTracker = new TokenTracker();
-        const { response } = await evaluateAnswer(
-          'What is the latest version of Node.js?',
-          'The latest version of Node.js is 14.0.0, released in April 2020.',
-          ['freshness'],
-          tokenTracker
-        );
-        expect(response).toHaveProperty('pass');
-        expect(response).toHaveProperty('think');
-        expect(response.type).toBe('freshness');
-        expect(response.freshness_analysis).toBeDefined();
-        expect(response.freshness_analysis?.likely_outdated).toBe(true);
-        expect(response.freshness_analysis?.dates_mentioned).toContain('2020-04');
-        expect(response.freshness_analysis?.current_time).toBeDefined();
-        expect(response.pass).toBe(false);
      });

      it('should evaluate answer plurality', async () => {
@ -64,38 +45,7 @@ describe('evaluateAnswer', () => {
        expect(response).toHaveProperty('pass');
        expect(response).toHaveProperty('think');
        expect(response.type).toBe('plurality');
-        expect(response.plurality_analysis).toBeDefined();
        expect(response.plurality_analysis?.expects_multiple).toBe(true);
-        expect(response.plurality_analysis?.provides_multiple).toBe(false);
-        expect(response.plurality_analysis?.count_expected).toBe(3);
-        expect(response.plurality_analysis?.count_provided).toBe(1);
-        expect(response.pass).toBe(false);
-      });
-
-      it('should evaluate in order and stop at first failure', async () => {
-        const tokenTracker = new TokenTracker();
-        const { response } = await evaluateAnswer(
-          'List the latest Node.js versions.',
-          'I am not sure about the Node.js versions.',
-          ['definitive', 'freshness', 'plurality'],
-          tokenTracker
-        );
-        expect(response.type).toBe('definitive');
-        expect(response.pass).toBe(false);
-        expect(response.freshness_analysis).toBeUndefined();
-        expect(response.plurality_analysis).toBeUndefined();
-      });
-
-      it('should track token usage', async () => {
-        const tokenTracker = new TokenTracker();
-        const spy = jest.spyOn(tokenTracker, 'trackUsage');
-        await evaluateAnswer(
-          'What is TypeScript?',
-          'TypeScript is a strongly typed programming language that builds on JavaScript.',
-          ['definitive', 'freshness', 'plurality'],
-          tokenTracker
-        );
-        expect(spy).toHaveBeenCalledWith('evaluator', expect.any(Number));
      });
    });
  });
--- a/src/tools/tests/query-rewriter.test.ts
+++ b/src/tools/tests/query-rewriter.test.ts
@ -1,34 +0,0 @@
-import { rewriteQuery } from '../query-rewriter';
-import { LLMProvider } from '../../config';
-
-describe('rewriteQuery', () => {
-  const providers: Array<LLMProvider> = ['openai', 'gemini'];
-  const originalEnv = process.env;
-
-  beforeEach(() => {
-    jest.resetModules();
-    process.env = { ...originalEnv };
-  });
-
-  afterEach(() => {
-    process.env = originalEnv;
-  });
-
-  providers.forEach(provider => {
-    describe(`with ${provider} provider`, () => {
-      beforeEach(() => {
-        process.env.LLM_PROVIDER = provider;
-      });
-
-      it('should rewrite search query', async () => {
-        const { queries } = await rewriteQuery({
-          action: 'search',
-          searchQuery: 'how does typescript work',
-          think: 'Understanding TypeScript basics'
-        });
-        expect(Array.isArray(queries)).toBe(true);
-        expect(queries.length).toBeGreaterThan(0);
-      });
-    });
-  });
-});
--- a/src/tools/tests/search.test.ts
+++ b/src/tools/tests/search.test.ts
@ -1,10 +1,10 @@
-import { jinaSearch } from '../jinaSearch';
+import { search } from '../jina-search';
 import { TokenTracker } from '../../utils/token-tracker';

 describe('search', () => {
  it.skip('should perform search with Jina API (skipped due to insufficient balance)', async () => {
    const tokenTracker = new TokenTracker();
-    const { response } = await jinaSearch('TypeScript programming', tokenTracker);
+    const { response } = await search('TypeScript programming', tokenTracker);
    expect(response).toBeDefined();
    expect(response.data).toBeDefined();
    if (response.data === null) {
@ -15,7 +15,7 @@ describe('search', () => {
  }, 15000);

  it('should handle empty query', async () => {
-    await expect(jinaSearch('')).rejects.toThrow();
+    await expect(search('')).rejects.toThrow();
  }, 15000);

  beforeEach(() => {
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@ -285,14 +285,13 @@ export async function evaluateAnswer(
          break;
      }
    } catch (error) {
-      console.error(`Error in ${evaluationType} evaluation:`, error);
      const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
      (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
-      if (!errorResult.object.pass) {
-        return { response: errorResult.object };
-      }
+      // Always return from catch block to prevent undefined result
+      return { response: errorResult.object };
    }
  }

+  // Only reach this point if all evaluations pass
  return { response: result!.object };
 }
--- a/src/tools/jina-dedup.ts
+++ b/src/tools/jina-dedup.ts
@ -0,0 +1,143 @@
+import axios from 'axios';
+import { TokenTracker } from "../utils/token-tracker";
+import {JINA_API_KEY} from "../config";
+
+const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
+const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity
+
+// Types for Jina API
+interface JinaEmbeddingRequest {
+  model: string;
+  input: string[];
+}
+
+interface JinaEmbeddingResponse {
+  model: string;
+  object: string;
+  usage: {
+    total_tokens: number;
+    prompt_tokens: number;
+  };
+  data: Array<{
+    object: string;
+    index: number;
+    embedding: number[];
+  }>;
+}
+
+
+// Compute cosine similarity between two vectors
+function cosineSimilarity(vecA: number[], vecB: number[]): number {
+  const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
+  const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
+  const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
+  return dotProduct / (normA * normB);
+}
+
+// Get embeddings for all queries in one batch
+async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> {
+  if (!JINA_API_KEY) {
+    throw new Error('JINA_API_KEY is not set');
+  }
+
+  const request: JinaEmbeddingRequest = {
+    model: 'jina-embeddings-v3',
+    input: queries
+  };
+
+  try {
+    const response = await axios.post<JinaEmbeddingResponse>(
+      JINA_API_URL,
+      request,
+      {
+        headers: {
+          'Content-Type': 'application/json',
+          'Authorization': `Bearer ${JINA_API_KEY}`
+        }
+      }
+    );
+
+    // Sort embeddings by index to maintain original order
+    const embeddings = response.data.data
+      .sort((a, b) => a.index - b.index)
+      .map(item => item.embedding);
+
+    return {
+      embeddings,
+      tokens: response.data.usage.total_tokens
+    };
+  } catch (error) {
+    console.error('Error getting embeddings from Jina:', error);
+    throw error;
+  }
+}
+
+export async function dedupQueries(
+  newQueries: string[],
+  existingQueries: string[],
+  tracker?: TokenTracker
+): Promise<{ unique_queries: string[], tokens: number }> {
+  try {
+    // Quick return for single new query with no existing queries
+    if (newQueries.length === 1 && existingQueries.length === 0) {
+      console.log('Dedup (quick return):', newQueries);
+      return {
+        unique_queries: newQueries,
+        tokens: 0  // No tokens used since we didn't call the API
+      };
+    }
+
+    // Get embeddings for all queries in one batch
+    const allQueries = [...newQueries, ...existingQueries];
+    const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries);
+
+    // Split embeddings back into new and existing
+    const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
+    const existingEmbeddings = allEmbeddings.slice(newQueries.length);
+
+    const uniqueQueries: string[] = [];
+    const usedIndices = new Set<number>();
+
+    // Compare each new query against existing queries and already accepted queries
+    for (let i = 0; i < newQueries.length; i++) {
+      let isUnique = true;
+
+      // Check against existing queries
+      for (let j = 0; j < existingQueries.length; j++) {
+        const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]);
+        if (similarity >= SIMILARITY_THRESHOLD) {
+          isUnique = false;
+          break;
+        }
+      }
+
+      // Check against already accepted queries
+      if (isUnique) {
+        for (const usedIndex of usedIndices) {
+          const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]);
+          if (similarity >= SIMILARITY_THRESHOLD) {
+            isUnique = false;
+            break;
+          }
+        }
+      }
+
+      // Add to unique queries if passed all checks
+      if (isUnique) {
+        uniqueQueries.push(newQueries[i]);
+        usedIndices.add(i);
+      }
+    }
+
+    // Track token usage from the API
+    (tracker || new TokenTracker()).trackUsage('dedup', tokens);
+    console.log('Dedup:', uniqueQueries);
+    return {
+      unique_queries: uniqueQueries,
+      tokens
+    };
+  } catch (error) {
+    console.error('Error in deduplication analysis:', error);
+    throw error;
+  }
+}
--- a/src/tools/jina-search.ts
+++ b/src/tools/jina-search.ts
@ -3,7 +3,7 @@ import { TokenTracker } from "../utils/token-tracker";
 import { SearchResponse } from '../types';
 import { JINA_API_KEY } from "../config";

-export function jinaSearch(query: string, tracker?: TokenTracker): Promise<{ response: SearchResponse, tokens: number }> {
+export function search(query: string, tracker?: TokenTracker): Promise<{ response: SearchResponse, tokens: number }> {
  return new Promise((resolve, reject) => {
    if (!query.trim()) {
      reject(new Error('Query cannot be empty'));
--- a/src/tools/query-rewriter.ts
+++ b/src/tools/query-rewriter.ts
@ -18,7 +18,7 @@ const responseSchema = z.object({


 function getPrompt(action: SearchAction): string {
-  return `You are an expert Information Retrieval Assistant. Transform user queries into precise keyword combinations with strategic reasoning and appropriate search operators.
+  return `You are an expert Information Retrieval query optimizer. Optimize user queries into precise keyword combinations with strategic reasoning and appropriate search operators.

 <rules>
 1. Generate search queries that directly include appropriate operators
@ -61,7 +61,7 @@ Input Query: How to fix a leaking kitchen faucet?
 <think>
 This is a how-to query seeking practical solutions. User likely wants step-by-step guidance and visual demonstrations for DIY repair. We'll target both video tutorials and written guides.
 </think>
-Queries: [
+Output Queries: [
  "kitchen faucet leak repair",
  "faucet drip fix site:youtube.com",
  "how to repair faucet "
@ -71,7 +71,7 @@ Input Query: What are healthy breakfast options for type 2 diabetes?
 <think>
 This is a health-specific informational query. User needs authoritative medical advice combined with practical meal suggestions. Splitting into medical guidelines and recipes will provide comprehensive coverage.
 </think>
-Queries: [
+Output Queries: [
  "what to eat for type 2 diabetes",
  "type 2 diabetes breakfast guidelines",
  "diabetic breakfast recipes"
@ -81,7 +81,7 @@ Input Query: Latest AWS Lambda features for serverless applications
 <think>
 This is a product research query focused on recent updates. User wants current information about specific technology features, likely for implementation purposes. We'll target official docs and community insights.
 </think>
-Queries: [
+Output Queries: [
  "aws lambda features site:aws.amazon.com intitle:2025",
  "new features lambda serverless"
 ]