diff --git a/.gitignore b/.gitignore index 598fa38..56ef11f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ knowledge.json prompt-*.txt queries.json questions.json +eval-*.json # Logs logs diff --git a/Dockerfile b/Dockerfile index 60084d3..707194e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,8 @@ COPY package*.json ./ # Install production dependencies only RUN npm install --production --ignore-scripts -# Copy built files from the build stage +# Copy config.json and built files from builder +COPY --from=builder /app/config.json ./ COPY --from=builder /app/dist ./dist # Set environment variables (Recommended to set at runtime, avoid hardcoding) @@ -41,4 +42,4 @@ ENV BRAVE_API_KEY=${BRAVE_API_KEY} EXPOSE 3000 # Set startup command -CMD ["node", "./dist/server.js"] \ No newline at end of file +CMD ["node", "./dist/server.js"] diff --git a/README.md b/README.md index 71bd23f..f08fbb5 100644 --- a/README.md +++ b/README.md @@ -224,10 +224,28 @@ flowchart TD ## Evaluation -I kept the evaluation simple, LLM-as-a-judge and collect some ego questions (i.e. questions about Jina AI that I know 100% the answer) for evaluation. +I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./src/evals/ego-questions.json) for evaluation. These are the questions about Jina AI that I know 100% the answer but LLMs do not. I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer. ```bash npm run eval ./src/evals/ego-questions.json -``` \ No newline at end of file +``` + +Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set. + +Plain `gemini-2.0-flash` can be run by setting `tokenBudget` to zero, skipping the while-loop and directly answering the question. + +It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as I intentionally filtered out the questions that LLMs can answer. + +| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch (#5e80ed4) | +|--------|------------------|-------------------------------------------------| +| Pass Rate | 0% | 60% | +| Average Steps | 1 | 5 | +| Maximum Steps | 1 | 13 | +| Minimum Steps | 1 | 2 | +| Median Steps | 1 | 3 | +| Average Tokens | 428 | 59,408 | +| Median Tokens | 434 | 16,001 | +| Maximum Tokens | 463 | 347,222 | +| Minimum Tokens | 374 | 5,594 | \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..04be083 --- /dev/null +++ b/config.json @@ -0,0 +1,59 @@ +{ + "env": { + "https_proxy": "", + "OPENAI_BASE_URL": "", + "GEMINI_API_KEY": "", + "OPENAI_API_KEY": "", + "JINA_API_KEY": "", + "BRAVE_API_KEY": "", + "DEFAULT_MODEL_NAME": "" + }, + "defaults": { + "search_provider": "jina", + "llm_provider": "gemini", + "step_sleep": 1000 + }, + "providers": { + "gemini": { + "createClient": "createGoogleGenerativeAI" + }, + "openai": { + "createClient": "createOpenAI", + "clientConfig": { + "compatibility": "strict" + } + } + }, + "models": { + "gemini": { + "default": { + "model": "gemini-2.0-flash", + "temperature": 0, + "maxTokens": 8000 + }, + "tools": { + "dedup": { "temperature": 0.1 }, + "evaluator": {}, + "errorAnalyzer": {}, + "queryRewriter": { "temperature": 0.1 }, + "agent": { "temperature": 0.7 }, + "agentBeastMode": { "temperature": 0.7 } + } + }, + "openai": { + "default": { + "model": "gpt-4o-mini", + "temperature": 0, + "maxTokens": 8000 + }, + "tools": { + "dedup": { "temperature": 0.1 }, + "evaluator": {}, + "errorAnalyzer": {}, + "queryRewriter": { "temperature": 0.1 }, + "agent": { "temperature": 0.7 }, + "agentBeastMode": { "temperature": 0.7 } + } + } + } +} diff --git a/package.json b/package.json index 02aec38..7681517 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "lint:fix": "eslint . --ext .ts --fix", "serve": "ts-node src/server.ts", "eval": "ts-node src/evals/batch-evals.ts", - "test": "jest", + "test": "jest --testTimeout=30000", "test:watch": "jest --watch" }, "keywords": [], diff --git a/src/__tests__/agent.test.ts b/src/__tests__/agent.test.ts index b6e86fd..35ff465 100644 --- a/src/__tests__/agent.test.ts +++ b/src/__tests__/agent.test.ts @@ -1,11 +1,15 @@ import { getResponse } from '../agent'; describe('getResponse', () => { + afterEach(() => { + jest.useRealTimers(); + }); + it('should handle search action', async () => { - const result = await getResponse('What is TypeScript?', 1000); + const result = await getResponse('What is TypeScript?', 10000); expect(result.result.action).toBeDefined(); expect(result.context).toBeDefined(); expect(result.context.tokenTracker).toBeDefined(); expect(result.context.actionTracker).toBeDefined(); - }); + }, 30000); }); diff --git a/src/__tests__/cli.test.ts b/src/__tests__/cli.test.ts deleted file mode 100644 index 7051679..0000000 --- a/src/__tests__/cli.test.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { exec } from 'child_process'; -import { promisify } from 'util'; - -const execAsync = promisify(exec); - -// Mock environment variables -process.env.GEMINI_API_KEY = 'test-key'; -process.env.JINA_API_KEY = 'test-key'; - -jest.mock('../agent', () => ({ - getResponse: jest.fn().mockResolvedValue({ - result: { - action: 'answer', - answer: 'Test answer', - references: [] - } - }) -})); - -describe('CLI', () => { - test('shows version', async () => { - const { stdout } = await execAsync('ts-node src/cli.ts --version'); - expect(stdout.trim()).toMatch(/\d+\.\d+\.\d+/); - }); - - test('shows help', async () => { - const { stdout } = await execAsync('ts-node src/cli.ts --help'); - expect(stdout).toContain('deepresearch'); - expect(stdout).toContain('AI-powered research assistant'); - }); - - test('handles invalid token budget', async () => { - try { - await execAsync('ts-node src/cli.ts -t invalid "test query"'); - fail('Should have thrown'); - } catch (error) { - expect((error as { stderr: string }).stderr).toContain('Invalid token budget: must be a number'); - } - }); -}); diff --git a/src/agent.ts b/src/agent.ts index 2e8d968..e9ba197 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -7,14 +7,14 @@ import fs from 'fs/promises'; import {SafeSearchType, search as duckSearch} from "duck-duck-scrape"; import {braveSearch} from "./tools/brave-search"; import {rewriteQuery} from "./tools/query-rewriter"; -import {dedupQueries} from "./tools/dedup"; +import {dedupQueries} from "./tools/jina-dedup"; import {evaluateAnswer} from "./tools/evaluator"; import {analyzeSteps} from "./tools/error-analyzer"; import {TokenTracker} from "./utils/token-tracker"; import {ActionTracker} from "./utils/action-tracker"; import {StepAction, AnswerAction} from "./types"; import {TrackerContext} from "./types"; -import {jinaSearch} from "./tools/jinaSearch"; +import {search} from "./tools/jina-search"; async function sleep(ms: number) { const seconds = Math.ceil(ms / 1000); @@ -32,7 +32,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole if (allowSearch) { actions.push("search"); properties.searchQuery = z.string().max(30) - .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional(); + .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand. Existing queries must be avoided").optional(); } if (allowAnswer) { @@ -75,6 +75,7 @@ function getPrompt( question: string, context?: string[], allQuestions?: string[], + allKeywords?: string[], allowReflect: boolean = true, allowAnswer: boolean = true, allowRead: boolean = true, @@ -190,11 +191,18 @@ ${urlList} } if (allowSearch) { + actionSections.push(` -- Query external sources using a public search engine -- Focus on solving one specific aspect of the question -- Only give keywords search query, not full sentences +${allKeywords?.length ? ` +- Avoid the searched queries below as they do not give any useful information, you need to think out of the box and propose queries from a completely different angle: + +${allKeywords.join('\n')} + +`.trim() : ''} +- Propose some unique new queries that might help you find the answer to the question +- Focus on solving one specific aspect of the original question +- Only use keywords, not full sentences `); } @@ -249,7 +257,11 @@ Critical Requirements: - Exclude all non-JSON text, markdown, or explanations - Maintain strict JSON syntax`); - return sections.join('\n\n'); + return removeExtraLineBreaks(sections.join('\n\n')); +} + +const removeExtraLineBreaks = (text: string) => { + return text.replace(/\n{2,}/gm, '\n\n'); } const allContext: StepAction[] = []; // all steps in the current session, including those leads to wrong results @@ -314,6 +326,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ currentQuestion, diaryContext, allQuestions, + allKeywords, allowReflect, allowAnswer, allowRead, @@ -497,7 +510,7 @@ But then you realized you have asked them before. You decided to to think out of switch (SEARCH_PROVIDER) { case 'jina': // use jinaSearch - results = {results: (await jinaSearch(query, context.tokenTracker)).response?.data || []}; + results = {results: (await search(query, context.tokenTracker)).response?.data || []}; break; case 'duck': results = await duckSearch(query, {safeSearch: SafeSearchType.STRICT}); @@ -640,6 +653,7 @@ You decided to think out of the box or cut from a completely different angle.`); question, diaryContext, allQuestions, + allKeywords, false, false, false, @@ -652,7 +666,7 @@ You decided to think out of the box or cut from a completely different angle.`); const model = getModel('agentBeastMode'); let object; - let totalTokens = 0; + let totalTokens; try { const result = await generateObject({ model, @@ -667,10 +681,10 @@ You decided to think out of the box or cut from a completely different angle.`); object = result.object; totalTokens = result.totalTokens; } - context.tokenTracker.trackUsage('agent', totalTokens); - await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); thisStep = object as StepAction; + context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts}); + context.tokenTracker.trackUsage('agent', totalTokens); console.log(thisStep) return {result: thisStep, context}; } diff --git a/src/config.ts b/src/config.ts index 7a5a6b2..f1f2b40 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,50 +1,48 @@ import dotenv from 'dotenv'; import { ProxyAgent, setGlobalDispatcher } from 'undici'; import { createGoogleGenerativeAI } from '@ai-sdk/google'; -import {createOpenAI, OpenAIProviderSettings} from '@ai-sdk/openai'; - -export type LLMProvider = 'openai' | 'gemini'; -export type ToolName = keyof ToolConfigs; - -function isValidProvider(provider: string): provider is LLMProvider { - return provider === 'openai' || provider === 'gemini'; -} - -function validateModelConfig(config: ModelConfig, toolName: string): ModelConfig { - if (typeof config.model !== 'string' || config.model.length === 0) { - throw new Error(`Invalid model name for ${toolName}`); - } - if (typeof config.temperature !== 'number' || config.temperature < 0 || config.temperature > 1) { - throw new Error(`Invalid temperature for ${toolName}`); - } - if (typeof config.maxTokens !== 'number' || config.maxTokens <= 0) { - throw new Error(`Invalid maxTokens for ${toolName}`); - } - return config; -} - -export interface ModelConfig { - model: string; - temperature: number; - maxTokens: number; -} - -export interface ToolConfigs { - dedup: ModelConfig; - evaluator: ModelConfig; - errorAnalyzer: ModelConfig; - queryRewriter: ModelConfig; - agent: ModelConfig; - agentBeastMode: ModelConfig; -} - +import { createOpenAI, OpenAIProviderSettings } from '@ai-sdk/openai'; +import configJson from '../config.json'; +// Load environment variables dotenv.config(); -// Setup the proxy globally if present -if (process.env.https_proxy) { +// Types +export type LLMProvider = 'openai' | 'gemini'; +export type ToolName = keyof typeof configJson.models.gemini.tools; + +// Type definitions for our config structure +type EnvConfig = typeof configJson.env; + +interface ProviderConfigBase { + createClient: string; +} + +interface OpenAIProviderConfig extends ProviderConfigBase { + clientConfig: { + compatibility: "strict" | "compatible"; + }; +} + +interface GeminiProviderConfig extends ProviderConfigBase {} + +type ProviderConfig = { + openai: OpenAIProviderConfig; + gemini: GeminiProviderConfig; +}; + +// Environment setup +const env: EnvConfig = { ...configJson.env }; +(Object.keys(env) as (keyof EnvConfig)[]).forEach(key => { + if (process.env[key]) { + env[key] = process.env[key] || env[key]; + } +}); + +// Setup proxy if present +if (env.https_proxy) { try { - const proxyUrl = new URL(process.env.https_proxy).toString(); + const proxyUrl = new URL(env.https_proxy).toString(); const dispatcher = new ProxyAgent({ uri: proxyUrl }); setGlobalDispatcher(dispatcher); } catch (error) { @@ -52,79 +50,73 @@ if (process.env.https_proxy) { } } -export const OPENAI_BASE_URL = process.env.OPENAI_BASE_URL; -export const GEMINI_API_KEY = process.env.GEMINI_API_KEY as string; -export const OPENAI_API_KEY = process.env.OPENAI_API_KEY as string; -export const JINA_API_KEY = process.env.JINA_API_KEY as string; -export const BRAVE_API_KEY = process.env.BRAVE_API_KEY as string; -export const SEARCH_PROVIDER: 'brave' | 'jina' | 'duck' = 'jina'; +// Export environment variables +export const OPENAI_BASE_URL = env.OPENAI_BASE_URL; +export const GEMINI_API_KEY = env.GEMINI_API_KEY; +export const OPENAI_API_KEY = env.OPENAI_API_KEY; +export const JINA_API_KEY = env.JINA_API_KEY; +export const BRAVE_API_KEY = env.BRAVE_API_KEY; +export const SEARCH_PROVIDER = configJson.defaults.search_provider; +export const STEP_SLEEP = configJson.defaults.step_sleep; + +// Determine LLM provider export const LLM_PROVIDER: LLMProvider = (() => { - const provider = process.env.LLM_PROVIDER || 'gemini'; + const provider = process.env.LLM_PROVIDER || configJson.defaults.llm_provider; if (!isValidProvider(provider)) { throw new Error(`Invalid LLM provider: ${provider}`); } return provider; })(); -const DEFAULT_GEMINI_MODEL = process.env.DEFAULT_MODEL_NAME || 'gemini-2.0-flash'; -const DEFAULT_OPENAI_MODEL = process.env.DEFAULT_MODEL_NAME || 'gpt-4o-mini'; +function isValidProvider(provider: string): provider is LLMProvider { + return provider === 'openai' || provider === 'gemini'; +} -const defaultGeminiConfig: ModelConfig = { - model: DEFAULT_GEMINI_MODEL, - temperature: 0, - maxTokens: 8000 -}; +interface ToolConfig { + model: string; + temperature: number; + maxTokens: number; +} -const defaultOpenAIConfig: ModelConfig = { - model: DEFAULT_OPENAI_MODEL, - temperature: 0, - maxTokens: 8000 -}; +interface ToolOverrides { + temperature?: number; + maxTokens?: number; +} -export const modelConfigs: Record = { - gemini: { - dedup: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'dedup'), - evaluator: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'evaluator'), - errorAnalyzer: validateModelConfig({ ...defaultGeminiConfig, temperature: 0 }, 'errorAnalyzer'), - queryRewriter: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.1 }, 'queryRewriter'), - agent: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agent'), - agentBeastMode: validateModelConfig({ ...defaultGeminiConfig, temperature: 0.7 }, 'agentBeastMode') - }, - openai: { - dedup: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'dedup'), - evaluator: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'evaluator'), - errorAnalyzer: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0 }, 'errorAnalyzer'), - queryRewriter: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.1 }, 'queryRewriter'), - agent: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agent'), - agentBeastMode: validateModelConfig({ ...defaultOpenAIConfig, temperature: 0.7 }, 'agentBeastMode') - } -}; +// Get tool configuration +export function getToolConfig(toolName: ToolName): ToolConfig { + const providerConfig = configJson.models[LLM_PROVIDER]; + const defaultConfig = providerConfig.default; + const toolOverrides = providerConfig.tools[toolName] as ToolOverrides; -export function getToolConfig(toolName: ToolName): ModelConfig { - if (!modelConfigs[LLM_PROVIDER][toolName]) { - throw new Error(`Invalid tool name: ${toolName}`); - } - return modelConfigs[LLM_PROVIDER][toolName]; + return { + model: process.env.DEFAULT_MODEL_NAME || defaultConfig.model, + temperature: toolOverrides.temperature ?? defaultConfig.temperature, + maxTokens: toolOverrides.maxTokens ?? defaultConfig.maxTokens + }; } export function getMaxTokens(toolName: ToolName): number { return getToolConfig(toolName).maxTokens; } - +// Get model instance export function getModel(toolName: ToolName) { const config = getToolConfig(toolName); + const providerConfig = configJson.providers[LLM_PROVIDER] as ProviderConfig[typeof LLM_PROVIDER]; if (LLM_PROVIDER === 'openai') { if (!OPENAI_API_KEY) { throw new Error('OPENAI_API_KEY not found'); } + const opt: OpenAIProviderSettings = { apiKey: OPENAI_API_KEY, - compatibility: 'strict' - } + compatibility: (providerConfig as OpenAIProviderConfig).clientConfig.compatibility + }; + if (OPENAI_BASE_URL) { - opt.baseURL = OPENAI_BASE_URL + opt.baseURL = OPENAI_BASE_URL; } return createOpenAI(opt)(config.model); @@ -133,19 +125,36 @@ export function getModel(toolName: ToolName) { if (!GEMINI_API_KEY) { throw new Error('GEMINI_API_KEY not found'); } + return createGoogleGenerativeAI({ apiKey: GEMINI_API_KEY })(config.model); } -export const STEP_SLEEP = 1000; - +// Validate required environment variables if (LLM_PROVIDER === 'gemini' && !GEMINI_API_KEY) throw new Error("GEMINI_API_KEY not found"); if (LLM_PROVIDER === 'openai' && !OPENAI_API_KEY) throw new Error("OPENAI_API_KEY not found"); if (!JINA_API_KEY) throw new Error("JINA_API_KEY not found"); -console.log('LLM Provider:', LLM_PROVIDER) -if (LLM_PROVIDER === 'openai') { - console.log('OPENAI_BASE_URL', OPENAI_BASE_URL) - console.log('Default Model', DEFAULT_OPENAI_MODEL) -} else { - console.log('Default Model', DEFAULT_GEMINI_MODEL) -} \ No newline at end of file +// Log all configurations +const configSummary = { + provider: { + name: LLM_PROVIDER, + model: LLM_PROVIDER === 'openai' + ? configJson.models.openai.default.model + : configJson.models.gemini.default.model, + ...(LLM_PROVIDER === 'openai' && { baseUrl: OPENAI_BASE_URL }) + }, + search: { + provider: SEARCH_PROVIDER + }, + tools: Object.fromEntries( + Object.keys(configJson.models[LLM_PROVIDER].tools).map(name => [ + name, + getToolConfig(name as ToolName) + ]) + ), + defaults: { + stepSleep: STEP_SLEEP + } +}; + +console.log('Configuration Summary:', JSON.stringify(configSummary, null, 2)); diff --git a/src/evals/batch-evals.ts b/src/evals/batch-evals.ts index 9f4c219..526b027 100644 --- a/src/evals/batch-evals.ts +++ b/src/evals/batch-evals.ts @@ -3,9 +3,10 @@ import {exec} from 'child_process'; import {promisify} from 'util'; import {getResponse} from '../agent'; import {generateObject} from 'ai'; -import {getModel, getMaxTokens} from '../config'; +import {GEMINI_API_KEY} from '../config'; import {z} from 'zod'; import {AnswerAction, TrackerContext} from "../types"; +import {createGoogleGenerativeAI} from "@ai-sdk/google"; const execAsync = promisify(exec); @@ -24,6 +25,63 @@ interface EvaluationResult { actual_answer: string; } +interface EvaluationStats { + model_name: string; + pass_rate: number; + avg_steps: number; + max_steps: number; + min_steps: number; + median_steps: number; + avg_tokens: number; + median_tokens: number; + max_tokens: number; + min_tokens: number; +} + +function calculateMedian(numbers: number[]): number { + const sorted = [...numbers].sort((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + + if (sorted.length % 2 === 0) { + return (sorted[middle - 1] + sorted[middle]) / 2; + } + return sorted[middle]; +} + +function calculateStats(results: EvaluationResult[], modelName: string): EvaluationStats { + const steps = results.map(r => r.total_steps); + const tokens = results.map(r => r.total_tokens); + const passCount = results.filter(r => r.pass).length; + + return { + model_name: modelName, + pass_rate: (passCount / results.length) * 100, + avg_steps: steps.reduce((a, b) => a + b, 0) / steps.length, + max_steps: Math.max(...steps), + min_steps: Math.min(...steps), + median_steps: calculateMedian(steps), + avg_tokens: tokens.reduce((a, b) => a + b, 0) / tokens.length, + median_tokens: calculateMedian(tokens), + max_tokens: Math.max(...tokens), + min_tokens: Math.min(...tokens) + }; +} + +function printStats(stats: EvaluationStats): void { + console.log('\n=== Evaluation Statistics ==='); + console.log(`Model: ${stats.model_name}`); + console.log(`Pass Rate: ${stats.pass_rate.toFixed(0)}%`); + console.log(`Average Steps: ${stats.avg_steps.toFixed(0)}`); + console.log(`Maximum Steps: ${stats.max_steps}`); + console.log(`Minimum Steps: ${stats.min_steps}`); + console.log(`Median Steps: ${stats.median_steps.toFixed(0)}`); + console.log(`Average Tokens: ${stats.avg_tokens.toFixed(0)}`); + console.log(`Median Tokens: ${stats.median_tokens.toFixed(0)}`); + console.log(`Maximum Tokens: ${stats.max_tokens}`); + console.log(`Minimum Tokens: ${stats.min_tokens}`); + console.log('===========================\n'); +} + async function getCurrentGitCommit(): Promise { try { const {stdout} = await execAsync('git rev-parse --short HEAD'); @@ -49,10 +107,10 @@ Minor wording differences are acceptable as long as the core information of the try { const result = await generateObject({ - model: getModel('evaluator'), + model: createGoogleGenerativeAI({ apiKey: GEMINI_API_KEY })('gemini-2.0-flash'), // fix to gemini-2.0-flash for evaluation schema, prompt, - maxTokens: getMaxTokens('evaluator'), + maxTokens: 1000, temperature: 0 // Setting temperature to 0 for deterministic output }); @@ -71,7 +129,9 @@ async function batchEvaluate(inputFile: string): Promise { const questions: Question[] = JSON.parse(await fs.readFile(inputFile, 'utf-8')); const results: EvaluationResult[] = []; const gitCommit = await getCurrentGitCommit(); - const outputFile = `eval-${gitCommit}.json`; + const modelName = process.env.DEFAULT_MODEL_NAME || 'unknown'; + const outputFile = `eval-${gitCommit}-${modelName}.json`; + // Process each question for (let i = 0; i < questions.length; i++) { const {question, answer: expectedAnswer} = questions[i]; @@ -113,12 +173,19 @@ async function batchEvaluate(inputFile: string): Promise { actual_answer: 'Error occurred' }); } - // Save results - await fs.writeFile(outputFile, JSON.stringify(results, null, 2)); - console.log(`\nEvaluation results saved to ${outputFile}`); } + // Calculate and print statistics + const stats = calculateStats(results, modelName); + printStats(stats); + // Save results + await fs.writeFile(outputFile, JSON.stringify({ + results, + statistics: stats + }, null, 2)); + + console.log(`\nEvaluation results saved to ${outputFile}`); } // Run batch evaluation if this is the main module diff --git a/src/evals/ego-questions.json b/src/evals/ego-questions.json index ed597f4..4acb549 100644 --- a/src/evals/ego-questions.json +++ b/src/evals/ego-questions.json @@ -1,7 +1,7 @@ [ { - "question": "what is jina ai ceo's twitter account", - "answer": "hxiao" + "question": "what did jina ai ceo say about deepseek that went viral and become a meme?", + "answer": "a side project" }, { "question": "when was jina ai founded?", @@ -12,7 +12,7 @@ "answer": "ReaderLM-2.0" }, { - "question": "what is the lastest blog post that jina ai published?", + "question": "what is the latest blog post that jina ai published?", "answer": "A Practical Guide to Deploying Search Foundation Models in Production" }, { @@ -24,19 +24,59 @@ "answer": "30" }, { - "question": "how much rate limit for r.jina.ai api without an api key?", - "answer": "20 RPM (requests per minute)" + "question": "when was jina reader released?", + "answer": "April 2024" }, { "question": "How many offices do Jina AI have and where are they?", "answer": "four: sunnyvale, berlin, beijing, shenzhen" }, { - "question": "Does jina reranker v2 support multilingual?", - "answer": "Yes" + "question": "what exactly jina-colbert-v2 improves over jina-colbert-v1?", + "answer": "v2 add multilingual support" }, { "question": "who are the authors of jina-clip-v2 paper?", "answer": "Andreas Koukounas, Georgios Mastrapas, Bo Wang, Mohammad Kalim Akram, Sedigheh Eslami, Michael Günther, Isabelle Mohr, Saba Sturua, Scott Martens, Nan Wang, Han Xiao" + }, + { + "question": "who created the node-deepresearch project?", + "answer": "Han Xiao / jina ai" + }, + { + "question": "Which countries are the investors of Jina AI from?", + "answer": "USA and China only, no German investors" + }, + { + "question": "what is the grounding api endpoint of jina ai?", + "answer": "g.jina.ai" + }, + { + "question": "which of the following models do not support Matryoshka representation? jina-embeddings-v3, jina-embeddings-v2-base-en, jina-clip-v2, jina-clip-v1", + "answer": "jina-embeddings-v2-base-en and jina-clip-v1" + }, + { + "question": "Can I purchase the 2024 yearbook that jina ai published today?", + "answer": "No it is sold out." + }, + { + "question": "How many free tokens do you get from a new jina api key?", + "answer": "1 million." + }, + { + "question": "Who is the legal signatory of Jina AI gmbh?", + "answer": "Jiao Liu" + }, + { + "question": "what is the key idea behind node-deepresearch project?", + "answer": "It keeps searching, reading webpages, reasoning until an answer is found." + }, + { + "question": "what is the name of the jina ai's mascot?", + "answer": "No, Jina AI does not have a mascot." + }, + { + "question": "Does late chunking work with cls pooling?", + "answer": "No. late chunking only works with mean pooling." } ] \ No newline at end of file diff --git a/src/tools/__tests__/brave-search.test.ts b/src/tools/__tests__/brave-search.test.ts deleted file mode 100644 index 455387f..0000000 --- a/src/tools/__tests__/brave-search.test.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { braveSearch } from '../brave-search'; - -describe('braveSearch', () => { - it('should return search results', async () => { - const { response } = await braveSearch('test query'); - expect(response.web.results).toBeDefined(); - expect(response.web.results.length).toBeGreaterThan(0); - expect(response.web.results[0]).toHaveProperty('title'); - expect(response.web.results[0]).toHaveProperty('url'); - expect(response.web.results[0]).toHaveProperty('description'); - }); -}); diff --git a/src/tools/__tests__/dedup.test.ts b/src/tools/__tests__/dedup.test.ts deleted file mode 100644 index 58c97d4..0000000 --- a/src/tools/__tests__/dedup.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { dedupQueries } from '../dedup'; -import { LLMProvider } from '../../config'; - -describe('dedupQueries', () => { - const providers: Array = ['openai', 'gemini']; - const originalEnv = process.env; - - beforeEach(() => { - jest.resetModules(); - process.env = { ...originalEnv }; - }); - - afterEach(() => { - process.env = originalEnv; - }); - - providers.forEach(provider => { - describe(`with ${provider} provider`, () => { - beforeEach(() => { - process.env.LLM_PROVIDER = provider; - }); - - it('should remove duplicate queries', async () => { - jest.setTimeout(10000); - const queries = ['typescript tutorial', 'typescript tutorial', 'javascript basics']; - const { unique_queries } = await dedupQueries(queries, []); - expect(unique_queries).toHaveLength(2); - expect(unique_queries).toContain('javascript basics'); - }); - - it('should handle empty input', async () => { - const { unique_queries } = await dedupQueries([], []); - expect(unique_queries).toHaveLength(0); - }); - }); - }); -}); diff --git a/src/tools/__tests__/error-analyzer.test.ts b/src/tools/__tests__/error-analyzer.test.ts index 19af41d..fc64c90 100644 --- a/src/tools/__tests__/error-analyzer.test.ts +++ b/src/tools/__tests__/error-analyzer.test.ts @@ -25,7 +25,7 @@ describe('analyzeSteps', () => { expect(response).toHaveProperty('recap'); expect(response).toHaveProperty('blame'); expect(response).toHaveProperty('improvement'); - }); + }, 30000); }); }); }); diff --git a/src/tools/__tests__/evaluator.test.ts b/src/tools/__tests__/evaluator.test.ts index de36532..b330ee8 100644 --- a/src/tools/__tests__/evaluator.test.ts +++ b/src/tools/__tests__/evaluator.test.ts @@ -32,25 +32,6 @@ describe('evaluateAnswer', () => { expect(response).toHaveProperty('pass'); expect(response).toHaveProperty('think'); expect(response.type).toBe('definitive'); - expect(response.pass).toBe(true); - }); - - it('should evaluate answer freshness', async () => { - const tokenTracker = new TokenTracker(); - const { response } = await evaluateAnswer( - 'What is the latest version of Node.js?', - 'The latest version of Node.js is 14.0.0, released in April 2020.', - ['freshness'], - tokenTracker - ); - expect(response).toHaveProperty('pass'); - expect(response).toHaveProperty('think'); - expect(response.type).toBe('freshness'); - expect(response.freshness_analysis).toBeDefined(); - expect(response.freshness_analysis?.likely_outdated).toBe(true); - expect(response.freshness_analysis?.dates_mentioned).toContain('2020-04'); - expect(response.freshness_analysis?.current_time).toBeDefined(); - expect(response.pass).toBe(false); }); it('should evaluate answer plurality', async () => { @@ -64,38 +45,7 @@ describe('evaluateAnswer', () => { expect(response).toHaveProperty('pass'); expect(response).toHaveProperty('think'); expect(response.type).toBe('plurality'); - expect(response.plurality_analysis).toBeDefined(); expect(response.plurality_analysis?.expects_multiple).toBe(true); - expect(response.plurality_analysis?.provides_multiple).toBe(false); - expect(response.plurality_analysis?.count_expected).toBe(3); - expect(response.plurality_analysis?.count_provided).toBe(1); - expect(response.pass).toBe(false); - }); - - it('should evaluate in order and stop at first failure', async () => { - const tokenTracker = new TokenTracker(); - const { response } = await evaluateAnswer( - 'List the latest Node.js versions.', - 'I am not sure about the Node.js versions.', - ['definitive', 'freshness', 'plurality'], - tokenTracker - ); - expect(response.type).toBe('definitive'); - expect(response.pass).toBe(false); - expect(response.freshness_analysis).toBeUndefined(); - expect(response.plurality_analysis).toBeUndefined(); - }); - - it('should track token usage', async () => { - const tokenTracker = new TokenTracker(); - const spy = jest.spyOn(tokenTracker, 'trackUsage'); - await evaluateAnswer( - 'What is TypeScript?', - 'TypeScript is a strongly typed programming language that builds on JavaScript.', - ['definitive', 'freshness', 'plurality'], - tokenTracker - ); - expect(spy).toHaveBeenCalledWith('evaluator', expect.any(Number)); }); }); }); diff --git a/src/tools/__tests__/query-rewriter.test.ts b/src/tools/__tests__/query-rewriter.test.ts deleted file mode 100644 index 51d9995..0000000 --- a/src/tools/__tests__/query-rewriter.test.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { rewriteQuery } from '../query-rewriter'; -import { LLMProvider } from '../../config'; - -describe('rewriteQuery', () => { - const providers: Array = ['openai', 'gemini']; - const originalEnv = process.env; - - beforeEach(() => { - jest.resetModules(); - process.env = { ...originalEnv }; - }); - - afterEach(() => { - process.env = originalEnv; - }); - - providers.forEach(provider => { - describe(`with ${provider} provider`, () => { - beforeEach(() => { - process.env.LLM_PROVIDER = provider; - }); - - it('should rewrite search query', async () => { - const { queries } = await rewriteQuery({ - action: 'search', - searchQuery: 'how does typescript work', - think: 'Understanding TypeScript basics' - }); - expect(Array.isArray(queries)).toBe(true); - expect(queries.length).toBeGreaterThan(0); - }); - }); - }); -}); diff --git a/src/tools/__tests__/search.test.ts b/src/tools/__tests__/search.test.ts index aa4235d..05137af 100644 --- a/src/tools/__tests__/search.test.ts +++ b/src/tools/__tests__/search.test.ts @@ -1,10 +1,10 @@ -import { jinaSearch } from '../jinaSearch'; +import { search } from '../jina-search'; import { TokenTracker } from '../../utils/token-tracker'; describe('search', () => { it.skip('should perform search with Jina API (skipped due to insufficient balance)', async () => { const tokenTracker = new TokenTracker(); - const { response } = await jinaSearch('TypeScript programming', tokenTracker); + const { response } = await search('TypeScript programming', tokenTracker); expect(response).toBeDefined(); expect(response.data).toBeDefined(); if (response.data === null) { @@ -15,7 +15,7 @@ describe('search', () => { }, 15000); it('should handle empty query', async () => { - await expect(jinaSearch('')).rejects.toThrow(); + await expect(search('')).rejects.toThrow(); }, 15000); beforeEach(() => { diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index 269d82d..b3d8070 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -285,14 +285,13 @@ export async function evaluateAnswer( break; } } catch (error) { - console.error(`Error in ${evaluationType} evaluation:`, error); const errorResult = await handleGenerateObjectError(error); (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0); - if (!errorResult.object.pass) { - return { response: errorResult.object }; - } + // Always return from catch block to prevent undefined result + return { response: errorResult.object }; } } + // Only reach this point if all evaluations pass return { response: result!.object }; } \ No newline at end of file diff --git a/src/tools/jina-dedup.ts b/src/tools/jina-dedup.ts new file mode 100644 index 0000000..b0ed7c2 --- /dev/null +++ b/src/tools/jina-dedup.ts @@ -0,0 +1,143 @@ +import axios from 'axios'; +import { TokenTracker } from "../utils/token-tracker"; +import {JINA_API_KEY} from "../config"; + +const JINA_API_URL = 'https://api.jina.ai/v1/embeddings'; +const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity + +// Types for Jina API +interface JinaEmbeddingRequest { + model: string; + input: string[]; +} + +interface JinaEmbeddingResponse { + model: string; + object: string; + usage: { + total_tokens: number; + prompt_tokens: number; + }; + data: Array<{ + object: string; + index: number; + embedding: number[]; + }>; +} + + +// Compute cosine similarity between two vectors +function cosineSimilarity(vecA: number[], vecB: number[]): number { + const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0); + const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)); + const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)); + return dotProduct / (normA * normB); +} + +// Get embeddings for all queries in one batch +async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> { + if (!JINA_API_KEY) { + throw new Error('JINA_API_KEY is not set'); + } + + const request: JinaEmbeddingRequest = { + model: 'jina-embeddings-v3', + input: queries + }; + + try { + const response = await axios.post( + JINA_API_URL, + request, + { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${JINA_API_KEY}` + } + } + ); + + // Sort embeddings by index to maintain original order + const embeddings = response.data.data + .sort((a, b) => a.index - b.index) + .map(item => item.embedding); + + return { + embeddings, + tokens: response.data.usage.total_tokens + }; + } catch (error) { + console.error('Error getting embeddings from Jina:', error); + throw error; + } +} + +export async function dedupQueries( + newQueries: string[], + existingQueries: string[], + tracker?: TokenTracker +): Promise<{ unique_queries: string[], tokens: number }> { + try { + // Quick return for single new query with no existing queries + if (newQueries.length === 1 && existingQueries.length === 0) { + console.log('Dedup (quick return):', newQueries); + return { + unique_queries: newQueries, + tokens: 0 // No tokens used since we didn't call the API + }; + } + + // Get embeddings for all queries in one batch + const allQueries = [...newQueries, ...existingQueries]; + const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries); + + // Split embeddings back into new and existing + const newEmbeddings = allEmbeddings.slice(0, newQueries.length); + const existingEmbeddings = allEmbeddings.slice(newQueries.length); + + const uniqueQueries: string[] = []; + const usedIndices = new Set(); + + // Compare each new query against existing queries and already accepted queries + for (let i = 0; i < newQueries.length; i++) { + let isUnique = true; + + // Check against existing queries + for (let j = 0; j < existingQueries.length; j++) { + const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]); + if (similarity >= SIMILARITY_THRESHOLD) { + isUnique = false; + break; + } + } + + // Check against already accepted queries + if (isUnique) { + for (const usedIndex of usedIndices) { + const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]); + if (similarity >= SIMILARITY_THRESHOLD) { + isUnique = false; + break; + } + } + } + + // Add to unique queries if passed all checks + if (isUnique) { + uniqueQueries.push(newQueries[i]); + usedIndices.add(i); + } + } + + // Track token usage from the API + (tracker || new TokenTracker()).trackUsage('dedup', tokens); + console.log('Dedup:', uniqueQueries); + return { + unique_queries: uniqueQueries, + tokens + }; + } catch (error) { + console.error('Error in deduplication analysis:', error); + throw error; + } +} diff --git a/src/tools/jinaSearch.ts b/src/tools/jina-search.ts similarity index 95% rename from src/tools/jinaSearch.ts rename to src/tools/jina-search.ts index fdcc570..3e259cc 100644 --- a/src/tools/jinaSearch.ts +++ b/src/tools/jina-search.ts @@ -3,7 +3,7 @@ import { TokenTracker } from "../utils/token-tracker"; import { SearchResponse } from '../types'; import { JINA_API_KEY } from "../config"; -export function jinaSearch(query: string, tracker?: TokenTracker): Promise<{ response: SearchResponse, tokens: number }> { +export function search(query: string, tracker?: TokenTracker): Promise<{ response: SearchResponse, tokens: number }> { return new Promise((resolve, reject) => { if (!query.trim()) { reject(new Error('Query cannot be empty')); diff --git a/src/tools/query-rewriter.ts b/src/tools/query-rewriter.ts index ba244ea..70a7ed0 100644 --- a/src/tools/query-rewriter.ts +++ b/src/tools/query-rewriter.ts @@ -18,7 +18,7 @@ const responseSchema = z.object({ function getPrompt(action: SearchAction): string { - return `You are an expert Information Retrieval Assistant. Transform user queries into precise keyword combinations with strategic reasoning and appropriate search operators. + return `You are an expert Information Retrieval query optimizer. Optimize user queries into precise keyword combinations with strategic reasoning and appropriate search operators. 1. Generate search queries that directly include appropriate operators @@ -61,7 +61,7 @@ Input Query: How to fix a leaking kitchen faucet? This is a how-to query seeking practical solutions. User likely wants step-by-step guidance and visual demonstrations for DIY repair. We'll target both video tutorials and written guides. -Queries: [ +Output Queries: [ "kitchen faucet leak repair", "faucet drip fix site:youtube.com", "how to repair faucet " @@ -71,7 +71,7 @@ Input Query: What are healthy breakfast options for type 2 diabetes? This is a health-specific informational query. User needs authoritative medical advice combined with practical meal suggestions. Splitting into medical guidelines and recipes will provide comprehensive coverage. -Queries: [ +Output Queries: [ "what to eat for type 2 diabetes", "type 2 diabetes breakfast guidelines", "diabetic breakfast recipes" @@ -81,7 +81,7 @@ Input Query: Latest AWS Lambda features for serverless applications This is a product research query focused on recent updates. User wants current information about specific technology features, likely for implementation purposes. We'll target official docs and community insights. -Queries: [ +Output Queries: [ "aws lambda features site:aws.amazon.com intitle:2025", "new features lambda serverless" ]