diff --git a/README.md b/README.md index d810ce9..da1fde7 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ I kept the evaluation simple, LLM-as-a-judge and collect some [ego questions](./ I mainly look at 3 things: total steps, total tokens, and the correctness of the final answer. ```bash -npm run eval ./src/evals/ego-questions +npm run eval ./src/evals/questions.json ``` Here's the table comparing plain `gemini-2.0-flash` and `gemini-2.0-flash + node-deepresearch` on the ego set. diff --git a/package-lock.json b/package-lock.json index 352fb8c..ba3dfd0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,7 +20,8 @@ "express": "^4.21.2", "node-fetch": "^3.3.2", "undici": "^7.3.0", - "zod": "^3.22.4" + "zod": "^3.22.4", + "zod-to-json-schema": "^3.24.1" }, "devDependencies": { "@types/commander": "^2.12.0", diff --git a/package.json b/package.json index e24ab48..93e6d89 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,8 @@ "express": "^4.21.2", "node-fetch": "^3.3.2", "undici": "^7.3.0", - "zod": "^3.22.4" + "zod": "^3.22.4", + "zod-to-json-schema": "^3.24.1" }, "devDependencies": { "@types/commander": "^2.12.0", diff --git a/src/agent.ts b/src/agent.ts index d9ea676..173c73b 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -1,6 +1,6 @@ -import {z} from 'zod'; +import {z, ZodObject} from 'zod'; import {generateObject} from 'ai'; -import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP, LLM_PROVIDER} from "./config"; +import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP} from "./config"; import {readUrl} from "./tools/read"; import {handleGenerateObjectError} from './utils/error-handling'; import fs from 'fs/promises'; @@ -8,14 +8,15 @@ import {SafeSearchType, search as duckSearch} from "duck-duck-scrape"; import {braveSearch} from "./tools/brave-search"; import {rewriteQuery} from "./tools/query-rewriter"; import {dedupQueries} from "./tools/jina-dedup"; -import {evaluateAnswer} from "./tools/evaluator"; +import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator"; import {analyzeSteps} from "./tools/error-analyzer"; import {TokenTracker} from "./utils/token-tracker"; import {ActionTracker} from "./utils/action-tracker"; import {StepAction, AnswerAction} from "./types"; import {TrackerContext} from "./types"; import {search} from "./tools/jina-search"; -import {grounding} from "./tools/grounding"; +// import {grounding} from "./tools/grounding"; +import { zodToJsonSchema } from "zod-to-json-schema"; async function sleep(ms: number) { const seconds = Math.ceil(ms / 1000); @@ -43,7 +44,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole properties.references = z.array( z.object({ exactQuote: z.string().describe("Exact relevant quote from the document"), - url: z.string().describe("URL of the document; must be directly from the context") + url: z.string().describe("source URL; must be directly from the context") }).required() ).describe("Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document").optional(); } @@ -291,6 +292,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ let step = 0; let totalStep = 0; let badAttempts = 0; + let schema: ZodObject = getSchema(true, true, true, true) const gaps: string[] = [question]; // All questions to be answered including the orginal question const allQuestions = [question]; const allKeywords = []; @@ -307,6 +309,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ const allURLs: Record = {}; const visitedURLs: string[] = []; + const evaluationMetrics: Record = {}; while (context.tokenTracker.getTotalUsage() < tokenBudget && badAttempts <= maxBadAttempts) { // add 1s delay to avoid rate limiting await sleep(STEP_SLEEP); @@ -317,6 +320,10 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ console.log('Gaps:', gaps); allowReflect = allowReflect && (gaps.length <= 1); const currentQuestion = gaps.length > 0 ? gaps.shift()! : question; + if (!evaluationMetrics[currentQuestion]) { + evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker) + } + // update all urls with buildURLMap allowRead = allowRead && (Object.keys(allURLs).length > 0); allowSearch = allowSearch && (Object.keys(allURLs).length < 50); // disable search when too many urls already @@ -336,14 +343,14 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ allURLs, false ); - + schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch) const model = getModel('agent'); let object; let totalTokens = 0; try { const result = await generateObject({ model, - schema: getSchema(allowReflect, allowRead, allowAnswer, allowSearch), + schema, prompt, maxTokens: getMaxTokens('agent') }); @@ -384,7 +391,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ }); const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, - ['definitive', 'freshness', 'plurality'], context.tokenTracker); + evaluationMetrics[currentQuestion], context.tokenTracker); if (currentQuestion === question) { if (evaluation.pass) { @@ -437,6 +444,13 @@ ${evaluation.think} evaluation: evaluation.think, ...errorAnalysis }); + + if (errorAnalysis.questionsToAnswer) { + gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); + allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); + gaps.push(question); // always keep the original question in the gaps + } + badAttempts++; allowAnswer = false; // disable answer action in the immediate next step diaryContext = []; @@ -504,7 +518,7 @@ But then you realized you have asked them before. You decided to to think out of keywordsQueries = dedupedQueries; if (keywordsQueries.length > 0) { - let googleGrounded = ''; + // let googleGrounded = ''; const searchResults = []; for (const query of keywordsQueries) { console.log(`Search query: ${query}`); @@ -515,9 +529,9 @@ But then you realized you have asked them before. You decided to to think out of case 'jina': // use jinaSearch results = {results: (await search(query, context.tokenTracker)).response?.data || []}; - if (LLM_PROVIDER === 'gemini') { - googleGrounded = await grounding(query, context.tokenTracker); - } + // if (LLM_PROVIDER === 'gemini') { + // googleGrounded = await grounding(query, context.tokenTracker); + // } break; case 'duck': results = await duckSearch(query, {safeSearch: SafeSearchType.STRICT}); @@ -556,7 +570,8 @@ But then you realized you have asked them before. You decided to to think out of allKnowledge.push({ question: `What do Internet say about ${thisStep.searchQuery}?`, - answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')), + answer: removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')), + // answer: googleGrounded + removeHTMLtags(searchResults.map(r => r.results.map(r => r.description).join('; ')).join('; ')), // flatten into one url list, and take unique urls references: searchResults.map(r => r.results.map(r => r.url)).flat().filter((v, i, a) => a.indexOf(v) === i), type: 'side-info' @@ -645,10 +660,10 @@ You decided to think out of the box or cut from a completely different angle.`); } } - await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); } - await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); if (isAnswered) { return {result: thisStep, context}; } else { @@ -671,13 +686,14 @@ You decided to think out of the box or cut from a completely different angle.`); true ); + schema = getSchema(false, false, true, false); const model = getModel('agentBeastMode'); let object; let totalTokens; try { const result = await generateObject({ model, - schema: getSchema(false, false, allowAnswer, false), + schema: schema, prompt, maxTokens: getMaxTokens('agentBeastMode') }); @@ -688,7 +704,7 @@ You decided to think out of the box or cut from a completely different angle.`); object = result.object; totalTokens = result.totalTokens; } - await storeContext(prompt, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(prompt, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); thisStep = object as StepAction; context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts}); context.tokenTracker.trackUsage('agent', totalTokens); @@ -697,9 +713,15 @@ You decided to think out of the box or cut from a completely different angle.`); } } -async function storeContext(prompt: string, memory: any[][], step: number) { +async function storeContext(prompt: string, schema: any, memory: any[][], step: number) { try { - await fs.writeFile(`prompt-${step}.txt`, prompt); + await fs.writeFile(`prompt-${step}.txt`, ` +Prompt: +${prompt} + +JSONSchema: +${JSON.stringify(zodToJsonSchema(schema), null, 2)} +`); const [context, keywords, questions, knowledge] = memory; await fs.writeFile('context.json', JSON.stringify(context, null, 2)); await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2)); diff --git a/src/evals/ego-questions b/src/evals/ego-questions.json similarity index 100% rename from src/evals/ego-questions rename to src/evals/ego-questions.json diff --git a/src/server.ts b/src/server.ts index 8c6cf1d..58a2956 100644 --- a/src/server.ts +++ b/src/server.ts @@ -36,6 +36,19 @@ interface QueryRequest extends Request { }; } +function buildMdFromAnswer(answer: AnswerAction) { + let refStr = ''; + if (answer.references?.length > 0) { + refStr = ` + +## References +${answer.references.map((ref, i) => ` +${i + 1}. [${ref.exactQuote}](${ref.url})`).join('')}`; + } + return `${answer.answer.replace(/\(REF_(\d+)\)/g, (_, num) => `[^${num}]`)}${refStr}`; +} + + // OpenAI-compatible chat completions endpoint app.post('/v1/chat/completions', (async (req: Request, res: Response) => { // Check authentication only if secret is set @@ -175,7 +188,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => { system_fingerprint: 'fp_' + requestId, choices: [{ index: 0, - delta: { content: '\n\n' }, + delta: { content: `\n\n` }, logprobs: null, finish_reason: null }] @@ -191,7 +204,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => { system_fingerprint: 'fp_' + requestId, choices: [{ index: 0, - delta: { content: result.action === 'answer' ? (result as AnswerAction).answer : result.think }, + delta: { content: result.action === 'answer' ? buildMdFromAnswer(result) : result.think }, logprobs: null, finish_reason: 'stop' }] @@ -210,7 +223,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => { index: 0, message: { role: 'assistant', - content: result.action === 'answer' ? (result as AnswerAction).answer : result.think + content: result.action === 'answer' ? buildMdFromAnswer(result): result.think }, logprobs: null, finish_reason: 'stop' diff --git a/src/tools/error-analyzer.ts b/src/tools/error-analyzer.ts index 7656e44..61c2715 100644 --- a/src/tools/error-analyzer.ts +++ b/src/tools/error-analyzer.ts @@ -10,7 +10,11 @@ const model = getModel('errorAnalyzer'); const responseSchema = z.object({ recap: z.string().describe('Recap of the actions taken and the steps conducted'), blame: z.string().describe('Which action or the step was the root cause of the answer rejection'), - improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.') + improvement: z.string().describe('Suggested key improvement for the next iteration, do not use bullet points, be concise and hot-take vibe.'), + questionsToAnswer: z.array( + z.string().describe("each question must be a single line, concise and clear. not composite or compound, less than 20 words.") + ).max(2) + .describe("List of most important reflect questions to fill the knowledge gaps"), }); @@ -93,7 +97,12 @@ The answer is not definitive and fails to provide the requested information. La "blame": "The root cause of failure was getting stuck in a repetitive search pattern without adapting the strategy. Steps 4-5 repeated the same search, and step 6 deviated to less reliable entertainment sources instead of exploring business journals, news articles, or professional databases. Additionally, the process didn't attempt to triangulate age through indirect information like education history or career milestones.", - "improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements." + "improvement": "1. Avoid repeating identical searches and implement a strategy to track previously searched terms. 2. When direct age/birthdate searches fail, try indirect approaches like: searching for earliest career mentions, finding university graduation years, or identifying first company founding dates. 3. Focus on high-quality business sources and avoid entertainment websites for professional information. 4. Consider using industry event appearances or conference presentations where age-related context might be mentioned. 5. If exact age cannot be determined, provide an estimated range based on career timeline and professional achievements.", + + "questionsToAnswer": [ + "What alternative professional databases or news archives could provide reliable biographical information?", + "How can we use education history or career milestones to estimate age range?" + ] } diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index b3d8070..1e39dd7 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -231,6 +231,107 @@ Question: ${JSON.stringify(question)} Answer: ${JSON.stringify(answer)}`; } + +const questionEvaluationSchema = z.object({ + needsFreshness: z.boolean().describe('Whether the question requires freshness check'), + needsPlurality: z.boolean().describe('Whether the question requires plurality check'), + reasoning: z.string().describe('Explanation of why these checks are needed or not needed') +}); + +function getQuestionEvaluationPrompt(question: string): string { + return `You are an evaluator that determines if a question requires freshness and/or plurality checks in addition to the required definitiveness check. + + +1. freshness - Checks if the answer needs to be current and up-to-date +2. plurality - Checks if the answer needs to provide multiple items or a specific count +Note: Definitiveness check is always applied regardless of the question type + + + +1. Freshness Evaluation: + - Required for questions about current state, recent events, or time-sensitive information + - Required for: prices, versions, leadership positions, status updates + - Look for terms: "current", "latest", "recent", "now", "today", "new" + - Consider company positions, product versions, market data time-sensitive + +2. Plurality Evaluation: + - Required when question asks for multiple items or specific counts + - Check for: numbers ("5 examples"), plural nouns, list requests + - Look for: "all", "list", "enumerate", "examples", plural forms + - Required when question implies completeness ("all the reasons", "every factor") + +3. Ordering Rules: + - Always include definitive check in the order + - Prioritize freshness for "current/latest" queries as outdated info invalidates other aspects + - Prioritize plurality for explicit numbered requests when freshness isn't critical + - Default order is: definitive -> freshness -> plurality + + + +Question: "What is the current CEO of OpenAI?" +Evaluation: { + "needsFreshness": true, + "needsPlurality": false, + "reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position." +} + +Question: "List all the AI companies in Berlin" +Evaluation: { + "needsFreshness": false, + "needsPlurality": true, + "reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive." +} + +Question: "What are the top 5 latest AI models released by OpenAI?" +Evaluation: { + "needsFreshness": true, + "needsPlurality": true, + "reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items." +} + +Question: "Who created Python?" +Evaluation: { + "needsFreshness": false, + "needsPlurality": false, + "reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed." +} + + +Now evaluate this question: +Question: ${JSON.stringify(question)}`; +} + +export async function evaluateQuestion( + question: string, + tracker?: TokenTracker +): Promise { + try { + const result = await generateObject({ + model: getModel('evaluator'), + schema: questionEvaluationSchema, + prompt: getQuestionEvaluationPrompt(question), + maxTokens: getMaxTokens('evaluator') + }); + + (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); + console.log('Question Evaluation:', result.object); + + // Always include definitive in types + const types: EvaluationType[] = ['definitive']; + if (result.object.needsFreshness) types.push('freshness'); + if (result.object.needsPlurality) types.push('plurality'); + + console.log('Question Metrics:', types) + + // Always evaluate definitive first, then freshness (if needed), then plurality (if needed) + return types; + } catch (error) { + // Default to all evaluations in standard order if evaluation fails + console.error('Question evaluation failed:', error); + return ['definitive', 'freshness', 'plurality']; + } +} + export async function evaluateAnswer( question: string, answer: string, diff --git a/src/types.ts b/src/types.ts index 2409cb9..9f0fec0 100644 --- a/src/types.ts +++ b/src/types.ts @@ -97,8 +97,6 @@ export interface ReadResponse { } - - export type EvaluationResponse = { pass: boolean; think: string; @@ -121,6 +119,7 @@ export type ErrorAnalysisResponse = { recap: string; blame: string; improvement: string; + questionsToAnswer: string[]; }; export interface SearchResult { @@ -214,8 +213,8 @@ export interface ChatCompletionChunk { } // Tracker Types -import { TokenTracker } from './utils/token-tracker'; -import { ActionTracker } from './utils/action-tracker'; +import {TokenTracker} from './utils/token-tracker'; +import {ActionTracker} from './utils/action-tracker'; export interface TrackerContext { tokenTracker: TokenTracker;