diff --git a/src/agent.ts b/src/agent.ts index 8aa18c8..2e8d968 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole if (allowSearch) { actions.push("search"); - properties.searchQuery = z.string() + properties.searchQuery = z.string().max(30) .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional(); } @@ -356,39 +356,24 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ // execute the step and action if (thisStep.action === 'answer') { + if (step === 1) { + // LLM is so confident and answer immediately, skip all evaluations + isAnswered = true; + break + } + updateContext({ totalStep, question: currentQuestion, ...thisStep, }); - const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker); - + const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, + ['definitive', 'freshness', 'plurality'], context.tokenTracker); if (currentQuestion === question) { - if (badAttempts >= maxBadAttempts) { - // EXIT POINT OF THE PROGRAM!!!! + if (evaluation.pass) { diaryContext.push(` -At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question: - -Original question: -${currentQuestion} - -Your answer: -${thisStep.answer} - -The evaluator thinks your answer is good because: -${evaluation.reasoning} - -Your journey ends here. -`); - isAnswered = false; - break - } - if (evaluation.is_definitive) { - if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) { - // EXIT POINT OF THE PROGRAM!!!! - diaryContext.push(` At step ${step}, you took **answer** action and finally found the answer to the original question: Original question: @@ -398,31 +383,18 @@ Your answer: ${thisStep.answer} The evaluator thinks your answer is good because: -${evaluation.reasoning} +${evaluation.think} Your journey ends here. You have successfully answered the original question. Congratulations! 🎉 `); - isAnswered = true; + isAnswered = true; + break + } else { + if (badAttempts >= maxBadAttempts) { + isAnswered = false; break } else { diaryContext.push(` -At step ${step}, you took **answer** action and finally found the answer to the original question: - -Original question: -${currentQuestion} - -Your answer: -${thisStep.answer} - -Unfortunately, you did not provide any references to support your answer. -You need to find more URL references to support your answer.`); - } - - isAnswered = true; - break - - } else { - diaryContext.push(` At step ${step}, you took **answer** action but evaluator thinks it is not a good answer: Original question: @@ -432,23 +404,31 @@ Your answer: ${thisStep.answer} The evaluator thinks your answer is bad because: -${evaluation.reasoning} +${evaluation.think} `); - // store the bad context and reset the diary context - const {response: errorAnalysis} = await analyzeSteps(diaryContext); + // store the bad context and reset the diary context + const {response: errorAnalysis} = await analyzeSteps(diaryContext); - badContext.push({ - question: currentQuestion, - answer: thisStep.answer, - evaluation: evaluation.reasoning, - ...errorAnalysis - }); - badAttempts++; - allowAnswer = false; // disable answer action in the immediate next step - diaryContext = []; - step = 0; + allKnowledge.push({ + question: currentQuestion, + answer: thisStep.answer, + references: thisStep.references, + type: 'qa' + }); + + badContext.push({ + question: currentQuestion, + answer: thisStep.answer, + evaluation: evaluation.think, + ...errorAnalysis + }); + badAttempts++; + allowAnswer = false; // disable answer action in the immediate next step + diaryContext = []; + step = 0; + } } - } else if (evaluation.is_definitive) { + } else if (evaluation.pass) { diaryContext.push(` At step ${step}, you took **answer** action. You found a good answer to the sub-question: @@ -459,7 +439,7 @@ Your answer: ${thisStep.answer} The evaluator thinks your answer is good because: -${evaluation.reasoning} +${evaluation.think} Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going. `); diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index a23b276..269d82d 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling'; const model = getModel('evaluator'); -const responseSchema = z.object({ - is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'), - reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive') +type EvaluationType = 'definitive' | 'freshness' | 'plurality'; + +const baseSchema = { + pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'), + think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria') +}; + +const definitiveSchema = z.object({ + ...baseSchema, + type: z.literal('definitive') }); -function getPrompt(question: string, answer: string): string { +const freshnessSchema = z.object({ + ...baseSchema, + type: z.literal('freshness'), + freshness_analysis: z.object({ + likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'), + dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'), + current_time: z.string().describe('Current system time when evaluation was performed'), + max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated') + }) +}); + +const pluralitySchema = z.object({ + ...baseSchema, + type: z.literal('plurality'), + plurality_analysis: z.object({ + expects_multiple: z.boolean().describe('Whether the question asks for multiple items'), + provides_multiple: z.boolean().describe('Whether the answer provides multiple items'), + count_expected: z.number().optional().describe('Number of items expected if specified in question'), + count_provided: z.number().describe('Number of items provided in answer') + }) +}); + +function getDefinitivePrompt(question: string, answer: string): string { return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not. @@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive 5. Non-answers that suggest alternatives - Question: "What are the system requirements for running Python 3.9?" Answer: "I'm not entirely sure, but I think you need a computer with some RAM." Evaluation: { - "is_definitive": false, - "reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive." + "pass": false, + "think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive." } Question: "What are the system requirements for running Python 3.9?" Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux." Evaluation: { - "is_definitive": true, - "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity." + "pass": true, + "think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity." } Question: "Who will be the president of the United States in 2032?" Answer: "I cannot predict the future, it depends on the election results." Evaluation: { - "is_definitive": false, - "reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive." + "pass": false, + "think": "The answer contains a statement of inability to predict the future, making it non-definitive." } Question: "Who is the sales director at Company X?" Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com" Evaluation: { - "is_definitive": false, - "reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question." + "pass": false, + "think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question." } Question: "what is the twitter account of jina ai's founder?" Answer: "The provided text does not contain the Twitter account of Jina AI's founder." Evaluation: { - "is_definitive": false, - "reasoning": "The answer indicates a lack of information rather than providing a definitive response." + "pass": false, + "think": "The answer indicates a lack of information rather than providing a definitive response." } + Now evaluate this pair: Question: ${JSON.stringify(question)} Answer: ${JSON.stringify(answer)}`; } -export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> { - try { - const prompt = getPrompt(question, answer); - let object; - let totalTokens = 0; +function getFreshnessPrompt(question: string, answer: string, currentTime: string): string { + return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time. + + +1. Date Analysis: + - Extract all dates mentioned in the answer + - Compare against current system time: ${currentTime} + - Consider content outdated if: + * It refers to a "latest" or "current" state from more than 30 days ago + * It mentions specific dates/events that have been superseded + * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago + - For product versions, releases, or announcements, max age is 30 days + - For company positions, leadership, or general facts, max age is 60 days + +2. Context Hints: + - Words indicating recency: "latest", "current", "newest", "just released", "recently" + - Time-sensitive terms: "CEO", "price", "version", "release" + - Future dates should be ignored in outdated calculation + + + +Question: "What is Jina AI's latest embedding model?" +Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024." +Current Time: "2024-10-06T00:00:00Z" +Evaluation: { + "pass": false, + "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information", + "freshness_analysis": { + "likely_outdated": true, + "dates_mentioned": ["2024-03-15"], + "current_time": "2024-10-06T00:00:00Z", + "max_age_days": 30 + } +} + +Question: "Who is OpenAI's CEO?" +Answer: "Sam Altman is the CEO of OpenAI as of December 2023." +Current Time: "2024-02-06T00:00:00Z" +Evaluation: { + "pass": true, + "think": "The answer is about company leadership and is within the 60-day threshold for such information", + "freshness_analysis": { + "likely_outdated": false, + "dates_mentioned": ["2023-12"], + "current_time": "2024-02-06T00:00:00Z", + "max_age_days": 60 + } +} + + +Now evaluate this pair: +Question: ${JSON.stringify(question)} +Answer: ${JSON.stringify(answer)}`; +} + +function getPluralityPrompt(question: string, answer: string): string { + return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question. + + +1. Question Analysis: + - Check if question asks for multiple items using indicators like: + * Plural nouns: "companies", "people", "names" + * Quantifiers: "all", "many", "several", "various", "multiple" + * List requests: "list", "enumerate", "name all", "give me all" + * Numbers: "5 examples", "top 10" + - Otherwise skip the analysis and return pass to true + +2. Answer Analysis: + - Count distinct items provided in the answer + - Check if answer uses limiting words like "only", "just", "single" + - Identify if answer acknowledges there are more items but only provides some + +3. Definitiveness Rules: + - If question asks for multiple items but answer provides only one → NOT definitive + - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive + - If answer clearly states it's providing a partial list → NOT definitive + - If question asks for "all" or "every" but answer seems incomplete → NOT definitive + + + +Question: "Who works in Jina AI's sales team?" +Answer: "John Smith is a sales representative at Jina AI." +Evaluation: { + "pass": true, + "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.", + "plurality_analysis": { + "expects_multiple": false, + "provides_multiple": false, + "count_provided": 1 + } +} + +Question: "List all the salespeople who work at Jina AI" +Answer: "John Smith is a sales representative at Jina AI." +Evaluation: { + "pass": false, + "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.", + "plurality_analysis": { + "expects_multiple": true, + "provides_multiple": false, + "count_provided": 1 + } +} + +Question: "Name the top 3 products sold by Jina AI" +Answer: "Jina AI's product lineup includes DocArray and Jina." +Evaluation: { + "pass": false, + "think": "The question asks for top 3 products but only 2 are provided.", + "plurality_analysis": { + "expects_multiple": true, + "provides_multiple": true, + "count_expected": 3, + "count_provided": 2 + } +} + +Question: "List as many AI companies in Berlin as you can find" +Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples." +Evaluation: { + "pass": false, + "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.", + "plurality_analysis": { + "expects_multiple": true, + "provides_multiple": true, + "count_provided": 5 + } +} + + +Now evaluate this pair: +Question: ${JSON.stringify(question)} +Answer: ${JSON.stringify(answer)}`; +} + +export async function evaluateAnswer( + question: string, + answer: string, + evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'], + tracker?: TokenTracker +): Promise<{ response: EvaluationResponse }> { + let result; + + for (const evaluationType of evaluationOrder) { try { - const result = await generateObject({ - model, - schema: responseSchema, - prompt, - maxTokens: getMaxTokens('evaluator') - }); - object = result.object; - totalTokens = result.usage?.totalTokens || 0; + switch (evaluationType) { + case 'definitive': + result = await generateObject({ + model, + schema: definitiveSchema, + prompt: getDefinitivePrompt(question, answer), + maxTokens: getMaxTokens('evaluator') + }); + (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); + console.log('Evaluation:', result.object); + if (!result.object.pass) { + return { response: result.object }; + } + break; + + case 'freshness': + result = await generateObject({ + model, + schema: freshnessSchema, + prompt: getFreshnessPrompt(question, answer, new Date().toISOString()), + maxTokens: getMaxTokens('evaluator') + }); + (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); + console.log('Evaluation:', result.object); + if (!result.object.pass) { + return { response: result.object }; + } + break; + + case 'plurality': + result = await generateObject({ + model, + schema: pluralitySchema, + prompt: getPluralityPrompt(question, answer), + maxTokens: getMaxTokens('evaluator') + }); + (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); + console.log('Evaluation:', result.object); + if (!result.object.pass) { + return { response: result.object }; + } + break; + } } catch (error) { - const result = await handleGenerateObjectError(error); - object = result.object; - totalTokens = result.totalTokens; + console.error(`Error in ${evaluationType} evaluation:`, error); + const errorResult = await handleGenerateObjectError(error); + (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0); + if (!errorResult.object.pass) { + return { response: errorResult.object }; + } } - console.log('Evaluation:', { - definitive: object.is_definitive, - reason: object.reasoning - }); - (tracker || new TokenTracker()).trackUsage('evaluator', totalTokens); - return { response: object, tokens: totalTokens }; - } catch (error) { - console.error('Error in answer evaluation:', error); - throw error; - } -} - -// Example usage -async function main() { - const question = process.argv[2] || ''; - const answer = process.argv[3] || ''; - - if (!question || !answer) { - console.error('Please provide both question and answer as command line arguments'); - process.exit(1); } - try { - await evaluateAnswer(question, answer); - } catch (error) { - console.error('Failed to evaluate answer:', error); - } -} - -if (require.main === module) { - main().catch(console.error); + return { response: result!.object }; } \ No newline at end of file diff --git a/src/types.ts b/src/types.ts index 0e7db89..a93ab5a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,18 +1,3 @@ -import { z } from 'zod'; - -export const ThinkSchema = z.string().describe('Strategic reasoning about the process'); - -export const QuerySchema = z.string() - .max(30) - .describe('Search query, must be less than 30 characters'); - -export const URLSchema = z.string().url(); - -export const ReferenceSchema = z.object({ - exactQuote: z.string().describe('Exact relevant quote from the document'), - url: URLSchema.describe('URL of the document') -}); - // Action Types type BaseAction = { action: "search" | "answer" | "reflect" | "visit"; @@ -96,9 +81,12 @@ export interface ReadResponse { readableMessage?: string; } + + + export type EvaluationResponse = { - is_definitive: boolean; - reasoning: string; + pass: boolean; + think: string; }; export type ErrorAnalysisResponse = {