diff --git a/src/agent.ts b/src/agent.ts index 819a882..dc66d75 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator"; import {analyzeSteps} from "./tools/error-analyzer"; import {TokenTracker} from "./utils/token-tracker"; import {ActionTracker} from "./utils/action-tracker"; -import {StepAction, AnswerAction, KnowledgeItem} from "./types"; +import {StepAction, AnswerAction, KnowledgeItem, EvaluationCriteria} from "./types"; import {TrackerContext} from "./types"; import {search} from "./tools/jina-search"; // import {grounding} from "./tools/grounding"; @@ -24,7 +24,7 @@ async function sleep(ms: number) { return new Promise(resolve => setTimeout(resolve, ms)); } -function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean) { +function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean, languageStyle: string = 'same language as the question') { const actions: string[] = []; const properties: Record = { action: z.enum(['placeholder']), // Will update later with actual actions @@ -40,7 +40,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole if (allowAnswer) { actions.push("answer"); properties.answer = z.string() - .describe("Required when action='answer'. Must be the final answer in natural language").optional(); + .describe(`Required when action='answer'. Must in ${languageStyle}`).optional(); properties.references = z.array( z.object({ exactQuote: z.string().describe("Exact relevant quote from the document"), @@ -85,7 +85,8 @@ function getPrompt( badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[], knowledge?: KnowledgeItem[], allURLs?: Record, - beastMode?: boolean + beastMode?: boolean, + languageStyle?: string ): string { const sections: string[] = []; const actionSections: string[] = []; @@ -216,11 +217,11 @@ ${allKeywords.join('\n')} if (allowAnswer) { actionSections.push(` -- If is a simple greeting, chit-chat, or general knowledge, provide the answer directly. -- Must provide "references" and each must specify "exactQuote" and "url" -- In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references -- Responses must be definitive (no ambiguity, uncertainty, or disclaimers) -- Provide final response only when 100% certain${allowReflect ? '\n- If doubts remain, use instead' : ''} +- If is a simple greeting, chit-chat, or general knowledge, provide the answer directly; +- Must provide "references" and each must specify "exactQuote" and "url"; +- In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references; +- Responses must be definitive (no ambiguity, uncertainty, or disclaimers) and in the style of ${languageStyle}; +- Provide final response only when 100% certain;${allowReflect ? '\n- If doubts remain, use instead' : ''} `); } @@ -299,8 +300,9 @@ export async function getResponse(question: string, let totalStep = 0; let badAttempts = 0; let schema: ZodObject = getSchema(true, true, true, true) - const gaps: string[] = [question.trim()]; // All questions to be answered including the orginal question - const allQuestions = [question.trim()]; + question = question.trim() + const gaps: string[] = [question]; // All questions to be answered including the orginal question + const allQuestions = [question]; const allKeywords = []; const allKnowledge: KnowledgeItem[] = []; // knowledge are intermedidate questions that are answered // iterate over historyMessages @@ -329,7 +331,7 @@ export async function getResponse(question: string, const allURLs: Record = {}; const visitedURLs: string[] = []; - const evaluationMetrics: Record = {}; + const evaluationMetrics: Record = {}; while (context.tokenTracker.getTotalUsage().totalTokens < tokenBudget && badAttempts <= maxBadAttempts) { // add 1s delay to avoid rate limiting await sleep(STEP_SLEEP); @@ -339,7 +341,7 @@ export async function getResponse(question: string, console.log(`Step ${totalStep} / Budget used ${budgetPercentage}%`); console.log('Gaps:', gaps); allowReflect = allowReflect && (gaps.length <= 1); - const currentQuestion = gaps.length > 0 ? gaps.shift()! : question.trim(); + const currentQuestion = gaps.length > 0 ? gaps.shift()! : question if (!evaluationMetrics[currentQuestion]) { evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker) } @@ -361,9 +363,11 @@ export async function getResponse(question: string, badContext, allKnowledge, allURLs, - false + false, + evaluationMetrics[currentQuestion].languageStyle ); - schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch) + schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch, + evaluationMetrics[currentQuestion].languageStyle) const generator = new ObjectGeneratorSafe(context.tokenTracker); const result = await generator.generateObject({ model: 'agent', @@ -401,7 +405,7 @@ export async function getResponse(question: string, const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep, evaluationMetrics[currentQuestion], context.tokenTracker); - if (currentQuestion.trim() === question.trim()) { + if (currentQuestion.trim() === question) { if (evaluation.pass) { diaryContext.push(` At step ${step}, you took **answer** action and finally found the answer to the original question: @@ -458,7 +462,7 @@ ${evaluation.think} // reranker? maybe gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); - gaps.push(question.trim()); // always keep the original question in the gaps + gaps.push(question); // always keep the original question in the gaps } badAttempts++; @@ -505,7 +509,7 @@ You will now figure out the answers to these sub-questions and see if they can h `); gaps.push(...newGapQuestions.slice(0, 2)); allQuestions.push(...newGapQuestions.slice(0, 2)); - gaps.push(question.trim()); // always keep the original question in the gaps + gaps.push(question); // always keep the original question in the gaps } else { diaryContext.push(` At step ${step}, you took **reflect** and think about the knowledge gaps. You tried to break down the question "${currentQuestion}" into gap-questions like this: ${oldQuestions.join(', ')} @@ -697,10 +701,12 @@ You decided to think out of the box or cut from a completely different angle.`); badContext, allKnowledge, allURLs, - true + true, + evaluationMetrics[question]?.languageStyle || 'same language as the question' ); - schema = getSchema(false, false, true, false); + schema = getSchema(false, false, true, false, + evaluationMetrics[question]?.languageStyle || 'same language as the question'); const generator = new ObjectGeneratorSafe(context.tokenTracker); const result = await generator.generateObject({ model: 'agentBeastMode', @@ -721,7 +727,15 @@ You decided to think out of the box or cut from a completely different angle.`); async function storeContext(prompt: string, schema: any, memory: any[][], step: number) { if ((process as any).asyncLocalContext?.available?.()) { const [context, keywords, questions, knowledge] = memory; - (process as any).asyncLocalContext.ctx.promptContext = { prompt, schema, context, keywords, questions, knowledge, step }; + (process as any).asyncLocalContext.ctx.promptContext = { + prompt, + schema, + context, + keywords, + questions, + knowledge, + step + }; return; } diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index 33d5d81..fdc786a 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -1,12 +1,11 @@ import {z} from 'zod'; import {GenerateObjectResult} from 'ai'; import {TokenTracker} from "../utils/token-tracker"; -import {AnswerAction, EvaluationResponse} from '../types'; +import {AnswerAction, EvaluationCriteria, EvaluationResponse, EvaluationType} from '../types'; import {readUrl, removeAllLineBreaks} from "./read"; import {ObjectGeneratorSafe} from "../utils/safe-generator"; -type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution'; const baseSchema = { pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'), @@ -301,7 +300,8 @@ Answer: ${JSON.stringify(answer)}`; const questionEvaluationSchema = z.object({ needsFreshness: z.boolean().describe('Whether the question requires freshness check'), needsPlurality: z.boolean().describe('Whether the question requires plurality check'), - reasoning: z.string().describe('Explanation of why these checks are needed or not needed') + reasoning: z.string().describe('Explanation of why these checks are needed or not needed'), + languageStyle: z.string().describe('The language being used and the overall vibe/mood of the question'), }); function getQuestionEvaluationPrompt(question: string): string { @@ -310,6 +310,7 @@ function getQuestionEvaluationPrompt(question: string): string { 1. freshness - Checks if the question is time-sensitive or requires very recent information 2. plurality - Checks if the question asks for multiple items or a specific count or enumeration +3. language style - Identifies both the language used and the overall vibe of the question @@ -326,42 +327,54 @@ If question is a simple greeting, chit-chat, or general knowledge, provide the a - Check for: numbers ("5 examples"), plural nouns, list requests - Look for: "all", "list", "enumerate", "examples", plural forms - Required when question implies completeness ("all the reasons", "every factor") + +3. Language Style Analysis: + Combine both language and emotional vibe in a descriptive phrase, considering: + - Language: The primary language or mix of languages used + - Emotional tone: panic, excitement, frustration, curiosity, etc. + - Formality level: academic, casual, professional, etc. + - Domain context: technical, academic, social, etc. -Question: "Hello, how are you?" +Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭" Evaluation: { - "needsFreshness": false, - "needsPlurality": false, - "reasoning": "Simple greeting, no additional checks needed." + "needsFreshness": false, + "needsPlurality": true, + "reasoning": "Multiple eigenvalues needed but no time-sensitive information required", + "languageStyle": "panicked student English with math jargon" } -Question: "What is the current CEO of OpenAI?" +Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦‍♂️ #MonacoGP" Evaluation: { - "needsFreshness": true, - "needsPlurality": false, - "reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position." + "needsFreshness": true, + "needsPlurality": true, + "reasoning": "Refers to recent race event and requires analysis of multiple strategic decisions", + "languageStyle": "frustrated fan English with F1 terminology" } -Question: "List all the AI companies in Berlin" +Question: "肖老师您好,请您介绍一下最近量子计算领域的三个重大突破,特别是它们在密码学领域的应用价值吗?🤔" Evaluation: { - "needsFreshness": false, - "needsPlurality": true, - "reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive." + "needsFreshness": true, + "needsPlurality": true, + "reasoning": "Asks for recent breakthroughs (freshness) and specifically requests three examples (plurality)", + "languageStyle": "formal technical Chinese with academic undertones" } -Question: "What are the top 5 latest AI models released by OpenAI?" +Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤" Evaluation: { - "needsFreshness": true, - "needsPlurality": true, - "reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items." + "needsFreshness": false, + "needsPlurality": true, + "reasoning": "Requires comprehensive debugging analysis of multiple potential issues", + "languageStyle": "frustrated German-English tech slang" } -Question: "Who created Python?" +Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis." Evaluation: { - "needsFreshness": false, - "needsPlurality": false, - "reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed." + "needsFreshness": true, + "needsPlurality": true, + "reasoning": "Requires analysis of current impacts (freshness) across multiple dimensions: sociopolitical, cultural, and linguistic (plurality)", + "languageStyle": "formal academic English with sociological terminology" } @@ -374,7 +387,7 @@ const TOOL_NAME = 'evaluator'; export async function evaluateQuestion( question: string, tracker?: TokenTracker -): Promise { +): Promise { try { const generator = new ObjectGeneratorSafe(tracker); @@ -394,12 +407,12 @@ export async function evaluateQuestion( console.log('Question Metrics:', types); // Always evaluate definitive first, then freshness (if needed), then plurality (if needed) - return types; + return {types, languageStyle: result.object.languageStyle}; } catch (error) { console.error('Error in question evaluation:', error); // Default to all evaluation types in case of error - return ['definitive', 'freshness', 'plurality']; + return {types: ['definitive', 'freshness', 'plurality'], languageStyle: 'plain English'}; } } @@ -430,17 +443,17 @@ async function performEvaluation( export async function evaluateAnswer( question: string, action: AnswerAction, - evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'], + evaluationCri: EvaluationCriteria, tracker?: TokenTracker ): Promise<{ response: EvaluationResponse }> { let result; // Only add attribution if we have valid references if (action.references && action.references.length > 0) { - evaluationOrder = ['attribution', ...evaluationOrder]; + evaluationCri.types = ['attribution', ...evaluationCri.types]; } - for (const evaluationType of evaluationOrder) { + for (const evaluationType of evaluationCri.types) { switch (evaluationType) { case 'attribution': { // Safely handle references and ensure we have content diff --git a/src/types.ts b/src/types.ts index f3f9f0d..12ac169 100644 --- a/src/types.ts +++ b/src/types.ts @@ -45,6 +45,11 @@ export type VisitAction = BaseAction & { export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction; +export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution'; +export type EvaluationCriteria = { + types: EvaluationType[]; + languageStyle: string; +}; // Following Vercel AI SDK's token counting interface export interface TokenUsage {