feat: improved evaluators

2026-03-22 07:29:35 +08:00 · 2025-02-06 21:36:32 +08:00
parent a5e5627823
commit 906424f015
3 changed files with 282 additions and 136 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
  if (allowSearch) {
    actions.push("search");
-    properties.searchQuery = z.string()
+    properties.searchQuery = z.string().max(30)
      .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
  }
@@ -356,39 +356,24 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
    // execute the step and action
    if (thisStep.action === 'answer') {
      if (step === 1) {
        // LLM is so confident and answer immediately, skip all evaluations
        isAnswered = true;
        break
      }
      updateContext({
        totalStep,
        question: currentQuestion,
        ...thisStep,
      });
-      const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
+      const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
-
+        ['definitive', 'freshness', 'plurality'], context.tokenTracker);
      if (currentQuestion === question) {
-        if (badAttempts >= maxBadAttempts) {
+        if (evaluation.pass) {
          // EXIT POINT OF THE PROGRAM!!!!
          diaryContext.push(`
 At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
 Original question: 
 ${currentQuestion}
 Your answer: 
 ${thisStep.answer}
 The evaluator thinks your answer is good because: 
 ${evaluation.reasoning}
 Your journey ends here.
 `);
          isAnswered = false;
          break
        }
        if (evaluation.is_definitive) {
          if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
            // EXIT POINT OF THE PROGRAM!!!!
            diaryContext.push(`
 At step ${step}, you took **answer** action and finally found the answer to the original question:
 Original question: 
@@ -398,31 +383,18 @@ Your answer:
 ${thisStep.answer}
 The evaluator thinks your answer is good because: 
-${evaluation.reasoning}
+${evaluation.think}
 Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
 `);
-            isAnswered = true;
+          isAnswered = true;
          break
        } else {
          if (badAttempts >= maxBadAttempts) {
            isAnswered = false;
            break
          } else {
            diaryContext.push(`
 At step ${step}, you took **answer** action and finally found the answer to the original question:
 Original question: 
 ${currentQuestion}
 Your answer: 
 ${thisStep.answer}
 Unfortunately, you did not provide any references to support your answer. 
 You need to find more URL references to support your answer.`);
          }
          isAnswered = true;
          break
        } else {
          diaryContext.push(`
 At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
 Original question: 
@@ -432,23 +404,31 @@ Your answer:
 ${thisStep.answer}
 The evaluator thinks your answer is bad because: 
-${evaluation.reasoning}
+${evaluation.think}
 `);
-          // store the bad context and reset the diary context
+            // store the bad context and reset the diary context
-          const {response: errorAnalysis} = await analyzeSteps(diaryContext);
+            const {response: errorAnalysis} = await analyzeSteps(diaryContext);
-          badContext.push({
+            allKnowledge.push({
-            question: currentQuestion,
+              question: currentQuestion,
-            answer: thisStep.answer,
+              answer: thisStep.answer,
-            evaluation: evaluation.reasoning,
+              references: thisStep.references,
-            ...errorAnalysis
+              type: 'qa'
-          });
+            });
-          badAttempts++;
+
-          allowAnswer = false;  // disable answer action in the immediate next step
+            badContext.push({
-          diaryContext = [];
+              question: currentQuestion,
-          step = 0;
+              answer: thisStep.answer,
              evaluation: evaluation.think,
              ...errorAnalysis
            });
            badAttempts++;
            allowAnswer = false;  // disable answer action in the immediate next step
            diaryContext = [];
            step = 0;
          }
        }
-      } else if (evaluation.is_definitive) {
+      } else if (evaluation.pass) {
        diaryContext.push(`
 At step ${step}, you took **answer** action. You found a good answer to the sub-question:
@@ -459,7 +439,7 @@ Your answer:
 ${thisStep.answer}
 The evaluator thinks your answer is good because: 
-${evaluation.reasoning}
+${evaluation.think}
 Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
 `);
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';
 const model = getModel('evaluator');
-const responseSchema = z.object({
+type EvaluationType = 'definitive' | 'freshness' | 'plurality';
-  is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
+
-  reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
+const baseSchema = {
  pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
  think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
 };
 const definitiveSchema = z.object({
  ...baseSchema,
  type: z.literal('definitive')
 });
-function getPrompt(question: string, answer: string): string {
+const freshnessSchema = z.object({
  ...baseSchema,
  type: z.literal('freshness'),
  freshness_analysis: z.object({
    likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
    dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
    current_time: z.string().describe('Current system time when evaluation was performed'),
    max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
  })
 });
 const pluralitySchema = z.object({
  ...baseSchema,
  type: z.literal('plurality'),
  plurality_analysis: z.object({
    expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
    provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
    count_expected: z.number().optional().describe('Number of items expected if specified in question'),
    count_provided: z.number().describe('Number of items provided in answer')
  })
 });
 function getDefinitivePrompt(question: string, answer: string): string {
  return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
 <rules>
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
  5. Non-answers that suggest alternatives
 </rules>
 <examples>
 Question: "What are the system requirements for running Python 3.9?"
 Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
 Evaluation: {
-  "is_definitive": false,
+  "pass": false,
-  "reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
+  "think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
 }
 Question: "What are the system requirements for running Python 3.9?"
 Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
 Evaluation: {
-  "is_definitive": true,
+  "pass": true,
-  "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
+  "think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
 }
 Question: "Who will be the president of the United States in 2032?"
 Answer: "I cannot predict the future, it depends on the election results."
 Evaluation: {
-  "is_definitive": false,
+  "pass": false,
-  "reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
+  "think": "The answer contains a statement of inability to predict the future, making it non-definitive."
 }
 Question: "Who is the sales director at Company X?"
 Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
 Evaluation: {
-  "is_definitive": false,
+  "pass": false,
-  "reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
+  "think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
 }
 Question: "what is the twitter account of jina ai's founder?"
 Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
 Evaluation: {
-  "is_definitive": false,
+  "pass": false,
-  "reasoning": "The answer indicates a lack of information rather than providing a definitive response."
+  "think": "The answer indicates a lack of information rather than providing a definitive response."
 }
 </examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
 }
-export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
+function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
-  try {
+  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
-    const prompt = getPrompt(question, answer);
+
-    let object;
+<rules>
-    let totalTokens = 0;
+1. Date Analysis:
   - Extract all dates mentioned in the answer
   - Compare against current system time: ${currentTime}
   - Consider content outdated if:
     * It refers to a "latest" or "current" state from more than 30 days ago
     * It mentions specific dates/events that have been superseded
     * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
   - For product versions, releases, or announcements, max age is 30 days
   - For company positions, leadership, or general facts, max age is 60 days
 2. Context Hints:
   - Words indicating recency: "latest", "current", "newest", "just released", "recently"
   - Time-sensitive terms: "CEO", "price", "version", "release"
   - Future dates should be ignored in outdated calculation
 </rules>
 <examples>
 Question: "What is Jina AI's latest embedding model?"
 Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
 Current Time: "2024-10-06T00:00:00Z"
 Evaluation: {
  "pass": false,
  "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
  "freshness_analysis": {
    "likely_outdated": true,
    "dates_mentioned": ["2024-03-15"],
    "current_time": "2024-10-06T00:00:00Z",
    "max_age_days": 30
  }
 }
 Question: "Who is OpenAI's CEO?"
 Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
 Current Time: "2024-02-06T00:00:00Z"
 Evaluation: {
  "pass": true,
  "think": "The answer is about company leadership and is within the 60-day threshold for such information",
  "freshness_analysis": {
    "likely_outdated": false,
    "dates_mentioned": ["2023-12"],
    "current_time": "2024-02-06T00:00:00Z",
    "max_age_days": 60
  }
 }
 </examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
 }
 function getPluralityPrompt(question: string, answer: string): string {
  return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
 <rules>
 1. Question Analysis:
   - Check if question asks for multiple items using indicators like:
     * Plural nouns: "companies", "people", "names"
     * Quantifiers: "all", "many", "several", "various", "multiple"
     * List requests: "list", "enumerate", "name all", "give me all"
     * Numbers: "5 examples", "top 10"
   - Otherwise skip the analysis and return pass to true
 2. Answer Analysis:
   - Count distinct items provided in the answer
   - Check if answer uses limiting words like "only", "just", "single"
   - Identify if answer acknowledges there are more items but only provides some
 3. Definitiveness Rules:
   - If question asks for multiple items but answer provides only one → NOT definitive
   - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
   - If answer clearly states it's providing a partial list → NOT definitive
   - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
 </rules>
 <examples>
 Question: "Who works in Jina AI's sales team?"
 Answer: "John Smith is a sales representative at Jina AI."
 Evaluation: {
  "pass": true,
  "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
  "plurality_analysis": {
    "expects_multiple": false,
    "provides_multiple": false,
    "count_provided": 1
  }
 }
 Question: "List all the salespeople who work at Jina AI"
 Answer: "John Smith is a sales representative at Jina AI."
 Evaluation: {
  "pass": false,
  "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": false,
    "count_provided": 1
  }
 }
 Question: "Name the top 3 products sold by Jina AI"
 Answer: "Jina AI's product lineup includes DocArray and Jina."
 Evaluation: {
  "pass": false,
  "think": "The question asks for top 3 products but only 2 are provided.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": true,
    "count_expected": 3,
    "count_provided": 2
  }
 }
 Question: "List as many AI companies in Berlin as you can find"
 Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
 Evaluation: {
  "pass": false,
  "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
  "plurality_analysis": {
    "expects_multiple": true,
    "provides_multiple": true,
    "count_provided": 5
  }
 }
 </examples>
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
 }
 export async function evaluateAnswer(
  question: string,
  answer: string,
  evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
  tracker?: TokenTracker
 ): Promise<{ response: EvaluationResponse }> {
  let result;
  for (const evaluationType of evaluationOrder) {
    try {
-      const result = await generateObject({
+      switch (evaluationType) {
-        model,
+        case 'definitive':
-        schema: responseSchema,
+          result = await generateObject({
-        prompt,
+            model,
-        maxTokens: getMaxTokens('evaluator')
+            schema: definitiveSchema,
-      });
+            prompt: getDefinitivePrompt(question, answer),
-      object = result.object;
+            maxTokens: getMaxTokens('evaluator')
-      totalTokens = result.usage?.totalTokens || 0;
+          });
          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
          console.log('Evaluation:', result.object);
          if (!result.object.pass) {
            return { response: result.object };
          }
          break;
        case 'freshness':
          result = await generateObject({
            model,
            schema: freshnessSchema,
            prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
            maxTokens: getMaxTokens('evaluator')
          });
          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
          console.log('Evaluation:', result.object);
          if (!result.object.pass) {
            return { response: result.object };
          }
          break;
        case 'plurality':
          result = await generateObject({
            model,
            schema: pluralitySchema,
            prompt: getPluralityPrompt(question, answer),
            maxTokens: getMaxTokens('evaluator')
          });
          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
          console.log('Evaluation:', result.object);
          if (!result.object.pass) {
            return { response: result.object };
          }
          break;
      }
    } catch (error) {
-      const result = await handleGenerateObjectError<EvaluationResponse>(error);
+      console.error(`Error in ${evaluationType} evaluation:`, error);
-      object = result.object;
+      const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
-      totalTokens = result.totalTokens;
+      (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
      if (!errorResult.object.pass) {
        return { response: errorResult.object };
      }
    }
    console.log('Evaluation:', {
      definitive: object.is_definitive,
      reason: object.reasoning
    });
    (tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
    return { response: object, tokens: totalTokens };
  } catch (error) {
    console.error('Error in answer evaluation:', error);
    throw error;
  }
 }
 // Example usage
 async function main() {
  const question = process.argv[2] || '';
  const answer = process.argv[3] || '';
  if (!question || !answer) {
    console.error('Please provide both question and answer as command line arguments');
    process.exit(1);
  }
-  try {
+  return { response: result!.object };
    await evaluateAnswer(question, answer);
  } catch (error) {
    console.error('Failed to evaluate answer:', error);
  }
 }
 if (require.main === module) {
  main().catch(console.error);
 }
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,18 +1,3 @@
 import { z } from 'zod';
 export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
 export const QuerySchema = z.string()
  .max(30)
  .describe('Search query, must be less than 30 characters');
 export const URLSchema = z.string().url();
 export const ReferenceSchema = z.object({
  exactQuote: z.string().describe('Exact relevant quote from the document'),
  url: URLSchema.describe('URL of the document')
 });
 // Action Types
 type BaseAction = {
  action: "search" | "answer" | "reflect" | "visit";
@@ -96,9 +81,12 @@ export interface ReadResponse {
  readableMessage?: string;
 }
 export type EvaluationResponse = {
-  is_definitive: boolean;
+  pass: boolean;
-  reasoning: string;
+  think: string;
 };
 export type ErrorAnalysisResponse = {