feat: improved evaluators

2026-03-22 07:29:35 +08:00 · 2025-02-06 21:36:32 +08:00
parent a5e5627823
commit 906424f015
3 changed files with 282 additions and 136 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole

  if (allowSearch) {
    actions.push("search");
-    properties.searchQuery = z.string()
+    properties.searchQuery = z.string().max(30)
      .describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
  }

@@ -356,38 +356,23 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_

    // execute the step and action
    if (thisStep.action === 'answer') {
+      if (step === 1) {
+        // LLM is so confident and answer immediately, skip all evaluations
+        isAnswered = true;
+        break
+      }
+
      updateContext({
        totalStep,
        question: currentQuestion,
        ...thisStep,
      });

-      const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
-
+      const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
+        ['definitive', 'freshness', 'plurality'], context.tokenTracker);

      if (currentQuestion === question) {
-        if (badAttempts >= maxBadAttempts) {
-          // EXIT POINT OF THE PROGRAM!!!!
-          diaryContext.push(`
-At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
-
-Original question: 
-${currentQuestion}
-
-Your answer: 
-${thisStep.answer}
-
-The evaluator thinks your answer is good because: 
-${evaluation.reasoning}
-
-Your journey ends here.
-`);
-          isAnswered = false;
-          break
-        }
-        if (evaluation.is_definitive) {
-          if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
-            // EXIT POINT OF THE PROGRAM!!!!
+        if (evaluation.pass) {
          diaryContext.push(`
 At step ${step}, you took **answer** action and finally found the answer to the original question:

@@ -398,29 +383,16 @@ Your answer:
 ${thisStep.answer}

 The evaluator thinks your answer is good because: 
-${evaluation.reasoning}
+${evaluation.think}

 Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
 `);
          isAnswered = true;
          break
        } else {
-            diaryContext.push(`
-At step ${step}, you took **answer** action and finally found the answer to the original question:
-
-Original question: 
-${currentQuestion}
-
-Your answer: 
-${thisStep.answer}
-
-Unfortunately, you did not provide any references to support your answer. 
-You need to find more URL references to support your answer.`);
-          }
-
-          isAnswered = true;
+          if (badAttempts >= maxBadAttempts) {
+            isAnswered = false;
            break
-
          } else {
            diaryContext.push(`
 At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
@@ -432,15 +404,22 @@ Your answer:
 ${thisStep.answer}

 The evaluator thinks your answer is bad because: 
-${evaluation.reasoning}
+${evaluation.think}
 `);
            // store the bad context and reset the diary context
            const {response: errorAnalysis} = await analyzeSteps(diaryContext);

+            allKnowledge.push({
+              question: currentQuestion,
+              answer: thisStep.answer,
+              references: thisStep.references,
+              type: 'qa'
+            });
+
            badContext.push({
              question: currentQuestion,
              answer: thisStep.answer,
-            evaluation: evaluation.reasoning,
+              evaluation: evaluation.think,
              ...errorAnalysis
            });
            badAttempts++;
@@ -448,7 +427,8 @@ ${evaluation.reasoning}
            diaryContext = [];
            step = 0;
          }
-      } else if (evaluation.is_definitive) {
+        }
+      } else if (evaluation.pass) {
        diaryContext.push(`
 At step ${step}, you took **answer** action. You found a good answer to the sub-question:

@@ -459,7 +439,7 @@ Your answer:
 ${thisStep.answer}

 The evaluator thinks your answer is good because: 
-${evaluation.reasoning}
+${evaluation.think}

 Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
 `);
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';

 const model = getModel('evaluator');

-const responseSchema = z.object({
-  is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
-  reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
+type EvaluationType = 'definitive' | 'freshness' | 'plurality';
+
+const baseSchema = {
+  pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
+  think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
+};
+
+const definitiveSchema = z.object({
+  ...baseSchema,
+  type: z.literal('definitive')
 });

-function getPrompt(question: string, answer: string): string {
+const freshnessSchema = z.object({
+  ...baseSchema,
+  type: z.literal('freshness'),
+  freshness_analysis: z.object({
+    likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
+    dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
+    current_time: z.string().describe('Current system time when evaluation was performed'),
+    max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
+  })
+});
+
+const pluralitySchema = z.object({
+  ...baseSchema,
+  type: z.literal('plurality'),
+  plurality_analysis: z.object({
+    expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
+    provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
+    count_expected: z.number().optional().describe('Number of items expected if specified in question'),
+    count_provided: z.number().describe('Number of items provided in answer')
+  })
+});
+
+function getDefinitivePrompt(question: string, answer: string): string {
  return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.

 <rules>
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
  5. Non-answers that suggest alternatives
 </rules>

-
 <examples>
 Question: "What are the system requirements for running Python 3.9?"
 Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
 Evaluation: {
-  "is_definitive": false,
-  "reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
+  "pass": false,
+  "think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
 }

 Question: "What are the system requirements for running Python 3.9?"
 Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
 Evaluation: {
-  "is_definitive": true,
-  "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
+  "pass": true,
+  "think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
 }

 Question: "Who will be the president of the United States in 2032?"
 Answer: "I cannot predict the future, it depends on the election results."
 Evaluation: {
-  "is_definitive": false,
-  "reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
+  "pass": false,
+  "think": "The answer contains a statement of inability to predict the future, making it non-definitive."
 }

 Question: "Who is the sales director at Company X?"
 Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
 Evaluation: {
-  "is_definitive": false,
-  "reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
+  "pass": false,
+  "think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
 }

 Question: "what is the twitter account of jina ai's founder?"
 Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
 Evaluation: {
-  "is_definitive": false,
-  "reasoning": "The answer indicates a lack of information rather than providing a definitive response."
+  "pass": false,
+  "think": "The answer indicates a lack of information rather than providing a definitive response."
 }
 </examples>
+
 Now evaluate this pair:
 Question: ${JSON.stringify(question)}
 Answer: ${JSON.stringify(answer)}`;
 }

-export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
+function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
+  return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
+
+<rules>
+1. Date Analysis:
+   - Extract all dates mentioned in the answer
+   - Compare against current system time: ${currentTime}
+   - Consider content outdated if:
+     * It refers to a "latest" or "current" state from more than 30 days ago
+     * It mentions specific dates/events that have been superseded
+     * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
+   - For product versions, releases, or announcements, max age is 30 days
+   - For company positions, leadership, or general facts, max age is 60 days
+
+2. Context Hints:
+   - Words indicating recency: "latest", "current", "newest", "just released", "recently"
+   - Time-sensitive terms: "CEO", "price", "version", "release"
+   - Future dates should be ignored in outdated calculation
+</rules>
+
+<examples>
+Question: "What is Jina AI's latest embedding model?"
+Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
+Current Time: "2024-10-06T00:00:00Z"
+Evaluation: {
+  "pass": false,
+  "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
+  "freshness_analysis": {
+    "likely_outdated": true,
+    "dates_mentioned": ["2024-03-15"],
+    "current_time": "2024-10-06T00:00:00Z",
+    "max_age_days": 30
+  }
+}
+
+Question: "Who is OpenAI's CEO?"
+Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
+Current Time: "2024-02-06T00:00:00Z"
+Evaluation: {
+  "pass": true,
+  "think": "The answer is about company leadership and is within the 60-day threshold for such information",
+  "freshness_analysis": {
+    "likely_outdated": false,
+    "dates_mentioned": ["2023-12"],
+    "current_time": "2024-02-06T00:00:00Z",
+    "max_age_days": 60
+  }
+}
+</examples>
+
+Now evaluate this pair:
+Question: ${JSON.stringify(question)}
+Answer: ${JSON.stringify(answer)}`;
+}
+
+function getPluralityPrompt(question: string, answer: string): string {
+  return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
+
+<rules>
+1. Question Analysis:
+   - Check if question asks for multiple items using indicators like:
+     * Plural nouns: "companies", "people", "names"
+     * Quantifiers: "all", "many", "several", "various", "multiple"
+     * List requests: "list", "enumerate", "name all", "give me all"
+     * Numbers: "5 examples", "top 10"
+   - Otherwise skip the analysis and return pass to true
+
+2. Answer Analysis:
+   - Count distinct items provided in the answer
+   - Check if answer uses limiting words like "only", "just", "single"
+   - Identify if answer acknowledges there are more items but only provides some
+
+3. Definitiveness Rules:
+   - If question asks for multiple items but answer provides only one → NOT definitive
+   - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
+   - If answer clearly states it's providing a partial list → NOT definitive
+   - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
+</rules>
+
+<examples>
+Question: "Who works in Jina AI's sales team?"
+Answer: "John Smith is a sales representative at Jina AI."
+Evaluation: {
+  "pass": true,
+  "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
+  "plurality_analysis": {
+    "expects_multiple": false,
+    "provides_multiple": false,
+    "count_provided": 1
+  }
+}
+
+Question: "List all the salespeople who work at Jina AI"
+Answer: "John Smith is a sales representative at Jina AI."
+Evaluation: {
+  "pass": false,
+  "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
+  "plurality_analysis": {
+    "expects_multiple": true,
+    "provides_multiple": false,
+    "count_provided": 1
+  }
+}
+
+Question: "Name the top 3 products sold by Jina AI"
+Answer: "Jina AI's product lineup includes DocArray and Jina."
+Evaluation: {
+  "pass": false,
+  "think": "The question asks for top 3 products but only 2 are provided.",
+  "plurality_analysis": {
+    "expects_multiple": true,
+    "provides_multiple": true,
+    "count_expected": 3,
+    "count_provided": 2
+  }
+}
+
+Question: "List as many AI companies in Berlin as you can find"
+Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
+Evaluation: {
+  "pass": false,
+  "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
+  "plurality_analysis": {
+    "expects_multiple": true,
+    "provides_multiple": true,
+    "count_provided": 5
+  }
+}
+</examples>
+
+Now evaluate this pair:
+Question: ${JSON.stringify(question)}
+Answer: ${JSON.stringify(answer)}`;
+}
+
+export async function evaluateAnswer(
+  question: string,
+  answer: string,
+  evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
+  tracker?: TokenTracker
+): Promise<{ response: EvaluationResponse }> {
+  let result;
+
+  for (const evaluationType of evaluationOrder) {
    try {
-    const prompt = getPrompt(question, answer);
-    let object;
-    let totalTokens = 0;
-    try {
-      const result = await generateObject({
+      switch (evaluationType) {
+        case 'definitive':
+          result = await generateObject({
            model,
-        schema: responseSchema,
-        prompt,
+            schema: definitiveSchema,
+            prompt: getDefinitivePrompt(question, answer),
            maxTokens: getMaxTokens('evaluator')
          });
-      object = result.object;
-      totalTokens = result.usage?.totalTokens || 0;
-    } catch (error) {
-      const result = await handleGenerateObjectError<EvaluationResponse>(error);
-      object = result.object;
-      totalTokens = result.totalTokens;
+          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+          console.log('Evaluation:', result.object);
+          if (!result.object.pass) {
+            return { response: result.object };
          }
-    console.log('Evaluation:', {
-      definitive: object.is_definitive,
-      reason: object.reasoning
+          break;
+
+        case 'freshness':
+          result = await generateObject({
+            model,
+            schema: freshnessSchema,
+            prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
+            maxTokens: getMaxTokens('evaluator')
          });
-    (tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
-    return { response: object, tokens: totalTokens };
+          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+          console.log('Evaluation:', result.object);
+          if (!result.object.pass) {
+            return { response: result.object };
+          }
+          break;
+
+        case 'plurality':
+          result = await generateObject({
+            model,
+            schema: pluralitySchema,
+            prompt: getPluralityPrompt(question, answer),
+            maxTokens: getMaxTokens('evaluator')
+          });
+          (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+          console.log('Evaluation:', result.object);
+          if (!result.object.pass) {
+            return { response: result.object };
+          }
+          break;
+      }
    } catch (error) {
-    console.error('Error in answer evaluation:', error);
-    throw error;
+      console.error(`Error in ${evaluationType} evaluation:`, error);
+      const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
+      (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
+      if (!errorResult.object.pass) {
+        return { response: errorResult.object };
+      }
    }
-}
-
-// Example usage
-async function main() {
-  const question = process.argv[2] || '';
-  const answer = process.argv[3] || '';
-
-  if (!question || !answer) {
-    console.error('Please provide both question and answer as command line arguments');
-    process.exit(1);
  }

-  try {
-    await evaluateAnswer(question, answer);
-  } catch (error) {
-    console.error('Failed to evaluate answer:', error);
-  }
-}
-
-if (require.main === module) {
-  main().catch(console.error);
+  return { response: result!.object };
 }
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,18 +1,3 @@
-import { z } from 'zod';
-
-export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
-
-export const QuerySchema = z.string()
-  .max(30)
-  .describe('Search query, must be less than 30 characters');
-
-export const URLSchema = z.string().url();
-
-export const ReferenceSchema = z.object({
-  exactQuote: z.string().describe('Exact relevant quote from the document'),
-  url: URLSchema.describe('URL of the document')
-});
-
 // Action Types
 type BaseAction = {
  action: "search" | "answer" | "reflect" | "visit";
@@ -96,9 +81,12 @@ export interface ReadResponse {
  readableMessage?: string;
 }

+
+
+
 export type EvaluationResponse = {
-  is_definitive: boolean;
-  reasoning: string;
+  pass: boolean;
+  think: string;
 };

 export type ErrorAnalysisResponse = {