fix: eval logics

2025-12-26 06:28:56 +08:00 · 2025-03-20 10:49:57 +08:00 · 2025-03-20 10:49:57 +08:00 · 4523f10283
commit 4523f10283
parent 8fa9aaa151
4 changed files with 47 additions and 50 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -17,7 +17,7 @@ import {
  SearchResult,
  EvaluationType,
  BoostedSearchSnippet,
-  SearchSnippet, EvaluationResponse, Reference, SERPQuery
+  SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType
 } from "./types";
 import {TrackerContext} from "./types";
 import {search} from "./tools/jina-search";
@ -353,10 +353,13 @@ async function executeSearchQueries(
  };
 }

+function includesEval(allChecks: RepeatEvaluationType[], evalType: EvaluationType): boolean {
+  return allChecks.some(c => c.type === evalType);
+}

 export async function getResponse(question?: string,
                                  tokenBudget: number = 1_000_000,
-                                  maxBadAttempts: number = 3,
+                                  maxBadAttempts: number = 2,
                                  existingContext?: Partial<TrackerContext>,
                                  messages?: Array<CoreMessage>,
                                  numReturnedURLs: number = 100,
@ -367,7 +370,6 @@ export async function getResponse(question?: string,

  let step = 0;
  let totalStep = 0;
-  let badAttempts = 0;

  question = question?.trim() as string;
  // remove incoming system messages to avoid override
@ -409,19 +411,18 @@ export async function getResponse(question?: string,
  let allowReflect = true;
  let allowCoding = false;
  let system = '';
-  let maxStrictEvals = Math.max(1, Math.min(3, maxBadAttempts - 1));
  let msgWithKnowledge: CoreMessage[] = [];
  let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};

  const allURLs: Record<string, SearchSnippet> = {};
  const visitedURLs: string[] = [];
  const badURLs: string[] = [];
-  const evaluationMetrics: Record<string, EvaluationType[]> = {};
+  const evaluationMetrics: Record<string, RepeatEvaluationType[]> = {};
  // reserve the 10% final budget for the beast mode
  const regularBudget = tokenBudget * 0.85;
  const finalAnswerPIP: string[] = [];
  let trivialQuestion = false;
-  while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget && badAttempts <= maxBadAttempts) {
+  while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
    // add 1s delay to avoid rate limiting
    step++;
    totalStep++;
@ -438,14 +439,19 @@ export async function getResponse(question?: string,
    if (currentQuestion.trim() === question && totalStep === 1) {
      // only add evaluation for initial question, once at step 1
      evaluationMetrics[currentQuestion] =
-        await evaluateQuestion(currentQuestion, context, SchemaGen)
+        (await evaluateQuestion(currentQuestion, context, SchemaGen)).map(e => {
+          return {
+            type: e,
+            numEvalsRequired: maxBadAttempts
+          } as RepeatEvaluationType
+        })
      // force strict eval for the original question, at last, only once.
-      evaluationMetrics[currentQuestion].push('strict')
+      evaluationMetrics[currentQuestion].push({type: 'strict', numEvalsRequired: maxBadAttempts});
    } else if (currentQuestion.trim() !== question) {
      evaluationMetrics[currentQuestion] = []
    }

-    if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
+    if (totalStep === 1 && includesEval(evaluationMetrics[currentQuestion], 'freshness')) {
      // if it detects freshness, avoid direct answer at step 1
      allowAnswer = false;
      allowReflect = false;
@ -501,7 +507,7 @@ export async function getResponse(question?: string,
    console.log(`${currentQuestion}: ${thisStep.action} <- [${actionsStr}]`);
    console.log(thisStep)

-    context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
+    context.actionTracker.trackAction({totalStep, thisStep, gaps});

    // reset allow* to true
    allowAnswer = true;
@ -554,7 +560,7 @@ export async function getResponse(question?: string,
        evaluation = await evaluateAnswer(
          currentQuestion,
          thisStep,
-          evaluationMetrics[currentQuestion],
+          evaluationMetrics[currentQuestion].map(e => e.type),
          context,
          allKnowledge,
          SchemaGen
@ -583,20 +589,25 @@ Your journey ends here. You have successfully answered the original question. Co
          thisStep.isFinal = true;
          break
        } else {
+          // lower numEvalsRequired for the failed evaluation and if numEvalsRequired is 0, remove it from the evaluation metrics
+          evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].map(e => {
+            if (e.type === evaluation.type) {
+              e.numEvalsRequired--;
+            }
+            return e;
+          }).filter(e => e.numEvalsRequired > 0);
+
          if (evaluation.type === 'strict' && evaluation.improvement_plan) {
            finalAnswerPIP.push(evaluation.improvement_plan);
-            maxStrictEvals--;
-            if (maxStrictEvals <= 0) {
-              // remove 'strict' from the evaluation metrics
-              console.log('Remove `strict` from evaluation metrics')
-              evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
-            }
          }
-          if (badAttempts >= maxBadAttempts) {
+
+          if (evaluationMetrics[currentQuestion].length === 0) {
+            // failed so many times, give up, route to beast mode
            thisStep.isFinal = false;
            break
-          } else {
-            diaryContext.push(`
+          }
+
+          diaryContext.push(`
 At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:

 Original question: 
@ -608,11 +619,11 @@ ${thisStep.answer}
 The evaluator thinks your answer is bad because: 
 ${evaluation.think}
 `);
-            // store the bad context and reset the diary context
-            const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);
+          // store the bad context and reset the diary context
+          const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);

-            allKnowledge.push({
-              question: `
+          allKnowledge.push({
+            question: `
 Why is the following answer bad for the question? Please reflect

 <question>
@ -623,7 +634,7 @@ ${currentQuestion}
 ${thisStep.answer}
 </answer>
 `,
-              answer: `
+            answer: `
 ${evaluation.think}

 ${errorAnalysis.recap}
@ -632,14 +643,12 @@ ${errorAnalysis.blame}

 ${errorAnalysis.improvement}
 `,
-              type: 'qa',
-            })
+            type: 'qa',
+          })

-            badAttempts++;
-            allowAnswer = false;  // disable answer action in the immediate next step
-            diaryContext = [];
-            step = 0;
-          }
+          allowAnswer = false;  // disable answer action in the immediate next step
+          diaryContext = [];
+          step = 0;
        }
      } else if (evaluation.pass) {
        // solved a gap question
@ -913,7 +922,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
    } as AnswerAction;
    await updateReferences(thisStep, allURLs);
    (thisStep as AnswerAction).isFinal = true;
-    context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
+    context.actionTracker.trackAction({totalStep, thisStep, gaps});
  }

  if (!trivialQuestion) {
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@ -634,26 +634,13 @@ export async function evaluateAnswer(
  for (const evaluationType of evaluationTypes) {
    let prompt: { system: string; user: string } | undefined
    switch (evaluationType) {
-      // case 'attribution': {
-      //   if (allKnowledge.length === 0) {
-      //     return {
-      //       pass: false,
-      //       think: `The knowledge is completely empty and the answer can not be derived from it. Need to found some other references and URLs`,
-      //       type: 'attribution',
-      //     };
-      //   }
-      //   prompt = getAttributionPrompt(question, action.answer, allKnowledge);
-      //   break;
-      // }

      case 'definitive':
        prompt = getDefinitivePrompt(question, action.answer);
        break;
-
      case 'freshness':
        prompt = getFreshnessPrompt(question, action, new Date().toISOString());
        break;
-
      case 'plurality':
        prompt = getPluralityPrompt(question, action.answer);
        break;
@ -681,6 +668,6 @@ export async function evaluateAnswer(
    }
  }

-    return result?.object as EvaluationResponse;
+  return result?.object as EvaluationResponse;

 }
--- a/src/types.ts
+++ b/src/types.ts
@ -63,6 +63,10 @@ export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitActi

 export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution' | 'completeness' | 'strict';

+export type RepeatEvaluationType = {
+    type: EvaluationType;
+    numEvalsRequired: number;
+}

 // Following Vercel AI SDK's token counting interface
 export interface TokenUsage {
--- a/src/utils/action-tracker.ts
+++ b/src/utils/action-tracker.ts
@ -5,7 +5,6 @@ import {getI18nText} from "./text-tools";
 interface ActionState {
  thisStep: StepAction;
  gaps: string[];
-  badAttempts: number;
  totalStep: number;
 }

@ -14,7 +13,6 @@ export class ActionTracker extends EventEmitter {
  private state: ActionState = {
    thisStep: {action: 'answer', answer: '', references: [], think: ''},
    gaps: [],
-    badAttempts: 0,
    totalStep: 0
  };

@ -39,7 +37,6 @@ export class ActionTracker extends EventEmitter {
    this.state = {
      thisStep: {action: 'answer', answer: '', references: [], think: ''},
      gaps: [],
-      badAttempts: 0,
      totalStep: 0
    };
  }