fix: strict evaluator

2026-03-22 15:39:06 +08:00 · 2025-03-14 11:57:02 +08:00
parent de640b7b29
commit b0c07162dd
9 changed files with 208 additions and 58 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -308,17 +308,17 @@ export async function getResponse(question?: string,
    //   evaluationMetrics[currentQuestion] =
    //     await evaluateQuestion(currentQuestion, context, SchemaGen)
    // }
-    if (currentQuestion.trim() === question && step === 1) {
+    if (currentQuestion.trim() === question && totalStep === 1) {
      // only add evaluation for initial question, once at step 1
      evaluationMetrics[currentQuestion] =
        await evaluateQuestion(currentQuestion, context, SchemaGen)
-      // force strict eval for the original question, only once.
+      // force strict eval for the original question, at last, only once.
      evaluationMetrics[currentQuestion].push('strict')
    } else if (currentQuestion.trim() !== question) {
      evaluationMetrics[currentQuestion] = []
    }

-    if (step === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
+    if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
      // if it detects freshness, avoid direct answer at step 1
      allowAnswer = false;
      allowReflect = false;
@@ -403,7 +403,7 @@ export async function getResponse(question?: string,

      console.log('Updated references:', thisStep.references)

-      if (step === 1 && thisStep.references.length === 0) {
+      if (totalStep === 1 && thisStep.references.length === 0) {
        // LLM is so confident and answer immediately, skip all evaluations
        // however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"
        thisStep.isFinal = true;
@@ -423,9 +423,10 @@ export async function getResponse(question?: string,
          currentQuestion
        );

-        if (!evaluationMetrics[currentQuestion].includes('attribution')) {
-          evaluationMetrics[currentQuestion].push('attribution')
-        }
+        // is this really required???
+        // if (!evaluationMetrics[currentQuestion].includes('attribution')) {
+        //   evaluationMetrics[currentQuestion].push('attribution')
+        // }
      }

      updateContext({
@@ -470,6 +471,7 @@ Your journey ends here. You have successfully answered the original question. Co
          if (evaluation.type === 'strict') {
            finalAnswerPIP = evaluation.improvement_plan || '';
            // remove 'strict' from the evaluation metrics
+            console.log('Remove `strict` from evaluation metrics')
            evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
          }
          if (badAttempts >= maxBadAttempts) {
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -2,23 +2,61 @@ import {GenerateObjectResult} from 'ai';
 import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
 import {ObjectGeneratorSafe} from "../utils/safe-generator";
 import {Schemas} from "../utils/schemas";
+import {removeExtraLineBreaks} from "../utils/text-tools";

 const TOOL_NAME = 'evaluator';


-function getRejectAllAnswersPrompt(question: string, answer: AnswerAction): PromptPair {
-  return {
-    system: `You are a ruthless evaluator trained to REJECT answers. 
-Your job is to find ANY weakness in the presented JSON answer. Extremely strict standards of evidence apply. 
-Identity EVERY missing detail. First, argue AGAINST the conclusion with the strongest possible case. 
-Then, argue FOR the conclusion. 
-Only after considering both perspectives, synthesize a final improvement plan.
+function getRejectAllAnswersPrompt(question: string, answer: AnswerAction, allKnowledge: KnowledgeItem[]): PromptPair {
+  const KnowledgeStr = allKnowledge.map((k, idx) => {
+    const aMsg = `
+<knowledge-${idx+1}>
+${k.question}

-Any JSON formatting/structure/syntax issue should not be the reason to rejection.
+${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
+<knowledge-datetime>
+${k.updated}
+</knowledge-datetime>
+` : ''}
+
+${k.references && k.type === 'url' ? `
+<knowledge-url>
+${k.references[0]}
+</knowledge-url>
+` : ''}
+
+
+${k.answer}
+</knowledge-${idx+1}>
+      `.trim();
+
+    return removeExtraLineBreaks(aMsg);
+  })
+
+  return {
+    system: `
+You are a ruthless answer evaluator trained to REJECT answers. 
+Given a question-answer pair, your job is to find ANY weakness in the presented answer. 
+Extremely strict standards of evidence apply. 
+Identity EVERY missing detail. 
+First, argue AGAINST the answer with the strongest possible case. 
+Then, argue FOR the answer. 
+Only after considering both perspectives, synthesize a final improvement plan starts with "For the best answer, you must...".
+
+The following knowledge items are provided for your reference. Note that some of them may not be directly related to the question/answer user provided, but may give some subtle hints and insights:
+${KnowledgeStr.join('\n\n')}
 `,
    user: `
-question: ${question}
-answer: ${JSON.stringify(answer)}
+<question>
+${question}
+</question>
+
+Here is my answer for the question:
+<answer>
+${answer.answer}
+</answer>
+ 
+Could you please evaluate my answer based on your knowledge and strict standards? If you decide to reject the answer, please tell me how to improve it.
 `
  }
 }
@@ -37,7 +75,7 @@ ${question}
 ${answer}
 </answer>

-Please look at my answer and think.
+Please read and think.
 `
  }
 }
@@ -632,7 +670,6 @@ export async function evaluateAnswer(
    let prompt: { system: string; user: string } | undefined
    switch (evaluationType) {
      case 'attribution': {
-        // Safely handle references and ensure we have content
        if (allKnowledge.length === 0) {
          return {
            pass: false,
@@ -659,7 +696,7 @@ export async function evaluateAnswer(
        prompt = getCompletenessPrompt(question, action.answer);
        break;
      case 'strict':
-        prompt = getRejectAllAnswersPrompt(question, action);
+        prompt = getRejectAllAnswersPrompt(question, action, allKnowledge);
        break;
      default:
        console.error(`Unknown evaluation type: ${evaluationType}`);
--- a/src/utils/safe-generator.ts
+++ b/src/utils/safe-generator.ts
@@ -8,6 +8,7 @@ import {
 } from "ai";
 import {TokenTracker} from "./token-tracker";
 import {getModel, ToolName, getToolConfig} from "../config";
+import Hjson from 'hjson'; // Import Hjson library

 interface GenerateObjectResult<T> {
  object: T;
@@ -29,6 +30,104 @@ export class ObjectGeneratorSafe {
    this.tokenTracker = tokenTracker || new TokenTracker();
  }

+  /**
+   * Creates a distilled version of a schema by removing all descriptions
+   * This makes the schema simpler for fallback parsing scenarios
+   */
+  private createDistilledSchema<T>(schema: z.ZodType<T> | Schema<T>): z.ZodType<T> | Schema<T> {
+    // For zod schemas
+    if (schema instanceof z.ZodType) {
+      return this.stripZodDescriptions(schema);
+    }
+
+    // For AI SDK Schema objects
+    if (typeof schema === 'object' && schema !== null) {
+      return this.stripSchemaDescriptions(schema as Schema<T>);
+    }
+
+    // If we can't determine the schema type, return as is
+    return schema;
+  }
+
+  /**
+   * Recursively strips descriptions from Zod schemas
+   */
+  private stripZodDescriptions<T>(zodSchema: z.ZodType<T>): z.ZodType<T> {
+    if (zodSchema instanceof z.ZodObject) {
+      const shape = zodSchema._def.shape();
+      const newShape: Record<string, any> = {};
+
+      for (const key in shape) {
+        if (Object.prototype.hasOwnProperty.call(shape, key)) {
+          // Recursively strip descriptions from nested schemas
+          newShape[key] = this.stripZodDescriptions(shape[key]);
+        }
+      }
+
+      return z.object(newShape) as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodArray) {
+      return z.array(this.stripZodDescriptions(zodSchema._def.type)) as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodString) {
+      // Create a new string schema without any describe() metadata
+      return z.string() as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodUnion || zodSchema instanceof z.ZodIntersection) {
+      // These are more complex schemas that would need special handling
+      // This is a simplified implementation
+      return zodSchema;
+    }
+
+    // For other primitive types or complex types we're not handling specifically,
+    // return as is
+    return zodSchema;
+  }
+
+  /**
+   * Strips descriptions from AI SDK Schema objects
+   */
+  private stripSchemaDescriptions<T>(schema: Schema<T>): Schema<T> {
+    // Deep clone the schema to avoid modifying the original
+    const clonedSchema = JSON.parse(JSON.stringify(schema));
+
+    // Recursively remove description properties
+    const removeDescriptions = (obj: any) => {
+      if (typeof obj !== 'object' || obj === null) return;
+
+      if (obj.properties) {
+        for (const key in obj.properties) {
+          // Remove description property
+          if (obj.properties[key].description) {
+            delete obj.properties[key].description;
+          }
+
+          // Recursively process nested properties
+          removeDescriptions(obj.properties[key]);
+        }
+      }
+
+      // Handle arrays
+      if (obj.items) {
+        if (obj.items.description) {
+          delete obj.items.description;
+        }
+        removeDescriptions(obj.items);
+      }
+
+      // Handle any other nested objects that might contain descriptions
+      if (obj.anyOf) obj.anyOf.forEach(removeDescriptions);
+      if (obj.allOf) obj.allOf.forEach(removeDescriptions);
+      if (obj.oneOf) obj.oneOf.forEach(removeDescriptions);
+    };
+
+    removeDescriptions(clonedSchema);
+    return clonedSchema;
+  }
+
  async generateObject<T>(options: GenerateOptions<T>): Promise<GenerateObjectResult<T>> {
    const {
      model,
@@ -54,7 +153,7 @@ export class ObjectGeneratorSafe {
      return result;

    } catch (error) {
-      // First fallback: Try manual JSON parsing of the error response
+      // First fallback: Try manual parsing of the error response
      try {
        const errorResult = await this.handleGenerateObjectError<T>(error);
        this.tokenTracker.trackUsage(model, errorResult.usage);
@@ -67,15 +166,20 @@ export class ObjectGeneratorSafe {
          const failedOutput = (parseError as any).text;
          console.error(`${model} failed on object generation ${failedOutput} -> manual parsing failed again -> trying fallback model`, fallbackModel);
          try {
+            // Create a distilled version of the schema without descriptions
+            const distilledSchema = this.createDistilledSchema(schema);
+            console.log('Distilled schema', distilledSchema)
+
            const fallbackResult = await generateObject({
              model: fallbackModel,
-              schema,
-              prompt: `Extract the desired information from this text: \n ${failedOutput}`,
+              schema: distilledSchema,
+              prompt: `Following the given JSON schema, extract the field from below: \n\n ${failedOutput}`,
              maxTokens: getToolConfig('fallback').maxTokens,
              temperature: getToolConfig('fallback').temperature,
            });

            this.tokenTracker.trackUsage(model, fallbackResult.usage);
+            console.log('Distilled schema parse success!')
            return fallbackResult;
          } catch (fallbackError) {
            // If fallback model also fails, try parsing its error response
@@ -91,15 +195,28 @@ export class ObjectGeneratorSafe {

  private async handleGenerateObjectError<T>(error: unknown): Promise<GenerateObjectResult<T>> {
    if (NoObjectGeneratedError.isInstance(error)) {
-      console.error('Object not generated according to schema, fallback to manual JSON parsing');
+      console.error('Object not generated according to schema, fallback to manual parsing');
      try {
+        // First try standard JSON parsing
        const partialResponse = JSON.parse((error as any).text);
+        console.log('JSON parse success!')
        return {
          object: partialResponse as T,
          usage: (error as any).usage
        };
      } catch (parseError) {
-        throw error;
+        // Use Hjson to parse the error response for more lenient parsing
+        try {
+          const hjsonResponse = Hjson.parse((error as any).text);
+          console.log('Hjson parse success!')
+          return {
+            object: hjsonResponse as T,
+            usage: (error as any).usage
+          };
+        } catch (hjsonError) {
+          console.error('Both JSON and Hjson parsing failed:', hjsonError);
+          throw error;
+        }
      }
    }
    throw error;
--- a/src/utils/schemas.ts
+++ b/src/utils/schemas.ts
@@ -187,7 +187,7 @@ export class Schemas {
        return z.object({
          type: z.literal('strict'),
          ...baseSchemaBefore,
-          improvement_plan: z.string().describe('Short explain how a perfect answer should look like and what revisions are needed to improve the current answer.').max(500),
+          improvement_plan: z.string().describe('Explain how a perfect answer should look like and what are needed to improve the current answer. Starts with "For the best answer, you must..."').max(500),
          ...baseSchemaAfter
        });
      default: