fix: strict evaluator

2025-12-26 06:28:56 +08:00 · 2025-03-14 11:57:02 +08:00 · 2025-03-14 11:57:02 +08:00 · b0c07162dd
commit b0c07162dd
parent de640b7b29
9 changed files with 208 additions and 58 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # DeepResearch

-[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Blog](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
+[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Stable API](https://jina.ai/deepsearch) | [Blog](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)

 Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.

@ -32,11 +32,11 @@ flowchart LR

 Whether you like this implementation or not, I highly recommend you to read DeepSearch/DeepResearch implementation guide I wrote, which gives you a gentle intro to this topic.

- [English](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
- [中文微信公众号](https://mp.weixin.qq.com/s/-pPhHDi2nz8hp5R3Lm_mww)
+- [English Part I](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch), [Part II](https://jina.ai/news/snippet-selection-and-url-ranking-in-deepsearch-deepresearch)
+- [中文微信公众号 第一讲](https://mp.weixin.qq.com/s/-pPhHDi2nz8hp5R3Lm_mww), [第二讲](https://mp.weixin.qq.com/s/apnorBj4TZs3-Mo23xUReQ)
 - [日本語: DeepSearch/DeepResearch 実装の実践ガイド](https://jina.ai/ja/news/a-practical-guide-to-implementing-deepsearch-deepresearch)

-## Test it yourself
+## Test it Yourself

 We host an online deployment of this **exact** codebase, which allows you to do a vibe-check; or use it as daily productivity tools.

--- a/config.json
+++ b/config.json
@ -41,7 +41,7 @@
        "queryRewriter": { "temperature": 0.1 },
        "agent": { "temperature": 0.7 },
        "agentBeastMode": { "temperature": 0.7 },
-        "fallback": { "temperature": 0 }
+        "fallback": {"maxTokens": 8000, "model": "gemini-2.0-flash-lite"}
      }
    },
    "openai": {
--- a/jina-ai/config.json
+++ b/jina-ai/config.json
@ -47,7 +47,7 @@
        "queryRewriter": {"maxTokens": 2000},
        "agent": { },
        "agentBeastMode": { },
-        "fallback": {"maxTokens": 4000}
+        "fallback": {"maxTokens": 8000, "model": "gemini-2.0-flash-lite"}
      }
    },
    "openai": {
--- a/package-lock.json
+++ b/package-lock.json
@ -11,8 +11,6 @@
      "dependencies": {
        "@ai-sdk/google": "^1.0.0",
        "@ai-sdk/openai": "^1.1.9",
-        "@dmitryrechkin/json-schema-to-zod": "^1.0.0",
-        "add": "^2.0.6",
        "ai": "^4.1.26",
        "axios": "^1.7.9",
        "commander": "^13.1.0",
@ -20,7 +18,7 @@
        "dotenv": "^16.4.7",
        "duck-duck-scrape": "^2.2.7",
        "express": "^4.21.2",
-        "json-schema-to-zod": "^2.6.0",
+        "hjson": "^3.2.2",
        "node-fetch": "^3.3.2",
        "undici": "^7.3.0",
        "zod": "^3.22.4",
@ -30,6 +28,7 @@
        "@types/commander": "^2.12.0",
        "@types/cors": "^2.8.17",
        "@types/express": "^5.0.0",
+        "@types/hjson": "^2.4.6",
        "@types/jest": "^29.5.14",
        "@types/node": "^22.10.10",
        "@types/node-fetch": "^2.6.12",
@ -765,14 +764,6 @@
        "node": ">=12"
      }
    },
-    "node_modules/@dmitryrechkin/json-schema-to-zod": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/@dmitryrechkin/json-schema-to-zod/-/json-schema-to-zod-1.0.0.tgz",
-      "integrity": "sha512-avV26RC8CRzhnL6AvQsURlkd071SXlcPURxiYFsRLpsMoDDXBBGJDIsNQTvYmevq31WHYdwGCKGgQKC0YIjDGg==",
-      "dependencies": {
-        "zod": "^3.23.8"
-      }
-    },
    "node_modules/@eslint-community/eslint-utils": {
      "version": "4.4.1",
      "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.1.tgz",
@ -1665,6 +1656,13 @@
        "@types/node": "*"
      }
    },
+    "node_modules/@types/hjson": {
+      "version": "2.4.6",
+      "resolved": "https://registry.npmjs.org/@types/hjson/-/hjson-2.4.6.tgz",
+      "integrity": "sha512-tEQ4hlyKfsb9WWeueUY5eRnU2eK+KdE0eofSpQ05v9Aah4VvWwIRIid/ZN1zZZ0TfeVTRDgabKKqKZXEkfD3Sw==",
+      "dev": true,
+      "license": "MIT"
+    },
    "node_modules/@types/http-errors": {
      "version": "2.0.4",
      "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@ -2079,12 +2077,6 @@
        "node": ">=0.4.0"
      }
    },
-    "node_modules/add": {
-      "version": "2.0.6",
-      "resolved": "https://registry.npmjs.org/add/-/add-2.0.6.tgz",
-      "integrity": "sha512-j5QzrmsokwWWp6kUcJQySpbG+xfOBqqKnup3OIk1pz+kB/80SLorZ9V8zHFLO92Lcd+hbvq8bT+zOGoPkmBV0Q==",
-      "license": "MIT"
-    },
    "node_modules/agent-base": {
      "version": "7.1.3",
      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.3.tgz",
@ -4249,6 +4241,15 @@
        "node": ">=8"
      }
    },
+    "node_modules/hjson": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/hjson/-/hjson-3.2.2.tgz",
+      "integrity": "sha512-MkUeB0cTIlppeSsndgESkfFD21T2nXPRaBStLtf3cAYA2bVEFdXlodZB0TukwZiobPD1Ksax5DK4RTZeaXCI3Q==",
+      "license": "MIT",
+      "bin": {
+        "hjson": "bin/hjson"
+      }
+    },
    "node_modules/html-entities": {
      "version": "2.5.2",
      "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz",
@ -5274,15 +5275,6 @@
      "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
      "license": "(AFL-2.1 OR BSD-3-Clause)"
    },
-    "node_modules/json-schema-to-zod": {
-      "version": "2.6.0",
-      "resolved": "https://registry.npmjs.org/json-schema-to-zod/-/json-schema-to-zod-2.6.0.tgz",
-      "integrity": "sha512-6sFZqOzHZeON8g2ZW5HJ114Hb/FffNCjWh8dgulJaKFkUqKCEWZAzF4+g07SQpfBZF7HXemwedtdLypZzmnVpQ==",
-      "license": "ISC",
-      "bin": {
-        "json-schema-to-zod": "dist/cjs/cli.js"
-      }
-    },
    "node_modules/json-schema-traverse": {
      "version": "0.4.1",
      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
--- a/package.json
+++ b/package.json
@ -35,6 +35,7 @@
    "dotenv": "^16.4.7",
    "duck-duck-scrape": "^2.2.7",
    "express": "^4.21.2",
+    "hjson": "^3.2.2",
    "node-fetch": "^3.3.2",
    "undici": "^7.3.0",
    "zod": "^3.22.4",
@ -44,6 +45,7 @@
    "@types/commander": "^2.12.0",
    "@types/cors": "^2.8.17",
    "@types/express": "^5.0.0",
+    "@types/hjson": "^2.4.6",
    "@types/jest": "^29.5.14",
    "@types/node": "^22.10.10",
    "@types/node-fetch": "^2.6.12",
--- a/src/agent.ts
+++ b/src/agent.ts
@ -308,17 +308,17 @@ export async function getResponse(question?: string,
    //   evaluationMetrics[currentQuestion] =
    //     await evaluateQuestion(currentQuestion, context, SchemaGen)
    // }
-    if (currentQuestion.trim() === question && step === 1) {
+    if (currentQuestion.trim() === question && totalStep === 1) {
      // only add evaluation for initial question, once at step 1
      evaluationMetrics[currentQuestion] =
        await evaluateQuestion(currentQuestion, context, SchemaGen)
-      // force strict eval for the original question, only once.
+      // force strict eval for the original question, at last, only once.
      evaluationMetrics[currentQuestion].push('strict')
    } else if (currentQuestion.trim() !== question) {
      evaluationMetrics[currentQuestion] = []
    }

-    if (step === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
+    if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
      // if it detects freshness, avoid direct answer at step 1
      allowAnswer = false;
      allowReflect = false;
@ -403,7 +403,7 @@ export async function getResponse(question?: string,

      console.log('Updated references:', thisStep.references)

-      if (step === 1 && thisStep.references.length === 0) {
+      if (totalStep === 1 && thisStep.references.length === 0) {
        // LLM is so confident and answer immediately, skip all evaluations
        // however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"
        thisStep.isFinal = true;
@ -423,9 +423,10 @@ export async function getResponse(question?: string,
          currentQuestion
        );

-        if (!evaluationMetrics[currentQuestion].includes('attribution')) {
-          evaluationMetrics[currentQuestion].push('attribution')
-        }
+        // is this really required???
+        // if (!evaluationMetrics[currentQuestion].includes('attribution')) {
+        //   evaluationMetrics[currentQuestion].push('attribution')
+        // }
      }

      updateContext({
@ -470,6 +471,7 @@ Your journey ends here. You have successfully answered the original question. Co
          if (evaluation.type === 'strict') {
            finalAnswerPIP = evaluation.improvement_plan || '';
            // remove 'strict' from the evaluation metrics
+            console.log('Remove `strict` from evaluation metrics')
            evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
          }
          if (badAttempts >= maxBadAttempts) {
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@ -2,23 +2,61 @@ import {GenerateObjectResult} from 'ai';
 import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
 import {ObjectGeneratorSafe} from "../utils/safe-generator";
 import {Schemas} from "../utils/schemas";
+import {removeExtraLineBreaks} from "../utils/text-tools";

 const TOOL_NAME = 'evaluator';


-function getRejectAllAnswersPrompt(question: string, answer: AnswerAction): PromptPair {
-  return {
-    system: `You are a ruthless evaluator trained to REJECT answers. 
-Your job is to find ANY weakness in the presented JSON answer. Extremely strict standards of evidence apply. 
-Identity EVERY missing detail. First, argue AGAINST the conclusion with the strongest possible case. 
-Then, argue FOR the conclusion. 
-Only after considering both perspectives, synthesize a final improvement plan.
+function getRejectAllAnswersPrompt(question: string, answer: AnswerAction, allKnowledge: KnowledgeItem[]): PromptPair {
+  const KnowledgeStr = allKnowledge.map((k, idx) => {
+    const aMsg = `
+<knowledge-${idx+1}>
+${k.question}

-Any JSON formatting/structure/syntax issue should not be the reason to rejection.
+${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
+<knowledge-datetime>
+${k.updated}
+</knowledge-datetime>
+` : ''}
+
+${k.references && k.type === 'url' ? `
+<knowledge-url>
+${k.references[0]}
+</knowledge-url>
+` : ''}
+
+
+${k.answer}
+</knowledge-${idx+1}>
+      `.trim();
+
+    return removeExtraLineBreaks(aMsg);
+  })
+
+  return {
+    system: `
+You are a ruthless answer evaluator trained to REJECT answers. 
+Given a question-answer pair, your job is to find ANY weakness in the presented answer. 
+Extremely strict standards of evidence apply. 
+Identity EVERY missing detail. 
+First, argue AGAINST the answer with the strongest possible case. 
+Then, argue FOR the answer. 
+Only after considering both perspectives, synthesize a final improvement plan starts with "For the best answer, you must...".
+
+The following knowledge items are provided for your reference. Note that some of them may not be directly related to the question/answer user provided, but may give some subtle hints and insights:
+${KnowledgeStr.join('\n\n')}
 `,
    user: `
-question: ${question}
-answer: ${JSON.stringify(answer)}
+<question>
+${question}
+</question>
+
+Here is my answer for the question:
+<answer>
+${answer.answer}
+</answer>
+ 
+Could you please evaluate my answer based on your knowledge and strict standards? If you decide to reject the answer, please tell me how to improve it.
 `
  }
 }
@ -37,7 +75,7 @@ ${question}
 ${answer}
 </answer>

-Please look at my answer and think.
+Please read and think.
 `
  }
 }
@ -632,7 +670,6 @@ export async function evaluateAnswer(
    let prompt: { system: string; user: string } | undefined
    switch (evaluationType) {
      case 'attribution': {
-        // Safely handle references and ensure we have content
        if (allKnowledge.length === 0) {
          return {
            pass: false,
@ -659,7 +696,7 @@ export async function evaluateAnswer(
        prompt = getCompletenessPrompt(question, action.answer);
        break;
      case 'strict':
-        prompt = getRejectAllAnswersPrompt(question, action);
+        prompt = getRejectAllAnswersPrompt(question, action, allKnowledge);
        break;
      default:
        console.error(`Unknown evaluation type: ${evaluationType}`);
--- a/src/utils/safe-generator.ts
+++ b/src/utils/safe-generator.ts
@ -8,6 +8,7 @@ import {
 } from "ai";
 import {TokenTracker} from "./token-tracker";
 import {getModel, ToolName, getToolConfig} from "../config";
+import Hjson from 'hjson'; // Import Hjson library

 interface GenerateObjectResult<T> {
  object: T;
@ -29,6 +30,104 @@ export class ObjectGeneratorSafe {
    this.tokenTracker = tokenTracker || new TokenTracker();
  }

+  /**
+   * Creates a distilled version of a schema by removing all descriptions
+   * This makes the schema simpler for fallback parsing scenarios
+   */
+  private createDistilledSchema<T>(schema: z.ZodType<T> | Schema<T>): z.ZodType<T> | Schema<T> {
+    // For zod schemas
+    if (schema instanceof z.ZodType) {
+      return this.stripZodDescriptions(schema);
+    }
+
+    // For AI SDK Schema objects
+    if (typeof schema === 'object' && schema !== null) {
+      return this.stripSchemaDescriptions(schema as Schema<T>);
+    }
+
+    // If we can't determine the schema type, return as is
+    return schema;
+  }
+
+  /**
+   * Recursively strips descriptions from Zod schemas
+   */
+  private stripZodDescriptions<T>(zodSchema: z.ZodType<T>): z.ZodType<T> {
+    if (zodSchema instanceof z.ZodObject) {
+      const shape = zodSchema._def.shape();
+      const newShape: Record<string, any> = {};
+
+      for (const key in shape) {
+        if (Object.prototype.hasOwnProperty.call(shape, key)) {
+          // Recursively strip descriptions from nested schemas
+          newShape[key] = this.stripZodDescriptions(shape[key]);
+        }
+      }
+
+      return z.object(newShape) as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodArray) {
+      return z.array(this.stripZodDescriptions(zodSchema._def.type)) as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodString) {
+      // Create a new string schema without any describe() metadata
+      return z.string() as unknown as z.ZodType<T>;
+    }
+
+    if (zodSchema instanceof z.ZodUnion || zodSchema instanceof z.ZodIntersection) {
+      // These are more complex schemas that would need special handling
+      // This is a simplified implementation
+      return zodSchema;
+    }
+
+    // For other primitive types or complex types we're not handling specifically,
+    // return as is
+    return zodSchema;
+  }
+
+  /**
+   * Strips descriptions from AI SDK Schema objects
+   */
+  private stripSchemaDescriptions<T>(schema: Schema<T>): Schema<T> {
+    // Deep clone the schema to avoid modifying the original
+    const clonedSchema = JSON.parse(JSON.stringify(schema));
+
+    // Recursively remove description properties
+    const removeDescriptions = (obj: any) => {
+      if (typeof obj !== 'object' || obj === null) return;
+
+      if (obj.properties) {
+        for (const key in obj.properties) {
+          // Remove description property
+          if (obj.properties[key].description) {
+            delete obj.properties[key].description;
+          }
+
+          // Recursively process nested properties
+          removeDescriptions(obj.properties[key]);
+        }
+      }
+
+      // Handle arrays
+      if (obj.items) {
+        if (obj.items.description) {
+          delete obj.items.description;
+        }
+        removeDescriptions(obj.items);
+      }
+
+      // Handle any other nested objects that might contain descriptions
+      if (obj.anyOf) obj.anyOf.forEach(removeDescriptions);
+      if (obj.allOf) obj.allOf.forEach(removeDescriptions);
+      if (obj.oneOf) obj.oneOf.forEach(removeDescriptions);
+    };
+
+    removeDescriptions(clonedSchema);
+    return clonedSchema;
+  }
+
  async generateObject<T>(options: GenerateOptions<T>): Promise<GenerateObjectResult<T>> {
    const {
      model,
@ -54,7 +153,7 @@ export class ObjectGeneratorSafe {
      return result;

    } catch (error) {
-      // First fallback: Try manual JSON parsing of the error response
+      // First fallback: Try manual parsing of the error response
      try {
        const errorResult = await this.handleGenerateObjectError<T>(error);
        this.tokenTracker.trackUsage(model, errorResult.usage);
@ -67,15 +166,20 @@ export class ObjectGeneratorSafe {
          const failedOutput = (parseError as any).text;
          console.error(`${model} failed on object generation ${failedOutput} -> manual parsing failed again -> trying fallback model`, fallbackModel);
          try {
+            // Create a distilled version of the schema without descriptions
+            const distilledSchema = this.createDistilledSchema(schema);
+            console.log('Distilled schema', distilledSchema)
+
            const fallbackResult = await generateObject({
              model: fallbackModel,
-              schema,
-              prompt: `Extract the desired information from this text: \n ${failedOutput}`,
+              schema: distilledSchema,
+              prompt: `Following the given JSON schema, extract the field from below: \n\n ${failedOutput}`,
              maxTokens: getToolConfig('fallback').maxTokens,
              temperature: getToolConfig('fallback').temperature,
            });

            this.tokenTracker.trackUsage(model, fallbackResult.usage);
+            console.log('Distilled schema parse success!')
            return fallbackResult;
          } catch (fallbackError) {
            // If fallback model also fails, try parsing its error response
@ -91,15 +195,28 @@ export class ObjectGeneratorSafe {

  private async handleGenerateObjectError<T>(error: unknown): Promise<GenerateObjectResult<T>> {
    if (NoObjectGeneratedError.isInstance(error)) {
-      console.error('Object not generated according to schema, fallback to manual JSON parsing');
+      console.error('Object not generated according to schema, fallback to manual parsing');
      try {
+        // First try standard JSON parsing
        const partialResponse = JSON.parse((error as any).text);
+        console.log('JSON parse success!')
        return {
          object: partialResponse as T,
          usage: (error as any).usage
        };
      } catch (parseError) {
-        throw error;
+        // Use Hjson to parse the error response for more lenient parsing
+        try {
+          const hjsonResponse = Hjson.parse((error as any).text);
+          console.log('Hjson parse success!')
+          return {
+            object: hjsonResponse as T,
+            usage: (error as any).usage
+          };
+        } catch (hjsonError) {
+          console.error('Both JSON and Hjson parsing failed:', hjsonError);
+          throw error;
+        }
      }
    }
    throw error;
--- a/src/utils/schemas.ts
+++ b/src/utils/schemas.ts
@ -187,7 +187,7 @@ export class Schemas {
        return z.object({
          type: z.literal('strict'),
          ...baseSchemaBefore,
-          improvement_plan: z.string().describe('Short explain how a perfect answer should look like and what revisions are needed to improve the current answer.').max(500),
+          improvement_plan: z.string().describe('Explain how a perfect answer should look like and what are needed to improve the current answer. Starts with "For the best answer, you must..."').max(500),
          ...baseSchemaAfter
        });
      default: