fix: strict evaluator

This commit is contained in:
Han Xiao 2025-03-14 11:57:02 +08:00
parent de640b7b29
commit b0c07162dd
9 changed files with 208 additions and 58 deletions

View File

@ -1,6 +1,6 @@
# DeepResearch
[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Blog](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Stable API](https://jina.ai/deepsearch) | [Blog](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query.
@ -32,11 +32,11 @@ flowchart LR
Whether you like this implementation or not, I highly recommend you to read DeepSearch/DeepResearch implementation guide I wrote, which gives you a gentle intro to this topic.
- [English](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
- [中文微信公众号](https://mp.weixin.qq.com/s/-pPhHDi2nz8hp5R3Lm_mww)
- [English Part I](https://jina.ai/news/a-practical-guide-to-implementing-deepsearch-deepresearch), [Part II](https://jina.ai/news/snippet-selection-and-url-ranking-in-deepsearch-deepresearch)
- [中文微信公众号 第一讲](https://mp.weixin.qq.com/s/-pPhHDi2nz8hp5R3Lm_mww), [第二讲](https://mp.weixin.qq.com/s/apnorBj4TZs3-Mo23xUReQ)
- [日本語: DeepSearch/DeepResearch 実装の実践ガイド](https://jina.ai/ja/news/a-practical-guide-to-implementing-deepsearch-deepresearch)
## Test it yourself
## Test it Yourself
We host an online deployment of this **exact** codebase, which allows you to do a vibe-check; or use it as daily productivity tools.

View File

@ -41,7 +41,7 @@
"queryRewriter": { "temperature": 0.1 },
"agent": { "temperature": 0.7 },
"agentBeastMode": { "temperature": 0.7 },
"fallback": { "temperature": 0 }
"fallback": {"maxTokens": 8000, "model": "gemini-2.0-flash-lite"}
}
},
"openai": {

View File

@ -47,7 +47,7 @@
"queryRewriter": {"maxTokens": 2000},
"agent": { },
"agentBeastMode": { },
"fallback": {"maxTokens": 4000}
"fallback": {"maxTokens": 8000, "model": "gemini-2.0-flash-lite"}
}
},
"openai": {

44
package-lock.json generated
View File

@ -11,8 +11,6 @@
"dependencies": {
"@ai-sdk/google": "^1.0.0",
"@ai-sdk/openai": "^1.1.9",
"@dmitryrechkin/json-schema-to-zod": "^1.0.0",
"add": "^2.0.6",
"ai": "^4.1.26",
"axios": "^1.7.9",
"commander": "^13.1.0",
@ -20,7 +18,7 @@
"dotenv": "^16.4.7",
"duck-duck-scrape": "^2.2.7",
"express": "^4.21.2",
"json-schema-to-zod": "^2.6.0",
"hjson": "^3.2.2",
"node-fetch": "^3.3.2",
"undici": "^7.3.0",
"zod": "^3.22.4",
@ -30,6 +28,7 @@
"@types/commander": "^2.12.0",
"@types/cors": "^2.8.17",
"@types/express": "^5.0.0",
"@types/hjson": "^2.4.6",
"@types/jest": "^29.5.14",
"@types/node": "^22.10.10",
"@types/node-fetch": "^2.6.12",
@ -765,14 +764,6 @@
"node": ">=12"
}
},
"node_modules/@dmitryrechkin/json-schema-to-zod": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@dmitryrechkin/json-schema-to-zod/-/json-schema-to-zod-1.0.0.tgz",
"integrity": "sha512-avV26RC8CRzhnL6AvQsURlkd071SXlcPURxiYFsRLpsMoDDXBBGJDIsNQTvYmevq31WHYdwGCKGgQKC0YIjDGg==",
"dependencies": {
"zod": "^3.23.8"
}
},
"node_modules/@eslint-community/eslint-utils": {
"version": "4.4.1",
"resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.1.tgz",
@ -1665,6 +1656,13 @@
"@types/node": "*"
}
},
"node_modules/@types/hjson": {
"version": "2.4.6",
"resolved": "https://registry.npmjs.org/@types/hjson/-/hjson-2.4.6.tgz",
"integrity": "sha512-tEQ4hlyKfsb9WWeueUY5eRnU2eK+KdE0eofSpQ05v9Aah4VvWwIRIid/ZN1zZZ0TfeVTRDgabKKqKZXEkfD3Sw==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/http-errors": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@ -2079,12 +2077,6 @@
"node": ">=0.4.0"
}
},
"node_modules/add": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/add/-/add-2.0.6.tgz",
"integrity": "sha512-j5QzrmsokwWWp6kUcJQySpbG+xfOBqqKnup3OIk1pz+kB/80SLorZ9V8zHFLO92Lcd+hbvq8bT+zOGoPkmBV0Q==",
"license": "MIT"
},
"node_modules/agent-base": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.3.tgz",
@ -4249,6 +4241,15 @@
"node": ">=8"
}
},
"node_modules/hjson": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/hjson/-/hjson-3.2.2.tgz",
"integrity": "sha512-MkUeB0cTIlppeSsndgESkfFD21T2nXPRaBStLtf3cAYA2bVEFdXlodZB0TukwZiobPD1Ksax5DK4RTZeaXCI3Q==",
"license": "MIT",
"bin": {
"hjson": "bin/hjson"
}
},
"node_modules/html-entities": {
"version": "2.5.2",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz",
@ -5274,15 +5275,6 @@
"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
"license": "(AFL-2.1 OR BSD-3-Clause)"
},
"node_modules/json-schema-to-zod": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/json-schema-to-zod/-/json-schema-to-zod-2.6.0.tgz",
"integrity": "sha512-6sFZqOzHZeON8g2ZW5HJ114Hb/FffNCjWh8dgulJaKFkUqKCEWZAzF4+g07SQpfBZF7HXemwedtdLypZzmnVpQ==",
"license": "ISC",
"bin": {
"json-schema-to-zod": "dist/cjs/cli.js"
}
},
"node_modules/json-schema-traverse": {
"version": "0.4.1",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",

View File

@ -35,6 +35,7 @@
"dotenv": "^16.4.7",
"duck-duck-scrape": "^2.2.7",
"express": "^4.21.2",
"hjson": "^3.2.2",
"node-fetch": "^3.3.2",
"undici": "^7.3.0",
"zod": "^3.22.4",
@ -44,6 +45,7 @@
"@types/commander": "^2.12.0",
"@types/cors": "^2.8.17",
"@types/express": "^5.0.0",
"@types/hjson": "^2.4.6",
"@types/jest": "^29.5.14",
"@types/node": "^22.10.10",
"@types/node-fetch": "^2.6.12",

View File

@ -308,17 +308,17 @@ export async function getResponse(question?: string,
// evaluationMetrics[currentQuestion] =
// await evaluateQuestion(currentQuestion, context, SchemaGen)
// }
if (currentQuestion.trim() === question && step === 1) {
if (currentQuestion.trim() === question && totalStep === 1) {
// only add evaluation for initial question, once at step 1
evaluationMetrics[currentQuestion] =
await evaluateQuestion(currentQuestion, context, SchemaGen)
// force strict eval for the original question, only once.
// force strict eval for the original question, at last, only once.
evaluationMetrics[currentQuestion].push('strict')
} else if (currentQuestion.trim() !== question) {
evaluationMetrics[currentQuestion] = []
}
if (step === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
// if it detects freshness, avoid direct answer at step 1
allowAnswer = false;
allowReflect = false;
@ -403,7 +403,7 @@ export async function getResponse(question?: string,
console.log('Updated references:', thisStep.references)
if (step === 1 && thisStep.references.length === 0) {
if (totalStep === 1 && thisStep.references.length === 0) {
// LLM is so confident and answer immediately, skip all evaluations
// however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"
thisStep.isFinal = true;
@ -423,9 +423,10 @@ export async function getResponse(question?: string,
currentQuestion
);
if (!evaluationMetrics[currentQuestion].includes('attribution')) {
evaluationMetrics[currentQuestion].push('attribution')
}
// is this really required???
// if (!evaluationMetrics[currentQuestion].includes('attribution')) {
// evaluationMetrics[currentQuestion].push('attribution')
// }
}
updateContext({
@ -470,6 +471,7 @@ Your journey ends here. You have successfully answered the original question. Co
if (evaluation.type === 'strict') {
finalAnswerPIP = evaluation.improvement_plan || '';
// remove 'strict' from the evaluation metrics
console.log('Remove `strict` from evaluation metrics')
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
}
if (badAttempts >= maxBadAttempts) {

View File

@ -2,23 +2,61 @@ import {GenerateObjectResult} from 'ai';
import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
import {removeExtraLineBreaks} from "../utils/text-tools";
const TOOL_NAME = 'evaluator';
function getRejectAllAnswersPrompt(question: string, answer: AnswerAction): PromptPair {
return {
system: `You are a ruthless evaluator trained to REJECT answers.
Your job is to find ANY weakness in the presented JSON answer. Extremely strict standards of evidence apply.
Identity EVERY missing detail. First, argue AGAINST the conclusion with the strongest possible case.
Then, argue FOR the conclusion.
Only after considering both perspectives, synthesize a final improvement plan.
function getRejectAllAnswersPrompt(question: string, answer: AnswerAction, allKnowledge: KnowledgeItem[]): PromptPair {
const KnowledgeStr = allKnowledge.map((k, idx) => {
const aMsg = `
<knowledge-${idx+1}>
${k.question}
Any JSON formatting/structure/syntax issue should not be the reason to rejection.
${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
<knowledge-datetime>
${k.updated}
</knowledge-datetime>
` : ''}
${k.references && k.type === 'url' ? `
<knowledge-url>
${k.references[0]}
</knowledge-url>
` : ''}
${k.answer}
</knowledge-${idx+1}>
`.trim();
return removeExtraLineBreaks(aMsg);
})
return {
system: `
You are a ruthless answer evaluator trained to REJECT answers.
Given a question-answer pair, your job is to find ANY weakness in the presented answer.
Extremely strict standards of evidence apply.
Identity EVERY missing detail.
First, argue AGAINST the answer with the strongest possible case.
Then, argue FOR the answer.
Only after considering both perspectives, synthesize a final improvement plan starts with "For the best answer, you must...".
The following knowledge items are provided for your reference. Note that some of them may not be directly related to the question/answer user provided, but may give some subtle hints and insights:
${KnowledgeStr.join('\n\n')}
`,
user: `
question: ${question}
answer: ${JSON.stringify(answer)}
<question>
${question}
</question>
Here is my answer for the question:
<answer>
${answer.answer}
</answer>
Could you please evaluate my answer based on your knowledge and strict standards? If you decide to reject the answer, please tell me how to improve it.
`
}
}
@ -37,7 +75,7 @@ ${question}
${answer}
</answer>
Please look at my answer and think.
Please read and think.
`
}
}
@ -632,7 +670,6 @@ export async function evaluateAnswer(
let prompt: { system: string; user: string } | undefined
switch (evaluationType) {
case 'attribution': {
// Safely handle references and ensure we have content
if (allKnowledge.length === 0) {
return {
pass: false,
@ -659,7 +696,7 @@ export async function evaluateAnswer(
prompt = getCompletenessPrompt(question, action.answer);
break;
case 'strict':
prompt = getRejectAllAnswersPrompt(question, action);
prompt = getRejectAllAnswersPrompt(question, action, allKnowledge);
break;
default:
console.error(`Unknown evaluation type: ${evaluationType}`);

View File

@ -8,6 +8,7 @@ import {
} from "ai";
import {TokenTracker} from "./token-tracker";
import {getModel, ToolName, getToolConfig} from "../config";
import Hjson from 'hjson'; // Import Hjson library
interface GenerateObjectResult<T> {
object: T;
@ -29,6 +30,104 @@ export class ObjectGeneratorSafe {
this.tokenTracker = tokenTracker || new TokenTracker();
}
/**
* Creates a distilled version of a schema by removing all descriptions
* This makes the schema simpler for fallback parsing scenarios
*/
private createDistilledSchema<T>(schema: z.ZodType<T> | Schema<T>): z.ZodType<T> | Schema<T> {
// For zod schemas
if (schema instanceof z.ZodType) {
return this.stripZodDescriptions(schema);
}
// For AI SDK Schema objects
if (typeof schema === 'object' && schema !== null) {
return this.stripSchemaDescriptions(schema as Schema<T>);
}
// If we can't determine the schema type, return as is
return schema;
}
/**
* Recursively strips descriptions from Zod schemas
*/
private stripZodDescriptions<T>(zodSchema: z.ZodType<T>): z.ZodType<T> {
if (zodSchema instanceof z.ZodObject) {
const shape = zodSchema._def.shape();
const newShape: Record<string, any> = {};
for (const key in shape) {
if (Object.prototype.hasOwnProperty.call(shape, key)) {
// Recursively strip descriptions from nested schemas
newShape[key] = this.stripZodDescriptions(shape[key]);
}
}
return z.object(newShape) as unknown as z.ZodType<T>;
}
if (zodSchema instanceof z.ZodArray) {
return z.array(this.stripZodDescriptions(zodSchema._def.type)) as unknown as z.ZodType<T>;
}
if (zodSchema instanceof z.ZodString) {
// Create a new string schema without any describe() metadata
return z.string() as unknown as z.ZodType<T>;
}
if (zodSchema instanceof z.ZodUnion || zodSchema instanceof z.ZodIntersection) {
// These are more complex schemas that would need special handling
// This is a simplified implementation
return zodSchema;
}
// For other primitive types or complex types we're not handling specifically,
// return as is
return zodSchema;
}
/**
* Strips descriptions from AI SDK Schema objects
*/
private stripSchemaDescriptions<T>(schema: Schema<T>): Schema<T> {
// Deep clone the schema to avoid modifying the original
const clonedSchema = JSON.parse(JSON.stringify(schema));
// Recursively remove description properties
const removeDescriptions = (obj: any) => {
if (typeof obj !== 'object' || obj === null) return;
if (obj.properties) {
for (const key in obj.properties) {
// Remove description property
if (obj.properties[key].description) {
delete obj.properties[key].description;
}
// Recursively process nested properties
removeDescriptions(obj.properties[key]);
}
}
// Handle arrays
if (obj.items) {
if (obj.items.description) {
delete obj.items.description;
}
removeDescriptions(obj.items);
}
// Handle any other nested objects that might contain descriptions
if (obj.anyOf) obj.anyOf.forEach(removeDescriptions);
if (obj.allOf) obj.allOf.forEach(removeDescriptions);
if (obj.oneOf) obj.oneOf.forEach(removeDescriptions);
};
removeDescriptions(clonedSchema);
return clonedSchema;
}
async generateObject<T>(options: GenerateOptions<T>): Promise<GenerateObjectResult<T>> {
const {
model,
@ -54,7 +153,7 @@ export class ObjectGeneratorSafe {
return result;
} catch (error) {
// First fallback: Try manual JSON parsing of the error response
// First fallback: Try manual parsing of the error response
try {
const errorResult = await this.handleGenerateObjectError<T>(error);
this.tokenTracker.trackUsage(model, errorResult.usage);
@ -67,15 +166,20 @@ export class ObjectGeneratorSafe {
const failedOutput = (parseError as any).text;
console.error(`${model} failed on object generation ${failedOutput} -> manual parsing failed again -> trying fallback model`, fallbackModel);
try {
// Create a distilled version of the schema without descriptions
const distilledSchema = this.createDistilledSchema(schema);
console.log('Distilled schema', distilledSchema)
const fallbackResult = await generateObject({
model: fallbackModel,
schema,
prompt: `Extract the desired information from this text: \n ${failedOutput}`,
schema: distilledSchema,
prompt: `Following the given JSON schema, extract the field from below: \n\n ${failedOutput}`,
maxTokens: getToolConfig('fallback').maxTokens,
temperature: getToolConfig('fallback').temperature,
});
this.tokenTracker.trackUsage(model, fallbackResult.usage);
console.log('Distilled schema parse success!')
return fallbackResult;
} catch (fallbackError) {
// If fallback model also fails, try parsing its error response
@ -91,15 +195,28 @@ export class ObjectGeneratorSafe {
private async handleGenerateObjectError<T>(error: unknown): Promise<GenerateObjectResult<T>> {
if (NoObjectGeneratedError.isInstance(error)) {
console.error('Object not generated according to schema, fallback to manual JSON parsing');
console.error('Object not generated according to schema, fallback to manual parsing');
try {
// First try standard JSON parsing
const partialResponse = JSON.parse((error as any).text);
console.log('JSON parse success!')
return {
object: partialResponse as T,
usage: (error as any).usage
};
} catch (parseError) {
throw error;
// Use Hjson to parse the error response for more lenient parsing
try {
const hjsonResponse = Hjson.parse((error as any).text);
console.log('Hjson parse success!')
return {
object: hjsonResponse as T,
usage: (error as any).usage
};
} catch (hjsonError) {
console.error('Both JSON and Hjson parsing failed:', hjsonError);
throw error;
}
}
}
throw error;

View File

@ -187,7 +187,7 @@ export class Schemas {
return z.object({
type: z.literal('strict'),
...baseSchemaBefore,
improvement_plan: z.string().describe('Short explain how a perfect answer should look like and what revisions are needed to improve the current answer.').max(500),
improvement_plan: z.string().describe('Explain how a perfect answer should look like and what are needed to improve the current answer. Starts with "For the best answer, you must..."').max(500),
...baseSchemaAfter
});
default: