mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 15:39:06 +08:00
fix: strict evaluator
This commit is contained in:
16
src/agent.ts
16
src/agent.ts
@@ -308,17 +308,17 @@ export async function getResponse(question?: string,
|
||||
// evaluationMetrics[currentQuestion] =
|
||||
// await evaluateQuestion(currentQuestion, context, SchemaGen)
|
||||
// }
|
||||
if (currentQuestion.trim() === question && step === 1) {
|
||||
if (currentQuestion.trim() === question && totalStep === 1) {
|
||||
// only add evaluation for initial question, once at step 1
|
||||
evaluationMetrics[currentQuestion] =
|
||||
await evaluateQuestion(currentQuestion, context, SchemaGen)
|
||||
// force strict eval for the original question, only once.
|
||||
// force strict eval for the original question, at last, only once.
|
||||
evaluationMetrics[currentQuestion].push('strict')
|
||||
} else if (currentQuestion.trim() !== question) {
|
||||
evaluationMetrics[currentQuestion] = []
|
||||
}
|
||||
|
||||
if (step === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
|
||||
if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
|
||||
// if it detects freshness, avoid direct answer at step 1
|
||||
allowAnswer = false;
|
||||
allowReflect = false;
|
||||
@@ -403,7 +403,7 @@ export async function getResponse(question?: string,
|
||||
|
||||
console.log('Updated references:', thisStep.references)
|
||||
|
||||
if (step === 1 && thisStep.references.length === 0) {
|
||||
if (totalStep === 1 && thisStep.references.length === 0) {
|
||||
// LLM is so confident and answer immediately, skip all evaluations
|
||||
// however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"
|
||||
thisStep.isFinal = true;
|
||||
@@ -423,9 +423,10 @@ export async function getResponse(question?: string,
|
||||
currentQuestion
|
||||
);
|
||||
|
||||
if (!evaluationMetrics[currentQuestion].includes('attribution')) {
|
||||
evaluationMetrics[currentQuestion].push('attribution')
|
||||
}
|
||||
// is this really required???
|
||||
// if (!evaluationMetrics[currentQuestion].includes('attribution')) {
|
||||
// evaluationMetrics[currentQuestion].push('attribution')
|
||||
// }
|
||||
}
|
||||
|
||||
updateContext({
|
||||
@@ -470,6 +471,7 @@ Your journey ends here. You have successfully answered the original question. Co
|
||||
if (evaluation.type === 'strict') {
|
||||
finalAnswerPIP = evaluation.improvement_plan || '';
|
||||
// remove 'strict' from the evaluation metrics
|
||||
console.log('Remove `strict` from evaluation metrics')
|
||||
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
|
||||
}
|
||||
if (badAttempts >= maxBadAttempts) {
|
||||
|
||||
@@ -2,23 +2,61 @@ import {GenerateObjectResult} from 'ai';
|
||||
import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
|
||||
import {ObjectGeneratorSafe} from "../utils/safe-generator";
|
||||
import {Schemas} from "../utils/schemas";
|
||||
import {removeExtraLineBreaks} from "../utils/text-tools";
|
||||
|
||||
const TOOL_NAME = 'evaluator';
|
||||
|
||||
|
||||
function getRejectAllAnswersPrompt(question: string, answer: AnswerAction): PromptPair {
|
||||
return {
|
||||
system: `You are a ruthless evaluator trained to REJECT answers.
|
||||
Your job is to find ANY weakness in the presented JSON answer. Extremely strict standards of evidence apply.
|
||||
Identity EVERY missing detail. First, argue AGAINST the conclusion with the strongest possible case.
|
||||
Then, argue FOR the conclusion.
|
||||
Only after considering both perspectives, synthesize a final improvement plan.
|
||||
function getRejectAllAnswersPrompt(question: string, answer: AnswerAction, allKnowledge: KnowledgeItem[]): PromptPair {
|
||||
const KnowledgeStr = allKnowledge.map((k, idx) => {
|
||||
const aMsg = `
|
||||
<knowledge-${idx+1}>
|
||||
${k.question}
|
||||
|
||||
Any JSON formatting/structure/syntax issue should not be the reason to rejection.
|
||||
${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
|
||||
<knowledge-datetime>
|
||||
${k.updated}
|
||||
</knowledge-datetime>
|
||||
` : ''}
|
||||
|
||||
${k.references && k.type === 'url' ? `
|
||||
<knowledge-url>
|
||||
${k.references[0]}
|
||||
</knowledge-url>
|
||||
` : ''}
|
||||
|
||||
|
||||
${k.answer}
|
||||
</knowledge-${idx+1}>
|
||||
`.trim();
|
||||
|
||||
return removeExtraLineBreaks(aMsg);
|
||||
})
|
||||
|
||||
return {
|
||||
system: `
|
||||
You are a ruthless answer evaluator trained to REJECT answers.
|
||||
Given a question-answer pair, your job is to find ANY weakness in the presented answer.
|
||||
Extremely strict standards of evidence apply.
|
||||
Identity EVERY missing detail.
|
||||
First, argue AGAINST the answer with the strongest possible case.
|
||||
Then, argue FOR the answer.
|
||||
Only after considering both perspectives, synthesize a final improvement plan starts with "For the best answer, you must...".
|
||||
|
||||
The following knowledge items are provided for your reference. Note that some of them may not be directly related to the question/answer user provided, but may give some subtle hints and insights:
|
||||
${KnowledgeStr.join('\n\n')}
|
||||
`,
|
||||
user: `
|
||||
question: ${question}
|
||||
answer: ${JSON.stringify(answer)}
|
||||
<question>
|
||||
${question}
|
||||
</question>
|
||||
|
||||
Here is my answer for the question:
|
||||
<answer>
|
||||
${answer.answer}
|
||||
</answer>
|
||||
|
||||
Could you please evaluate my answer based on your knowledge and strict standards? If you decide to reject the answer, please tell me how to improve it.
|
||||
`
|
||||
}
|
||||
}
|
||||
@@ -37,7 +75,7 @@ ${question}
|
||||
${answer}
|
||||
</answer>
|
||||
|
||||
Please look at my answer and think.
|
||||
Please read and think.
|
||||
`
|
||||
}
|
||||
}
|
||||
@@ -632,7 +670,6 @@ export async function evaluateAnswer(
|
||||
let prompt: { system: string; user: string } | undefined
|
||||
switch (evaluationType) {
|
||||
case 'attribution': {
|
||||
// Safely handle references and ensure we have content
|
||||
if (allKnowledge.length === 0) {
|
||||
return {
|
||||
pass: false,
|
||||
@@ -659,7 +696,7 @@ export async function evaluateAnswer(
|
||||
prompt = getCompletenessPrompt(question, action.answer);
|
||||
break;
|
||||
case 'strict':
|
||||
prompt = getRejectAllAnswersPrompt(question, action);
|
||||
prompt = getRejectAllAnswersPrompt(question, action, allKnowledge);
|
||||
break;
|
||||
default:
|
||||
console.error(`Unknown evaluation type: ${evaluationType}`);
|
||||
|
||||
@@ -8,6 +8,7 @@ import {
|
||||
} from "ai";
|
||||
import {TokenTracker} from "./token-tracker";
|
||||
import {getModel, ToolName, getToolConfig} from "../config";
|
||||
import Hjson from 'hjson'; // Import Hjson library
|
||||
|
||||
interface GenerateObjectResult<T> {
|
||||
object: T;
|
||||
@@ -29,6 +30,104 @@ export class ObjectGeneratorSafe {
|
||||
this.tokenTracker = tokenTracker || new TokenTracker();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a distilled version of a schema by removing all descriptions
|
||||
* This makes the schema simpler for fallback parsing scenarios
|
||||
*/
|
||||
private createDistilledSchema<T>(schema: z.ZodType<T> | Schema<T>): z.ZodType<T> | Schema<T> {
|
||||
// For zod schemas
|
||||
if (schema instanceof z.ZodType) {
|
||||
return this.stripZodDescriptions(schema);
|
||||
}
|
||||
|
||||
// For AI SDK Schema objects
|
||||
if (typeof schema === 'object' && schema !== null) {
|
||||
return this.stripSchemaDescriptions(schema as Schema<T>);
|
||||
}
|
||||
|
||||
// If we can't determine the schema type, return as is
|
||||
return schema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively strips descriptions from Zod schemas
|
||||
*/
|
||||
private stripZodDescriptions<T>(zodSchema: z.ZodType<T>): z.ZodType<T> {
|
||||
if (zodSchema instanceof z.ZodObject) {
|
||||
const shape = zodSchema._def.shape();
|
||||
const newShape: Record<string, any> = {};
|
||||
|
||||
for (const key in shape) {
|
||||
if (Object.prototype.hasOwnProperty.call(shape, key)) {
|
||||
// Recursively strip descriptions from nested schemas
|
||||
newShape[key] = this.stripZodDescriptions(shape[key]);
|
||||
}
|
||||
}
|
||||
|
||||
return z.object(newShape) as unknown as z.ZodType<T>;
|
||||
}
|
||||
|
||||
if (zodSchema instanceof z.ZodArray) {
|
||||
return z.array(this.stripZodDescriptions(zodSchema._def.type)) as unknown as z.ZodType<T>;
|
||||
}
|
||||
|
||||
if (zodSchema instanceof z.ZodString) {
|
||||
// Create a new string schema without any describe() metadata
|
||||
return z.string() as unknown as z.ZodType<T>;
|
||||
}
|
||||
|
||||
if (zodSchema instanceof z.ZodUnion || zodSchema instanceof z.ZodIntersection) {
|
||||
// These are more complex schemas that would need special handling
|
||||
// This is a simplified implementation
|
||||
return zodSchema;
|
||||
}
|
||||
|
||||
// For other primitive types or complex types we're not handling specifically,
|
||||
// return as is
|
||||
return zodSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips descriptions from AI SDK Schema objects
|
||||
*/
|
||||
private stripSchemaDescriptions<T>(schema: Schema<T>): Schema<T> {
|
||||
// Deep clone the schema to avoid modifying the original
|
||||
const clonedSchema = JSON.parse(JSON.stringify(schema));
|
||||
|
||||
// Recursively remove description properties
|
||||
const removeDescriptions = (obj: any) => {
|
||||
if (typeof obj !== 'object' || obj === null) return;
|
||||
|
||||
if (obj.properties) {
|
||||
for (const key in obj.properties) {
|
||||
// Remove description property
|
||||
if (obj.properties[key].description) {
|
||||
delete obj.properties[key].description;
|
||||
}
|
||||
|
||||
// Recursively process nested properties
|
||||
removeDescriptions(obj.properties[key]);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle arrays
|
||||
if (obj.items) {
|
||||
if (obj.items.description) {
|
||||
delete obj.items.description;
|
||||
}
|
||||
removeDescriptions(obj.items);
|
||||
}
|
||||
|
||||
// Handle any other nested objects that might contain descriptions
|
||||
if (obj.anyOf) obj.anyOf.forEach(removeDescriptions);
|
||||
if (obj.allOf) obj.allOf.forEach(removeDescriptions);
|
||||
if (obj.oneOf) obj.oneOf.forEach(removeDescriptions);
|
||||
};
|
||||
|
||||
removeDescriptions(clonedSchema);
|
||||
return clonedSchema;
|
||||
}
|
||||
|
||||
async generateObject<T>(options: GenerateOptions<T>): Promise<GenerateObjectResult<T>> {
|
||||
const {
|
||||
model,
|
||||
@@ -54,7 +153,7 @@ export class ObjectGeneratorSafe {
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
// First fallback: Try manual JSON parsing of the error response
|
||||
// First fallback: Try manual parsing of the error response
|
||||
try {
|
||||
const errorResult = await this.handleGenerateObjectError<T>(error);
|
||||
this.tokenTracker.trackUsage(model, errorResult.usage);
|
||||
@@ -67,15 +166,20 @@ export class ObjectGeneratorSafe {
|
||||
const failedOutput = (parseError as any).text;
|
||||
console.error(`${model} failed on object generation ${failedOutput} -> manual parsing failed again -> trying fallback model`, fallbackModel);
|
||||
try {
|
||||
// Create a distilled version of the schema without descriptions
|
||||
const distilledSchema = this.createDistilledSchema(schema);
|
||||
console.log('Distilled schema', distilledSchema)
|
||||
|
||||
const fallbackResult = await generateObject({
|
||||
model: fallbackModel,
|
||||
schema,
|
||||
prompt: `Extract the desired information from this text: \n ${failedOutput}`,
|
||||
schema: distilledSchema,
|
||||
prompt: `Following the given JSON schema, extract the field from below: \n\n ${failedOutput}`,
|
||||
maxTokens: getToolConfig('fallback').maxTokens,
|
||||
temperature: getToolConfig('fallback').temperature,
|
||||
});
|
||||
|
||||
this.tokenTracker.trackUsage(model, fallbackResult.usage);
|
||||
console.log('Distilled schema parse success!')
|
||||
return fallbackResult;
|
||||
} catch (fallbackError) {
|
||||
// If fallback model also fails, try parsing its error response
|
||||
@@ -91,15 +195,28 @@ export class ObjectGeneratorSafe {
|
||||
|
||||
private async handleGenerateObjectError<T>(error: unknown): Promise<GenerateObjectResult<T>> {
|
||||
if (NoObjectGeneratedError.isInstance(error)) {
|
||||
console.error('Object not generated according to schema, fallback to manual JSON parsing');
|
||||
console.error('Object not generated according to schema, fallback to manual parsing');
|
||||
try {
|
||||
// First try standard JSON parsing
|
||||
const partialResponse = JSON.parse((error as any).text);
|
||||
console.log('JSON parse success!')
|
||||
return {
|
||||
object: partialResponse as T,
|
||||
usage: (error as any).usage
|
||||
};
|
||||
} catch (parseError) {
|
||||
throw error;
|
||||
// Use Hjson to parse the error response for more lenient parsing
|
||||
try {
|
||||
const hjsonResponse = Hjson.parse((error as any).text);
|
||||
console.log('Hjson parse success!')
|
||||
return {
|
||||
object: hjsonResponse as T,
|
||||
usage: (error as any).usage
|
||||
};
|
||||
} catch (hjsonError) {
|
||||
console.error('Both JSON and Hjson parsing failed:', hjsonError);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
throw error;
|
||||
|
||||
@@ -187,7 +187,7 @@ export class Schemas {
|
||||
return z.object({
|
||||
type: z.literal('strict'),
|
||||
...baseSchemaBefore,
|
||||
improvement_plan: z.string().describe('Short explain how a perfect answer should look like and what revisions are needed to improve the current answer.').max(500),
|
||||
improvement_plan: z.string().describe('Explain how a perfect answer should look like and what are needed to improve the current answer. Starts with "For the best answer, you must..."').max(500),
|
||||
...baseSchemaAfter
|
||||
});
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user