fix: eval logics

This commit is contained in:
Han Xiao 2025-03-20 10:49:57 +08:00
parent 8fa9aaa151
commit 4523f10283
4 changed files with 47 additions and 50 deletions

View File

@ -17,7 +17,7 @@ import {
SearchResult,
EvaluationType,
BoostedSearchSnippet,
SearchSnippet, EvaluationResponse, Reference, SERPQuery
SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType
} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
@ -353,10 +353,13 @@ async function executeSearchQueries(
};
}
function includesEval(allChecks: RepeatEvaluationType[], evalType: EvaluationType): boolean {
return allChecks.some(c => c.type === evalType);
}
export async function getResponse(question?: string,
tokenBudget: number = 1_000_000,
maxBadAttempts: number = 3,
maxBadAttempts: number = 2,
existingContext?: Partial<TrackerContext>,
messages?: Array<CoreMessage>,
numReturnedURLs: number = 100,
@ -367,7 +370,6 @@ export async function getResponse(question?: string,
let step = 0;
let totalStep = 0;
let badAttempts = 0;
question = question?.trim() as string;
// remove incoming system messages to avoid override
@ -409,19 +411,18 @@ export async function getResponse(question?: string,
let allowReflect = true;
let allowCoding = false;
let system = '';
let maxStrictEvals = Math.max(1, Math.min(3, maxBadAttempts - 1));
let msgWithKnowledge: CoreMessage[] = [];
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
const allURLs: Record<string, SearchSnippet> = {};
const visitedURLs: string[] = [];
const badURLs: string[] = [];
const evaluationMetrics: Record<string, EvaluationType[]> = {};
const evaluationMetrics: Record<string, RepeatEvaluationType[]> = {};
// reserve the 10% final budget for the beast mode
const regularBudget = tokenBudget * 0.85;
const finalAnswerPIP: string[] = [];
let trivialQuestion = false;
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget && badAttempts <= maxBadAttempts) {
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
// add 1s delay to avoid rate limiting
step++;
totalStep++;
@ -438,14 +439,19 @@ export async function getResponse(question?: string,
if (currentQuestion.trim() === question && totalStep === 1) {
// only add evaluation for initial question, once at step 1
evaluationMetrics[currentQuestion] =
await evaluateQuestion(currentQuestion, context, SchemaGen)
(await evaluateQuestion(currentQuestion, context, SchemaGen)).map(e => {
return {
type: e,
numEvalsRequired: maxBadAttempts
} as RepeatEvaluationType
})
// force strict eval for the original question, at last, only once.
evaluationMetrics[currentQuestion].push('strict')
evaluationMetrics[currentQuestion].push({type: 'strict', numEvalsRequired: maxBadAttempts});
} else if (currentQuestion.trim() !== question) {
evaluationMetrics[currentQuestion] = []
}
if (totalStep === 1 && evaluationMetrics[currentQuestion].includes('freshness')) {
if (totalStep === 1 && includesEval(evaluationMetrics[currentQuestion], 'freshness')) {
// if it detects freshness, avoid direct answer at step 1
allowAnswer = false;
allowReflect = false;
@ -501,7 +507,7 @@ export async function getResponse(question?: string,
console.log(`${currentQuestion}: ${thisStep.action} <- [${actionsStr}]`);
console.log(thisStep)
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
context.actionTracker.trackAction({totalStep, thisStep, gaps});
// reset allow* to true
allowAnswer = true;
@ -554,7 +560,7 @@ export async function getResponse(question?: string,
evaluation = await evaluateAnswer(
currentQuestion,
thisStep,
evaluationMetrics[currentQuestion],
evaluationMetrics[currentQuestion].map(e => e.type),
context,
allKnowledge,
SchemaGen
@ -583,20 +589,25 @@ Your journey ends here. You have successfully answered the original question. Co
thisStep.isFinal = true;
break
} else {
// lower numEvalsRequired for the failed evaluation and if numEvalsRequired is 0, remove it from the evaluation metrics
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].map(e => {
if (e.type === evaluation.type) {
e.numEvalsRequired--;
}
return e;
}).filter(e => e.numEvalsRequired > 0);
if (evaluation.type === 'strict' && evaluation.improvement_plan) {
finalAnswerPIP.push(evaluation.improvement_plan);
maxStrictEvals--;
if (maxStrictEvals <= 0) {
// remove 'strict' from the evaluation metrics
console.log('Remove `strict` from evaluation metrics')
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
}
}
if (badAttempts >= maxBadAttempts) {
if (evaluationMetrics[currentQuestion].length === 0) {
// failed so many times, give up, route to beast mode
thisStep.isFinal = false;
break
} else {
diaryContext.push(`
}
diaryContext.push(`
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
Original question:
@ -608,11 +619,11 @@ ${thisStep.answer}
The evaluator thinks your answer is bad because:
${evaluation.think}
`);
// store the bad context and reset the diary context
const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);
// store the bad context and reset the diary context
const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);
allKnowledge.push({
question: `
allKnowledge.push({
question: `
Why is the following answer bad for the question? Please reflect
<question>
@ -623,7 +634,7 @@ ${currentQuestion}
${thisStep.answer}
</answer>
`,
answer: `
answer: `
${evaluation.think}
${errorAnalysis.recap}
@ -632,14 +643,12 @@ ${errorAnalysis.blame}
${errorAnalysis.improvement}
`,
type: 'qa',
})
type: 'qa',
})
badAttempts++;
allowAnswer = false; // disable answer action in the immediate next step
diaryContext = [];
step = 0;
}
allowAnswer = false; // disable answer action in the immediate next step
diaryContext = [];
step = 0;
}
} else if (evaluation.pass) {
// solved a gap question
@ -913,7 +922,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
} as AnswerAction;
await updateReferences(thisStep, allURLs);
(thisStep as AnswerAction).isFinal = true;
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
context.actionTracker.trackAction({totalStep, thisStep, gaps});
}
if (!trivialQuestion) {

View File

@ -634,26 +634,13 @@ export async function evaluateAnswer(
for (const evaluationType of evaluationTypes) {
let prompt: { system: string; user: string } | undefined
switch (evaluationType) {
// case 'attribution': {
// if (allKnowledge.length === 0) {
// return {
// pass: false,
// think: `The knowledge is completely empty and the answer can not be derived from it. Need to found some other references and URLs`,
// type: 'attribution',
// };
// }
// prompt = getAttributionPrompt(question, action.answer, allKnowledge);
// break;
// }
case 'definitive':
prompt = getDefinitivePrompt(question, action.answer);
break;
case 'freshness':
prompt = getFreshnessPrompt(question, action, new Date().toISOString());
break;
case 'plurality':
prompt = getPluralityPrompt(question, action.answer);
break;
@ -681,6 +668,6 @@ export async function evaluateAnswer(
}
}
return result?.object as EvaluationResponse;
return result?.object as EvaluationResponse;
}

View File

@ -63,6 +63,10 @@ export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitActi
export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution' | 'completeness' | 'strict';
export type RepeatEvaluationType = {
type: EvaluationType;
numEvalsRequired: number;
}
// Following Vercel AI SDK's token counting interface
export interface TokenUsage {

View File

@ -5,7 +5,6 @@ import {getI18nText} from "./text-tools";
interface ActionState {
thisStep: StepAction;
gaps: string[];
badAttempts: number;
totalStep: number;
}
@ -14,7 +13,6 @@ export class ActionTracker extends EventEmitter {
private state: ActionState = {
thisStep: {action: 'answer', answer: '', references: [], think: ''},
gaps: [],
badAttempts: 0,
totalStep: 0
};
@ -39,7 +37,6 @@ export class ActionTracker extends EventEmitter {
this.state = {
thisStep: {action: 'answer', answer: '', references: [], think: ''},
gaps: [],
badAttempts: 0,
totalStep: 0
};
}