feat: improved evaluators

This commit is contained in:
Han Xiao
2025-02-06 21:36:32 +08:00
parent a5e5627823
commit 906424f015
3 changed files with 282 additions and 136 deletions

View File

@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
if (allowSearch) {
actions.push("search");
properties.searchQuery = z.string()
properties.searchQuery = z.string().max(30)
.describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
}
@@ -356,38 +356,23 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
// execute the step and action
if (thisStep.action === 'answer') {
if (step === 1) {
// LLM is so confident and answer immediately, skip all evaluations
isAnswered = true;
break
}
updateContext({
totalStep,
question: currentQuestion,
...thisStep,
});
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
['definitive', 'freshness', 'plurality'], context.tokenTracker);
if (currentQuestion === question) {
if (badAttempts >= maxBadAttempts) {
// EXIT POINT OF THE PROGRAM!!!!
diaryContext.push(`
At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
Original question:
${currentQuestion}
Your answer:
${thisStep.answer}
The evaluator thinks your answer is good because:
${evaluation.reasoning}
Your journey ends here.
`);
isAnswered = false;
break
}
if (evaluation.is_definitive) {
if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
// EXIT POINT OF THE PROGRAM!!!!
if (evaluation.pass) {
diaryContext.push(`
At step ${step}, you took **answer** action and finally found the answer to the original question:
@@ -398,29 +383,16 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is good because:
${evaluation.reasoning}
${evaluation.think}
Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
`);
isAnswered = true;
break
} else {
diaryContext.push(`
At step ${step}, you took **answer** action and finally found the answer to the original question:
Original question:
${currentQuestion}
Your answer:
${thisStep.answer}
Unfortunately, you did not provide any references to support your answer.
You need to find more URL references to support your answer.`);
}
isAnswered = true;
if (badAttempts >= maxBadAttempts) {
isAnswered = false;
break
} else {
diaryContext.push(`
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
@@ -432,15 +404,22 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is bad because:
${evaluation.reasoning}
${evaluation.think}
`);
// store the bad context and reset the diary context
const {response: errorAnalysis} = await analyzeSteps(diaryContext);
allKnowledge.push({
question: currentQuestion,
answer: thisStep.answer,
references: thisStep.references,
type: 'qa'
});
badContext.push({
question: currentQuestion,
answer: thisStep.answer,
evaluation: evaluation.reasoning,
evaluation: evaluation.think,
...errorAnalysis
});
badAttempts++;
@@ -448,7 +427,8 @@ ${evaluation.reasoning}
diaryContext = [];
step = 0;
}
} else if (evaluation.is_definitive) {
}
} else if (evaluation.pass) {
diaryContext.push(`
At step ${step}, you took **answer** action. You found a good answer to the sub-question:
@@ -459,7 +439,7 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is good because:
${evaluation.reasoning}
${evaluation.think}
Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
`);

View File

@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';
const model = getModel('evaluator');
const responseSchema = z.object({
is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
type EvaluationType = 'definitive' | 'freshness' | 'plurality';
const baseSchema = {
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
};
const definitiveSchema = z.object({
...baseSchema,
type: z.literal('definitive')
});
function getPrompt(question: string, answer: string): string {
const freshnessSchema = z.object({
...baseSchema,
type: z.literal('freshness'),
freshness_analysis: z.object({
likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
current_time: z.string().describe('Current system time when evaluation was performed'),
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
})
});
const pluralitySchema = z.object({
...baseSchema,
type: z.literal('plurality'),
plurality_analysis: z.object({
expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
count_provided: z.number().describe('Number of items provided in answer')
})
});
function getDefinitivePrompt(question: string, answer: string): string {
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
<rules>
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
5. Non-answers that suggest alternatives
</rules>
<examples>
Question: "What are the system requirements for running Python 3.9?"
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
Evaluation: {
"is_definitive": false,
"reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
"pass": false,
"think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
}
Question: "What are the system requirements for running Python 3.9?"
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
Evaluation: {
"is_definitive": true,
"reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
"pass": true,
"think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
}
Question: "Who will be the president of the United States in 2032?"
Answer: "I cannot predict the future, it depends on the election results."
Evaluation: {
"is_definitive": false,
"reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
"pass": false,
"think": "The answer contains a statement of inability to predict the future, making it non-definitive."
}
Question: "Who is the sales director at Company X?"
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
Evaluation: {
"is_definitive": false,
"reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
"pass": false,
"think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
}
Question: "what is the twitter account of jina ai's founder?"
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
Evaluation: {
"is_definitive": false,
"reasoning": "The answer indicates a lack of information rather than providing a definitive response."
"pass": false,
"think": "The answer indicates a lack of information rather than providing a definitive response."
}
</examples>
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
}
export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
<rules>
1. Date Analysis:
- Extract all dates mentioned in the answer
- Compare against current system time: ${currentTime}
- Consider content outdated if:
* It refers to a "latest" or "current" state from more than 30 days ago
* It mentions specific dates/events that have been superseded
* It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
- For product versions, releases, or announcements, max age is 30 days
- For company positions, leadership, or general facts, max age is 60 days
2. Context Hints:
- Words indicating recency: "latest", "current", "newest", "just released", "recently"
- Time-sensitive terms: "CEO", "price", "version", "release"
- Future dates should be ignored in outdated calculation
</rules>
<examples>
Question: "What is Jina AI's latest embedding model?"
Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
Current Time: "2024-10-06T00:00:00Z"
Evaluation: {
"pass": false,
"think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
"freshness_analysis": {
"likely_outdated": true,
"dates_mentioned": ["2024-03-15"],
"current_time": "2024-10-06T00:00:00Z",
"max_age_days": 30
}
}
Question: "Who is OpenAI's CEO?"
Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
Current Time: "2024-02-06T00:00:00Z"
Evaluation: {
"pass": true,
"think": "The answer is about company leadership and is within the 60-day threshold for such information",
"freshness_analysis": {
"likely_outdated": false,
"dates_mentioned": ["2023-12"],
"current_time": "2024-02-06T00:00:00Z",
"max_age_days": 60
}
}
</examples>
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
}
function getPluralityPrompt(question: string, answer: string): string {
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
<rules>
1. Question Analysis:
- Check if question asks for multiple items using indicators like:
* Plural nouns: "companies", "people", "names"
* Quantifiers: "all", "many", "several", "various", "multiple"
* List requests: "list", "enumerate", "name all", "give me all"
* Numbers: "5 examples", "top 10"
- Otherwise skip the analysis and return pass to true
2. Answer Analysis:
- Count distinct items provided in the answer
- Check if answer uses limiting words like "only", "just", "single"
- Identify if answer acknowledges there are more items but only provides some
3. Definitiveness Rules:
- If question asks for multiple items but answer provides only one → NOT definitive
- If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
- If answer clearly states it's providing a partial list → NOT definitive
- If question asks for "all" or "every" but answer seems incomplete → NOT definitive
</rules>
<examples>
Question: "Who works in Jina AI's sales team?"
Answer: "John Smith is a sales representative at Jina AI."
Evaluation: {
"pass": true,
"think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
"plurality_analysis": {
"expects_multiple": false,
"provides_multiple": false,
"count_provided": 1
}
}
Question: "List all the salespeople who work at Jina AI"
Answer: "John Smith is a sales representative at Jina AI."
Evaluation: {
"pass": false,
"think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
"plurality_analysis": {
"expects_multiple": true,
"provides_multiple": false,
"count_provided": 1
}
}
Question: "Name the top 3 products sold by Jina AI"
Answer: "Jina AI's product lineup includes DocArray and Jina."
Evaluation: {
"pass": false,
"think": "The question asks for top 3 products but only 2 are provided.",
"plurality_analysis": {
"expects_multiple": true,
"provides_multiple": true,
"count_expected": 3,
"count_provided": 2
}
}
Question: "List as many AI companies in Berlin as you can find"
Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
Evaluation: {
"pass": false,
"think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
"plurality_analysis": {
"expects_multiple": true,
"provides_multiple": true,
"count_provided": 5
}
}
</examples>
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
}
export async function evaluateAnswer(
question: string,
answer: string,
evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
tracker?: TokenTracker
): Promise<{ response: EvaluationResponse }> {
let result;
for (const evaluationType of evaluationOrder) {
try {
const prompt = getPrompt(question, answer);
let object;
let totalTokens = 0;
try {
const result = await generateObject({
switch (evaluationType) {
case 'definitive':
result = await generateObject({
model,
schema: responseSchema,
prompt,
schema: definitiveSchema,
prompt: getDefinitivePrompt(question, answer),
maxTokens: getMaxTokens('evaluator')
});
object = result.object;
totalTokens = result.usage?.totalTokens || 0;
} catch (error) {
const result = await handleGenerateObjectError<EvaluationResponse>(error);
object = result.object;
totalTokens = result.totalTokens;
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
}
console.log('Evaluation:', {
definitive: object.is_definitive,
reason: object.reasoning
break;
case 'freshness':
result = await generateObject({
model,
schema: freshnessSchema,
prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
return { response: object, tokens: totalTokens };
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
}
break;
case 'plurality':
result = await generateObject({
model,
schema: pluralitySchema,
prompt: getPluralityPrompt(question, answer),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
}
break;
}
} catch (error) {
console.error('Error in answer evaluation:', error);
throw error;
console.error(`Error in ${evaluationType} evaluation:`, error);
const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
(tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
if (!errorResult.object.pass) {
return { response: errorResult.object };
}
}
}
// Example usage
async function main() {
const question = process.argv[2] || '';
const answer = process.argv[3] || '';
if (!question || !answer) {
console.error('Please provide both question and answer as command line arguments');
process.exit(1);
}
try {
await evaluateAnswer(question, answer);
} catch (error) {
console.error('Failed to evaluate answer:', error);
}
}
if (require.main === module) {
main().catch(console.error);
return { response: result!.object };
}

View File

@@ -1,18 +1,3 @@
import { z } from 'zod';
export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
export const QuerySchema = z.string()
.max(30)
.describe('Search query, must be less than 30 characters');
export const URLSchema = z.string().url();
export const ReferenceSchema = z.object({
exactQuote: z.string().describe('Exact relevant quote from the document'),
url: URLSchema.describe('URL of the document')
});
// Action Types
type BaseAction = {
action: "search" | "answer" | "reflect" | "visit";
@@ -96,9 +81,12 @@ export interface ReadResponse {
readableMessage?: string;
}
export type EvaluationResponse = {
is_definitive: boolean;
reasoning: string;
pass: boolean;
think: string;
};
export type ErrorAnalysisResponse = {