mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
feat: improved evaluators
This commit is contained in:
70
src/agent.ts
70
src/agent.ts
@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
|
||||
|
||||
if (allowSearch) {
|
||||
actions.push("search");
|
||||
properties.searchQuery = z.string()
|
||||
properties.searchQuery = z.string().max(30)
|
||||
.describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
|
||||
}
|
||||
|
||||
@@ -356,38 +356,23 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
||||
|
||||
// execute the step and action
|
||||
if (thisStep.action === 'answer') {
|
||||
if (step === 1) {
|
||||
// LLM is so confident and answer immediately, skip all evaluations
|
||||
isAnswered = true;
|
||||
break
|
||||
}
|
||||
|
||||
updateContext({
|
||||
totalStep,
|
||||
question: currentQuestion,
|
||||
...thisStep,
|
||||
});
|
||||
|
||||
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
|
||||
|
||||
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
|
||||
['definitive', 'freshness', 'plurality'], context.tokenTracker);
|
||||
|
||||
if (currentQuestion === question) {
|
||||
if (badAttempts >= maxBadAttempts) {
|
||||
// EXIT POINT OF THE PROGRAM!!!!
|
||||
diaryContext.push(`
|
||||
At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
|
||||
|
||||
Original question:
|
||||
${currentQuestion}
|
||||
|
||||
Your answer:
|
||||
${thisStep.answer}
|
||||
|
||||
The evaluator thinks your answer is good because:
|
||||
${evaluation.reasoning}
|
||||
|
||||
Your journey ends here.
|
||||
`);
|
||||
isAnswered = false;
|
||||
break
|
||||
}
|
||||
if (evaluation.is_definitive) {
|
||||
if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
|
||||
// EXIT POINT OF THE PROGRAM!!!!
|
||||
if (evaluation.pass) {
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took **answer** action and finally found the answer to the original question:
|
||||
|
||||
@@ -398,29 +383,16 @@ Your answer:
|
||||
${thisStep.answer}
|
||||
|
||||
The evaluator thinks your answer is good because:
|
||||
${evaluation.reasoning}
|
||||
${evaluation.think}
|
||||
|
||||
Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
|
||||
`);
|
||||
isAnswered = true;
|
||||
break
|
||||
} else {
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took **answer** action and finally found the answer to the original question:
|
||||
|
||||
Original question:
|
||||
${currentQuestion}
|
||||
|
||||
Your answer:
|
||||
${thisStep.answer}
|
||||
|
||||
Unfortunately, you did not provide any references to support your answer.
|
||||
You need to find more URL references to support your answer.`);
|
||||
}
|
||||
|
||||
isAnswered = true;
|
||||
if (badAttempts >= maxBadAttempts) {
|
||||
isAnswered = false;
|
||||
break
|
||||
|
||||
} else {
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
|
||||
@@ -432,15 +404,22 @@ Your answer:
|
||||
${thisStep.answer}
|
||||
|
||||
The evaluator thinks your answer is bad because:
|
||||
${evaluation.reasoning}
|
||||
${evaluation.think}
|
||||
`);
|
||||
// store the bad context and reset the diary context
|
||||
const {response: errorAnalysis} = await analyzeSteps(diaryContext);
|
||||
|
||||
allKnowledge.push({
|
||||
question: currentQuestion,
|
||||
answer: thisStep.answer,
|
||||
references: thisStep.references,
|
||||
type: 'qa'
|
||||
});
|
||||
|
||||
badContext.push({
|
||||
question: currentQuestion,
|
||||
answer: thisStep.answer,
|
||||
evaluation: evaluation.reasoning,
|
||||
evaluation: evaluation.think,
|
||||
...errorAnalysis
|
||||
});
|
||||
badAttempts++;
|
||||
@@ -448,7 +427,8 @@ ${evaluation.reasoning}
|
||||
diaryContext = [];
|
||||
step = 0;
|
||||
}
|
||||
} else if (evaluation.is_definitive) {
|
||||
}
|
||||
} else if (evaluation.pass) {
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took **answer** action. You found a good answer to the sub-question:
|
||||
|
||||
@@ -459,7 +439,7 @@ Your answer:
|
||||
${thisStep.answer}
|
||||
|
||||
The evaluator thinks your answer is good because:
|
||||
${evaluation.reasoning}
|
||||
${evaluation.think}
|
||||
|
||||
Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
|
||||
`);
|
||||
|
||||
@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';
|
||||
|
||||
const model = getModel('evaluator');
|
||||
|
||||
const responseSchema = z.object({
|
||||
is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
|
||||
reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
|
||||
type EvaluationType = 'definitive' | 'freshness' | 'plurality';
|
||||
|
||||
const baseSchema = {
|
||||
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
|
||||
think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
|
||||
};
|
||||
|
||||
const definitiveSchema = z.object({
|
||||
...baseSchema,
|
||||
type: z.literal('definitive')
|
||||
});
|
||||
|
||||
function getPrompt(question: string, answer: string): string {
|
||||
const freshnessSchema = z.object({
|
||||
...baseSchema,
|
||||
type: z.literal('freshness'),
|
||||
freshness_analysis: z.object({
|
||||
likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
|
||||
dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
|
||||
current_time: z.string().describe('Current system time when evaluation was performed'),
|
||||
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
|
||||
})
|
||||
});
|
||||
|
||||
const pluralitySchema = z.object({
|
||||
...baseSchema,
|
||||
type: z.literal('plurality'),
|
||||
plurality_analysis: z.object({
|
||||
expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
|
||||
provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
|
||||
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
|
||||
count_provided: z.number().describe('Number of items provided in answer')
|
||||
})
|
||||
});
|
||||
|
||||
function getDefinitivePrompt(question: string, answer: string): string {
|
||||
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
|
||||
|
||||
<rules>
|
||||
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
|
||||
5. Non-answers that suggest alternatives
|
||||
</rules>
|
||||
|
||||
|
||||
<examples>
|
||||
Question: "What are the system requirements for running Python 3.9?"
|
||||
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
|
||||
Evaluation: {
|
||||
"is_definitive": false,
|
||||
"reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
|
||||
"pass": false,
|
||||
"think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
|
||||
}
|
||||
|
||||
Question: "What are the system requirements for running Python 3.9?"
|
||||
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
|
||||
Evaluation: {
|
||||
"is_definitive": true,
|
||||
"reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
|
||||
"pass": true,
|
||||
"think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
|
||||
}
|
||||
|
||||
Question: "Who will be the president of the United States in 2032?"
|
||||
Answer: "I cannot predict the future, it depends on the election results."
|
||||
Evaluation: {
|
||||
"is_definitive": false,
|
||||
"reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
|
||||
"pass": false,
|
||||
"think": "The answer contains a statement of inability to predict the future, making it non-definitive."
|
||||
}
|
||||
|
||||
Question: "Who is the sales director at Company X?"
|
||||
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
|
||||
Evaluation: {
|
||||
"is_definitive": false,
|
||||
"reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
|
||||
"pass": false,
|
||||
"think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
|
||||
}
|
||||
|
||||
Question: "what is the twitter account of jina ai's founder?"
|
||||
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
|
||||
Evaluation: {
|
||||
"is_definitive": false,
|
||||
"reasoning": "The answer indicates a lack of information rather than providing a definitive response."
|
||||
"pass": false,
|
||||
"think": "The answer indicates a lack of information rather than providing a definitive response."
|
||||
}
|
||||
</examples>
|
||||
|
||||
Now evaluate this pair:
|
||||
Question: ${JSON.stringify(question)}
|
||||
Answer: ${JSON.stringify(answer)}`;
|
||||
}
|
||||
|
||||
export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
|
||||
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
|
||||
return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
|
||||
|
||||
<rules>
|
||||
1. Date Analysis:
|
||||
- Extract all dates mentioned in the answer
|
||||
- Compare against current system time: ${currentTime}
|
||||
- Consider content outdated if:
|
||||
* It refers to a "latest" or "current" state from more than 30 days ago
|
||||
* It mentions specific dates/events that have been superseded
|
||||
* It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
|
||||
- For product versions, releases, or announcements, max age is 30 days
|
||||
- For company positions, leadership, or general facts, max age is 60 days
|
||||
|
||||
2. Context Hints:
|
||||
- Words indicating recency: "latest", "current", "newest", "just released", "recently"
|
||||
- Time-sensitive terms: "CEO", "price", "version", "release"
|
||||
- Future dates should be ignored in outdated calculation
|
||||
</rules>
|
||||
|
||||
<examples>
|
||||
Question: "What is Jina AI's latest embedding model?"
|
||||
Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
|
||||
Current Time: "2024-10-06T00:00:00Z"
|
||||
Evaluation: {
|
||||
"pass": false,
|
||||
"think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
|
||||
"freshness_analysis": {
|
||||
"likely_outdated": true,
|
||||
"dates_mentioned": ["2024-03-15"],
|
||||
"current_time": "2024-10-06T00:00:00Z",
|
||||
"max_age_days": 30
|
||||
}
|
||||
}
|
||||
|
||||
Question: "Who is OpenAI's CEO?"
|
||||
Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
|
||||
Current Time: "2024-02-06T00:00:00Z"
|
||||
Evaluation: {
|
||||
"pass": true,
|
||||
"think": "The answer is about company leadership and is within the 60-day threshold for such information",
|
||||
"freshness_analysis": {
|
||||
"likely_outdated": false,
|
||||
"dates_mentioned": ["2023-12"],
|
||||
"current_time": "2024-02-06T00:00:00Z",
|
||||
"max_age_days": 60
|
||||
}
|
||||
}
|
||||
</examples>
|
||||
|
||||
Now evaluate this pair:
|
||||
Question: ${JSON.stringify(question)}
|
||||
Answer: ${JSON.stringify(answer)}`;
|
||||
}
|
||||
|
||||
function getPluralityPrompt(question: string, answer: string): string {
|
||||
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
|
||||
|
||||
<rules>
|
||||
1. Question Analysis:
|
||||
- Check if question asks for multiple items using indicators like:
|
||||
* Plural nouns: "companies", "people", "names"
|
||||
* Quantifiers: "all", "many", "several", "various", "multiple"
|
||||
* List requests: "list", "enumerate", "name all", "give me all"
|
||||
* Numbers: "5 examples", "top 10"
|
||||
- Otherwise skip the analysis and return pass to true
|
||||
|
||||
2. Answer Analysis:
|
||||
- Count distinct items provided in the answer
|
||||
- Check if answer uses limiting words like "only", "just", "single"
|
||||
- Identify if answer acknowledges there are more items but only provides some
|
||||
|
||||
3. Definitiveness Rules:
|
||||
- If question asks for multiple items but answer provides only one → NOT definitive
|
||||
- If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
|
||||
- If answer clearly states it's providing a partial list → NOT definitive
|
||||
- If question asks for "all" or "every" but answer seems incomplete → NOT definitive
|
||||
</rules>
|
||||
|
||||
<examples>
|
||||
Question: "Who works in Jina AI's sales team?"
|
||||
Answer: "John Smith is a sales representative at Jina AI."
|
||||
Evaluation: {
|
||||
"pass": true,
|
||||
"think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
|
||||
"plurality_analysis": {
|
||||
"expects_multiple": false,
|
||||
"provides_multiple": false,
|
||||
"count_provided": 1
|
||||
}
|
||||
}
|
||||
|
||||
Question: "List all the salespeople who work at Jina AI"
|
||||
Answer: "John Smith is a sales representative at Jina AI."
|
||||
Evaluation: {
|
||||
"pass": false,
|
||||
"think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
|
||||
"plurality_analysis": {
|
||||
"expects_multiple": true,
|
||||
"provides_multiple": false,
|
||||
"count_provided": 1
|
||||
}
|
||||
}
|
||||
|
||||
Question: "Name the top 3 products sold by Jina AI"
|
||||
Answer: "Jina AI's product lineup includes DocArray and Jina."
|
||||
Evaluation: {
|
||||
"pass": false,
|
||||
"think": "The question asks for top 3 products but only 2 are provided.",
|
||||
"plurality_analysis": {
|
||||
"expects_multiple": true,
|
||||
"provides_multiple": true,
|
||||
"count_expected": 3,
|
||||
"count_provided": 2
|
||||
}
|
||||
}
|
||||
|
||||
Question: "List as many AI companies in Berlin as you can find"
|
||||
Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
|
||||
Evaluation: {
|
||||
"pass": false,
|
||||
"think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
|
||||
"plurality_analysis": {
|
||||
"expects_multiple": true,
|
||||
"provides_multiple": true,
|
||||
"count_provided": 5
|
||||
}
|
||||
}
|
||||
</examples>
|
||||
|
||||
Now evaluate this pair:
|
||||
Question: ${JSON.stringify(question)}
|
||||
Answer: ${JSON.stringify(answer)}`;
|
||||
}
|
||||
|
||||
export async function evaluateAnswer(
|
||||
question: string,
|
||||
answer: string,
|
||||
evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
|
||||
tracker?: TokenTracker
|
||||
): Promise<{ response: EvaluationResponse }> {
|
||||
let result;
|
||||
|
||||
for (const evaluationType of evaluationOrder) {
|
||||
try {
|
||||
const prompt = getPrompt(question, answer);
|
||||
let object;
|
||||
let totalTokens = 0;
|
||||
try {
|
||||
const result = await generateObject({
|
||||
switch (evaluationType) {
|
||||
case 'definitive':
|
||||
result = await generateObject({
|
||||
model,
|
||||
schema: responseSchema,
|
||||
prompt,
|
||||
schema: definitiveSchema,
|
||||
prompt: getDefinitivePrompt(question, answer),
|
||||
maxTokens: getMaxTokens('evaluator')
|
||||
});
|
||||
object = result.object;
|
||||
totalTokens = result.usage?.totalTokens || 0;
|
||||
} catch (error) {
|
||||
const result = await handleGenerateObjectError<EvaluationResponse>(error);
|
||||
object = result.object;
|
||||
totalTokens = result.totalTokens;
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||
console.log('Evaluation:', result.object);
|
||||
if (!result.object.pass) {
|
||||
return { response: result.object };
|
||||
}
|
||||
console.log('Evaluation:', {
|
||||
definitive: object.is_definitive,
|
||||
reason: object.reasoning
|
||||
break;
|
||||
|
||||
case 'freshness':
|
||||
result = await generateObject({
|
||||
model,
|
||||
schema: freshnessSchema,
|
||||
prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
|
||||
maxTokens: getMaxTokens('evaluator')
|
||||
});
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
|
||||
return { response: object, tokens: totalTokens };
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||
console.log('Evaluation:', result.object);
|
||||
if (!result.object.pass) {
|
||||
return { response: result.object };
|
||||
}
|
||||
break;
|
||||
|
||||
case 'plurality':
|
||||
result = await generateObject({
|
||||
model,
|
||||
schema: pluralitySchema,
|
||||
prompt: getPluralityPrompt(question, answer),
|
||||
maxTokens: getMaxTokens('evaluator')
|
||||
});
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||
console.log('Evaluation:', result.object);
|
||||
if (!result.object.pass) {
|
||||
return { response: result.object };
|
||||
}
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error in answer evaluation:', error);
|
||||
throw error;
|
||||
console.error(`Error in ${evaluationType} evaluation:`, error);
|
||||
const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
|
||||
(tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
|
||||
if (!errorResult.object.pass) {
|
||||
return { response: errorResult.object };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Example usage
|
||||
async function main() {
|
||||
const question = process.argv[2] || '';
|
||||
const answer = process.argv[3] || '';
|
||||
|
||||
if (!question || !answer) {
|
||||
console.error('Please provide both question and answer as command line arguments');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
await evaluateAnswer(question, answer);
|
||||
} catch (error) {
|
||||
console.error('Failed to evaluate answer:', error);
|
||||
}
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main().catch(console.error);
|
||||
return { response: result!.object };
|
||||
}
|
||||
22
src/types.ts
22
src/types.ts
@@ -1,18 +1,3 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
|
||||
|
||||
export const QuerySchema = z.string()
|
||||
.max(30)
|
||||
.describe('Search query, must be less than 30 characters');
|
||||
|
||||
export const URLSchema = z.string().url();
|
||||
|
||||
export const ReferenceSchema = z.object({
|
||||
exactQuote: z.string().describe('Exact relevant quote from the document'),
|
||||
url: URLSchema.describe('URL of the document')
|
||||
});
|
||||
|
||||
// Action Types
|
||||
type BaseAction = {
|
||||
action: "search" | "answer" | "reflect" | "visit";
|
||||
@@ -96,9 +81,12 @@ export interface ReadResponse {
|
||||
readableMessage?: string;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
export type EvaluationResponse = {
|
||||
is_definitive: boolean;
|
||||
reasoning: string;
|
||||
pass: boolean;
|
||||
think: string;
|
||||
};
|
||||
|
||||
export type ErrorAnalysisResponse = {
|
||||
|
||||
Reference in New Issue
Block a user