mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
feat: improved evaluators
This commit is contained in:
98
src/agent.ts
98
src/agent.ts
@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
|
|||||||
|
|
||||||
if (allowSearch) {
|
if (allowSearch) {
|
||||||
actions.push("search");
|
actions.push("search");
|
||||||
properties.searchQuery = z.string()
|
properties.searchQuery = z.string().max(30)
|
||||||
.describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
|
.describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -356,39 +356,24 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
|
|||||||
|
|
||||||
// execute the step and action
|
// execute the step and action
|
||||||
if (thisStep.action === 'answer') {
|
if (thisStep.action === 'answer') {
|
||||||
|
if (step === 1) {
|
||||||
|
// LLM is so confident and answer immediately, skip all evaluations
|
||||||
|
isAnswered = true;
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
updateContext({
|
updateContext({
|
||||||
totalStep,
|
totalStep,
|
||||||
question: currentQuestion,
|
question: currentQuestion,
|
||||||
...thisStep,
|
...thisStep,
|
||||||
});
|
});
|
||||||
|
|
||||||
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
|
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
|
||||||
|
['definitive', 'freshness', 'plurality'], context.tokenTracker);
|
||||||
|
|
||||||
if (currentQuestion === question) {
|
if (currentQuestion === question) {
|
||||||
if (badAttempts >= maxBadAttempts) {
|
if (evaluation.pass) {
|
||||||
// EXIT POINT OF THE PROGRAM!!!!
|
|
||||||
diaryContext.push(`
|
diaryContext.push(`
|
||||||
At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
|
|
||||||
|
|
||||||
Original question:
|
|
||||||
${currentQuestion}
|
|
||||||
|
|
||||||
Your answer:
|
|
||||||
${thisStep.answer}
|
|
||||||
|
|
||||||
The evaluator thinks your answer is good because:
|
|
||||||
${evaluation.reasoning}
|
|
||||||
|
|
||||||
Your journey ends here.
|
|
||||||
`);
|
|
||||||
isAnswered = false;
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if (evaluation.is_definitive) {
|
|
||||||
if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
|
|
||||||
// EXIT POINT OF THE PROGRAM!!!!
|
|
||||||
diaryContext.push(`
|
|
||||||
At step ${step}, you took **answer** action and finally found the answer to the original question:
|
At step ${step}, you took **answer** action and finally found the answer to the original question:
|
||||||
|
|
||||||
Original question:
|
Original question:
|
||||||
@@ -398,31 +383,18 @@ Your answer:
|
|||||||
${thisStep.answer}
|
${thisStep.answer}
|
||||||
|
|
||||||
The evaluator thinks your answer is good because:
|
The evaluator thinks your answer is good because:
|
||||||
${evaluation.reasoning}
|
${evaluation.think}
|
||||||
|
|
||||||
Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
|
Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
|
||||||
`);
|
`);
|
||||||
isAnswered = true;
|
isAnswered = true;
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
if (badAttempts >= maxBadAttempts) {
|
||||||
|
isAnswered = false;
|
||||||
break
|
break
|
||||||
} else {
|
} else {
|
||||||
diaryContext.push(`
|
diaryContext.push(`
|
||||||
At step ${step}, you took **answer** action and finally found the answer to the original question:
|
|
||||||
|
|
||||||
Original question:
|
|
||||||
${currentQuestion}
|
|
||||||
|
|
||||||
Your answer:
|
|
||||||
${thisStep.answer}
|
|
||||||
|
|
||||||
Unfortunately, you did not provide any references to support your answer.
|
|
||||||
You need to find more URL references to support your answer.`);
|
|
||||||
}
|
|
||||||
|
|
||||||
isAnswered = true;
|
|
||||||
break
|
|
||||||
|
|
||||||
} else {
|
|
||||||
diaryContext.push(`
|
|
||||||
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
|
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
|
||||||
|
|
||||||
Original question:
|
Original question:
|
||||||
@@ -432,23 +404,31 @@ Your answer:
|
|||||||
${thisStep.answer}
|
${thisStep.answer}
|
||||||
|
|
||||||
The evaluator thinks your answer is bad because:
|
The evaluator thinks your answer is bad because:
|
||||||
${evaluation.reasoning}
|
${evaluation.think}
|
||||||
`);
|
`);
|
||||||
// store the bad context and reset the diary context
|
// store the bad context and reset the diary context
|
||||||
const {response: errorAnalysis} = await analyzeSteps(diaryContext);
|
const {response: errorAnalysis} = await analyzeSteps(diaryContext);
|
||||||
|
|
||||||
badContext.push({
|
allKnowledge.push({
|
||||||
question: currentQuestion,
|
question: currentQuestion,
|
||||||
answer: thisStep.answer,
|
answer: thisStep.answer,
|
||||||
evaluation: evaluation.reasoning,
|
references: thisStep.references,
|
||||||
...errorAnalysis
|
type: 'qa'
|
||||||
});
|
});
|
||||||
badAttempts++;
|
|
||||||
allowAnswer = false; // disable answer action in the immediate next step
|
badContext.push({
|
||||||
diaryContext = [];
|
question: currentQuestion,
|
||||||
step = 0;
|
answer: thisStep.answer,
|
||||||
|
evaluation: evaluation.think,
|
||||||
|
...errorAnalysis
|
||||||
|
});
|
||||||
|
badAttempts++;
|
||||||
|
allowAnswer = false; // disable answer action in the immediate next step
|
||||||
|
diaryContext = [];
|
||||||
|
step = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (evaluation.is_definitive) {
|
} else if (evaluation.pass) {
|
||||||
diaryContext.push(`
|
diaryContext.push(`
|
||||||
At step ${step}, you took **answer** action. You found a good answer to the sub-question:
|
At step ${step}, you took **answer** action. You found a good answer to the sub-question:
|
||||||
|
|
||||||
@@ -459,7 +439,7 @@ Your answer:
|
|||||||
${thisStep.answer}
|
${thisStep.answer}
|
||||||
|
|
||||||
The evaluator thinks your answer is good because:
|
The evaluator thinks your answer is good because:
|
||||||
${evaluation.reasoning}
|
${evaluation.think}
|
||||||
|
|
||||||
Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
|
Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
|
||||||
`);
|
`);
|
||||||
|
|||||||
@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';
|
|||||||
|
|
||||||
const model = getModel('evaluator');
|
const model = getModel('evaluator');
|
||||||
|
|
||||||
const responseSchema = z.object({
|
type EvaluationType = 'definitive' | 'freshness' | 'plurality';
|
||||||
is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
|
|
||||||
reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
|
const baseSchema = {
|
||||||
|
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
|
||||||
|
think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
|
||||||
|
};
|
||||||
|
|
||||||
|
const definitiveSchema = z.object({
|
||||||
|
...baseSchema,
|
||||||
|
type: z.literal('definitive')
|
||||||
});
|
});
|
||||||
|
|
||||||
function getPrompt(question: string, answer: string): string {
|
const freshnessSchema = z.object({
|
||||||
|
...baseSchema,
|
||||||
|
type: z.literal('freshness'),
|
||||||
|
freshness_analysis: z.object({
|
||||||
|
likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
|
||||||
|
dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
|
||||||
|
current_time: z.string().describe('Current system time when evaluation was performed'),
|
||||||
|
max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
const pluralitySchema = z.object({
|
||||||
|
...baseSchema,
|
||||||
|
type: z.literal('plurality'),
|
||||||
|
plurality_analysis: z.object({
|
||||||
|
expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
|
||||||
|
provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
|
||||||
|
count_expected: z.number().optional().describe('Number of items expected if specified in question'),
|
||||||
|
count_provided: z.number().describe('Number of items provided in answer')
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
function getDefinitivePrompt(question: string, answer: string): string {
|
||||||
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
|
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
|
||||||
|
|
||||||
<rules>
|
<rules>
|
||||||
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
|
|||||||
5. Non-answers that suggest alternatives
|
5. Non-answers that suggest alternatives
|
||||||
</rules>
|
</rules>
|
||||||
|
|
||||||
|
|
||||||
<examples>
|
<examples>
|
||||||
Question: "What are the system requirements for running Python 3.9?"
|
Question: "What are the system requirements for running Python 3.9?"
|
||||||
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
|
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
|
||||||
Evaluation: {
|
Evaluation: {
|
||||||
"is_definitive": false,
|
"pass": false,
|
||||||
"reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
|
"think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
|
||||||
}
|
}
|
||||||
|
|
||||||
Question: "What are the system requirements for running Python 3.9?"
|
Question: "What are the system requirements for running Python 3.9?"
|
||||||
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
|
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
|
||||||
Evaluation: {
|
Evaluation: {
|
||||||
"is_definitive": true,
|
"pass": true,
|
||||||
"reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
|
"think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
|
||||||
}
|
}
|
||||||
|
|
||||||
Question: "Who will be the president of the United States in 2032?"
|
Question: "Who will be the president of the United States in 2032?"
|
||||||
Answer: "I cannot predict the future, it depends on the election results."
|
Answer: "I cannot predict the future, it depends on the election results."
|
||||||
Evaluation: {
|
Evaluation: {
|
||||||
"is_definitive": false,
|
"pass": false,
|
||||||
"reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
|
"think": "The answer contains a statement of inability to predict the future, making it non-definitive."
|
||||||
}
|
}
|
||||||
|
|
||||||
Question: "Who is the sales director at Company X?"
|
Question: "Who is the sales director at Company X?"
|
||||||
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
|
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
|
||||||
Evaluation: {
|
Evaluation: {
|
||||||
"is_definitive": false,
|
"pass": false,
|
||||||
"reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
|
"think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
|
||||||
}
|
}
|
||||||
|
|
||||||
Question: "what is the twitter account of jina ai's founder?"
|
Question: "what is the twitter account of jina ai's founder?"
|
||||||
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
|
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
|
||||||
Evaluation: {
|
Evaluation: {
|
||||||
"is_definitive": false,
|
"pass": false,
|
||||||
"reasoning": "The answer indicates a lack of information rather than providing a definitive response."
|
"think": "The answer indicates a lack of information rather than providing a definitive response."
|
||||||
}
|
}
|
||||||
</examples>
|
</examples>
|
||||||
|
|
||||||
Now evaluate this pair:
|
Now evaluate this pair:
|
||||||
Question: ${JSON.stringify(question)}
|
Question: ${JSON.stringify(question)}
|
||||||
Answer: ${JSON.stringify(answer)}`;
|
Answer: ${JSON.stringify(answer)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
|
function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
|
||||||
try {
|
return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
|
||||||
const prompt = getPrompt(question, answer);
|
|
||||||
let object;
|
<rules>
|
||||||
let totalTokens = 0;
|
1. Date Analysis:
|
||||||
|
- Extract all dates mentioned in the answer
|
||||||
|
- Compare against current system time: ${currentTime}
|
||||||
|
- Consider content outdated if:
|
||||||
|
* It refers to a "latest" or "current" state from more than 30 days ago
|
||||||
|
* It mentions specific dates/events that have been superseded
|
||||||
|
* It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
|
||||||
|
- For product versions, releases, or announcements, max age is 30 days
|
||||||
|
- For company positions, leadership, or general facts, max age is 60 days
|
||||||
|
|
||||||
|
2. Context Hints:
|
||||||
|
- Words indicating recency: "latest", "current", "newest", "just released", "recently"
|
||||||
|
- Time-sensitive terms: "CEO", "price", "version", "release"
|
||||||
|
- Future dates should be ignored in outdated calculation
|
||||||
|
</rules>
|
||||||
|
|
||||||
|
<examples>
|
||||||
|
Question: "What is Jina AI's latest embedding model?"
|
||||||
|
Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
|
||||||
|
Current Time: "2024-10-06T00:00:00Z"
|
||||||
|
Evaluation: {
|
||||||
|
"pass": false,
|
||||||
|
"think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
|
||||||
|
"freshness_analysis": {
|
||||||
|
"likely_outdated": true,
|
||||||
|
"dates_mentioned": ["2024-03-15"],
|
||||||
|
"current_time": "2024-10-06T00:00:00Z",
|
||||||
|
"max_age_days": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Question: "Who is OpenAI's CEO?"
|
||||||
|
Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
|
||||||
|
Current Time: "2024-02-06T00:00:00Z"
|
||||||
|
Evaluation: {
|
||||||
|
"pass": true,
|
||||||
|
"think": "The answer is about company leadership and is within the 60-day threshold for such information",
|
||||||
|
"freshness_analysis": {
|
||||||
|
"likely_outdated": false,
|
||||||
|
"dates_mentioned": ["2023-12"],
|
||||||
|
"current_time": "2024-02-06T00:00:00Z",
|
||||||
|
"max_age_days": 60
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</examples>
|
||||||
|
|
||||||
|
Now evaluate this pair:
|
||||||
|
Question: ${JSON.stringify(question)}
|
||||||
|
Answer: ${JSON.stringify(answer)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getPluralityPrompt(question: string, answer: string): string {
|
||||||
|
return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
|
||||||
|
|
||||||
|
<rules>
|
||||||
|
1. Question Analysis:
|
||||||
|
- Check if question asks for multiple items using indicators like:
|
||||||
|
* Plural nouns: "companies", "people", "names"
|
||||||
|
* Quantifiers: "all", "many", "several", "various", "multiple"
|
||||||
|
* List requests: "list", "enumerate", "name all", "give me all"
|
||||||
|
* Numbers: "5 examples", "top 10"
|
||||||
|
- Otherwise skip the analysis and return pass to true
|
||||||
|
|
||||||
|
2. Answer Analysis:
|
||||||
|
- Count distinct items provided in the answer
|
||||||
|
- Check if answer uses limiting words like "only", "just", "single"
|
||||||
|
- Identify if answer acknowledges there are more items but only provides some
|
||||||
|
|
||||||
|
3. Definitiveness Rules:
|
||||||
|
- If question asks for multiple items but answer provides only one → NOT definitive
|
||||||
|
- If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
|
||||||
|
- If answer clearly states it's providing a partial list → NOT definitive
|
||||||
|
- If question asks for "all" or "every" but answer seems incomplete → NOT definitive
|
||||||
|
</rules>
|
||||||
|
|
||||||
|
<examples>
|
||||||
|
Question: "Who works in Jina AI's sales team?"
|
||||||
|
Answer: "John Smith is a sales representative at Jina AI."
|
||||||
|
Evaluation: {
|
||||||
|
"pass": true,
|
||||||
|
"think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
|
||||||
|
"plurality_analysis": {
|
||||||
|
"expects_multiple": false,
|
||||||
|
"provides_multiple": false,
|
||||||
|
"count_provided": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Question: "List all the salespeople who work at Jina AI"
|
||||||
|
Answer: "John Smith is a sales representative at Jina AI."
|
||||||
|
Evaluation: {
|
||||||
|
"pass": false,
|
||||||
|
"think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
|
||||||
|
"plurality_analysis": {
|
||||||
|
"expects_multiple": true,
|
||||||
|
"provides_multiple": false,
|
||||||
|
"count_provided": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Question: "Name the top 3 products sold by Jina AI"
|
||||||
|
Answer: "Jina AI's product lineup includes DocArray and Jina."
|
||||||
|
Evaluation: {
|
||||||
|
"pass": false,
|
||||||
|
"think": "The question asks for top 3 products but only 2 are provided.",
|
||||||
|
"plurality_analysis": {
|
||||||
|
"expects_multiple": true,
|
||||||
|
"provides_multiple": true,
|
||||||
|
"count_expected": 3,
|
||||||
|
"count_provided": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Question: "List as many AI companies in Berlin as you can find"
|
||||||
|
Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
|
||||||
|
Evaluation: {
|
||||||
|
"pass": false,
|
||||||
|
"think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
|
||||||
|
"plurality_analysis": {
|
||||||
|
"expects_multiple": true,
|
||||||
|
"provides_multiple": true,
|
||||||
|
"count_provided": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</examples>
|
||||||
|
|
||||||
|
Now evaluate this pair:
|
||||||
|
Question: ${JSON.stringify(question)}
|
||||||
|
Answer: ${JSON.stringify(answer)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function evaluateAnswer(
|
||||||
|
question: string,
|
||||||
|
answer: string,
|
||||||
|
evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
|
||||||
|
tracker?: TokenTracker
|
||||||
|
): Promise<{ response: EvaluationResponse }> {
|
||||||
|
let result;
|
||||||
|
|
||||||
|
for (const evaluationType of evaluationOrder) {
|
||||||
try {
|
try {
|
||||||
const result = await generateObject({
|
switch (evaluationType) {
|
||||||
model,
|
case 'definitive':
|
||||||
schema: responseSchema,
|
result = await generateObject({
|
||||||
prompt,
|
model,
|
||||||
maxTokens: getMaxTokens('evaluator')
|
schema: definitiveSchema,
|
||||||
});
|
prompt: getDefinitivePrompt(question, answer),
|
||||||
object = result.object;
|
maxTokens: getMaxTokens('evaluator')
|
||||||
totalTokens = result.usage?.totalTokens || 0;
|
});
|
||||||
|
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||||
|
console.log('Evaluation:', result.object);
|
||||||
|
if (!result.object.pass) {
|
||||||
|
return { response: result.object };
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'freshness':
|
||||||
|
result = await generateObject({
|
||||||
|
model,
|
||||||
|
schema: freshnessSchema,
|
||||||
|
prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
|
||||||
|
maxTokens: getMaxTokens('evaluator')
|
||||||
|
});
|
||||||
|
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||||
|
console.log('Evaluation:', result.object);
|
||||||
|
if (!result.object.pass) {
|
||||||
|
return { response: result.object };
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'plurality':
|
||||||
|
result = await generateObject({
|
||||||
|
model,
|
||||||
|
schema: pluralitySchema,
|
||||||
|
prompt: getPluralityPrompt(question, answer),
|
||||||
|
maxTokens: getMaxTokens('evaluator')
|
||||||
|
});
|
||||||
|
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
|
||||||
|
console.log('Evaluation:', result.object);
|
||||||
|
if (!result.object.pass) {
|
||||||
|
return { response: result.object };
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const result = await handleGenerateObjectError<EvaluationResponse>(error);
|
console.error(`Error in ${evaluationType} evaluation:`, error);
|
||||||
object = result.object;
|
const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
|
||||||
totalTokens = result.totalTokens;
|
(tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
|
||||||
|
if (!errorResult.object.pass) {
|
||||||
|
return { response: errorResult.object };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
console.log('Evaluation:', {
|
|
||||||
definitive: object.is_definitive,
|
|
||||||
reason: object.reasoning
|
|
||||||
});
|
|
||||||
(tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
|
|
||||||
return { response: object, tokens: totalTokens };
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error in answer evaluation:', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Example usage
|
|
||||||
async function main() {
|
|
||||||
const question = process.argv[2] || '';
|
|
||||||
const answer = process.argv[3] || '';
|
|
||||||
|
|
||||||
if (!question || !answer) {
|
|
||||||
console.error('Please provide both question and answer as command line arguments');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
return { response: result!.object };
|
||||||
await evaluateAnswer(question, answer);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to evaluate answer:', error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (require.main === module) {
|
|
||||||
main().catch(console.error);
|
|
||||||
}
|
}
|
||||||
22
src/types.ts
22
src/types.ts
@@ -1,18 +1,3 @@
|
|||||||
import { z } from 'zod';
|
|
||||||
|
|
||||||
export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
|
|
||||||
|
|
||||||
export const QuerySchema = z.string()
|
|
||||||
.max(30)
|
|
||||||
.describe('Search query, must be less than 30 characters');
|
|
||||||
|
|
||||||
export const URLSchema = z.string().url();
|
|
||||||
|
|
||||||
export const ReferenceSchema = z.object({
|
|
||||||
exactQuote: z.string().describe('Exact relevant quote from the document'),
|
|
||||||
url: URLSchema.describe('URL of the document')
|
|
||||||
});
|
|
||||||
|
|
||||||
// Action Types
|
// Action Types
|
||||||
type BaseAction = {
|
type BaseAction = {
|
||||||
action: "search" | "answer" | "reflect" | "visit";
|
action: "search" | "answer" | "reflect" | "visit";
|
||||||
@@ -96,9 +81,12 @@ export interface ReadResponse {
|
|||||||
readableMessage?: string;
|
readableMessage?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export type EvaluationResponse = {
|
export type EvaluationResponse = {
|
||||||
is_definitive: boolean;
|
pass: boolean;
|
||||||
reasoning: string;
|
think: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ErrorAnalysisResponse = {
|
export type ErrorAnalysisResponse = {
|
||||||
|
|||||||
Reference in New Issue
Block a user