diff --git a/src/agent.ts b/src/agent.ts
index 8aa18c8..2e8d968 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -31,7 +31,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
if (allowSearch) {
actions.push("search");
- properties.searchQuery = z.string()
+ properties.searchQuery = z.string().max(30)
.describe("Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.").optional();
}
@@ -356,39 +356,24 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
// execute the step and action
if (thisStep.action === 'answer') {
+ if (step === 1) {
+ // LLM is so confident and answer immediately, skip all evaluations
+ isAnswered = true;
+ break
+ }
+
updateContext({
totalStep,
question: currentQuestion,
...thisStep,
});
- const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, context.tokenTracker);
-
+ const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
+ ['definitive', 'freshness', 'plurality'], context.tokenTracker);
if (currentQuestion === question) {
- if (badAttempts >= maxBadAttempts) {
- // EXIT POINT OF THE PROGRAM!!!!
+ if (evaluation.pass) {
diaryContext.push(`
-At step ${step} and ${badAttempts} attempts, you took **answer** action and found an answer, not a perfect one but good enough to answer the original question:
-
-Original question:
-${currentQuestion}
-
-Your answer:
-${thisStep.answer}
-
-The evaluator thinks your answer is good because:
-${evaluation.reasoning}
-
-Your journey ends here.
-`);
- isAnswered = false;
- break
- }
- if (evaluation.is_definitive) {
- if (thisStep.references?.length > 0 || Object.keys(allURLs).length === 0) {
- // EXIT POINT OF THE PROGRAM!!!!
- diaryContext.push(`
At step ${step}, you took **answer** action and finally found the answer to the original question:
Original question:
@@ -398,31 +383,18 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is good because:
-${evaluation.reasoning}
+${evaluation.think}
Your journey ends here. You have successfully answered the original question. Congratulations! 🎉
`);
- isAnswered = true;
+ isAnswered = true;
+ break
+ } else {
+ if (badAttempts >= maxBadAttempts) {
+ isAnswered = false;
break
} else {
diaryContext.push(`
-At step ${step}, you took **answer** action and finally found the answer to the original question:
-
-Original question:
-${currentQuestion}
-
-Your answer:
-${thisStep.answer}
-
-Unfortunately, you did not provide any references to support your answer.
-You need to find more URL references to support your answer.`);
- }
-
- isAnswered = true;
- break
-
- } else {
- diaryContext.push(`
At step ${step}, you took **answer** action but evaluator thinks it is not a good answer:
Original question:
@@ -432,23 +404,31 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is bad because:
-${evaluation.reasoning}
+${evaluation.think}
`);
- // store the bad context and reset the diary context
- const {response: errorAnalysis} = await analyzeSteps(diaryContext);
+ // store the bad context and reset the diary context
+ const {response: errorAnalysis} = await analyzeSteps(diaryContext);
- badContext.push({
- question: currentQuestion,
- answer: thisStep.answer,
- evaluation: evaluation.reasoning,
- ...errorAnalysis
- });
- badAttempts++;
- allowAnswer = false; // disable answer action in the immediate next step
- diaryContext = [];
- step = 0;
+ allKnowledge.push({
+ question: currentQuestion,
+ answer: thisStep.answer,
+ references: thisStep.references,
+ type: 'qa'
+ });
+
+ badContext.push({
+ question: currentQuestion,
+ answer: thisStep.answer,
+ evaluation: evaluation.think,
+ ...errorAnalysis
+ });
+ badAttempts++;
+ allowAnswer = false; // disable answer action in the immediate next step
+ diaryContext = [];
+ step = 0;
+ }
}
- } else if (evaluation.is_definitive) {
+ } else if (evaluation.pass) {
diaryContext.push(`
At step ${step}, you took **answer** action. You found a good answer to the sub-question:
@@ -459,7 +439,7 @@ Your answer:
${thisStep.answer}
The evaluator thinks your answer is good because:
-${evaluation.reasoning}
+${evaluation.think}
Although you solved a sub-question, you still need to find the answer to the original question. You need to keep going.
`);
diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
index a23b276..269d82d 100644
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -7,12 +7,41 @@ import { handleGenerateObjectError } from '../utils/error-handling';
const model = getModel('evaluator');
-const responseSchema = z.object({
- is_definitive: z.boolean().describe('Whether the answer provides a definitive response without uncertainty or negative statements'),
- reasoning: z.string().describe('Explanation of why the answer is or isn\'t definitive')
+type EvaluationType = 'definitive' | 'freshness' | 'plurality';
+
+const baseSchema = {
+ pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
+ think: z.string().describe('Explanation the thought process why the answer does not pass the evaluation criteria')
+};
+
+const definitiveSchema = z.object({
+ ...baseSchema,
+ type: z.literal('definitive')
});
-function getPrompt(question: string, answer: string): string {
+const freshnessSchema = z.object({
+ ...baseSchema,
+ type: z.literal('freshness'),
+ freshness_analysis: z.object({
+ likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'),
+ dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'),
+ current_time: z.string().describe('Current system time when evaluation was performed'),
+ max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated')
+ })
+});
+
+const pluralitySchema = z.object({
+ ...baseSchema,
+ type: z.literal('plurality'),
+ plurality_analysis: z.object({
+ expects_multiple: z.boolean().describe('Whether the question asks for multiple items'),
+ provides_multiple: z.boolean().describe('Whether the answer provides multiple items'),
+ count_expected: z.number().optional().describe('Number of items expected if specified in question'),
+ count_provided: z.number().describe('Number of items provided in answer')
+ })
+});
+
+function getDefinitivePrompt(question: string, answer: string): string {
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
@@ -25,96 +54,245 @@ Definitiveness is the king! The following types of responses are NOT definitive
5. Non-answers that suggest alternatives
-
Question: "What are the system requirements for running Python 3.9?"
Answer: "I'm not entirely sure, but I think you need a computer with some RAM."
Evaluation: {
- "is_definitive": false,
- "reasoning": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
+ "pass": false,
+ "think": "The answer contains uncertainty markers like 'not entirely sure' and 'I think', making it non-definitive."
}
Question: "What are the system requirements for running Python 3.9?"
Answer: "Python 3.9 requires Windows 7 or later, macOS 10.11 or later, or Linux."
Evaluation: {
- "is_definitive": true,
- "reasoning": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
+ "pass": true,
+ "think": "The answer makes clear, definitive statements without uncertainty markers or ambiguity."
}
Question: "Who will be the president of the United States in 2032?"
Answer: "I cannot predict the future, it depends on the election results."
Evaluation: {
- "is_definitive": false,
- "reasoning": "The answer contains a statement of inability to predict the future, making it non-definitive."
+ "pass": false,
+ "think": "The answer contains a statement of inability to predict the future, making it non-definitive."
}
Question: "Who is the sales director at Company X?"
Answer: "I cannot provide the name of the sales director, but you can contact their sales team at sales@companyx.com"
Evaluation: {
- "is_definitive": false,
- "reasoning": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
+ "pass": false,
+ "think": "The answer starts with 'I cannot provide' and redirects to an alternative contact method instead of answering the original question."
}
Question: "what is the twitter account of jina ai's founder?"
Answer: "The provided text does not contain the Twitter account of Jina AI's founder."
Evaluation: {
- "is_definitive": false,
- "reasoning": "The answer indicates a lack of information rather than providing a definitive response."
+ "pass": false,
+ "think": "The answer indicates a lack of information rather than providing a definitive response."
}
+
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}`;
}
-export async function evaluateAnswer(question: string, answer: string, tracker?: TokenTracker): Promise<{ response: EvaluationResponse, tokens: number }> {
- try {
- const prompt = getPrompt(question, answer);
- let object;
- let totalTokens = 0;
+function getFreshnessPrompt(question: string, answer: string, currentTime: string): string {
+ return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time.
+
+
+1. Date Analysis:
+ - Extract all dates mentioned in the answer
+ - Compare against current system time: ${currentTime}
+ - Consider content outdated if:
+ * It refers to a "latest" or "current" state from more than 30 days ago
+ * It mentions specific dates/events that have been superseded
+ * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago
+ - For product versions, releases, or announcements, max age is 30 days
+ - For company positions, leadership, or general facts, max age is 60 days
+
+2. Context Hints:
+ - Words indicating recency: "latest", "current", "newest", "just released", "recently"
+ - Time-sensitive terms: "CEO", "price", "version", "release"
+ - Future dates should be ignored in outdated calculation
+
+
+
+Question: "What is Jina AI's latest embedding model?"
+Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024."
+Current Time: "2024-10-06T00:00:00Z"
+Evaluation: {
+ "pass": false,
+ "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information",
+ "freshness_analysis": {
+ "likely_outdated": true,
+ "dates_mentioned": ["2024-03-15"],
+ "current_time": "2024-10-06T00:00:00Z",
+ "max_age_days": 30
+ }
+}
+
+Question: "Who is OpenAI's CEO?"
+Answer: "Sam Altman is the CEO of OpenAI as of December 2023."
+Current Time: "2024-02-06T00:00:00Z"
+Evaluation: {
+ "pass": true,
+ "think": "The answer is about company leadership and is within the 60-day threshold for such information",
+ "freshness_analysis": {
+ "likely_outdated": false,
+ "dates_mentioned": ["2023-12"],
+ "current_time": "2024-02-06T00:00:00Z",
+ "max_age_days": 60
+ }
+}
+
+
+Now evaluate this pair:
+Question: ${JSON.stringify(question)}
+Answer: ${JSON.stringify(answer)}`;
+}
+
+function getPluralityPrompt(question: string, answer: string): string {
+ return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question.
+
+
+1. Question Analysis:
+ - Check if question asks for multiple items using indicators like:
+ * Plural nouns: "companies", "people", "names"
+ * Quantifiers: "all", "many", "several", "various", "multiple"
+ * List requests: "list", "enumerate", "name all", "give me all"
+ * Numbers: "5 examples", "top 10"
+ - Otherwise skip the analysis and return pass to true
+
+2. Answer Analysis:
+ - Count distinct items provided in the answer
+ - Check if answer uses limiting words like "only", "just", "single"
+ - Identify if answer acknowledges there are more items but only provides some
+
+3. Definitiveness Rules:
+ - If question asks for multiple items but answer provides only one → NOT definitive
+ - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive
+ - If answer clearly states it's providing a partial list → NOT definitive
+ - If question asks for "all" or "every" but answer seems incomplete → NOT definitive
+
+
+
+Question: "Who works in Jina AI's sales team?"
+Answer: "John Smith is a sales representative at Jina AI."
+Evaluation: {
+ "pass": true,
+ "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.",
+ "plurality_analysis": {
+ "expects_multiple": false,
+ "provides_multiple": false,
+ "count_provided": 1
+ }
+}
+
+Question: "List all the salespeople who work at Jina AI"
+Answer: "John Smith is a sales representative at Jina AI."
+Evaluation: {
+ "pass": false,
+ "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.",
+ "plurality_analysis": {
+ "expects_multiple": true,
+ "provides_multiple": false,
+ "count_provided": 1
+ }
+}
+
+Question: "Name the top 3 products sold by Jina AI"
+Answer: "Jina AI's product lineup includes DocArray and Jina."
+Evaluation: {
+ "pass": false,
+ "think": "The question asks for top 3 products but only 2 are provided.",
+ "plurality_analysis": {
+ "expects_multiple": true,
+ "provides_multiple": true,
+ "count_expected": 3,
+ "count_provided": 2
+ }
+}
+
+Question: "List as many AI companies in Berlin as you can find"
+Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples."
+Evaluation: {
+ "pass": false,
+ "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.",
+ "plurality_analysis": {
+ "expects_multiple": true,
+ "provides_multiple": true,
+ "count_provided": 5
+ }
+}
+
+
+Now evaluate this pair:
+Question: ${JSON.stringify(question)}
+Answer: ${JSON.stringify(answer)}`;
+}
+
+export async function evaluateAnswer(
+ question: string,
+ answer: string,
+ evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
+ tracker?: TokenTracker
+): Promise<{ response: EvaluationResponse }> {
+ let result;
+
+ for (const evaluationType of evaluationOrder) {
try {
- const result = await generateObject({
- model,
- schema: responseSchema,
- prompt,
- maxTokens: getMaxTokens('evaluator')
- });
- object = result.object;
- totalTokens = result.usage?.totalTokens || 0;
+ switch (evaluationType) {
+ case 'definitive':
+ result = await generateObject({
+ model,
+ schema: definitiveSchema,
+ prompt: getDefinitivePrompt(question, answer),
+ maxTokens: getMaxTokens('evaluator')
+ });
+ (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+ console.log('Evaluation:', result.object);
+ if (!result.object.pass) {
+ return { response: result.object };
+ }
+ break;
+
+ case 'freshness':
+ result = await generateObject({
+ model,
+ schema: freshnessSchema,
+ prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
+ maxTokens: getMaxTokens('evaluator')
+ });
+ (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+ console.log('Evaluation:', result.object);
+ if (!result.object.pass) {
+ return { response: result.object };
+ }
+ break;
+
+ case 'plurality':
+ result = await generateObject({
+ model,
+ schema: pluralitySchema,
+ prompt: getPluralityPrompt(question, answer),
+ maxTokens: getMaxTokens('evaluator')
+ });
+ (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
+ console.log('Evaluation:', result.object);
+ if (!result.object.pass) {
+ return { response: result.object };
+ }
+ break;
+ }
} catch (error) {
- const result = await handleGenerateObjectError(error);
- object = result.object;
- totalTokens = result.totalTokens;
+ console.error(`Error in ${evaluationType} evaluation:`, error);
+ const errorResult = await handleGenerateObjectError(error);
+ (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
+ if (!errorResult.object.pass) {
+ return { response: errorResult.object };
+ }
}
- console.log('Evaluation:', {
- definitive: object.is_definitive,
- reason: object.reasoning
- });
- (tracker || new TokenTracker()).trackUsage('evaluator', totalTokens);
- return { response: object, tokens: totalTokens };
- } catch (error) {
- console.error('Error in answer evaluation:', error);
- throw error;
- }
-}
-
-// Example usage
-async function main() {
- const question = process.argv[2] || '';
- const answer = process.argv[3] || '';
-
- if (!question || !answer) {
- console.error('Please provide both question and answer as command line arguments');
- process.exit(1);
}
- try {
- await evaluateAnswer(question, answer);
- } catch (error) {
- console.error('Failed to evaluate answer:', error);
- }
-}
-
-if (require.main === module) {
- main().catch(console.error);
+ return { response: result!.object };
}
\ No newline at end of file
diff --git a/src/types.ts b/src/types.ts
index 0e7db89..a93ab5a 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,18 +1,3 @@
-import { z } from 'zod';
-
-export const ThinkSchema = z.string().describe('Strategic reasoning about the process');
-
-export const QuerySchema = z.string()
- .max(30)
- .describe('Search query, must be less than 30 characters');
-
-export const URLSchema = z.string().url();
-
-export const ReferenceSchema = z.object({
- exactQuote: z.string().describe('Exact relevant quote from the document'),
- url: URLSchema.describe('URL of the document')
-});
-
// Action Types
type BaseAction = {
action: "search" | "answer" | "reflect" | "visit";
@@ -96,9 +81,12 @@ export interface ReadResponse {
readableMessage?: string;
}
+
+
+
export type EvaluationResponse = {
- is_definitive: boolean;
- reasoning: string;
+ pass: boolean;
+ think: string;
};
export type ErrorAnalysisResponse = {