diff --git a/src/agent.ts b/src/agent.ts index 043788b..155112f 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -1,7 +1,7 @@ import {z, ZodObject} from 'zod'; import {generateObject} from 'ai'; import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP} from "./config"; -import {readUrl} from "./tools/read"; +import {readUrl, removeAllLineBreaks} from "./tools/read"; import {handleGenerateObjectError} from './utils/error-handling'; import fs from 'fs/promises'; import {SafeSearchType, search as duckSearch} from "duck-duck-scrape"; @@ -273,9 +273,7 @@ function updateContext(step: any) { allContext.push(step) } -function removeAllLineBreaks(text: string) { - return text.replace(/(\r\n|\n|\r)/gm, " "); -} + function removeHTMLtags(text: string) { return text.replace(/<[^>]*>?/gm, ''); @@ -390,7 +388,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_ ...thisStep, }); - const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer, + const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep, evaluationMetrics[currentQuestion], context.tokenTracker); if (currentQuestion === question) { diff --git a/src/tools/brave-search.ts b/src/tools/brave-search.ts index d605e09..0e9895d 100644 --- a/src/tools/brave-search.ts +++ b/src/tools/brave-search.ts @@ -7,7 +7,7 @@ export async function braveSearch(query: string): Promise<{ response: BraveSearc const response = await axios.get('https://api.search.brave.com/res/v1/web/search', { params: { q: query, - count: 5, + count: 10, safesearch: 'off' }, headers: { diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index 1e39dd7..c652f62 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -1,13 +1,14 @@ -import { z } from 'zod'; -import { generateObject } from 'ai'; -import { getModel, getMaxTokens } from "../config"; -import { TokenTracker } from "../utils/token-tracker"; -import { EvaluationResponse } from '../types'; -import { handleGenerateObjectError } from '../utils/error-handling'; +import {z} from 'zod'; +import {generateObject} from 'ai'; +import {getModel, getMaxTokens} from "../config"; +import {TokenTracker} from "../utils/token-tracker"; +import {AnswerAction, EvaluationResponse} from '../types'; +import {handleGenerateObjectError} from '../utils/error-handling'; +import {readUrl, removeAllLineBreaks} from "./read"; const model = getModel('evaluator'); -type EvaluationType = 'definitive' | 'freshness' | 'plurality'; +type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution'; const baseSchema = { pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'), @@ -41,6 +42,73 @@ const pluralitySchema = z.object({ }) }); +const attributionSchema = z.object({ + ...baseSchema, + type: z.literal('attribution'), + attribution_analysis: z.object({ + sources_provided: z.boolean().describe('Whether the answer provides source references'), + sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'), + quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content') + }) +}); + +function getAttributionPrompt(question: string, answer: string, sourceContent: string): string { + return `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided sources. + + +1. Source Verification: + - Check if answer claims are supported by the provided source content + - Verify that quotes are accurate and in proper context + - Ensure numerical data and statistics match the source + - Flag any claims that go beyond what the sources support + +2. Attribution Analysis: + - Check if answer properly references its sources + - Verify that important claims have clear source attribution + - Ensure quotes are properly marked and cited + - Check for any unsupported generalizations + +3. Accuracy Requirements: + - Direct quotes must match source exactly + - Paraphrasing must maintain original meaning + - Statistics and numbers must be precise + - Context must be preserved + + + +Question: "What are Jina AI's main products?" +Answer: "According to Jina AI's website, their main products are DocArray and Jina Framework." +Source Content: "Jina AI's flagship products include DocArray, Jina Framework, and JCloud, offering a complete ecosystem for neural search applications." +Evaluation: { + "pass": false, + "think": "The answer omits JCloud which is mentioned as a main product in the source. The information provided is incomplete and potentially misleading as it fails to mention a significant product from the company's ecosystem.", + "attribution_analysis": { + "sources_provided": true, + "sources_verified": false, + "quotes_accurate": false + } +} + +Question: "When was Python first released?" +Answer: "Python was first released in 1991 by Guido van Rossum." +Source Content: "Python was first released in 1991 by Guido van Rossum while working at CWI." +Evaluation: { + "pass": true, + "think": "The answer accurately reflects the core information from the source about Python's release date and creator, though it omits the additional context about CWI which isn't essential to the question.", + "attribution_analysis": { + "sources_provided": true, + "sources_verified": true, + "quotes_accurate": true + } +} + + +Now evaluate this pair: +Question: ${JSON.stringify(question)} +Answer: ${JSON.stringify(answer)} +Source Content: ${JSON.stringify(sourceContent)}`; +} + function getDefinitivePrompt(question: string, answer: string): string { return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not. @@ -332,67 +400,161 @@ export async function evaluateQuestion( } } + +// Helper function to handle common evaluation logic +async function performEvaluation( + evaluationType: EvaluationType, + params: { + model: any; + schema: z.ZodType; + prompt: string; + maxTokens: number; + }, + tracker?: TokenTracker +): Promise { + const result = await generateObject({ + model: params.model, + schema: params.schema, + prompt: params.prompt, + maxTokens: params.maxTokens + }); + + (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); + console.log(`${evaluationType} Evaluation:`, result.object); + + return result; +} + +interface GenerateObjectResult { + object: EvaluationResponse; + usage?: { + totalTokens: number; + }; +} + +// Main evaluation function export async function evaluateAnswer( question: string, - answer: string, + action: AnswerAction, evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'], tracker?: TokenTracker ): Promise<{ response: EvaluationResponse }> { - let result; + let result: GenerateObjectResult; + + // Only add attribution if we have valid references + if (action.references && action.references.length > 0) { + evaluationOrder = ['attribution', ...evaluationOrder]; + } for (const evaluationType of evaluationOrder) { try { switch (evaluationType) { - case 'definitive': - result = await generateObject({ - model, - schema: definitiveSchema, - prompt: getDefinitivePrompt(question, answer), - maxTokens: getMaxTokens('evaluator') - }); - (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); - console.log('Evaluation:', result.object); - if (!result.object.pass) { - return { response: result.object }; + case 'attribution': { + // Safely handle references and ensure we have content + const urls = action.references?.map(ref => ref.url) ?? []; + const uniqueURLs = [...new Set(urls)]; + const allKnowledge = await fetchSourceContent(uniqueURLs, tracker); + + if (!allKnowledge.trim()) { + return { + response: { + pass: false, + think: "The answer does not provide any valid attribution references that could be verified. No accessible source content was found to validate the claims made in the answer.", + type: 'attribution', + } + }; } + + result = await performEvaluation( + 'attribution', + { + model, + schema: attributionSchema, + prompt: getAttributionPrompt(question, action.answer, allKnowledge), + maxTokens: getMaxTokens('evaluator') + }, + tracker + ); + break; + } + + case 'definitive': + result = await performEvaluation( + 'definitive', + { + model, + schema: definitiveSchema, + prompt: getDefinitivePrompt(question, action.answer), + maxTokens: getMaxTokens('evaluator') + }, + tracker + ); break; case 'freshness': - result = await generateObject({ - model, - schema: freshnessSchema, - prompt: getFreshnessPrompt(question, answer, new Date().toISOString()), - maxTokens: getMaxTokens('evaluator') - }); - (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); - console.log('Evaluation:', result.object); - if (!result.object.pass) { - return { response: result.object }; - } + result = await performEvaluation( + 'freshness', + { + model, + schema: freshnessSchema, + prompt: getFreshnessPrompt(question, action.answer, new Date().toISOString()), + maxTokens: getMaxTokens('evaluator') + }, + tracker + ); break; case 'plurality': - result = await generateObject({ - model, - schema: pluralitySchema, - prompt: getPluralityPrompt(question, answer), - maxTokens: getMaxTokens('evaluator') - }); - (tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0); - console.log('Evaluation:', result.object); - if (!result.object.pass) { - return { response: result.object }; - } + result = await performEvaluation( + 'plurality', + { + model, + schema: pluralitySchema, + prompt: getPluralityPrompt(question, action.answer), + maxTokens: getMaxTokens('evaluator') + }, + tracker + ); break; } + + if (!result?.object.pass) { + return {response: result.object}; + } } catch (error) { const errorResult = await handleGenerateObjectError(error); (tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0); - // Always return from catch block to prevent undefined result - return { response: errorResult.object }; + return {response: errorResult.object}; } } - // Only reach this point if all evaluations pass - return { response: result!.object }; + return {response: result!.object}; +} + +// Helper function to fetch and combine source content +async function fetchSourceContent(urls: string[], tracker?: TokenTracker): Promise { + if (!urls.length) return ''; + + try { + const results = await Promise.all( + urls.map(async (url) => { + try { + const {response} = await readUrl(url, tracker); + const content = response?.data?.content || ''; + return removeAllLineBreaks(content); + } catch (error) { + console.error('Error reading URL:', error); + return ''; + } + }) + ); + + // Filter out empty results and join with proper separation + return results + .filter(content => content.trim()) + .join('\n\n'); + } catch (error) { + console.error('Error fetching source content:', error); + return ''; + } } \ No newline at end of file diff --git a/src/tools/read.ts b/src/tools/read.ts index 2f9c28b..5ccaaf4 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -91,4 +91,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response req.write(data); req.end(); }); +} + +export function removeAllLineBreaks(text: string) { + return text.replace(/(\r\n|\n|\r)/gm, " "); } \ No newline at end of file diff --git a/src/types.ts b/src/types.ts index ff829b3..b1369bb 100644 --- a/src/types.ts +++ b/src/types.ts @@ -100,7 +100,7 @@ export interface ReadResponse { export type EvaluationResponse = { pass: boolean; think: string; - type?: 'definitive' | 'freshness' | 'plurality'; + type?: 'definitive' | 'freshness' | 'plurality' | 'attribution'; freshness_analysis?: { likely_outdated: boolean; dates_mentioned: string[];