chore: update readme

This commit is contained in:
Han Xiao
2025-02-11 11:04:31 +08:00
parent 30d529082e
commit 188f1bb640
5 changed files with 217 additions and 53 deletions

View File

@@ -1,7 +1,7 @@
import {z, ZodObject} from 'zod';
import {generateObject} from 'ai';
import {getModel, getMaxTokens, SEARCH_PROVIDER, STEP_SLEEP} from "./config";
import {readUrl} from "./tools/read";
import {readUrl, removeAllLineBreaks} from "./tools/read";
import {handleGenerateObjectError} from './utils/error-handling';
import fs from 'fs/promises';
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
@@ -273,9 +273,7 @@ function updateContext(step: any) {
allContext.push(step)
}
function removeAllLineBreaks(text: string) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}
function removeHTMLtags(text: string) {
return text.replace(/<[^>]*>?/gm, '');
@@ -390,7 +388,7 @@ export async function getResponse(question: string, tokenBudget: number = 1_000_
...thisStep,
});
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep.answer,
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion], context.tokenTracker);
if (currentQuestion === question) {

View File

@@ -7,7 +7,7 @@ export async function braveSearch(query: string): Promise<{ response: BraveSearc
const response = await axios.get<BraveSearchResponse>('https://api.search.brave.com/res/v1/web/search', {
params: {
q: query,
count: 5,
count: 10,
safesearch: 'off'
},
headers: {

View File

@@ -1,13 +1,14 @@
import { z } from 'zod';
import { generateObject } from 'ai';
import { getModel, getMaxTokens } from "../config";
import { TokenTracker } from "../utils/token-tracker";
import { EvaluationResponse } from '../types';
import { handleGenerateObjectError } from '../utils/error-handling';
import {z} from 'zod';
import {generateObject} from 'ai';
import {getModel, getMaxTokens} from "../config";
import {TokenTracker} from "../utils/token-tracker";
import {AnswerAction, EvaluationResponse} from '../types';
import {handleGenerateObjectError} from '../utils/error-handling';
import {readUrl, removeAllLineBreaks} from "./read";
const model = getModel('evaluator');
type EvaluationType = 'definitive' | 'freshness' | 'plurality';
type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution';
const baseSchema = {
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
@@ -41,6 +42,73 @@ const pluralitySchema = z.object({
})
});
const attributionSchema = z.object({
...baseSchema,
type: z.literal('attribution'),
attribution_analysis: z.object({
sources_provided: z.boolean().describe('Whether the answer provides source references'),
sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'),
quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content')
})
});
function getAttributionPrompt(question: string, answer: string, sourceContent: string): string {
return `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided sources.
<rules>
1. Source Verification:
- Check if answer claims are supported by the provided source content
- Verify that quotes are accurate and in proper context
- Ensure numerical data and statistics match the source
- Flag any claims that go beyond what the sources support
2. Attribution Analysis:
- Check if answer properly references its sources
- Verify that important claims have clear source attribution
- Ensure quotes are properly marked and cited
- Check for any unsupported generalizations
3. Accuracy Requirements:
- Direct quotes must match source exactly
- Paraphrasing must maintain original meaning
- Statistics and numbers must be precise
- Context must be preserved
</rules>
<examples>
Question: "What are Jina AI's main products?"
Answer: "According to Jina AI's website, their main products are DocArray and Jina Framework."
Source Content: "Jina AI's flagship products include DocArray, Jina Framework, and JCloud, offering a complete ecosystem for neural search applications."
Evaluation: {
"pass": false,
"think": "The answer omits JCloud which is mentioned as a main product in the source. The information provided is incomplete and potentially misleading as it fails to mention a significant product from the company's ecosystem.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": false,
"quotes_accurate": false
}
}
Question: "When was Python first released?"
Answer: "Python was first released in 1991 by Guido van Rossum."
Source Content: "Python was first released in 1991 by Guido van Rossum while working at CWI."
Evaluation: {
"pass": true,
"think": "The answer accurately reflects the core information from the source about Python's release date and creator, though it omits the additional context about CWI which isn't essential to the question.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
}
</examples>
Now evaluate this pair:
Question: ${JSON.stringify(question)}
Answer: ${JSON.stringify(answer)}
Source Content: ${JSON.stringify(sourceContent)}`;
}
function getDefinitivePrompt(question: string, answer: string): string {
return `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
@@ -332,67 +400,161 @@ export async function evaluateQuestion(
}
}
// Helper function to handle common evaluation logic
async function performEvaluation(
evaluationType: EvaluationType,
params: {
model: any;
schema: z.ZodType<any>;
prompt: string;
maxTokens: number;
},
tracker?: TokenTracker
): Promise<GenerateObjectResult> {
const result = await generateObject({
model: params.model,
schema: params.schema,
prompt: params.prompt,
maxTokens: params.maxTokens
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log(`${evaluationType} Evaluation:`, result.object);
return result;
}
interface GenerateObjectResult {
object: EvaluationResponse;
usage?: {
totalTokens: number;
};
}
// Main evaluation function
export async function evaluateAnswer(
question: string,
answer: string,
action: AnswerAction,
evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
tracker?: TokenTracker
): Promise<{ response: EvaluationResponse }> {
let result;
let result: GenerateObjectResult;
// Only add attribution if we have valid references
if (action.references && action.references.length > 0) {
evaluationOrder = ['attribution', ...evaluationOrder];
}
for (const evaluationType of evaluationOrder) {
try {
switch (evaluationType) {
case 'definitive':
result = await generateObject({
model,
schema: definitiveSchema,
prompt: getDefinitivePrompt(question, answer),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
case 'attribution': {
// Safely handle references and ensure we have content
const urls = action.references?.map(ref => ref.url) ?? [];
const uniqueURLs = [...new Set(urls)];
const allKnowledge = await fetchSourceContent(uniqueURLs, tracker);
if (!allKnowledge.trim()) {
return {
response: {
pass: false,
think: "The answer does not provide any valid attribution references that could be verified. No accessible source content was found to validate the claims made in the answer.",
type: 'attribution',
}
};
}
result = await performEvaluation(
'attribution',
{
model,
schema: attributionSchema,
prompt: getAttributionPrompt(question, action.answer, allKnowledge),
maxTokens: getMaxTokens('evaluator')
},
tracker
);
break;
}
case 'definitive':
result = await performEvaluation(
'definitive',
{
model,
schema: definitiveSchema,
prompt: getDefinitivePrompt(question, action.answer),
maxTokens: getMaxTokens('evaluator')
},
tracker
);
break;
case 'freshness':
result = await generateObject({
model,
schema: freshnessSchema,
prompt: getFreshnessPrompt(question, answer, new Date().toISOString()),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
}
result = await performEvaluation(
'freshness',
{
model,
schema: freshnessSchema,
prompt: getFreshnessPrompt(question, action.answer, new Date().toISOString()),
maxTokens: getMaxTokens('evaluator')
},
tracker
);
break;
case 'plurality':
result = await generateObject({
model,
schema: pluralitySchema,
prompt: getPluralityPrompt(question, answer),
maxTokens: getMaxTokens('evaluator')
});
(tracker || new TokenTracker()).trackUsage('evaluator', result.usage?.totalTokens || 0);
console.log('Evaluation:', result.object);
if (!result.object.pass) {
return { response: result.object };
}
result = await performEvaluation(
'plurality',
{
model,
schema: pluralitySchema,
prompt: getPluralityPrompt(question, action.answer),
maxTokens: getMaxTokens('evaluator')
},
tracker
);
break;
}
if (!result?.object.pass) {
return {response: result.object};
}
} catch (error) {
const errorResult = await handleGenerateObjectError<EvaluationResponse>(error);
(tracker || new TokenTracker()).trackUsage('evaluator', errorResult.totalTokens || 0);
// Always return from catch block to prevent undefined result
return { response: errorResult.object };
return {response: errorResult.object};
}
}
// Only reach this point if all evaluations pass
return { response: result!.object };
return {response: result!.object};
}
// Helper function to fetch and combine source content
async function fetchSourceContent(urls: string[], tracker?: TokenTracker): Promise<string> {
if (!urls.length) return '';
try {
const results = await Promise.all(
urls.map(async (url) => {
try {
const {response} = await readUrl(url, tracker);
const content = response?.data?.content || '';
return removeAllLineBreaks(content);
} catch (error) {
console.error('Error reading URL:', error);
return '';
}
})
);
// Filter out empty results and join with proper separation
return results
.filter(content => content.trim())
.join('\n\n');
} catch (error) {
console.error('Error fetching source content:', error);
return '';
}
}

View File

@@ -91,4 +91,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
req.write(data);
req.end();
});
}
export function removeAllLineBreaks(text: string) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}

View File

@@ -100,7 +100,7 @@ export interface ReadResponse {
export type EvaluationResponse = {
pass: boolean;
think: string;
type?: 'definitive' | 'freshness' | 'plurality';
type?: 'definitive' | 'freshness' | 'plurality' | 'attribution';
freshness_analysis?: {
likely_outdated: boolean;
dates_mentioned: string[];