fix: eval

This commit is contained in:
Han Xiao 2025-03-11 17:56:39 +08:00
parent df45670bc9
commit c30043e119
3 changed files with 76 additions and 29 deletions

View File

@ -427,13 +427,68 @@ export async function getResponse(question?: string,
break
}
if (thisStep.references.length > 0) {
const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
const uniqueNewURLs = [...new Set(urls)];
if (uniqueNewURLs.length > 0) {
context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueNewURLs.join(', ')});
const urlResults = await Promise.all(
uniqueNewURLs.map(async url => {
try {
const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response;
const guessedTime = await getLastModified(url);
console.log('Guessed time for', url, guessedTime)
// Early return if no valid data
if (!data?.url || !data?.content) {
throw new Error('No content found');
}
allKnowledge.push({
question: `What do expert say about "${data.title}"?`,
answer: removeAllLineBreaks(data.content),
references: [data.url],
type: 'url',
updated: guessedTime
});
data.links?.forEach(link => {
const r: SearchSnippet = {
title: link[0],
url: normalizeUrl(link[1]),
description: link[0],
}
// in-page link has lower initial weight comparing to search links
if (r.url && r.url.startsWith('http')) {
addToAllURLs(r, allURLs, 0.1);
}
})
return {url, result: response};
} catch (error) {
console.error('Error reading URL:', error);
return null;
} finally {
visitedURLs.push(url);
}
})
).then(results => results.filter(Boolean));
const success = urlResults.length > 0;
if (success) {
// skip the rest, knowledge updated, answer again
continue
}
}
}
updateContext({
totalStep,
question: currentQuestion,
...thisStep,
});
console.log(currentQuestion, evaluationMetrics[currentQuestion])
let evaluation: EvaluationResponse = {pass: true, think: ''};
if (evaluationMetrics[currentQuestion].length > 0) {
@ -441,7 +496,7 @@ export async function getResponse(question?: string,
evaluation = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion],
context,
visitedURLs,
allKnowledge,
SchemaGen
) || evaluation;
}
@ -532,8 +587,7 @@ Although you solved a sub-question, you still need to find the answer to the ori
updated: new Date().toISOString()
});
}
}
else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
} else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
thisStep.questionsToAnswer = chooseK((await dedupQueries(thisStep.questionsToAnswer, allQuestions, context.tokenTracker)).unique_queries, MAX_REFLECT_PER_STEP);
const newGapQuestions = thisStep.questionsToAnswer
if (newGapQuestions.length > 0) {
@ -565,8 +619,7 @@ But then you realized you have asked them before. You decided to to think out of
});
}
allowReflect = false;
}
else if (thisStep.action === 'search' && thisStep.searchRequests) {
} else if (thisStep.action === 'search' && thisStep.searchRequests) {
// dedup search requests
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
@ -676,8 +729,7 @@ You decided to think out of the box or cut from a completely different angle.
});
}
allowSearch = false;
}
else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
// normalize URLs
thisStep.URLTargets = thisStep.URLTargets
.filter(url => url.startsWith('http'))
@ -766,8 +818,7 @@ You decided to think out of the box or cut from a completely different angle.`);
});
}
allowRead = false;
}
else if (thisStep.action === 'coding' && thisStep.codingIssue) {
} else if (thisStep.action === 'coding' && thisStep.codingIssue) {
const sandbox = new CodeSandbox({allContext, visitedURLs, allURLs, allKnowledge}, context, SchemaGen);
try {
const result = await sandbox.solve(thisStep.codingIssue);

View File

@ -1,5 +1,5 @@
import {GenerateObjectResult} from 'ai';
import {AnswerAction, EvaluationResponse, EvaluationType, PromptPair, TrackerContext} from '../types';
import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
import {readUrl} from "./read";
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
@ -25,13 +25,19 @@ answer: ${JSON.stringify(answer)}
}
}
function getAttributionPrompt(question: string, answer: string, sourceContent: string): PromptPair {
function getAttributionPrompt(question: string, answer: string, allKnowledge: KnowledgeItem[]): PromptPair {
return {
system: `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided context.`,
user: `
Context: ${sourceContent}
Question: ${question}
Answer: ${answer}
<context>
${JSON.stringify(allKnowledge)}
</context>
<question>
${question}
</question>
<answer>
${answer}
</answer>
Please look at my answer and think.
`
@ -618,31 +624,21 @@ export async function evaluateAnswer(
action: AnswerAction,
evaluationTypes: EvaluationType[],
trackers: TrackerContext,
visitedURLs: string[] = [],
allKnowledge: KnowledgeItem[],
schemaGen: Schemas
): Promise<EvaluationResponse> {
let result;
// Only add attribution if we have valid references
const urls = action.references?.filter(ref => ref.url.startsWith('http') && !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
const uniqueNewURLs = [...new Set(urls)];
if (uniqueNewURLs.length > 0) {
evaluationTypes = ['attribution', ...evaluationTypes];
}
for (const evaluationType of evaluationTypes) {
let prompt: { system: string; user: string } | undefined
switch (evaluationType) {
case 'attribution': {
// Safely handle references and ensure we have content
const allKnowledge = await fetchSourceContent(uniqueNewURLs, trackers, schemaGen);
visitedURLs.push(...uniqueNewURLs);
if (allKnowledge.trim().length === 0) {
if (allKnowledge.length === 0) {
return {
pass: false,
think: `The answer does provide URL references ${JSON.stringify(uniqueNewURLs)}, but the content could not be fetched or is empty. Need to found some other references and URLs`,
think: `The knowledge is completely empty and the answer can not be derived from it. Need to found some other references and URLs`,
type: 'attribution',
};
}

View File

@ -352,7 +352,7 @@ export async function getLastModified(url: string): Promise<string | undefined>
const data = await response.json();
// Return the bestGuess date if available
if (data.bestGuess) {
if (data.bestGuess && data.confidence >= 70) {
return data.bestGuess;
}