feat: add timestamp to the knowledge (#64)

This commit is contained in:
Han Xiao
2025-02-13 20:05:54 +08:00
committed by GitHub
parent 507bc38546
commit 3b76e0b4d8
3 changed files with 82 additions and 50 deletions

View File

@@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer"; import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker"; import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker"; import {ActionTracker} from "./utils/action-tracker";
import {StepAction, AnswerAction, KnowledgeItem} from "./types"; import {StepAction, AnswerAction, KnowledgeItem, EvaluationCriteria} from "./types";
import {TrackerContext} from "./types"; import {TrackerContext} from "./types";
import {search} from "./tools/jina-search"; import {search} from "./tools/jina-search";
// import {grounding} from "./tools/grounding"; // import {grounding} from "./tools/grounding";
@@ -24,7 +24,7 @@ async function sleep(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }
function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean) { function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean, languageStyle: string = 'same language as the question') {
const actions: string[] = []; const actions: string[] = [];
const properties: Record<string, z.ZodTypeAny> = { const properties: Record<string, z.ZodTypeAny> = {
action: z.enum(['placeholder']), // Will update later with actual actions action: z.enum(['placeholder']), // Will update later with actual actions
@@ -40,7 +40,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
if (allowAnswer) { if (allowAnswer) {
actions.push("answer"); actions.push("answer");
properties.answer = z.string() properties.answer = z.string()
.describe("Required when action='answer'. Must be the final answer in natural language").optional(); .describe(`Required when action='answer'. Must in ${languageStyle}`).optional();
properties.references = z.array( properties.references = z.array(
z.object({ z.object({
exactQuote: z.string().describe("Exact relevant quote from the document"), exactQuote: z.string().describe("Exact relevant quote from the document"),
@@ -85,7 +85,8 @@ function getPrompt(
badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[], badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[],
knowledge?: KnowledgeItem[], knowledge?: KnowledgeItem[],
allURLs?: Record<string, string>, allURLs?: Record<string, string>,
beastMode?: boolean beastMode?: boolean,
languageStyle?: string
): string { ): string {
const sections: string[] = []; const sections: string[] = [];
const actionSections: string[] = []; const actionSections: string[] = [];
@@ -216,11 +217,11 @@ ${allKeywords.join('\n')}
if (allowAnswer) { if (allowAnswer) {
actionSections.push(` actionSections.push(`
<action-answer> <action-answer>
- If <question> is a simple greeting, chit-chat, or general knowledge, provide the answer directly. - If <question> is a simple greeting, chit-chat, or general knowledge, provide the answer directly;
- Must provide "references" and each must specify "exactQuote" and "url" - Must provide "references" and each must specify "exactQuote" and "url";
- In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references - In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references;
- Responses must be definitive (no ambiguity, uncertainty, or disclaimers) - Responses must be definitive (no ambiguity, uncertainty, or disclaimers) and in the style of ${languageStyle};
- Provide final response only when 100% certain${allowReflect ? '\n- If doubts remain, use <action-reflect> instead' : ''} - Provide final response only when 100% certain;${allowReflect ? '\n- If doubts remain, use <action-reflect> instead' : ''}
</action-answer> </action-answer>
`); `);
} }
@@ -299,8 +300,9 @@ export async function getResponse(question: string,
let totalStep = 0; let totalStep = 0;
let badAttempts = 0; let badAttempts = 0;
let schema: ZodObject<any> = getSchema(true, true, true, true) let schema: ZodObject<any> = getSchema(true, true, true, true)
const gaps: string[] = [question.trim()]; // All questions to be answered including the orginal question question = question.trim()
const allQuestions = [question.trim()]; const gaps: string[] = [question]; // All questions to be answered including the orginal question
const allQuestions = [question];
const allKeywords = []; const allKeywords = [];
const allKnowledge: KnowledgeItem[] = []; // knowledge are intermedidate questions that are answered const allKnowledge: KnowledgeItem[] = []; // knowledge are intermedidate questions that are answered
// iterate over historyMessages // iterate over historyMessages
@@ -329,7 +331,7 @@ export async function getResponse(question: string,
const allURLs: Record<string, string> = {}; const allURLs: Record<string, string> = {};
const visitedURLs: string[] = []; const visitedURLs: string[] = [];
const evaluationMetrics: Record<string, any[]> = {}; const evaluationMetrics: Record<string, EvaluationCriteria> = {};
while (context.tokenTracker.getTotalUsage().totalTokens < tokenBudget && badAttempts <= maxBadAttempts) { while (context.tokenTracker.getTotalUsage().totalTokens < tokenBudget && badAttempts <= maxBadAttempts) {
// add 1s delay to avoid rate limiting // add 1s delay to avoid rate limiting
await sleep(STEP_SLEEP); await sleep(STEP_SLEEP);
@@ -339,7 +341,7 @@ export async function getResponse(question: string,
console.log(`Step ${totalStep} / Budget used ${budgetPercentage}%`); console.log(`Step ${totalStep} / Budget used ${budgetPercentage}%`);
console.log('Gaps:', gaps); console.log('Gaps:', gaps);
allowReflect = allowReflect && (gaps.length <= 1); allowReflect = allowReflect && (gaps.length <= 1);
const currentQuestion = gaps.length > 0 ? gaps.shift()! : question.trim(); const currentQuestion = gaps.length > 0 ? gaps.shift()! : question
if (!evaluationMetrics[currentQuestion]) { if (!evaluationMetrics[currentQuestion]) {
evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker) evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker)
} }
@@ -361,9 +363,11 @@ export async function getResponse(question: string,
badContext, badContext,
allKnowledge, allKnowledge,
allURLs, allURLs,
false false,
evaluationMetrics[currentQuestion].languageStyle
); );
schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch) schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch,
evaluationMetrics[currentQuestion].languageStyle)
const generator = new ObjectGeneratorSafe(context.tokenTracker); const generator = new ObjectGeneratorSafe(context.tokenTracker);
const result = await generator.generateObject({ const result = await generator.generateObject({
model: 'agent', model: 'agent',
@@ -401,7 +405,7 @@ export async function getResponse(question: string,
const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep, const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion], context.tokenTracker); evaluationMetrics[currentQuestion], context.tokenTracker);
if (currentQuestion.trim() === question.trim()) { if (currentQuestion.trim() === question) {
if (evaluation.pass) { if (evaluation.pass) {
diaryContext.push(` diaryContext.push(`
At step ${step}, you took **answer** action and finally found the answer to the original question: At step ${step}, you took **answer** action and finally found the answer to the original question:
@@ -458,7 +462,7 @@ ${evaluation.think}
// reranker? maybe // reranker? maybe
gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2)); allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
gaps.push(question.trim()); // always keep the original question in the gaps gaps.push(question); // always keep the original question in the gaps
} }
badAttempts++; badAttempts++;
@@ -505,7 +509,7 @@ You will now figure out the answers to these sub-questions and see if they can h
`); `);
gaps.push(...newGapQuestions.slice(0, 2)); gaps.push(...newGapQuestions.slice(0, 2));
allQuestions.push(...newGapQuestions.slice(0, 2)); allQuestions.push(...newGapQuestions.slice(0, 2));
gaps.push(question.trim()); // always keep the original question in the gaps gaps.push(question); // always keep the original question in the gaps
} else { } else {
diaryContext.push(` diaryContext.push(`
At step ${step}, you took **reflect** and think about the knowledge gaps. You tried to break down the question "${currentQuestion}" into gap-questions like this: ${oldQuestions.join(', ')} At step ${step}, you took **reflect** and think about the knowledge gaps. You tried to break down the question "${currentQuestion}" into gap-questions like this: ${oldQuestions.join(', ')}
@@ -697,10 +701,12 @@ You decided to think out of the box or cut from a completely different angle.`);
badContext, badContext,
allKnowledge, allKnowledge,
allURLs, allURLs,
true true,
evaluationMetrics[question]?.languageStyle || 'same language as the question'
); );
schema = getSchema(false, false, true, false); schema = getSchema(false, false, true, false,
evaluationMetrics[question]?.languageStyle || 'same language as the question');
const generator = new ObjectGeneratorSafe(context.tokenTracker); const generator = new ObjectGeneratorSafe(context.tokenTracker);
const result = await generator.generateObject({ const result = await generator.generateObject({
model: 'agentBeastMode', model: 'agentBeastMode',
@@ -721,7 +727,15 @@ You decided to think out of the box or cut from a completely different angle.`);
async function storeContext(prompt: string, schema: any, memory: any[][], step: number) { async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
if ((process as any).asyncLocalContext?.available?.()) { if ((process as any).asyncLocalContext?.available?.()) {
const [context, keywords, questions, knowledge] = memory; const [context, keywords, questions, knowledge] = memory;
(process as any).asyncLocalContext.ctx.promptContext = { prompt, schema, context, keywords, questions, knowledge, step }; (process as any).asyncLocalContext.ctx.promptContext = {
prompt,
schema,
context,
keywords,
questions,
knowledge,
step
};
return; return;
} }

View File

@@ -1,12 +1,11 @@
import {z} from 'zod'; import {z} from 'zod';
import {GenerateObjectResult} from 'ai'; import {GenerateObjectResult} from 'ai';
import {TokenTracker} from "../utils/token-tracker"; import {TokenTracker} from "../utils/token-tracker";
import {AnswerAction, EvaluationResponse} from '../types'; import {AnswerAction, EvaluationCriteria, EvaluationResponse, EvaluationType} from '../types';
import {readUrl, removeAllLineBreaks} from "./read"; import {readUrl, removeAllLineBreaks} from "./read";
import {ObjectGeneratorSafe} from "../utils/safe-generator"; import {ObjectGeneratorSafe} from "../utils/safe-generator";
type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution';
const baseSchema = { const baseSchema = {
pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'), pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
@@ -301,7 +300,8 @@ Answer: ${JSON.stringify(answer)}`;
const questionEvaluationSchema = z.object({ const questionEvaluationSchema = z.object({
needsFreshness: z.boolean().describe('Whether the question requires freshness check'), needsFreshness: z.boolean().describe('Whether the question requires freshness check'),
needsPlurality: z.boolean().describe('Whether the question requires plurality check'), needsPlurality: z.boolean().describe('Whether the question requires plurality check'),
reasoning: z.string().describe('Explanation of why these checks are needed or not needed') reasoning: z.string().describe('Explanation of why these checks are needed or not needed'),
languageStyle: z.string().describe('The language being used and the overall vibe/mood of the question'),
}); });
function getQuestionEvaluationPrompt(question: string): string { function getQuestionEvaluationPrompt(question: string): string {
@@ -310,6 +310,7 @@ function getQuestionEvaluationPrompt(question: string): string {
<evaluation_types> <evaluation_types>
1. freshness - Checks if the question is time-sensitive or requires very recent information 1. freshness - Checks if the question is time-sensitive or requires very recent information
2. plurality - Checks if the question asks for multiple items or a specific count or enumeration 2. plurality - Checks if the question asks for multiple items or a specific count or enumeration
3. language style - Identifies both the language used and the overall vibe of the question
</evaluation_types> </evaluation_types>
<rules> <rules>
@@ -326,42 +327,54 @@ If question is a simple greeting, chit-chat, or general knowledge, provide the a
- Check for: numbers ("5 examples"), plural nouns, list requests - Check for: numbers ("5 examples"), plural nouns, list requests
- Look for: "all", "list", "enumerate", "examples", plural forms - Look for: "all", "list", "enumerate", "examples", plural forms
- Required when question implies completeness ("all the reasons", "every factor") - Required when question implies completeness ("all the reasons", "every factor")
3. Language Style Analysis:
Combine both language and emotional vibe in a descriptive phrase, considering:
- Language: The primary language or mix of languages used
- Emotional tone: panic, excitement, frustration, curiosity, etc.
- Formality level: academic, casual, professional, etc.
- Domain context: technical, academic, social, etc.
</rules> </rules>
<examples> <examples>
Question: "Hello, how are you?" Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭"
Evaluation: { Evaluation: {
"needsFreshness": false, "needsFreshness": false,
"needsPlurality": false, "needsPlurality": true,
"reasoning": "Simple greeting, no additional checks needed." "reasoning": "Multiple eigenvalues needed but no time-sensitive information required",
"languageStyle": "panicked student English with math jargon"
} }
Question: "What is the current CEO of OpenAI?" Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦‍♂️ #MonacoGP"
Evaluation: { Evaluation: {
"needsFreshness": true, "needsFreshness": true,
"needsPlurality": false, "needsPlurality": true,
"reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position." "reasoning": "Refers to recent race event and requires analysis of multiple strategic decisions",
"languageStyle": "frustrated fan English with F1 terminology"
} }
Question: "List all the AI companies in Berlin" Question: "肖老师您好,请您介绍一下最近量子计算领域的三个重大突破,特别是它们在密码学领域的应用价值吗?🤔"
Evaluation: { Evaluation: {
"needsFreshness": false, "needsFreshness": true,
"needsPlurality": true, "needsPlurality": true,
"reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive." "reasoning": "Asks for recent breakthroughs (freshness) and specifically requests three examples (plurality)",
"languageStyle": "formal technical Chinese with academic undertones"
} }
Question: "What are the top 5 latest AI models released by OpenAI?" Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤"
Evaluation: { Evaluation: {
"needsFreshness": true, "needsFreshness": false,
"needsPlurality": true, "needsPlurality": true,
"reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items." "reasoning": "Requires comprehensive debugging analysis of multiple potential issues",
"languageStyle": "frustrated German-English tech slang"
} }
Question: "Who created Python?" Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis."
Evaluation: { Evaluation: {
"needsFreshness": false, "needsFreshness": true,
"needsPlurality": false, "needsPlurality": true,
"reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed." "reasoning": "Requires analysis of current impacts (freshness) across multiple dimensions: sociopolitical, cultural, and linguistic (plurality)",
"languageStyle": "formal academic English with sociological terminology"
} }
</examples> </examples>
@@ -374,7 +387,7 @@ const TOOL_NAME = 'evaluator';
export async function evaluateQuestion( export async function evaluateQuestion(
question: string, question: string,
tracker?: TokenTracker tracker?: TokenTracker
): Promise<EvaluationType[]> { ): Promise<EvaluationCriteria> {
try { try {
const generator = new ObjectGeneratorSafe(tracker); const generator = new ObjectGeneratorSafe(tracker);
@@ -394,12 +407,12 @@ export async function evaluateQuestion(
console.log('Question Metrics:', types); console.log('Question Metrics:', types);
// Always evaluate definitive first, then freshness (if needed), then plurality (if needed) // Always evaluate definitive first, then freshness (if needed), then plurality (if needed)
return types; return {types, languageStyle: result.object.languageStyle};
} catch (error) { } catch (error) {
console.error('Error in question evaluation:', error); console.error('Error in question evaluation:', error);
// Default to all evaluation types in case of error // Default to all evaluation types in case of error
return ['definitive', 'freshness', 'plurality']; return {types: ['definitive', 'freshness', 'plurality'], languageStyle: 'plain English'};
} }
} }
@@ -430,17 +443,17 @@ async function performEvaluation<T>(
export async function evaluateAnswer( export async function evaluateAnswer(
question: string, question: string,
action: AnswerAction, action: AnswerAction,
evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'], evaluationCri: EvaluationCriteria,
tracker?: TokenTracker tracker?: TokenTracker
): Promise<{ response: EvaluationResponse }> { ): Promise<{ response: EvaluationResponse }> {
let result; let result;
// Only add attribution if we have valid references // Only add attribution if we have valid references
if (action.references && action.references.length > 0) { if (action.references && action.references.length > 0) {
evaluationOrder = ['attribution', ...evaluationOrder]; evaluationCri.types = ['attribution', ...evaluationCri.types];
} }
for (const evaluationType of evaluationOrder) { for (const evaluationType of evaluationCri.types) {
switch (evaluationType) { switch (evaluationType) {
case 'attribution': { case 'attribution': {
// Safely handle references and ensure we have content // Safely handle references and ensure we have content

View File

@@ -45,6 +45,11 @@ export type VisitAction = BaseAction & {
export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction; export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction;
export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution';
export type EvaluationCriteria = {
types: EvaluationType[];
languageStyle: string;
};
// Following Vercel AI SDK's token counting interface // Following Vercel AI SDK's token counting interface
export interface TokenUsage { export interface TokenUsage {