feat: add timestamp to the knowledge (#64)

2026-03-22 07:29:35 +08:00 · 2025-02-13 20:05:54 +08:00
parent 507bc38546
commit 3b76e0b4d8
3 changed files with 82 additions and 50 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
 import {analyzeSteps} from "./tools/error-analyzer";
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
-import {StepAction, AnswerAction, KnowledgeItem} from "./types";
+import {StepAction, AnswerAction, KnowledgeItem, EvaluationCriteria} from "./types";
 import {TrackerContext} from "./types";
 import {search} from "./tools/jina-search";
 // import {grounding} from "./tools/grounding";
@@ -24,7 +24,7 @@ async function sleep(ms: number) {
  return new Promise(resolve => setTimeout(resolve, ms));
 }
-function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean) {
+function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boolean, allowSearch: boolean, languageStyle: string = 'same language as the question') {
  const actions: string[] = [];
  const properties: Record<string, z.ZodTypeAny> = {
    action: z.enum(['placeholder']), // Will update later with actual actions
@@ -40,7 +40,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
  if (allowAnswer) {
    actions.push("answer");
    properties.answer = z.string()
-      .describe("Required when action='answer'. Must be the final answer in natural language").optional();
+      .describe(`Required when action='answer'. Must in ${languageStyle}`).optional();
    properties.references = z.array(
      z.object({
        exactQuote: z.string().describe("Exact relevant quote from the document"),
@@ -85,7 +85,8 @@ function getPrompt(
  badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[],
  knowledge?: KnowledgeItem[],
  allURLs?: Record<string, string>,
-  beastMode?: boolean
+  beastMode?: boolean,
  languageStyle?: string
 ): string {
  const sections: string[] = [];
  const actionSections: string[] = [];
@@ -216,11 +217,11 @@ ${allKeywords.join('\n')}
  if (allowAnswer) {
    actionSections.push(`
 <action-answer>
- If <question> is a simple greeting, chit-chat, or general knowledge, provide the answer directly.
+- If <question> is a simple greeting, chit-chat, or general knowledge, provide the answer directly;
- Must provide "references" and each must specify "exactQuote" and "url" 
+- Must provide "references" and each must specify "exactQuote" and "url";
- In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references
+- In the answer, use markdown footnote syntax like [^1], [^2] to refer to the references;
- Responses must be definitive (no ambiguity, uncertainty, or disclaimers)
+- Responses must be definitive (no ambiguity, uncertainty, or disclaimers) and in the style of ${languageStyle};
- Provide final response only when 100% certain${allowReflect ? '\n- If doubts remain, use <action-reflect> instead' : ''}
+- Provide final response only when 100% certain;${allowReflect ? '\n- If doubts remain, use <action-reflect> instead' : ''}
 </action-answer>
 `);
  }
@@ -299,8 +300,9 @@ export async function getResponse(question: string,
  let totalStep = 0;
  let badAttempts = 0;
  let schema: ZodObject<any> = getSchema(true, true, true, true)
-  const gaps: string[] = [question.trim()];  // All questions to be answered including the orginal question
+  question = question.trim()
-  const allQuestions = [question.trim()];
+  const gaps: string[] = [question];  // All questions to be answered including the orginal question
  const allQuestions = [question];
  const allKeywords = [];
  const allKnowledge: KnowledgeItem[] = [];  // knowledge are intermedidate questions that are answered
  // iterate over historyMessages
@@ -329,7 +331,7 @@ export async function getResponse(question: string,
  const allURLs: Record<string, string> = {};
  const visitedURLs: string[] = [];
-  const evaluationMetrics: Record<string, any[]> = {};
+  const evaluationMetrics: Record<string, EvaluationCriteria> = {};
  while (context.tokenTracker.getTotalUsage().totalTokens < tokenBudget && badAttempts <= maxBadAttempts) {
    // add 1s delay to avoid rate limiting
    await sleep(STEP_SLEEP);
@@ -339,7 +341,7 @@ export async function getResponse(question: string,
    console.log(`Step ${totalStep} / Budget used ${budgetPercentage}%`);
    console.log('Gaps:', gaps);
    allowReflect = allowReflect && (gaps.length <= 1);
-    const currentQuestion = gaps.length > 0 ? gaps.shift()! : question.trim();
+    const currentQuestion = gaps.length > 0 ? gaps.shift()! : question
    if (!evaluationMetrics[currentQuestion]) {
      evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context.tokenTracker)
    }
@@ -361,9 +363,11 @@ export async function getResponse(question: string,
      badContext,
      allKnowledge,
      allURLs,
-      false
+      false,
      evaluationMetrics[currentQuestion].languageStyle
    );
-    schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch)
+    schema = getSchema(allowReflect, allowRead, allowAnswer, allowSearch,
      evaluationMetrics[currentQuestion].languageStyle)
    const generator = new ObjectGeneratorSafe(context.tokenTracker);
    const result = await generator.generateObject({
      model: 'agent',
@@ -401,7 +405,7 @@ export async function getResponse(question: string,
      const {response: evaluation} = await evaluateAnswer(currentQuestion, thisStep,
        evaluationMetrics[currentQuestion], context.tokenTracker);
-      if (currentQuestion.trim() === question.trim()) {
+      if (currentQuestion.trim() === question) {
        if (evaluation.pass) {
          diaryContext.push(`
 At step ${step}, you took **answer** action and finally found the answer to the original question:
@@ -458,7 +462,7 @@ ${evaluation.think}
              // reranker? maybe
              gaps.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
              allQuestions.push(...errorAnalysis.questionsToAnswer.slice(0, 2));
-              gaps.push(question.trim());  // always keep the original question in the gaps
+              gaps.push(question);  // always keep the original question in the gaps
            }
            badAttempts++;
@@ -505,7 +509,7 @@ You will now figure out the answers to these sub-questions and see if they can h
 `);
        gaps.push(...newGapQuestions.slice(0, 2));
        allQuestions.push(...newGapQuestions.slice(0, 2));
-        gaps.push(question.trim());  // always keep the original question in the gaps
+        gaps.push(question);  // always keep the original question in the gaps
      } else {
        diaryContext.push(`
 At step ${step}, you took **reflect** and think about the knowledge gaps. You tried to break down the question "${currentQuestion}" into gap-questions like this: ${oldQuestions.join(', ')} 
@@ -697,10 +701,12 @@ You decided to think out of the box or cut from a completely different angle.`);
      badContext,
      allKnowledge,
      allURLs,
-      true
+      true,
      evaluationMetrics[question]?.languageStyle || 'same language as the question'
    );
-    schema = getSchema(false, false, true, false);
+    schema = getSchema(false, false, true, false,
      evaluationMetrics[question]?.languageStyle || 'same language as the question');
    const generator = new ObjectGeneratorSafe(context.tokenTracker);
    const result = await generator.generateObject({
      model: 'agentBeastMode',
@@ -721,7 +727,15 @@ You decided to think out of the box or cut from a completely different angle.`);
 async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
  if ((process as any).asyncLocalContext?.available?.()) {
    const [context, keywords, questions, knowledge] = memory;
-    (process as any).asyncLocalContext.ctx.promptContext = { prompt, schema, context, keywords, questions, knowledge, step };
+    (process as any).asyncLocalContext.ctx.promptContext = {
      prompt,
      schema,
      context,
      keywords,
      questions,
      knowledge,
      step
    };
    return;
  }
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -1,12 +1,11 @@
 import {z} from 'zod';
 import {GenerateObjectResult} from 'ai';
 import {TokenTracker} from "../utils/token-tracker";
-import {AnswerAction, EvaluationResponse} from '../types';
+import {AnswerAction, EvaluationCriteria, EvaluationResponse, EvaluationType} from '../types';
 import {readUrl, removeAllLineBreaks} from "./read";
 import {ObjectGeneratorSafe} from "../utils/safe-generator";
 type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution';
 const baseSchema = {
  pass: z.boolean().describe('Whether the answer passes the evaluation criteria defined by the evaluator'),
@@ -301,7 +300,8 @@ Answer: ${JSON.stringify(answer)}`;
 const questionEvaluationSchema = z.object({
  needsFreshness: z.boolean().describe('Whether the question requires freshness check'),
  needsPlurality: z.boolean().describe('Whether the question requires plurality check'),
-  reasoning: z.string().describe('Explanation of why these checks are needed or not needed')
+  reasoning: z.string().describe('Explanation of why these checks are needed or not needed'),
  languageStyle: z.string().describe('The language being used and the overall vibe/mood of the question'),
 });
 function getQuestionEvaluationPrompt(question: string): string {
@@ -310,6 +310,7 @@ function getQuestionEvaluationPrompt(question: string): string {
 <evaluation_types>
 1. freshness - Checks if the question is time-sensitive or requires very recent information
 2. plurality - Checks if the question asks for multiple items or a specific count or enumeration
 3. language style - Identifies both the language used and the overall vibe of the question
 </evaluation_types>
 <rules>
@@ -326,42 +327,54 @@ If question is a simple greeting, chit-chat, or general knowledge, provide the a
   - Check for: numbers ("5 examples"), plural nouns, list requests
   - Look for: "all", "list", "enumerate", "examples", plural forms
   - Required when question implies completeness ("all the reasons", "every factor")
 3. Language Style Analysis:
  Combine both language and emotional vibe in a descriptive phrase, considering:
  - Language: The primary language or mix of languages used
  - Emotional tone: panic, excitement, frustration, curiosity, etc.
  - Formality level: academic, casual, professional, etc.
  - Domain context: technical, academic, social, etc.
 </rules>
 <examples>
-Question: "Hello, how are you?"
+Question: "fam PLEASE help me calculate the eigenvalues of this 4x4 matrix ASAP!! [matrix details] got an exam tmrw 😭"
 Evaluation: {
-  "needsFreshness": false,
+    "needsFreshness": false,
-  "needsPlurality": false,
+    "needsPlurality": true,
-  "reasoning": "Simple greeting, no additional checks needed."
+    "reasoning": "Multiple eigenvalues needed but no time-sensitive information required",
    "languageStyle": "panicked student English with math jargon"
 }
-Question: "What is the current CEO of OpenAI?"
+Question: "Can someone explain how tf did Ferrari mess up their pit stop strategy AGAIN?! 🤦‍♂️ #MonacoGP"
 Evaluation: {
-  "needsFreshness": true,
+    "needsFreshness": true,
-  "needsPlurality": false,
+    "needsPlurality": true,
-  "reasoning": "Question asks about current leadership position which requires freshness check. No plurality check needed as it asks for a single position."
+    "reasoning": "Refers to recent race event and requires analysis of multiple strategic decisions",
    "languageStyle": "frustrated fan English with F1 terminology"
 }
-Question: "List all the AI companies in Berlin"
+Question: "肖老师您好，请您介绍一下最近量子计算领域的三个重大突破，特别是它们在密码学领域的应用价值吗？🤔"
 Evaluation: {
-  "needsFreshness": false,
+    "needsFreshness": true,
-  "needsPlurality": true,
+    "needsPlurality": true,
-  "reasoning": "Question asks for a comprehensive list ('all') which requires plurality check. No freshness check needed as it's not time-sensitive."
+    "reasoning": "Asks for recent breakthroughs (freshness) and specifically requests three examples (plurality)",
    "languageStyle": "formal technical Chinese with academic undertones"
 }
-Question: "What are the top 5 latest AI models released by OpenAI?"
+Question: "Bruder krass, kannst du mir erklären warum meine neural network training loss komplett durchdreht? Hab schon alles probiert 😤"
 Evaluation: {
-  "needsFreshness": true,
+    "needsFreshness": false,
-  "needsPlurality": true,
+    "needsPlurality": true,
-  "reasoning": "Question requires freshness check for 'latest' releases and plurality check for 'top 5' items."
+    "reasoning": "Requires comprehensive debugging analysis of multiple potential issues",
    "languageStyle": "frustrated German-English tech slang"
 }
-Question: "Who created Python?"
+Question: "Does anyone have insights into the sociopolitical implications of GPT-4's emergence in the Global South, particularly regarding indigenous knowledge systems and linguistic diversity? Looking for a nuanced analysis."
 Evaluation: {
-  "needsFreshness": false,
+    "needsFreshness": true,
-  "needsPlurality": false,
+    "needsPlurality": true,
-  "reasoning": "Simple factual question requiring only definitiveness check. No time sensitivity or multiple items needed."
+    "reasoning": "Requires analysis of current impacts (freshness) across multiple dimensions: sociopolitical, cultural, and linguistic (plurality)",
    "languageStyle": "formal academic English with sociological terminology"
 }
 </examples>
@@ -374,7 +387,7 @@ const TOOL_NAME = 'evaluator';
 export async function evaluateQuestion(
  question: string,
  tracker?: TokenTracker
-): Promise<EvaluationType[]> {
+): Promise<EvaluationCriteria> {
  try {
    const generator = new ObjectGeneratorSafe(tracker);
@@ -394,12 +407,12 @@ export async function evaluateQuestion(
    console.log('Question Metrics:', types);
    // Always evaluate definitive first, then freshness (if needed), then plurality (if needed)
-    return types;
+    return {types, languageStyle: result.object.languageStyle};
  } catch (error) {
    console.error('Error in question evaluation:', error);
    // Default to all evaluation types in case of error
-    return ['definitive', 'freshness', 'plurality'];
+    return {types: ['definitive', 'freshness', 'plurality'], languageStyle: 'plain English'};
  }
 }
@@ -430,17 +443,17 @@ async function performEvaluation<T>(
 export async function evaluateAnswer(
  question: string,
  action: AnswerAction,
-  evaluationOrder: EvaluationType[] = ['definitive', 'freshness', 'plurality'],
+  evaluationCri: EvaluationCriteria,
  tracker?: TokenTracker
 ): Promise<{ response: EvaluationResponse }> {
  let result;
  // Only add attribution if we have valid references
  if (action.references && action.references.length > 0) {
-    evaluationOrder = ['attribution', ...evaluationOrder];
+    evaluationCri.types = ['attribution', ...evaluationCri.types];
  }
-  for (const evaluationType of evaluationOrder) {
+  for (const evaluationType of evaluationCri.types) {
    switch (evaluationType) {
      case 'attribution': {
        // Safely handle references and ensure we have content
--- a/src/types.ts
+++ b/src/types.ts
@@ -45,6 +45,11 @@ export type VisitAction = BaseAction & {
 export type StepAction = SearchAction | AnswerAction | ReflectAction | VisitAction;
 export type EvaluationType = 'definitive' | 'freshness' | 'plurality' | 'attribution';
 export type EvaluationCriteria = {
  types: EvaluationType[];
  languageStyle: string;
 };
 // Following Vercel AI SDK's token counting interface
 export interface TokenUsage {