fix: eval

2025-12-26 06:28:56 +08:00 · 2025-03-11 17:56:39 +08:00 · 2025-03-11 17:56:39 +08:00 · c30043e119
commit c30043e119
parent df45670bc9
3 changed files with 76 additions and 29 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -427,13 +427,68 @@ export async function getResponse(question?: string,
        break
      }

+      if (thisStep.references.length > 0) {
+        const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
+        const uniqueNewURLs = [...new Set(urls)];
+        if (uniqueNewURLs.length > 0) {
+          context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueNewURLs.join(', ')});
+          const urlResults = await Promise.all(
+            uniqueNewURLs.map(async url => {
+              try {
+                const {response} = await readUrl(url, true, context.tokenTracker);
+                const {data} = response;
+                const guessedTime = await getLastModified(url);
+                console.log('Guessed time for', url, guessedTime)
+
+                // Early return if no valid data
+                if (!data?.url || !data?.content) {
+                  throw new Error('No content found');
+                }
+
+                allKnowledge.push({
+                  question: `What do expert say about "${data.title}"?`,
+                  answer: removeAllLineBreaks(data.content),
+                  references: [data.url],
+                  type: 'url',
+                  updated: guessedTime
+                });
+
+                data.links?.forEach(link => {
+                  const r: SearchSnippet = {
+                    title: link[0],
+                    url: normalizeUrl(link[1]),
+                    description: link[0],
+                  }
+                  // in-page link has lower initial weight comparing to search links
+                  if (r.url && r.url.startsWith('http')) {
+                    addToAllURLs(r, allURLs, 0.1);
+                  }
+                })
+
+                return {url, result: response};
+              } catch (error) {
+                console.error('Error reading URL:', error);
+                return null;
+              } finally {
+                visitedURLs.push(url);
+              }
+            })
+          ).then(results => results.filter(Boolean));
+
+          const success = urlResults.length > 0;
+          if (success) {
+            // skip the rest, knowledge updated, answer again
+            continue
+          }
+        }
+      }
+
      updateContext({
        totalStep,
        question: currentQuestion,
        ...thisStep,
      });

-
      console.log(currentQuestion, evaluationMetrics[currentQuestion])
      let evaluation: EvaluationResponse = {pass: true, think: ''};
      if (evaluationMetrics[currentQuestion].length > 0) {
@ -441,7 +496,7 @@ export async function getResponse(question?: string,
        evaluation = await evaluateAnswer(currentQuestion, thisStep,
          evaluationMetrics[currentQuestion],
          context,
-          visitedURLs,
+          allKnowledge,
          SchemaGen
        ) || evaluation;
      }
@ -532,8 +587,7 @@ Although you solved a sub-question, you still need to find the answer to the ori
          updated: new Date().toISOString()
        });
      }
-    }
-    else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
+    } else if (thisStep.action === 'reflect' && thisStep.questionsToAnswer) {
      thisStep.questionsToAnswer = chooseK((await dedupQueries(thisStep.questionsToAnswer, allQuestions, context.tokenTracker)).unique_queries, MAX_REFLECT_PER_STEP);
      const newGapQuestions = thisStep.questionsToAnswer
      if (newGapQuestions.length > 0) {
@ -565,8 +619,7 @@ But then you realized you have asked them before. You decided to to think out of
        });
      }
      allowReflect = false;
-    }
-    else if (thisStep.action === 'search' && thisStep.searchRequests) {
+    } else if (thisStep.action === 'search' && thisStep.searchRequests) {
      // dedup search requests
      thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);

@ -676,8 +729,7 @@ You decided to think out of the box or cut from a completely different angle.
        });
      }
      allowSearch = false;
-    }
-    else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
+    } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
      // normalize URLs
      thisStep.URLTargets = thisStep.URLTargets
        .filter(url => url.startsWith('http'))
@ -766,8 +818,7 @@ You decided to think out of the box or cut from a completely different angle.`);
        });
      }
      allowRead = false;
-    }
-    else if (thisStep.action === 'coding' && thisStep.codingIssue) {
+    } else if (thisStep.action === 'coding' && thisStep.codingIssue) {
      const sandbox = new CodeSandbox({allContext, visitedURLs, allURLs, allKnowledge}, context, SchemaGen);
      try {
        const result = await sandbox.solve(thisStep.codingIssue);
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@ -1,5 +1,5 @@
 import {GenerateObjectResult} from 'ai';
-import {AnswerAction, EvaluationResponse, EvaluationType, PromptPair, TrackerContext} from '../types';
+import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
 import {readUrl} from "./read";
 import {ObjectGeneratorSafe} from "../utils/safe-generator";
 import {Schemas} from "../utils/schemas";
@ -25,13 +25,19 @@ answer: ${JSON.stringify(answer)}
  }
 }

-function getAttributionPrompt(question: string, answer: string, sourceContent: string): PromptPair {
+function getAttributionPrompt(question: string, answer: string, allKnowledge: KnowledgeItem[]): PromptPair {
  return {
    system: `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided context.`,
    user: `
-Context: ${sourceContent}    
-Question: ${question}
-Answer: ${answer}
+<context>
+${JSON.stringify(allKnowledge)}
+</context>    
+<question>
+${question}
+</question>
+<answer>
+${answer}
+</answer>

 Please look at my answer and think.
 `
@ -618,31 +624,21 @@ export async function evaluateAnswer(
  action: AnswerAction,
  evaluationTypes: EvaluationType[],
  trackers: TrackerContext,
-  visitedURLs: string[] = [],
+  allKnowledge: KnowledgeItem[],
  schemaGen: Schemas
 ): Promise<EvaluationResponse> {
  let result;

-  // Only add attribution if we have valid references
-  const urls = action.references?.filter(ref => ref.url.startsWith('http') && !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
-  const uniqueNewURLs = [...new Set(urls)];
-  if (uniqueNewURLs.length > 0) {
-    evaluationTypes = ['attribution', ...evaluationTypes];
-  }

  for (const evaluationType of evaluationTypes) {
    let prompt: { system: string; user: string } | undefined
    switch (evaluationType) {
      case 'attribution': {
        // Safely handle references and ensure we have content
-
-        const allKnowledge = await fetchSourceContent(uniqueNewURLs, trackers, schemaGen);
-        visitedURLs.push(...uniqueNewURLs);
-
-        if (allKnowledge.trim().length === 0) {
+        if (allKnowledge.length === 0) {
          return {
            pass: false,
-            think: `The answer does provide URL references ${JSON.stringify(uniqueNewURLs)}, but the content could not be fetched or is empty. Need to found some other references and URLs`,
+            think: `The knowledge is completely empty and the answer can not be derived from it. Need to found some other references and URLs`,
            type: 'attribution',
          };
        }
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -352,7 +352,7 @@ export async function getLastModified(url: string): Promise<string | undefined>
    const data = await response.json();

    // Return the bestGuess date if available
-    if (data.bestGuess) {
+    if (data.bestGuess && data.confidence >= 70) {
      return data.bestGuess;
    }