From c8fc259dff8f63a6b720d331b4698419737e4baf Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@jina.ai>
Date: Tue, 11 Mar 2025 21:30:59 +0800
Subject: [PATCH] refactor: pull url out

---
 src/agent.ts           | 117 ++++++-----------------------------------
 src/tools/evaluator.ts |  28 ----------
 src/utils/url-tools.ts |  73 ++++++++++++++++++++++++-
 3 files changed, 88 insertions(+), 130 deletions(-)

diff --git a/src/agent.ts b/src/agent.ts
index bb54602..f6ee00f 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -1,7 +1,6 @@
 import {ZodObject} from 'zod';
 import {CoreMessage} from 'ai';
 import {SEARCH_PROVIDER, STEP_SLEEP} from "./config";
-import {readUrl} from "./tools/read";
 import fs from 'fs/promises';
 import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
 import {braveSearch} from "./tools/brave-search";
@@ -33,12 +32,11 @@ import {
   countUrlParts,
   removeBFromA,
   normalizeUrl, sampleMultinomial,
-  weightedURLToString, getLastModified, keepKPerHostname
+  weightedURLToString, getLastModified, keepKPerHostname, processURLs
 } from "./utils/url-tools";
 import {
   buildMdFromAnswer,
   chooseK,
-  removeAllLineBreaks,
   removeExtraLineBreaks,
   removeHTMLtags
 } from "./utils/text-tools";
@@ -430,56 +428,14 @@ export async function getResponse(question?: string,
       if (thisStep.references.length > 0) {
         const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
         const uniqueNewURLs = [...new Set(urls)];
-        if (uniqueNewURLs.length > 0) {
-          context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueNewURLs.join(', ')});
-          const urlResults = await Promise.all(
-            uniqueNewURLs.map(async url => {
-              try {
-                const {response} = await readUrl(url, true, context.tokenTracker);
-                const {data} = response;
-                const guessedTime = await getLastModified(url);
-                console.log('Guessed time for', url, guessedTime)
-
-                // Early return if no valid data
-                if (!data?.url || !data?.content) {
-                  throw new Error('No content found');
-                }
-
-                allKnowledge.push({
-                  question: `What do expert say about "${data.title}"?`,
-                  answer: removeAllLineBreaks(data.content),
-                  references: [data.url],
-                  type: 'url',
-                  updated: guessedTime
-                });
-
-                data.links?.forEach(link => {
-                  const r: SearchSnippet = {
-                    title: link[0],
-                    url: normalizeUrl(link[1]),
-                    description: link[0],
-                  }
-                  // in-page link has lower initial weight comparing to search links
-                  if (r.url && r.url.startsWith('http')) {
-                    addToAllURLs(r, allURLs, 0.1);
-                  }
-                })
-
-                return {url, result: response};
-              } catch (error) {
-                console.error('Error reading URL:', error);
-                return null;
-              } finally {
-                visitedURLs.push(url);
-              }
-            })
-          ).then(results => results.filter(Boolean));
-
-          const success = urlResults.length > 0;
-          if (success) {
-            // knowledge updated
-          }
-        }
+        await processURLs(
+          uniqueNewURLs,
+          context,
+          allKnowledge,
+          allURLs,
+          visitedURLs,
+          SchemaGen.languageCode
+        );
       }
 
       updateContext({
@@ -644,12 +600,10 @@ But then you realized you have asked them before. You decided to to think out of
             const topHosts = Object.entries(countUrlParts(
               Object.entries(allURLs).map(([, result]) => result)
             ).hostnameCount).sort((a, b) => b[1] - a[1]);
-            console.log(topHosts)
             if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) {
               // explore-exploit
               siteQuery = query.q + ' site:' + sampleMultinomial(topHosts);
               query.q = siteQuery;
-              console.log('Site query:', siteQuery)
             }
 
             console.log('Search query:', query);
@@ -741,52 +695,15 @@ You decided to think out of the box or cut from a completely different angle.
       console.log(uniqueURLs)
 
       if (uniqueURLs.length > 0) {
-        context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueURLs.join(', ')});
+        const {urlResults, success} = await processURLs(
+          uniqueURLs,
+          context,
+          allKnowledge,
+          allURLs,
+          visitedURLs,
+          SchemaGen.languageCode
+        );
 
-        const urlResults = await Promise.all(
-          uniqueURLs.map(async url => {
-            try {
-              const {response} = await readUrl(url, true, context.tokenTracker);
-              const {data} = response;
-              const guessedTime = await getLastModified(url);
-              console.log('Guessed time for', url, guessedTime)
-
-              // Early return if no valid data
-              if (!data?.url || !data?.content) {
-                throw new Error('No content found');
-              }
-
-              allKnowledge.push({
-                question: `What do expert say about "${data.title}"?`,
-                answer: removeAllLineBreaks(data.content),
-                references: [data.url],
-                type: 'url',
-                updated: guessedTime
-              });
-
-              data.links?.forEach(link => {
-                const r: SearchSnippet = {
-                  title: link[0],
-                  url: normalizeUrl(link[1]),
-                  description: link[0],
-                }
-                // in-page link has lower initial weight comparing to search links
-                if (r.url && r.url.startsWith('http')) {
-                  addToAllURLs(r, allURLs, 0.1);
-                }
-              })
-
-              return {url, result: response};
-            } catch (error) {
-              console.error('Error reading URL:', error);
-              return null;
-            } finally {
-              visitedURLs.push(url);
-            }
-          })
-        ).then(results => results.filter(Boolean));
-
-        const success = urlResults.length > 0;
         diaryContext.push(success
           ? `At step ${step}, you took the **visit** action and deep dive into the following URLs:
 ${urlResults.map(r => r?.url).join('\n')}
diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
index 035b473..7df7d3b 100644
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -684,31 +684,3 @@ export async function evaluateAnswer(
     return result?.object as EvaluationResponse;
 
 }
-
-// Helper function to fetch and combine source content
-async function fetchSourceContent(urls: string[], trackers: TrackerContext, schemaGen: Schemas): Promise<string> {
-  if (!urls.length) return '';
-  trackers.actionTracker.trackThink('read_for_verify', schemaGen.languageCode);
-  try {
-    const results = await Promise.all(
-      urls.map(async (url) => {
-        try {
-          const {response} = await readUrl(url, false, trackers.tokenTracker);
-          const content = response?.data?.content || '';
-          return removeAllLineBreaks(content);
-        } catch (error) {
-          console.error('Error reading URL:', error);
-          return '';
-        }
-      })
-    );
-
-    // Filter out empty results and join with proper separation
-    return results
-      .filter(content => content.trim())
-      .join('\n\n');
-  } catch (error) {
-    console.error('Error fetching source content:', error);
-    return '';
-  }
-}
\ No newline at end of file
diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts
index 90a8f1c..905eaf2 100644
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@@ -1,6 +1,7 @@
-import {BoostedSearchSnippet, SearchResult, SearchSnippet, TrackerContext} from "../types";
-import {smartMergeStrings} from "./text-tools";
+import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types";
+import {removeAllLineBreaks, smartMergeStrings} from "./text-tools";
 import {rerankDocuments} from "../tools/jina-rerank";
+import {readUrl} from "../tools/read";
 
 export function normalizeUrl(urlString: string, debug = false, options = {
   removeAnchors: true,
@@ -381,4 +382,72 @@ export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) =>
     });
 
     return filteredResults;
+}
+
+export async function processURLs(
+  urls: string[],
+  context: TrackerContext,
+  allKnowledge: KnowledgeItem[],
+  allURLs: Record<string, SearchSnippet>,
+  visitedURLs: string[],
+  languageCode: string
+): Promise<{urlResults: any[], success: boolean}> {
+  // Skip if no URLs to process
+  if (urls.length === 0) {
+    return { urlResults: [], success: false };
+  }
+
+  // Track the reading action
+  context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')});
+
+  // Process each URL in parallel
+  const urlResults = await Promise.all(
+    urls.map(async url => {
+      try {
+        const {response} = await readUrl(url, true, context.tokenTracker);
+        const {data} = response;
+        const guessedTime = await getLastModified(url);
+        console.log('Guessed time for', url, guessedTime);
+
+        // Early return if no valid data
+        if (!data?.url || !data?.content) {
+          throw new Error('No content found');
+        }
+
+        // Add to knowledge base
+        allKnowledge.push({
+          question: `What do expert say about "${data.title}"?`,
+          answer: removeAllLineBreaks(data.content),
+          references: [data.url],
+          type: 'url',
+          updated: guessedTime
+        });
+
+        // Process page links
+        data.links?.forEach(link => {
+          const r: SearchSnippet = {
+            title: link[0],
+            url: normalizeUrl(link[1]),
+            description: link[0],
+          }
+          // in-page link has lower initial weight comparing to search links
+          if (r.url && r.url.startsWith('http')) {
+            addToAllURLs(r, allURLs, 0.1);
+          }
+        });
+
+        return {url, result: response};
+      } catch (error) {
+        console.error('Error reading URL:', error);
+        return null;
+      } finally {
+        visitedURLs.push(url);
+      }
+    })
+  ).then(results => results.filter(Boolean));
+
+  return {
+    urlResults,
+    success: urlResults.length > 0
+  };
 }
\ No newline at end of file