From c8fc259dff8f63a6b720d331b4698419737e4baf Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Tue, 11 Mar 2025 21:30:59 +0800 Subject: [PATCH] refactor: pull url out --- src/agent.ts | 117 ++++++----------------------------------- src/tools/evaluator.ts | 28 ---------- src/utils/url-tools.ts | 73 ++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 130 deletions(-) diff --git a/src/agent.ts b/src/agent.ts index bb54602..f6ee00f 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -1,7 +1,6 @@ import {ZodObject} from 'zod'; import {CoreMessage} from 'ai'; import {SEARCH_PROVIDER, STEP_SLEEP} from "./config"; -import {readUrl} from "./tools/read"; import fs from 'fs/promises'; import {SafeSearchType, search as duckSearch} from "duck-duck-scrape"; import {braveSearch} from "./tools/brave-search"; @@ -33,12 +32,11 @@ import { countUrlParts, removeBFromA, normalizeUrl, sampleMultinomial, - weightedURLToString, getLastModified, keepKPerHostname + weightedURLToString, getLastModified, keepKPerHostname, processURLs } from "./utils/url-tools"; import { buildMdFromAnswer, chooseK, - removeAllLineBreaks, removeExtraLineBreaks, removeHTMLtags } from "./utils/text-tools"; @@ -430,56 +428,14 @@ export async function getResponse(question?: string, if (thisStep.references.length > 0) { const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || []; const uniqueNewURLs = [...new Set(urls)]; - if (uniqueNewURLs.length > 0) { - context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueNewURLs.join(', ')}); - const urlResults = await Promise.all( - uniqueNewURLs.map(async url => { - try { - const {response} = await readUrl(url, true, context.tokenTracker); - const {data} = response; - const guessedTime = await getLastModified(url); - console.log('Guessed time for', url, guessedTime) - - // Early return if no valid data - if (!data?.url || !data?.content) { - throw new Error('No content found'); - } - - allKnowledge.push({ - question: `What do expert say about "${data.title}"?`, - answer: removeAllLineBreaks(data.content), - references: [data.url], - type: 'url', - updated: guessedTime - }); - - data.links?.forEach(link => { - const r: SearchSnippet = { - title: link[0], - url: normalizeUrl(link[1]), - description: link[0], - } - // in-page link has lower initial weight comparing to search links - if (r.url && r.url.startsWith('http')) { - addToAllURLs(r, allURLs, 0.1); - } - }) - - return {url, result: response}; - } catch (error) { - console.error('Error reading URL:', error); - return null; - } finally { - visitedURLs.push(url); - } - }) - ).then(results => results.filter(Boolean)); - - const success = urlResults.length > 0; - if (success) { - // knowledge updated - } - } + await processURLs( + uniqueNewURLs, + context, + allKnowledge, + allURLs, + visitedURLs, + SchemaGen.languageCode + ); } updateContext({ @@ -644,12 +600,10 @@ But then you realized you have asked them before. You decided to to think out of const topHosts = Object.entries(countUrlParts( Object.entries(allURLs).map(([, result]) => result) ).hostnameCount).sort((a, b) => b[1] - a[1]); - console.log(topHosts) if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) { // explore-exploit siteQuery = query.q + ' site:' + sampleMultinomial(topHosts); query.q = siteQuery; - console.log('Site query:', siteQuery) } console.log('Search query:', query); @@ -741,52 +695,15 @@ You decided to think out of the box or cut from a completely different angle. console.log(uniqueURLs) if (uniqueURLs.length > 0) { - context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueURLs.join(', ')}); + const {urlResults, success} = await processURLs( + uniqueURLs, + context, + allKnowledge, + allURLs, + visitedURLs, + SchemaGen.languageCode + ); - const urlResults = await Promise.all( - uniqueURLs.map(async url => { - try { - const {response} = await readUrl(url, true, context.tokenTracker); - const {data} = response; - const guessedTime = await getLastModified(url); - console.log('Guessed time for', url, guessedTime) - - // Early return if no valid data - if (!data?.url || !data?.content) { - throw new Error('No content found'); - } - - allKnowledge.push({ - question: `What do expert say about "${data.title}"?`, - answer: removeAllLineBreaks(data.content), - references: [data.url], - type: 'url', - updated: guessedTime - }); - - data.links?.forEach(link => { - const r: SearchSnippet = { - title: link[0], - url: normalizeUrl(link[1]), - description: link[0], - } - // in-page link has lower initial weight comparing to search links - if (r.url && r.url.startsWith('http')) { - addToAllURLs(r, allURLs, 0.1); - } - }) - - return {url, result: response}; - } catch (error) { - console.error('Error reading URL:', error); - return null; - } finally { - visitedURLs.push(url); - } - }) - ).then(results => results.filter(Boolean)); - - const success = urlResults.length > 0; diaryContext.push(success ? `At step ${step}, you took the **visit** action and deep dive into the following URLs: ${urlResults.map(r => r?.url).join('\n')} diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index 035b473..7df7d3b 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -684,31 +684,3 @@ export async function evaluateAnswer( return result?.object as EvaluationResponse; } - -// Helper function to fetch and combine source content -async function fetchSourceContent(urls: string[], trackers: TrackerContext, schemaGen: Schemas): Promise { - if (!urls.length) return ''; - trackers.actionTracker.trackThink('read_for_verify', schemaGen.languageCode); - try { - const results = await Promise.all( - urls.map(async (url) => { - try { - const {response} = await readUrl(url, false, trackers.tokenTracker); - const content = response?.data?.content || ''; - return removeAllLineBreaks(content); - } catch (error) { - console.error('Error reading URL:', error); - return ''; - } - }) - ); - - // Filter out empty results and join with proper separation - return results - .filter(content => content.trim()) - .join('\n\n'); - } catch (error) { - console.error('Error fetching source content:', error); - return ''; - } -} \ No newline at end of file diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index 90a8f1c..905eaf2 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -1,6 +1,7 @@ -import {BoostedSearchSnippet, SearchResult, SearchSnippet, TrackerContext} from "../types"; -import {smartMergeStrings} from "./text-tools"; +import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types"; +import {removeAllLineBreaks, smartMergeStrings} from "./text-tools"; import {rerankDocuments} from "../tools/jina-rerank"; +import {readUrl} from "../tools/read"; export function normalizeUrl(urlString: string, debug = false, options = { removeAnchors: true, @@ -381,4 +382,72 @@ export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => }); return filteredResults; +} + +export async function processURLs( + urls: string[], + context: TrackerContext, + allKnowledge: KnowledgeItem[], + allURLs: Record, + visitedURLs: string[], + languageCode: string +): Promise<{urlResults: any[], success: boolean}> { + // Skip if no URLs to process + if (urls.length === 0) { + return { urlResults: [], success: false }; + } + + // Track the reading action + context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')}); + + // Process each URL in parallel + const urlResults = await Promise.all( + urls.map(async url => { + try { + const {response} = await readUrl(url, true, context.tokenTracker); + const {data} = response; + const guessedTime = await getLastModified(url); + console.log('Guessed time for', url, guessedTime); + + // Early return if no valid data + if (!data?.url || !data?.content) { + throw new Error('No content found'); + } + + // Add to knowledge base + allKnowledge.push({ + question: `What do expert say about "${data.title}"?`, + answer: removeAllLineBreaks(data.content), + references: [data.url], + type: 'url', + updated: guessedTime + }); + + // Process page links + data.links?.forEach(link => { + const r: SearchSnippet = { + title: link[0], + url: normalizeUrl(link[1]), + description: link[0], + } + // in-page link has lower initial weight comparing to search links + if (r.url && r.url.startsWith('http')) { + addToAllURLs(r, allURLs, 0.1); + } + }); + + return {url, result: response}; + } catch (error) { + console.error('Error reading URL:', error); + return null; + } finally { + visitedURLs.push(url); + } + }) + ).then(results => results.filter(Boolean)); + + return { + urlResults, + success: urlResults.length > 0 + }; } \ No newline at end of file