From 5c36410b54b1b5b21e150c415490ff78f592f79a Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 17 Mar 2025 14:23:02 +0800 Subject: [PATCH] fix: normalize url --- src/agent.ts | 11 ++++++----- src/utils/url-tools.ts | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/agent.ts b/src/agent.ts index 4c96249..0c062ee 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -403,6 +403,7 @@ export async function getResponse(question?: string, const allURLs: Record = {}; const visitedURLs: string[] = []; + const badURLs: string[] = []; const evaluationMetrics: Record = {}; // reserve the 10% final budget for the beast mode const regularBudget = tokenBudget * 0.9; @@ -515,14 +516,13 @@ export async function getResponse(question?: string, allKnowledge, allURLs, visitedURLs, + badURLs, SchemaGen, currentQuestion ); - // is this really required??? - // if (!evaluationMetrics[currentQuestion].includes('attribution')) { - // evaluationMetrics[currentQuestion].push('attribution') - // } + // remove references whose urls are in badURLs + thisStep.references = thisStep.references.filter(ref => !badURLs.includes(ref.url)); } updateContext({ @@ -764,6 +764,7 @@ You decided to think out of the box or cut from a completely different angle. allKnowledge, allURLs, visitedURLs, + badURLs, SchemaGen, currentQuestion ); @@ -908,7 +909,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b result: thisStep, context, visitedURLs: returnedURLs, - readURLs: visitedURLs, + readURLs: visitedURLs.filter(url => !badURLs.includes(url)), allURLs: weightedURLs.map(r => r.url) }; } diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index 90ca47e..b6078db 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -390,14 +390,17 @@ export async function processURLs( allKnowledge: KnowledgeItem[], allURLs: Record, visitedURLs: string[], + badURLs: string[], schemaGen: Schemas, question: string -): Promise<{ urlResults: any[], success: boolean }> { +): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> { // Skip if no URLs to process if (urls.length === 0) { - return {urlResults: [], success: false}; + return {urlResults: [], success: false, badURLs: []}; } + const badHostnames: string[] = []; + // Track the reading action const thisStep: VisitAction = { action: 'visit', @@ -455,12 +458,29 @@ export async function processURLs( }); return {url, result: response}; - } catch (error) { + } catch (error: any) { console.error('Error reading URL:', url, error); + badURLs.push(url); + // Extract hostname from the URL + if ( + (error?.name === 'ParamValidationError' && error.message?.includes('Domain')) || + (error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) || + error?.message?.includes("Couldn't resolve host name") || + error?.message?.includes("could not be resolved") + ) { + let hostname = ''; + try { + hostname = extractUrlParts(url).hostname; + } catch (e) { + console.error('Error parsing URL for hostname:', url, e); + } + badHostnames.push(hostname); + console.log(`Added ${hostname} to bad hostnames list`); + } return null; } finally { // Only add valid URLs to visitedURLs list - if (url && typeof url === 'string') { + if (url) { visitedURLs.push(url); } } @@ -470,8 +490,20 @@ export async function processURLs( // Filter out null results without changing the original array const validResults = urlResults.filter(Boolean); + // remove any URL with bad hostnames from allURLs + if (badHostnames.length > 0) { + Object.keys(allURLs).forEach(url => { + if (badHostnames.includes(extractUrlParts(url).hostname)) { + delete allURLs[url]; + console.log(`Removed ${url} from allURLs`); + } + } + ) + } + return { urlResults: validResults, - success: validResults.length > 0 + success: validResults.length > 0, + badURLs }; } \ No newline at end of file