fix: normalize url

This commit is contained in:
Han Xiao 2025-03-17 14:23:02 +08:00
parent 1067a0c256
commit 5c36410b54
2 changed files with 43 additions and 10 deletions

View File

@ -403,6 +403,7 @@ export async function getResponse(question?: string,
const allURLs: Record<string, SearchSnippet> = {}; const allURLs: Record<string, SearchSnippet> = {};
const visitedURLs: string[] = []; const visitedURLs: string[] = [];
const badURLs: string[] = [];
const evaluationMetrics: Record<string, EvaluationType[]> = {}; const evaluationMetrics: Record<string, EvaluationType[]> = {};
// reserve the 10% final budget for the beast mode // reserve the 10% final budget for the beast mode
const regularBudget = tokenBudget * 0.9; const regularBudget = tokenBudget * 0.9;
@ -515,14 +516,13 @@ export async function getResponse(question?: string,
allKnowledge, allKnowledge,
allURLs, allURLs,
visitedURLs, visitedURLs,
badURLs,
SchemaGen, SchemaGen,
currentQuestion currentQuestion
); );
// is this really required??? // remove references whose urls are in badURLs
// if (!evaluationMetrics[currentQuestion].includes('attribution')) { thisStep.references = thisStep.references.filter(ref => !badURLs.includes(ref.url));
// evaluationMetrics[currentQuestion].push('attribution')
// }
} }
updateContext({ updateContext({
@ -764,6 +764,7 @@ You decided to think out of the box or cut from a completely different angle.
allKnowledge, allKnowledge,
allURLs, allURLs,
visitedURLs, visitedURLs,
badURLs,
SchemaGen, SchemaGen,
currentQuestion currentQuestion
); );
@ -908,7 +909,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
result: thisStep, result: thisStep,
context, context,
visitedURLs: returnedURLs, visitedURLs: returnedURLs,
readURLs: visitedURLs, readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
allURLs: weightedURLs.map(r => r.url) allURLs: weightedURLs.map(r => r.url)
}; };
} }

View File

@ -390,14 +390,17 @@ export async function processURLs(
allKnowledge: KnowledgeItem[], allKnowledge: KnowledgeItem[],
allURLs: Record<string, SearchSnippet>, allURLs: Record<string, SearchSnippet>,
visitedURLs: string[], visitedURLs: string[],
badURLs: string[],
schemaGen: Schemas, schemaGen: Schemas,
question: string question: string
): Promise<{ urlResults: any[], success: boolean }> { ): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
// Skip if no URLs to process // Skip if no URLs to process
if (urls.length === 0) { if (urls.length === 0) {
return {urlResults: [], success: false}; return {urlResults: [], success: false, badURLs: []};
} }
const badHostnames: string[] = [];
// Track the reading action // Track the reading action
const thisStep: VisitAction = { const thisStep: VisitAction = {
action: 'visit', action: 'visit',
@ -455,12 +458,29 @@ export async function processURLs(
}); });
return {url, result: response}; return {url, result: response};
} catch (error) { } catch (error: any) {
console.error('Error reading URL:', url, error); console.error('Error reading URL:', url, error);
badURLs.push(url);
// Extract hostname from the URL
if (
(error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
(error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
error?.message?.includes("Couldn't resolve host name") ||
error?.message?.includes("could not be resolved")
) {
let hostname = '';
try {
hostname = extractUrlParts(url).hostname;
} catch (e) {
console.error('Error parsing URL for hostname:', url, e);
}
badHostnames.push(hostname);
console.log(`Added ${hostname} to bad hostnames list`);
}
return null; return null;
} finally { } finally {
// Only add valid URLs to visitedURLs list // Only add valid URLs to visitedURLs list
if (url && typeof url === 'string') { if (url) {
visitedURLs.push(url); visitedURLs.push(url);
} }
} }
@ -470,8 +490,20 @@ export async function processURLs(
// Filter out null results without changing the original array // Filter out null results without changing the original array
const validResults = urlResults.filter(Boolean); const validResults = urlResults.filter(Boolean);
// remove any URL with bad hostnames from allURLs
if (badHostnames.length > 0) {
Object.keys(allURLs).forEach(url => {
if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url];
console.log(`Removed ${url} from allURLs`);
}
}
)
}
return { return {
urlResults: validResults, urlResults: validResults,
success: validResults.length > 0 success: validResults.length > 0,
badURLs
}; };
} }