fix: normalize url

This commit is contained in:
Han Xiao 2025-03-17 14:23:02 +08:00
parent 1067a0c256
commit 5c36410b54
2 changed files with 43 additions and 10 deletions

View File

@ -403,6 +403,7 @@ export async function getResponse(question?: string,
const allURLs: Record<string, SearchSnippet> = {};
const visitedURLs: string[] = [];
const badURLs: string[] = [];
const evaluationMetrics: Record<string, EvaluationType[]> = {};
// reserve the 10% final budget for the beast mode
const regularBudget = tokenBudget * 0.9;
@ -515,14 +516,13 @@ export async function getResponse(question?: string,
allKnowledge,
allURLs,
visitedURLs,
badURLs,
SchemaGen,
currentQuestion
);
// is this really required???
// if (!evaluationMetrics[currentQuestion].includes('attribution')) {
// evaluationMetrics[currentQuestion].push('attribution')
// }
// remove references whose urls are in badURLs
thisStep.references = thisStep.references.filter(ref => !badURLs.includes(ref.url));
}
updateContext({
@ -764,6 +764,7 @@ You decided to think out of the box or cut from a completely different angle.
allKnowledge,
allURLs,
visitedURLs,
badURLs,
SchemaGen,
currentQuestion
);
@ -908,7 +909,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
result: thisStep,
context,
visitedURLs: returnedURLs,
readURLs: visitedURLs,
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
allURLs: weightedURLs.map(r => r.url)
};
}

View File

@ -390,14 +390,17 @@ export async function processURLs(
allKnowledge: KnowledgeItem[],
allURLs: Record<string, SearchSnippet>,
visitedURLs: string[],
badURLs: string[],
schemaGen: Schemas,
question: string
): Promise<{ urlResults: any[], success: boolean }> {
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
// Skip if no URLs to process
if (urls.length === 0) {
return {urlResults: [], success: false};
return {urlResults: [], success: false, badURLs: []};
}
const badHostnames: string[] = [];
// Track the reading action
const thisStep: VisitAction = {
action: 'visit',
@ -455,12 +458,29 @@ export async function processURLs(
});
return {url, result: response};
} catch (error) {
} catch (error: any) {
console.error('Error reading URL:', url, error);
badURLs.push(url);
// Extract hostname from the URL
if (
(error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
(error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
error?.message?.includes("Couldn't resolve host name") ||
error?.message?.includes("could not be resolved")
) {
let hostname = '';
try {
hostname = extractUrlParts(url).hostname;
} catch (e) {
console.error('Error parsing URL for hostname:', url, e);
}
badHostnames.push(hostname);
console.log(`Added ${hostname} to bad hostnames list`);
}
return null;
} finally {
// Only add valid URLs to visitedURLs list
if (url && typeof url === 'string') {
if (url) {
visitedURLs.push(url);
}
}
@ -470,8 +490,20 @@ export async function processURLs(
// Filter out null results without changing the original array
const validResults = urlResults.filter(Boolean);
// remove any URL with bad hostnames from allURLs
if (badHostnames.length > 0) {
Object.keys(allURLs).forEach(url => {
if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url];
console.log(`Removed ${url} from allURLs`);
}
}
)
}
return {
urlResults: validResults,
success: validResults.length > 0
success: validResults.length > 0,
badURLs
};
}