mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: normalize url
This commit is contained in:
parent
1067a0c256
commit
5c36410b54
11
src/agent.ts
11
src/agent.ts
@ -403,6 +403,7 @@ export async function getResponse(question?: string,
|
|||||||
|
|
||||||
const allURLs: Record<string, SearchSnippet> = {};
|
const allURLs: Record<string, SearchSnippet> = {};
|
||||||
const visitedURLs: string[] = [];
|
const visitedURLs: string[] = [];
|
||||||
|
const badURLs: string[] = [];
|
||||||
const evaluationMetrics: Record<string, EvaluationType[]> = {};
|
const evaluationMetrics: Record<string, EvaluationType[]> = {};
|
||||||
// reserve the 10% final budget for the beast mode
|
// reserve the 10% final budget for the beast mode
|
||||||
const regularBudget = tokenBudget * 0.9;
|
const regularBudget = tokenBudget * 0.9;
|
||||||
@ -515,14 +516,13 @@ export async function getResponse(question?: string,
|
|||||||
allKnowledge,
|
allKnowledge,
|
||||||
allURLs,
|
allURLs,
|
||||||
visitedURLs,
|
visitedURLs,
|
||||||
|
badURLs,
|
||||||
SchemaGen,
|
SchemaGen,
|
||||||
currentQuestion
|
currentQuestion
|
||||||
);
|
);
|
||||||
|
|
||||||
// is this really required???
|
// remove references whose urls are in badURLs
|
||||||
// if (!evaluationMetrics[currentQuestion].includes('attribution')) {
|
thisStep.references = thisStep.references.filter(ref => !badURLs.includes(ref.url));
|
||||||
// evaluationMetrics[currentQuestion].push('attribution')
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
updateContext({
|
updateContext({
|
||||||
@ -764,6 +764,7 @@ You decided to think out of the box or cut from a completely different angle.
|
|||||||
allKnowledge,
|
allKnowledge,
|
||||||
allURLs,
|
allURLs,
|
||||||
visitedURLs,
|
visitedURLs,
|
||||||
|
badURLs,
|
||||||
SchemaGen,
|
SchemaGen,
|
||||||
currentQuestion
|
currentQuestion
|
||||||
);
|
);
|
||||||
@ -908,7 +909,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
result: thisStep,
|
result: thisStep,
|
||||||
context,
|
context,
|
||||||
visitedURLs: returnedURLs,
|
visitedURLs: returnedURLs,
|
||||||
readURLs: visitedURLs,
|
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
|
||||||
allURLs: weightedURLs.map(r => r.url)
|
allURLs: weightedURLs.map(r => r.url)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@ -390,14 +390,17 @@ export async function processURLs(
|
|||||||
allKnowledge: KnowledgeItem[],
|
allKnowledge: KnowledgeItem[],
|
||||||
allURLs: Record<string, SearchSnippet>,
|
allURLs: Record<string, SearchSnippet>,
|
||||||
visitedURLs: string[],
|
visitedURLs: string[],
|
||||||
|
badURLs: string[],
|
||||||
schemaGen: Schemas,
|
schemaGen: Schemas,
|
||||||
question: string
|
question: string
|
||||||
): Promise<{ urlResults: any[], success: boolean }> {
|
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
|
||||||
// Skip if no URLs to process
|
// Skip if no URLs to process
|
||||||
if (urls.length === 0) {
|
if (urls.length === 0) {
|
||||||
return {urlResults: [], success: false};
|
return {urlResults: [], success: false, badURLs: []};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const badHostnames: string[] = [];
|
||||||
|
|
||||||
// Track the reading action
|
// Track the reading action
|
||||||
const thisStep: VisitAction = {
|
const thisStep: VisitAction = {
|
||||||
action: 'visit',
|
action: 'visit',
|
||||||
@ -455,12 +458,29 @@ export async function processURLs(
|
|||||||
});
|
});
|
||||||
|
|
||||||
return {url, result: response};
|
return {url, result: response};
|
||||||
} catch (error) {
|
} catch (error: any) {
|
||||||
console.error('Error reading URL:', url, error);
|
console.error('Error reading URL:', url, error);
|
||||||
|
badURLs.push(url);
|
||||||
|
// Extract hostname from the URL
|
||||||
|
if (
|
||||||
|
(error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
|
||||||
|
(error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
|
||||||
|
error?.message?.includes("Couldn't resolve host name") ||
|
||||||
|
error?.message?.includes("could not be resolved")
|
||||||
|
) {
|
||||||
|
let hostname = '';
|
||||||
|
try {
|
||||||
|
hostname = extractUrlParts(url).hostname;
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error parsing URL for hostname:', url, e);
|
||||||
|
}
|
||||||
|
badHostnames.push(hostname);
|
||||||
|
console.log(`Added ${hostname} to bad hostnames list`);
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
} finally {
|
} finally {
|
||||||
// Only add valid URLs to visitedURLs list
|
// Only add valid URLs to visitedURLs list
|
||||||
if (url && typeof url === 'string') {
|
if (url) {
|
||||||
visitedURLs.push(url);
|
visitedURLs.push(url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -470,8 +490,20 @@ export async function processURLs(
|
|||||||
// Filter out null results without changing the original array
|
// Filter out null results without changing the original array
|
||||||
const validResults = urlResults.filter(Boolean);
|
const validResults = urlResults.filter(Boolean);
|
||||||
|
|
||||||
|
// remove any URL with bad hostnames from allURLs
|
||||||
|
if (badHostnames.length > 0) {
|
||||||
|
Object.keys(allURLs).forEach(url => {
|
||||||
|
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
||||||
|
delete allURLs[url];
|
||||||
|
console.log(`Removed ${url} from allURLs`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
urlResults: validResults,
|
urlResults: validResults,
|
||||||
success: validResults.length > 0
|
success: validResults.length > 0,
|
||||||
|
badURLs
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user