mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: normalize url
This commit is contained in:
parent
92d3aaae9e
commit
f9cb542dd0
69
src/agent.ts
69
src/agent.ts
@ -17,7 +17,7 @@ import {
|
||||
SearchResult,
|
||||
EvaluationType,
|
||||
BoostedSearchSnippet,
|
||||
SearchSnippet, EvaluationResponse
|
||||
SearchSnippet, EvaluationResponse, Reference
|
||||
} from "./types";
|
||||
import {TrackerContext} from "./types";
|
||||
import {search} from "./tools/jina-search";
|
||||
@ -238,6 +238,31 @@ function updateContext(step: any) {
|
||||
allContext.push(step)
|
||||
}
|
||||
|
||||
async function updateReferences(thisStep: AnswerAction, allURLs: Record<string, SearchSnippet>) {
|
||||
thisStep.references = thisStep.references
|
||||
?.filter(ref => ref?.url)
|
||||
.map(ref => {
|
||||
const normalizedUrl = normalizeUrl(ref.url);
|
||||
if (!normalizedUrl) return null; // This causes the type error
|
||||
|
||||
return {
|
||||
exactQuote: ref?.exactQuote || '',
|
||||
title: allURLs[normalizedUrl]?.title || '',
|
||||
url: normalizedUrl,
|
||||
dateTime: ref?.dateTime || ''
|
||||
};
|
||||
})
|
||||
.filter(Boolean) as Reference[]; // Add type assertion here
|
||||
|
||||
// parallel process guess all url datetime
|
||||
await Promise.all((thisStep.references || []).filter(ref => !ref.dateTime)
|
||||
.map(async ref => {
|
||||
ref.dateTime = await getLastModified(ref.url) || '';
|
||||
}));
|
||||
|
||||
console.log('Updated references:', thisStep.references);
|
||||
}
|
||||
|
||||
async function executeSearchQueries(
|
||||
keywordsQueries: any[],
|
||||
context: TrackerContext,
|
||||
@ -297,12 +322,19 @@ async function executeSearchQueries(
|
||||
await sleep(STEP_SLEEP);
|
||||
}
|
||||
|
||||
const minResults: SearchSnippet[] = (results).map(r => ({
|
||||
title: r.title,
|
||||
url: normalizeUrl('url' in r ? r.url : r.link),
|
||||
description: 'description' in r ? r.description : r.snippet,
|
||||
weight: 1
|
||||
}));
|
||||
const minResults: SearchSnippet[] = results
|
||||
.map(r => {
|
||||
const url = normalizeUrl('url' in r ? r.url : r.link);
|
||||
if (!url) return null; // Skip invalid URLs
|
||||
|
||||
return {
|
||||
title: r.title,
|
||||
url,
|
||||
description: 'description' in r ? r.description : r.snippet,
|
||||
weight: 1
|
||||
};
|
||||
})
|
||||
.filter(Boolean) as SearchSnippet[]; // Filter out null entries and assert type
|
||||
|
||||
minResults.forEach(r => {
|
||||
addToAllURLs(r, allURLs);
|
||||
@ -477,24 +509,7 @@ export async function getResponse(question?: string,
|
||||
// execute the step and action
|
||||
if (thisStep.action === 'answer' && thisStep.answer) {
|
||||
// normalize all references urls, add title to it
|
||||
thisStep.references = thisStep.references?.filter(ref => ref?.url && typeof ref.url === 'string' && ref.url.startsWith('http'))
|
||||
.map(ref => {
|
||||
const normalizedUrl = ref?.url ? normalizeUrl(ref.url) : '';
|
||||
return {
|
||||
exactQuote: ref?.exactQuote || '',
|
||||
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
|
||||
url: normalizedUrl,
|
||||
dateTime: ref?.dateTime || ''
|
||||
}
|
||||
});
|
||||
|
||||
// parallel process guess all url datetime
|
||||
await Promise.all(thisStep.references.filter(ref => !(ref?.dateTime))
|
||||
.map(async ref => {
|
||||
ref.dateTime = await getLastModified(ref.url) || ''
|
||||
}));
|
||||
|
||||
console.log('Updated references:', thisStep.references)
|
||||
await updateReferences(thisStep, allURLs)
|
||||
|
||||
if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) {
|
||||
// LLM is so confident and answer immediately, skip all evaluations
|
||||
@ -746,9 +761,8 @@ You decided to think out of the box or cut from a completely different angle.
|
||||
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
|
||||
// normalize URLs
|
||||
thisStep.URLTargets = thisStep.URLTargets
|
||||
.filter(url => url.startsWith('http'))
|
||||
.map(url => normalizeUrl(url))
|
||||
.filter(url => !visitedURLs.includes(url));
|
||||
.filter(url => url && !visitedURLs.includes(url)) as string[];
|
||||
|
||||
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
|
||||
|
||||
@ -883,6 +897,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
think: result.object.think,
|
||||
...result.object[result.object.action]
|
||||
} as AnswerAction;
|
||||
await updateReferences(thisStep, allURLs);
|
||||
(thisStep as AnswerAction).isFinal = true;
|
||||
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
||||
}
|
||||
|
||||
@ -17,7 +17,7 @@ export function buildMdFromAnswer(answer: AnswerAction) {
|
||||
|
||||
const citation = `[^${i + 1}]: ${cleanQuote}`;
|
||||
|
||||
if (!ref.url?.startsWith('http')) return citation;
|
||||
if (!ref.url) return citation;
|
||||
|
||||
const domainName = new URL(ref.url).hostname.replace('www.', '');
|
||||
return `${citation} [${domainName}](${ref.url})`;
|
||||
|
||||
@ -11,18 +11,19 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
removeSessionIDs: true,
|
||||
removeUTMParams: true,
|
||||
removeTrackingParams: true
|
||||
}): string {
|
||||
if (!urlString?.trim()) {
|
||||
throw new Error('Empty URL');
|
||||
}
|
||||
|
||||
urlString = urlString.trim();
|
||||
|
||||
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
|
||||
urlString = 'https://' + urlString;
|
||||
}
|
||||
|
||||
}) {
|
||||
try {
|
||||
urlString = urlString.replace(/\s+/g, '').trim();
|
||||
|
||||
if (!urlString?.trim()) {
|
||||
throw new Error('Empty URL');
|
||||
}
|
||||
|
||||
|
||||
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
|
||||
urlString = 'https://' + urlString;
|
||||
}
|
||||
|
||||
const url = new URL(urlString);
|
||||
|
||||
url.hostname = url.hostname.toLowerCase();
|
||||
@ -71,7 +72,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
|
||||
// Remove session IDs
|
||||
if (options.removeSessionIDs &&
|
||||
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
||||
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -82,7 +83,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
|
||||
// Remove common tracking parameters
|
||||
if (options.removeTrackingParams &&
|
||||
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
||||
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -132,13 +133,14 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
return normalizedUrl;
|
||||
} catch (error) {
|
||||
// Main URL parsing error - this one we should throw
|
||||
throw new Error(`Invalid URL "${urlString}": ${error}`);
|
||||
console.error(`Invalid URL "${urlString}": ${error}`);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
|
||||
return Object.entries(allURLs)
|
||||
.filter(([url, ]) => !visitedURLs.includes(url))
|
||||
.filter(([url,]) => !visitedURLs.includes(url))
|
||||
.map(([, result]) => result);
|
||||
}
|
||||
|
||||
@ -270,11 +272,12 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
||||
|
||||
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
|
||||
const nURL = normalizeUrl(r.url);
|
||||
if (!nURL) return;
|
||||
if (!allURLs[nURL]) {
|
||||
allURLs[nURL] = r;
|
||||
allURLs[nURL].weight = weightDelta;
|
||||
} else {
|
||||
(allURLs[nURL].weight as number)+= weightDelta;
|
||||
(allURLs[nURL].weight as number) += weightDelta;
|
||||
const curDesc = allURLs[nURL].description;
|
||||
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
|
||||
}
|
||||
@ -337,8 +340,6 @@ export function sampleMultinomial<T>(items: [T, number][]): T | null {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Fetches the last modified date for a URL using the datetime detection API
|
||||
* @param url The URL to check for last modified date
|
||||
@ -370,22 +371,22 @@ export async function getLastModified(url: string): Promise<string | undefined>
|
||||
|
||||
|
||||
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
||||
const hostnameMap: Record<string, number> = {};
|
||||
const filteredResults: BoostedSearchSnippet[] = [];
|
||||
const hostnameMap: Record<string, number> = {};
|
||||
const filteredResults: BoostedSearchSnippet[] = [];
|
||||
|
||||
results.forEach((result) => {
|
||||
const hostname = extractUrlParts(result.url).hostname;
|
||||
if (hostnameMap[hostname] === undefined) {
|
||||
hostnameMap[hostname] = 0;
|
||||
}
|
||||
results.forEach((result) => {
|
||||
const hostname = extractUrlParts(result.url).hostname;
|
||||
if (hostnameMap[hostname] === undefined) {
|
||||
hostnameMap[hostname] = 0;
|
||||
}
|
||||
|
||||
if (hostnameMap[hostname] < k) {
|
||||
filteredResults.push(result);
|
||||
hostnameMap[hostname]++;
|
||||
}
|
||||
});
|
||||
if (hostnameMap[hostname] < k) {
|
||||
filteredResults.push(result);
|
||||
hostnameMap[hostname]++;
|
||||
}
|
||||
});
|
||||
|
||||
return filteredResults;
|
||||
return filteredResults;
|
||||
}
|
||||
|
||||
export async function processURLs(
|
||||
@ -396,10 +397,10 @@ export async function processURLs(
|
||||
visitedURLs: string[],
|
||||
schemaGen: Schemas,
|
||||
question: string
|
||||
): Promise<{urlResults: any[], success: boolean}> {
|
||||
): Promise<{ urlResults: any[], success: boolean }> {
|
||||
// Skip if no URLs to process
|
||||
if (urls.length === 0) {
|
||||
return { urlResults: [], success: false };
|
||||
return {urlResults: [], success: false};
|
||||
}
|
||||
|
||||
// Track the reading action
|
||||
@ -414,7 +415,14 @@ export async function processURLs(
|
||||
const urlResults = await Promise.all(
|
||||
urls.map(async url => {
|
||||
try {
|
||||
url = normalizeUrl(url);
|
||||
const normalizedUrl = normalizeUrl(url);
|
||||
if (!normalizedUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Store normalized URL for consistent reference
|
||||
url = normalizedUrl;
|
||||
|
||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
||||
const {data} = response;
|
||||
const guessedTime = await getLastModified(url);
|
||||
@ -422,7 +430,6 @@ export async function processURLs(
|
||||
console.log('Guessed time for', url, guessedTime);
|
||||
}
|
||||
|
||||
|
||||
// Early return if no valid data
|
||||
if (!data?.url || !data?.content) {
|
||||
throw new Error('No content found');
|
||||
@ -434,18 +441,20 @@ export async function processURLs(
|
||||
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
|
||||
references: [data.url],
|
||||
type: 'url',
|
||||
updated: guessedTime? formatDateBasedOnType(new Date(guessedTime), 'full'): undefined
|
||||
updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
|
||||
});
|
||||
|
||||
// Process page links
|
||||
data.links?.forEach(link => {
|
||||
const nnUrl = normalizeUrl(link[1]);
|
||||
if (!nnUrl) return;
|
||||
const r: SearchSnippet = {
|
||||
title: link[0],
|
||||
url: normalizeUrl(link[1]),
|
||||
url: nnUrl,
|
||||
description: link[0],
|
||||
}
|
||||
// in-page link has lower initial weight comparing to search links
|
||||
if (r.url && r.url.startsWith('http')) {
|
||||
if (r.url) {
|
||||
addToAllURLs(r, allURLs, 0.1);
|
||||
}
|
||||
});
|
||||
@ -455,13 +464,19 @@ export async function processURLs(
|
||||
console.error('Error reading URL:', url, error);
|
||||
return null;
|
||||
} finally {
|
||||
visitedURLs.push(url);
|
||||
// Only add valid URLs to visitedURLs list
|
||||
if (url && typeof url === 'string') {
|
||||
visitedURLs.push(url);
|
||||
}
|
||||
}
|
||||
})
|
||||
).then(results => results.filter(Boolean));
|
||||
);
|
||||
|
||||
// Filter out null results without changing the original array
|
||||
const validResults = urlResults.filter(Boolean);
|
||||
|
||||
return {
|
||||
urlResults,
|
||||
success: urlResults.length > 0
|
||||
urlResults: validResults,
|
||||
success: validResults.length > 0
|
||||
};
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user