mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
fix: normalize url
This commit is contained in:
69
src/agent.ts
69
src/agent.ts
@@ -17,7 +17,7 @@ import {
|
|||||||
SearchResult,
|
SearchResult,
|
||||||
EvaluationType,
|
EvaluationType,
|
||||||
BoostedSearchSnippet,
|
BoostedSearchSnippet,
|
||||||
SearchSnippet, EvaluationResponse
|
SearchSnippet, EvaluationResponse, Reference
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {TrackerContext} from "./types";
|
import {TrackerContext} from "./types";
|
||||||
import {search} from "./tools/jina-search";
|
import {search} from "./tools/jina-search";
|
||||||
@@ -238,6 +238,31 @@ function updateContext(step: any) {
|
|||||||
allContext.push(step)
|
allContext.push(step)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function updateReferences(thisStep: AnswerAction, allURLs: Record<string, SearchSnippet>) {
|
||||||
|
thisStep.references = thisStep.references
|
||||||
|
?.filter(ref => ref?.url)
|
||||||
|
.map(ref => {
|
||||||
|
const normalizedUrl = normalizeUrl(ref.url);
|
||||||
|
if (!normalizedUrl) return null; // This causes the type error
|
||||||
|
|
||||||
|
return {
|
||||||
|
exactQuote: ref?.exactQuote || '',
|
||||||
|
title: allURLs[normalizedUrl]?.title || '',
|
||||||
|
url: normalizedUrl,
|
||||||
|
dateTime: ref?.dateTime || ''
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter(Boolean) as Reference[]; // Add type assertion here
|
||||||
|
|
||||||
|
// parallel process guess all url datetime
|
||||||
|
await Promise.all((thisStep.references || []).filter(ref => !ref.dateTime)
|
||||||
|
.map(async ref => {
|
||||||
|
ref.dateTime = await getLastModified(ref.url) || '';
|
||||||
|
}));
|
||||||
|
|
||||||
|
console.log('Updated references:', thisStep.references);
|
||||||
|
}
|
||||||
|
|
||||||
async function executeSearchQueries(
|
async function executeSearchQueries(
|
||||||
keywordsQueries: any[],
|
keywordsQueries: any[],
|
||||||
context: TrackerContext,
|
context: TrackerContext,
|
||||||
@@ -297,12 +322,19 @@ async function executeSearchQueries(
|
|||||||
await sleep(STEP_SLEEP);
|
await sleep(STEP_SLEEP);
|
||||||
}
|
}
|
||||||
|
|
||||||
const minResults: SearchSnippet[] = (results).map(r => ({
|
const minResults: SearchSnippet[] = results
|
||||||
title: r.title,
|
.map(r => {
|
||||||
url: normalizeUrl('url' in r ? r.url : r.link),
|
const url = normalizeUrl('url' in r ? r.url : r.link);
|
||||||
description: 'description' in r ? r.description : r.snippet,
|
if (!url) return null; // Skip invalid URLs
|
||||||
weight: 1
|
|
||||||
}));
|
return {
|
||||||
|
title: r.title,
|
||||||
|
url,
|
||||||
|
description: 'description' in r ? r.description : r.snippet,
|
||||||
|
weight: 1
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter(Boolean) as SearchSnippet[]; // Filter out null entries and assert type
|
||||||
|
|
||||||
minResults.forEach(r => {
|
minResults.forEach(r => {
|
||||||
addToAllURLs(r, allURLs);
|
addToAllURLs(r, allURLs);
|
||||||
@@ -477,24 +509,7 @@ export async function getResponse(question?: string,
|
|||||||
// execute the step and action
|
// execute the step and action
|
||||||
if (thisStep.action === 'answer' && thisStep.answer) {
|
if (thisStep.action === 'answer' && thisStep.answer) {
|
||||||
// normalize all references urls, add title to it
|
// normalize all references urls, add title to it
|
||||||
thisStep.references = thisStep.references?.filter(ref => ref?.url && typeof ref.url === 'string' && ref.url.startsWith('http'))
|
await updateReferences(thisStep, allURLs)
|
||||||
.map(ref => {
|
|
||||||
const normalizedUrl = ref?.url ? normalizeUrl(ref.url) : '';
|
|
||||||
return {
|
|
||||||
exactQuote: ref?.exactQuote || '',
|
|
||||||
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
|
|
||||||
url: normalizedUrl,
|
|
||||||
dateTime: ref?.dateTime || ''
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// parallel process guess all url datetime
|
|
||||||
await Promise.all(thisStep.references.filter(ref => !(ref?.dateTime))
|
|
||||||
.map(async ref => {
|
|
||||||
ref.dateTime = await getLastModified(ref.url) || ''
|
|
||||||
}));
|
|
||||||
|
|
||||||
console.log('Updated references:', thisStep.references)
|
|
||||||
|
|
||||||
if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) {
|
if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) {
|
||||||
// LLM is so confident and answer immediately, skip all evaluations
|
// LLM is so confident and answer immediately, skip all evaluations
|
||||||
@@ -746,9 +761,8 @@ You decided to think out of the box or cut from a completely different angle.
|
|||||||
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
|
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
|
||||||
// normalize URLs
|
// normalize URLs
|
||||||
thisStep.URLTargets = thisStep.URLTargets
|
thisStep.URLTargets = thisStep.URLTargets
|
||||||
.filter(url => url.startsWith('http'))
|
|
||||||
.map(url => normalizeUrl(url))
|
.map(url => normalizeUrl(url))
|
||||||
.filter(url => !visitedURLs.includes(url));
|
.filter(url => url && !visitedURLs.includes(url)) as string[];
|
||||||
|
|
||||||
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
|
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
|
||||||
|
|
||||||
@@ -883,6 +897,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
think: result.object.think,
|
think: result.object.think,
|
||||||
...result.object[result.object.action]
|
...result.object[result.object.action]
|
||||||
} as AnswerAction;
|
} as AnswerAction;
|
||||||
|
await updateReferences(thisStep, allURLs);
|
||||||
(thisStep as AnswerAction).isFinal = true;
|
(thisStep as AnswerAction).isFinal = true;
|
||||||
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ export function buildMdFromAnswer(answer: AnswerAction) {
|
|||||||
|
|
||||||
const citation = `[^${i + 1}]: ${cleanQuote}`;
|
const citation = `[^${i + 1}]: ${cleanQuote}`;
|
||||||
|
|
||||||
if (!ref.url?.startsWith('http')) return citation;
|
if (!ref.url) return citation;
|
||||||
|
|
||||||
const domainName = new URL(ref.url).hostname.replace('www.', '');
|
const domainName = new URL(ref.url).hostname.replace('www.', '');
|
||||||
return `${citation} [${domainName}](${ref.url})`;
|
return `${citation} [${domainName}](${ref.url})`;
|
||||||
|
|||||||
@@ -11,18 +11,19 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
|||||||
removeSessionIDs: true,
|
removeSessionIDs: true,
|
||||||
removeUTMParams: true,
|
removeUTMParams: true,
|
||||||
removeTrackingParams: true
|
removeTrackingParams: true
|
||||||
}): string {
|
}) {
|
||||||
if (!urlString?.trim()) {
|
|
||||||
throw new Error('Empty URL');
|
|
||||||
}
|
|
||||||
|
|
||||||
urlString = urlString.trim();
|
|
||||||
|
|
||||||
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
|
|
||||||
urlString = 'https://' + urlString;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
urlString = urlString.replace(/\s+/g, '').trim();
|
||||||
|
|
||||||
|
if (!urlString?.trim()) {
|
||||||
|
throw new Error('Empty URL');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
|
||||||
|
urlString = 'https://' + urlString;
|
||||||
|
}
|
||||||
|
|
||||||
const url = new URL(urlString);
|
const url = new URL(urlString);
|
||||||
|
|
||||||
url.hostname = url.hostname.toLowerCase();
|
url.hostname = url.hostname.toLowerCase();
|
||||||
@@ -71,7 +72,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
|||||||
|
|
||||||
// Remove session IDs
|
// Remove session IDs
|
||||||
if (options.removeSessionIDs &&
|
if (options.removeSessionIDs &&
|
||||||
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -82,7 +83,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
|||||||
|
|
||||||
// Remove common tracking parameters
|
// Remove common tracking parameters
|
||||||
if (options.removeTrackingParams &&
|
if (options.removeTrackingParams &&
|
||||||
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,13 +133,14 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
|||||||
return normalizedUrl;
|
return normalizedUrl;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Main URL parsing error - this one we should throw
|
// Main URL parsing error - this one we should throw
|
||||||
throw new Error(`Invalid URL "${urlString}": ${error}`);
|
console.error(`Invalid URL "${urlString}": ${error}`);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
|
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
|
||||||
return Object.entries(allURLs)
|
return Object.entries(allURLs)
|
||||||
.filter(([url, ]) => !visitedURLs.includes(url))
|
.filter(([url,]) => !visitedURLs.includes(url))
|
||||||
.map(([, result]) => result);
|
.map(([, result]) => result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -270,11 +272,12 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
|||||||
|
|
||||||
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
|
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
|
||||||
const nURL = normalizeUrl(r.url);
|
const nURL = normalizeUrl(r.url);
|
||||||
|
if (!nURL) return;
|
||||||
if (!allURLs[nURL]) {
|
if (!allURLs[nURL]) {
|
||||||
allURLs[nURL] = r;
|
allURLs[nURL] = r;
|
||||||
allURLs[nURL].weight = weightDelta;
|
allURLs[nURL].weight = weightDelta;
|
||||||
} else {
|
} else {
|
||||||
(allURLs[nURL].weight as number)+= weightDelta;
|
(allURLs[nURL].weight as number) += weightDelta;
|
||||||
const curDesc = allURLs[nURL].description;
|
const curDesc = allURLs[nURL].description;
|
||||||
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
|
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
|
||||||
}
|
}
|
||||||
@@ -337,8 +340,6 @@ export function sampleMultinomial<T>(items: [T, number][]): T | null {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches the last modified date for a URL using the datetime detection API
|
* Fetches the last modified date for a URL using the datetime detection API
|
||||||
* @param url The URL to check for last modified date
|
* @param url The URL to check for last modified date
|
||||||
@@ -370,22 +371,22 @@ export async function getLastModified(url: string): Promise<string | undefined>
|
|||||||
|
|
||||||
|
|
||||||
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
||||||
const hostnameMap: Record<string, number> = {};
|
const hostnameMap: Record<string, number> = {};
|
||||||
const filteredResults: BoostedSearchSnippet[] = [];
|
const filteredResults: BoostedSearchSnippet[] = [];
|
||||||
|
|
||||||
results.forEach((result) => {
|
results.forEach((result) => {
|
||||||
const hostname = extractUrlParts(result.url).hostname;
|
const hostname = extractUrlParts(result.url).hostname;
|
||||||
if (hostnameMap[hostname] === undefined) {
|
if (hostnameMap[hostname] === undefined) {
|
||||||
hostnameMap[hostname] = 0;
|
hostnameMap[hostname] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hostnameMap[hostname] < k) {
|
if (hostnameMap[hostname] < k) {
|
||||||
filteredResults.push(result);
|
filteredResults.push(result);
|
||||||
hostnameMap[hostname]++;
|
hostnameMap[hostname]++;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return filteredResults;
|
return filteredResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processURLs(
|
export async function processURLs(
|
||||||
@@ -396,10 +397,10 @@ export async function processURLs(
|
|||||||
visitedURLs: string[],
|
visitedURLs: string[],
|
||||||
schemaGen: Schemas,
|
schemaGen: Schemas,
|
||||||
question: string
|
question: string
|
||||||
): Promise<{urlResults: any[], success: boolean}> {
|
): Promise<{ urlResults: any[], success: boolean }> {
|
||||||
// Skip if no URLs to process
|
// Skip if no URLs to process
|
||||||
if (urls.length === 0) {
|
if (urls.length === 0) {
|
||||||
return { urlResults: [], success: false };
|
return {urlResults: [], success: false};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Track the reading action
|
// Track the reading action
|
||||||
@@ -414,7 +415,14 @@ export async function processURLs(
|
|||||||
const urlResults = await Promise.all(
|
const urlResults = await Promise.all(
|
||||||
urls.map(async url => {
|
urls.map(async url => {
|
||||||
try {
|
try {
|
||||||
url = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
|
if (!normalizedUrl) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store normalized URL for consistent reference
|
||||||
|
url = normalizedUrl;
|
||||||
|
|
||||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
const {response} = await readUrl(url, true, context.tokenTracker);
|
||||||
const {data} = response;
|
const {data} = response;
|
||||||
const guessedTime = await getLastModified(url);
|
const guessedTime = await getLastModified(url);
|
||||||
@@ -422,7 +430,6 @@ export async function processURLs(
|
|||||||
console.log('Guessed time for', url, guessedTime);
|
console.log('Guessed time for', url, guessedTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Early return if no valid data
|
// Early return if no valid data
|
||||||
if (!data?.url || !data?.content) {
|
if (!data?.url || !data?.content) {
|
||||||
throw new Error('No content found');
|
throw new Error('No content found');
|
||||||
@@ -434,18 +441,20 @@ export async function processURLs(
|
|||||||
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
|
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
|
||||||
references: [data.url],
|
references: [data.url],
|
||||||
type: 'url',
|
type: 'url',
|
||||||
updated: guessedTime? formatDateBasedOnType(new Date(guessedTime), 'full'): undefined
|
updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process page links
|
// Process page links
|
||||||
data.links?.forEach(link => {
|
data.links?.forEach(link => {
|
||||||
|
const nnUrl = normalizeUrl(link[1]);
|
||||||
|
if (!nnUrl) return;
|
||||||
const r: SearchSnippet = {
|
const r: SearchSnippet = {
|
||||||
title: link[0],
|
title: link[0],
|
||||||
url: normalizeUrl(link[1]),
|
url: nnUrl,
|
||||||
description: link[0],
|
description: link[0],
|
||||||
}
|
}
|
||||||
// in-page link has lower initial weight comparing to search links
|
// in-page link has lower initial weight comparing to search links
|
||||||
if (r.url && r.url.startsWith('http')) {
|
if (r.url) {
|
||||||
addToAllURLs(r, allURLs, 0.1);
|
addToAllURLs(r, allURLs, 0.1);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -455,13 +464,19 @@ export async function processURLs(
|
|||||||
console.error('Error reading URL:', url, error);
|
console.error('Error reading URL:', url, error);
|
||||||
return null;
|
return null;
|
||||||
} finally {
|
} finally {
|
||||||
visitedURLs.push(url);
|
// Only add valid URLs to visitedURLs list
|
||||||
|
if (url && typeof url === 'string') {
|
||||||
|
visitedURLs.push(url);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
).then(results => results.filter(Boolean));
|
);
|
||||||
|
|
||||||
|
// Filter out null results without changing the original array
|
||||||
|
const validResults = urlResults.filter(Boolean);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
urlResults,
|
urlResults: validResults,
|
||||||
success: urlResults.length > 0
|
success: validResults.length > 0
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user