mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-25 22:16:49 +08:00
521 lines
16 KiB
TypeScript
521 lines
16 KiB
TypeScript
import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
|
|
import {getI18nText, smartMergeStrings} from "./text-tools";
|
|
import {rerankDocuments} from "../tools/jina-rerank";
|
|
import {readUrl} from "../tools/read";
|
|
import {Schemas} from "./schemas";
|
|
import {cherryPick} from "../tools/jina-latechunk";
|
|
import {formatDateBasedOnType} from "./date-tools";
|
|
|
|
export function normalizeUrl(urlString: string, debug = false, options = {
|
|
removeAnchors: true,
|
|
removeSessionIDs: true,
|
|
removeUTMParams: true,
|
|
removeTrackingParams: true
|
|
}) {
|
|
try {
|
|
urlString = urlString.replace(/\s+/g, '').trim();
|
|
|
|
if (!urlString?.trim()) {
|
|
throw new Error('Empty URL');
|
|
}
|
|
|
|
if (urlString.startsWith('https://google.com') || urlString.startsWith('https://www.google.com')) {
|
|
throw new Error('Google search link');
|
|
}
|
|
|
|
const url = new URL(urlString);
|
|
if (url.protocol !== 'http:' && url.protocol !== 'https:') {
|
|
throw new Error('Unsupported protocol');
|
|
}
|
|
|
|
url.hostname = url.hostname.toLowerCase();
|
|
if (url.hostname.startsWith('www.')) {
|
|
url.hostname = url.hostname.slice(4);
|
|
}
|
|
|
|
if ((url.protocol === 'http:' && url.port === '80') ||
|
|
(url.protocol === 'https:' && url.port === '443')) {
|
|
url.port = '';
|
|
}
|
|
|
|
// Path normalization with error tracking
|
|
url.pathname = url.pathname
|
|
.split('/')
|
|
.map(segment => {
|
|
try {
|
|
return decodeURIComponent(segment);
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
|
|
return segment;
|
|
}
|
|
})
|
|
.join('/')
|
|
.replace(/\/+/g, '/')
|
|
.replace(/\/+$/, '') || '/';
|
|
|
|
// Query parameter normalization with error details
|
|
const searchParams = new URLSearchParams(url.search);
|
|
const sortedParams = Array.from(searchParams.entries())
|
|
.map(([key, value]) => {
|
|
if (value === '') return [key, ''];
|
|
try {
|
|
const decodedValue = decodeURIComponent(value);
|
|
if (encodeURIComponent(decodedValue) === value) {
|
|
return [key, decodedValue];
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
|
|
}
|
|
return [key, value];
|
|
})
|
|
// Filter out tracking, session and UTM parameters
|
|
.filter(([key]) => {
|
|
if (key === '') return false;
|
|
|
|
// Remove session IDs
|
|
if (options.removeSessionIDs &&
|
|
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
// Remove UTM parameters
|
|
if (options.removeUTMParams && /^utm_/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
// Remove common tracking parameters
|
|
if (options.removeTrackingParams &&
|
|
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
})
|
|
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB));
|
|
|
|
url.search = new URLSearchParams(sortedParams).toString();
|
|
|
|
// Fragment (anchor) handling - remove completely if requested
|
|
if (options.removeAnchors) {
|
|
url.hash = '';
|
|
} else if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
|
|
url.hash = '';
|
|
} else if (url.hash) {
|
|
try {
|
|
const decodedHash = decodeURIComponent(url.hash.slice(1));
|
|
const encodedBack = encodeURIComponent(decodedHash);
|
|
// Only use decoded version if it's safe
|
|
if (encodedBack === url.hash.slice(1)) {
|
|
url.hash = '#' + decodedHash;
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
|
|
}
|
|
}
|
|
|
|
let normalizedUrl = url.toString();
|
|
|
|
// Remove trailing slash from paths that aren't just "/"
|
|
if (url.pathname.length > 1 && url.pathname.endsWith('/')) {
|
|
url.pathname = url.pathname.slice(0, -1);
|
|
}
|
|
|
|
// Final URL normalization with validation
|
|
try {
|
|
const decodedUrl = decodeURIComponent(normalizedUrl);
|
|
const encodedBack = encodeURIComponent(decodedUrl);
|
|
// Only use decoded version if it's safe
|
|
if (encodedBack === normalizedUrl) {
|
|
normalizedUrl = decodedUrl;
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error('Failed to decode final URL', e);
|
|
}
|
|
|
|
return normalizedUrl;
|
|
} catch (error) {
|
|
// Main URL parsing error - this one we should throw
|
|
console.error(`Invalid URL "${urlString}": ${error}`);
|
|
return;
|
|
}
|
|
}
|
|
|
|
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
|
|
return Object.entries(allURLs)
|
|
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
|
|
.map(([, result]) => result);
|
|
}
|
|
|
|
|
|
// Function to extract hostname and path from a URL
|
|
const extractUrlParts = (urlStr: string) => {
|
|
try {
|
|
const url = new URL(urlStr);
|
|
return {
|
|
hostname: url.hostname,
|
|
path: url.pathname
|
|
};
|
|
} catch (e) {
|
|
console.error(`Error parsing URL: ${urlStr}`, e);
|
|
return {hostname: "", path: ""};
|
|
}
|
|
};
|
|
|
|
// Function to count occurrences of hostnames and paths
|
|
export const countUrlParts = (urlItems: SearchResult[]) => {
|
|
const hostnameCount: Record<string, number> = {};
|
|
const pathPrefixCount: Record<string, number> = {};
|
|
let totalUrls = 0;
|
|
|
|
urlItems.forEach(item => {
|
|
item = (item as { title: string; url: string; description: string; weight?: number })
|
|
if (!item || !item.url) return; // Skip invalid items
|
|
|
|
totalUrls++;
|
|
const {hostname, path} = extractUrlParts(item.url);
|
|
|
|
// Count hostnames
|
|
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
|
|
|
// Count path prefixes (segments)
|
|
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
|
pathSegments.forEach((segment, index) => {
|
|
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
|
pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
|
|
});
|
|
});
|
|
|
|
return {hostnameCount, pathPrefixCount, totalUrls};
|
|
};
|
|
|
|
// Calculate normalized frequency for boosting
|
|
const normalizeCount = (count: any, total: any) => {
|
|
return total > 0 ? count / total : 0;
|
|
};
|
|
|
|
// Calculate boosted weights
|
|
export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => {
|
|
// Default parameters for boosting - can be overridden
|
|
const {
|
|
freqFactor = 0.5, // How much to boost based on term frequency
|
|
hostnameBoostFactor = 0.5, // How much to boost based on hostname frequency
|
|
pathBoostFactor = 0.4, // How much to boost based on path frequency
|
|
decayFactor = 0.8, // Decay factor for longer paths (0-1)
|
|
jinaRerankFactor = 0.8, // How much to boost based on Jina reranking
|
|
minBoost = 0, // Minimum boost score
|
|
maxBoost = 5, // Maximum boost score cap
|
|
question = '', // Optional question for Jina reranking
|
|
boostHostnames = [], // Optional hostnames to boost
|
|
} = options;
|
|
|
|
// Count URL parts first
|
|
const counts = countUrlParts(urlItems);
|
|
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
|
|
|
|
if (question.trim().length > 0) {
|
|
// get from jina rerank
|
|
rerankDocuments(question, urlItems.map(item => smartMergeStrings(item.title, item.description)), trackers.tokenTracker)
|
|
.then(({results}) => {
|
|
results.forEach(({index, relevance_score}) => {
|
|
(urlItems[index] as BoostedSearchSnippet).jinaRerankBoost = relevance_score * jinaRerankFactor;
|
|
});
|
|
})
|
|
}
|
|
|
|
|
|
return (urlItems as BoostedSearchSnippet[]).map(item => {
|
|
if (!item || !item.url) {
|
|
console.error('Skipping invalid item:', item);
|
|
return item; // Return unchanged
|
|
}
|
|
|
|
const {hostname, path} = extractUrlParts(item.url);
|
|
|
|
// Base weight from original
|
|
const freq = item.weight || 0; // Default to 1 if weight is missing
|
|
|
|
// Hostname boost (normalized by total URLs)
|
|
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
|
|
const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1);
|
|
|
|
// Path boost (consider all path prefixes with decay for longer paths)
|
|
let pathBoost = 0;
|
|
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
|
pathSegments.forEach((segment, index) => {
|
|
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
|
const prefixCount = pathPrefixCount[prefix] || 0;
|
|
const prefixFreq = normalizeCount(prefixCount, totalUrls);
|
|
|
|
// Apply decay factor based on path depth
|
|
const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
|
|
pathBoost += decayedBoost;
|
|
});
|
|
|
|
const freqBoost = freq / totalUrls * freqFactor;
|
|
const jinaRerankBoost = item.jinaRerankBoost || 0;
|
|
// Calculate new weight with clamping
|
|
const finalScore = Math.min(
|
|
Math.max(
|
|
hostnameBoost
|
|
+ pathBoost
|
|
+ freqBoost
|
|
+ jinaRerankBoost, minBoost),
|
|
maxBoost);
|
|
|
|
return {
|
|
...item,
|
|
freqBoost,
|
|
hostnameBoost,
|
|
pathBoost,
|
|
jinaRerankBoost,
|
|
finalScore
|
|
} as BoostedSearchSnippet;
|
|
}).sort((a, b) => b.finalScore - a.finalScore);
|
|
};
|
|
|
|
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
|
|
const nURL = normalizeUrl(r.url);
|
|
if (!nURL) return 0;
|
|
if (!allURLs[nURL]) {
|
|
allURLs[nURL] = r;
|
|
allURLs[nURL].weight = weightDelta;
|
|
return 1;
|
|
} else {
|
|
(allURLs[nURL].weight as number) += weightDelta;
|
|
const curDesc = allURLs[nURL].description;
|
|
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
|
|
if (!allURLs || allURLs.length === 0) return '';
|
|
|
|
return (allURLs)
|
|
.map(r => {
|
|
const merged = smartMergeStrings(r.title, r.description);
|
|
return {
|
|
url: r.url,
|
|
score: r.finalScore,
|
|
merged
|
|
};
|
|
})
|
|
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
|
|
.sort((a, b) => (b.score || 0) - (a.score || 0))
|
|
.slice(0, maxURLs)
|
|
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
|
|
.join('\n');
|
|
}
|
|
|
|
|
|
/**
|
|
* Draw a sample from a multinomial distribution
|
|
* @param items Array of [name, weight] tuples
|
|
* @returns A randomly selected item based on the weights, or null if array is empty
|
|
*/
|
|
export function sampleMultinomial<T>(items: [T, number][]): T | null {
|
|
// Handle empty array
|
|
if (!items || items.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
// Calculate total weight
|
|
const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0);
|
|
|
|
// Handle case where all weights are 0
|
|
if (totalWeight === 0) {
|
|
return null;
|
|
}
|
|
|
|
// Generate a random number between 0 and total weight
|
|
const randValue = Math.random() * totalWeight;
|
|
|
|
// Find the item corresponding to the random value
|
|
let cumulativeWeight = 0;
|
|
|
|
for (const [item, weight] of items) {
|
|
cumulativeWeight += weight;
|
|
if (randValue <= cumulativeWeight) {
|
|
return item;
|
|
}
|
|
}
|
|
|
|
// Fallback (should rarely happen due to floating point precision)
|
|
return items[items.length - 1][0];
|
|
}
|
|
|
|
|
|
/**
|
|
* Fetches the last modified date for a URL using the datetime detection API
|
|
* @param url The URL to check for last modified date
|
|
* @returns Promise containing the last modified date or null if not found
|
|
*/
|
|
export async function getLastModified(url: string): Promise<string | undefined> {
|
|
try {
|
|
// Call the API with proper encoding
|
|
const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`;
|
|
const response = await fetch(apiUrl);
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`API returned ${response.status}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
// Return the bestGuess date if available
|
|
if (data.bestGuess && data.confidence >= 70) {
|
|
return data.bestGuess;
|
|
}
|
|
|
|
return undefined;
|
|
} catch (error) {
|
|
console.error('Failed to fetch last modified date:', error);
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
|
|
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
|
const hostnameMap: Record<string, number> = {};
|
|
const filteredResults: BoostedSearchSnippet[] = [];
|
|
|
|
results.forEach((result) => {
|
|
const hostname = extractUrlParts(result.url).hostname;
|
|
if (hostnameMap[hostname] === undefined) {
|
|
hostnameMap[hostname] = 0;
|
|
}
|
|
|
|
if (hostnameMap[hostname] < k) {
|
|
filteredResults.push(result);
|
|
hostnameMap[hostname]++;
|
|
}
|
|
});
|
|
|
|
return filteredResults;
|
|
}
|
|
|
|
export async function processURLs(
|
|
urls: string[],
|
|
context: TrackerContext,
|
|
allKnowledge: KnowledgeItem[],
|
|
allURLs: Record<string, SearchSnippet>,
|
|
visitedURLs: string[],
|
|
badURLs: string[],
|
|
schemaGen: Schemas,
|
|
question: string
|
|
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
|
|
// Skip if no URLs to process
|
|
if (urls.length === 0) {
|
|
return {urlResults: [], success: false, badURLs: []};
|
|
}
|
|
|
|
const badHostnames: string[] = [];
|
|
|
|
// Track the reading action
|
|
const thisStep: VisitAction = {
|
|
action: 'visit',
|
|
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
|
|
URLTargets: urls
|
|
}
|
|
context.actionTracker.trackAction({thisStep})
|
|
|
|
// Process each URL in parallel
|
|
const urlResults = await Promise.all(
|
|
urls.map(async url => {
|
|
try {
|
|
const normalizedUrl = normalizeUrl(url);
|
|
if (!normalizedUrl) {
|
|
return null;
|
|
}
|
|
|
|
// Store normalized URL for consistent reference
|
|
url = normalizedUrl;
|
|
|
|
const {response} = await readUrl(url, true, context.tokenTracker);
|
|
const {data} = response;
|
|
const guessedTime = await getLastModified(url);
|
|
if (guessedTime) {
|
|
console.log('Guessed time for', url, guessedTime);
|
|
}
|
|
|
|
// Early return if no valid data
|
|
if (!data?.url || !data?.content) {
|
|
throw new Error('No content found');
|
|
}
|
|
|
|
// Add to knowledge base
|
|
allKnowledge.push({
|
|
question: `What do expert say about "${question}"?`,
|
|
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
|
|
references: [data.url],
|
|
type: 'url',
|
|
updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
|
|
});
|
|
|
|
// Process page links
|
|
data.links?.forEach(link => {
|
|
const nnUrl = normalizeUrl(link[1]);
|
|
if (!nnUrl) return;
|
|
const r: SearchSnippet = {
|
|
title: link[0],
|
|
url: nnUrl,
|
|
description: link[0],
|
|
}
|
|
// in-page link has lower initial weight comparing to search links
|
|
if (r.url) {
|
|
addToAllURLs(r, allURLs, 0.1);
|
|
}
|
|
});
|
|
|
|
return {url, result: response};
|
|
} catch (error: any) {
|
|
console.error('Error reading URL:', url, error);
|
|
badURLs.push(url);
|
|
// Extract hostname from the URL
|
|
if (
|
|
(error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
|
|
(error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
|
|
error?.message?.includes("Couldn't resolve host name") ||
|
|
error?.message?.includes("could not be resolved") ||
|
|
error?.message?.includes("ERR_CERT_COMMON_NAME_INVALID") ||
|
|
error?.message?.includes("ERR_CONNECTION_REFUSED")
|
|
) {
|
|
let hostname = '';
|
|
try {
|
|
hostname = extractUrlParts(url).hostname;
|
|
} catch (e) {
|
|
console.error('Error parsing URL for hostname:', url, e);
|
|
}
|
|
badHostnames.push(hostname);
|
|
console.log(`Added ${hostname} to bad hostnames list`);
|
|
}
|
|
return null;
|
|
} finally {
|
|
// Only add valid URLs to visitedURLs list
|
|
if (url) {
|
|
visitedURLs.push(url);
|
|
}
|
|
}
|
|
})
|
|
);
|
|
|
|
// Filter out null results without changing the original array
|
|
const validResults = urlResults.filter(Boolean);
|
|
|
|
// remove any URL with bad hostnames from allURLs
|
|
if (badHostnames.length > 0) {
|
|
Object.keys(allURLs).forEach(url => {
|
|
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
|
delete allURLs[url];
|
|
console.log(`Removed ${url} from allURLs`);
|
|
}
|
|
}
|
|
)
|
|
}
|
|
|
|
return {
|
|
urlResults: validResults,
|
|
success: validResults.length > 0,
|
|
badURLs
|
|
};
|
|
} |