node-DeepResearch/src/utils/url-tools.ts
2025-03-18 11:24:53 +08:00

521 lines
16 KiB
TypeScript

import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
import {getI18nText, smartMergeStrings} from "./text-tools";
import {rerankDocuments} from "../tools/jina-rerank";
import {readUrl} from "../tools/read";
import {Schemas} from "./schemas";
import {cherryPick} from "../tools/jina-latechunk";
import {formatDateBasedOnType} from "./date-tools";
export function normalizeUrl(urlString: string, debug = false, options = {
removeAnchors: true,
removeSessionIDs: true,
removeUTMParams: true,
removeTrackingParams: true
}) {
try {
urlString = urlString.replace(/\s+/g, '').trim();
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
if (urlString.startsWith('https://google.com') || urlString.startsWith('https://www.google.com')) {
throw new Error('Google search link');
}
const url = new URL(urlString);
if (url.protocol !== 'http:' && url.protocol !== 'https:') {
throw new Error('Unsupported protocol');
}
url.hostname = url.hostname.toLowerCase();
if (url.hostname.startsWith('www.')) {
url.hostname = url.hostname.slice(4);
}
if ((url.protocol === 'http:' && url.port === '80') ||
(url.protocol === 'https:' && url.port === '443')) {
url.port = '';
}
// Path normalization with error tracking
url.pathname = url.pathname
.split('/')
.map(segment => {
try {
return decodeURIComponent(segment);
} catch (e) {
if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
return segment;
}
})
.join('/')
.replace(/\/+/g, '/')
.replace(/\/+$/, '') || '/';
// Query parameter normalization with error details
const searchParams = new URLSearchParams(url.search);
const sortedParams = Array.from(searchParams.entries())
.map(([key, value]) => {
if (value === '') return [key, ''];
try {
const decodedValue = decodeURIComponent(value);
if (encodeURIComponent(decodedValue) === value) {
return [key, decodedValue];
}
} catch (e) {
if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
}
return [key, value];
})
// Filter out tracking, session and UTM parameters
.filter(([key]) => {
if (key === '') return false;
// Remove session IDs
if (options.removeSessionIDs &&
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
return false;
}
// Remove UTM parameters
if (options.removeUTMParams && /^utm_/i.test(key)) {
return false;
}
// Remove common tracking parameters
if (options.removeTrackingParams &&
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
return false;
}
return true;
})
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB));
url.search = new URLSearchParams(sortedParams).toString();
// Fragment (anchor) handling - remove completely if requested
if (options.removeAnchors) {
url.hash = '';
} else if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
url.hash = '';
} else if (url.hash) {
try {
const decodedHash = decodeURIComponent(url.hash.slice(1));
const encodedBack = encodeURIComponent(decodedHash);
// Only use decoded version if it's safe
if (encodedBack === url.hash.slice(1)) {
url.hash = '#' + decodedHash;
}
} catch (e) {
if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
}
}
let normalizedUrl = url.toString();
// Remove trailing slash from paths that aren't just "/"
if (url.pathname.length > 1 && url.pathname.endsWith('/')) {
url.pathname = url.pathname.slice(0, -1);
}
// Final URL normalization with validation
try {
const decodedUrl = decodeURIComponent(normalizedUrl);
const encodedBack = encodeURIComponent(decodedUrl);
// Only use decoded version if it's safe
if (encodedBack === normalizedUrl) {
normalizedUrl = decodedUrl;
}
} catch (e) {
if (debug) console.error('Failed to decode final URL', e);
}
return normalizedUrl;
} catch (error) {
// Main URL parsing error - this one we should throw
console.error(`Invalid URL "${urlString}": ${error}`);
return;
}
}
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
return Object.entries(allURLs)
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
.map(([, result]) => result);
}
// Function to extract hostname and path from a URL
const extractUrlParts = (urlStr: string) => {
try {
const url = new URL(urlStr);
return {
hostname: url.hostname,
path: url.pathname
};
} catch (e) {
console.error(`Error parsing URL: ${urlStr}`, e);
return {hostname: "", path: ""};
}
};
// Function to count occurrences of hostnames and paths
export const countUrlParts = (urlItems: SearchResult[]) => {
const hostnameCount: Record<string, number> = {};
const pathPrefixCount: Record<string, number> = {};
let totalUrls = 0;
urlItems.forEach(item => {
item = (item as { title: string; url: string; description: string; weight?: number })
if (!item || !item.url) return; // Skip invalid items
totalUrls++;
const {hostname, path} = extractUrlParts(item.url);
// Count hostnames
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
// Count path prefixes (segments)
const pathSegments = path.split('/').filter(segment => segment.length > 0);
pathSegments.forEach((segment, index) => {
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
});
});
return {hostnameCount, pathPrefixCount, totalUrls};
};
// Calculate normalized frequency for boosting
const normalizeCount = (count: any, total: any) => {
return total > 0 ? count / total : 0;
};
// Calculate boosted weights
export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => {
// Default parameters for boosting - can be overridden
const {
freqFactor = 0.5, // How much to boost based on term frequency
hostnameBoostFactor = 0.5, // How much to boost based on hostname frequency
pathBoostFactor = 0.4, // How much to boost based on path frequency
decayFactor = 0.8, // Decay factor for longer paths (0-1)
jinaRerankFactor = 0.8, // How much to boost based on Jina reranking
minBoost = 0, // Minimum boost score
maxBoost = 5, // Maximum boost score cap
question = '', // Optional question for Jina reranking
boostHostnames = [], // Optional hostnames to boost
} = options;
// Count URL parts first
const counts = countUrlParts(urlItems);
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
if (question.trim().length > 0) {
// get from jina rerank
rerankDocuments(question, urlItems.map(item => smartMergeStrings(item.title, item.description)), trackers.tokenTracker)
.then(({results}) => {
results.forEach(({index, relevance_score}) => {
(urlItems[index] as BoostedSearchSnippet).jinaRerankBoost = relevance_score * jinaRerankFactor;
});
})
}
return (urlItems as BoostedSearchSnippet[]).map(item => {
if (!item || !item.url) {
console.error('Skipping invalid item:', item);
return item; // Return unchanged
}
const {hostname, path} = extractUrlParts(item.url);
// Base weight from original
const freq = item.weight || 0; // Default to 1 if weight is missing
// Hostname boost (normalized by total URLs)
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1);
// Path boost (consider all path prefixes with decay for longer paths)
let pathBoost = 0;
const pathSegments = path.split('/').filter(segment => segment.length > 0);
pathSegments.forEach((segment, index) => {
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
const prefixCount = pathPrefixCount[prefix] || 0;
const prefixFreq = normalizeCount(prefixCount, totalUrls);
// Apply decay factor based on path depth
const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
pathBoost += decayedBoost;
});
const freqBoost = freq / totalUrls * freqFactor;
const jinaRerankBoost = item.jinaRerankBoost || 0;
// Calculate new weight with clamping
const finalScore = Math.min(
Math.max(
hostnameBoost
+ pathBoost
+ freqBoost
+ jinaRerankBoost, minBoost),
maxBoost);
return {
...item,
freqBoost,
hostnameBoost,
pathBoost,
jinaRerankBoost,
finalScore
} as BoostedSearchSnippet;
}).sort((a, b) => b.finalScore - a.finalScore);
};
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
const nURL = normalizeUrl(r.url);
if (!nURL) return 0;
if (!allURLs[nURL]) {
allURLs[nURL] = r;
allURLs[nURL].weight = weightDelta;
return 1;
} else {
(allURLs[nURL].weight as number) += weightDelta;
const curDesc = allURLs[nURL].description;
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
return 0;
}
}
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
if (!allURLs || allURLs.length === 0) return '';
return (allURLs)
.map(r => {
const merged = smartMergeStrings(r.title, r.description);
return {
url: r.url,
score: r.finalScore,
merged
};
})
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
.sort((a, b) => (b.score || 0) - (a.score || 0))
.slice(0, maxURLs)
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
.join('\n');
}
/**
* Draw a sample from a multinomial distribution
* @param items Array of [name, weight] tuples
* @returns A randomly selected item based on the weights, or null if array is empty
*/
export function sampleMultinomial<T>(items: [T, number][]): T | null {
// Handle empty array
if (!items || items.length === 0) {
return null;
}
// Calculate total weight
const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0);
// Handle case where all weights are 0
if (totalWeight === 0) {
return null;
}
// Generate a random number between 0 and total weight
const randValue = Math.random() * totalWeight;
// Find the item corresponding to the random value
let cumulativeWeight = 0;
for (const [item, weight] of items) {
cumulativeWeight += weight;
if (randValue <= cumulativeWeight) {
return item;
}
}
// Fallback (should rarely happen due to floating point precision)
return items[items.length - 1][0];
}
/**
* Fetches the last modified date for a URL using the datetime detection API
* @param url The URL to check for last modified date
* @returns Promise containing the last modified date or null if not found
*/
export async function getLastModified(url: string): Promise<string | undefined> {
try {
// Call the API with proper encoding
const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`;
const response = await fetch(apiUrl);
if (!response.ok) {
throw new Error(`API returned ${response.status}`);
}
const data = await response.json();
// Return the bestGuess date if available
if (data.bestGuess && data.confidence >= 70) {
return data.bestGuess;
}
return undefined;
} catch (error) {
console.error('Failed to fetch last modified date:', error);
return undefined;
}
}
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = [];
results.forEach((result) => {
const hostname = extractUrlParts(result.url).hostname;
if (hostnameMap[hostname] === undefined) {
hostnameMap[hostname] = 0;
}
if (hostnameMap[hostname] < k) {
filteredResults.push(result);
hostnameMap[hostname]++;
}
});
return filteredResults;
}
export async function processURLs(
urls: string[],
context: TrackerContext,
allKnowledge: KnowledgeItem[],
allURLs: Record<string, SearchSnippet>,
visitedURLs: string[],
badURLs: string[],
schemaGen: Schemas,
question: string
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
// Skip if no URLs to process
if (urls.length === 0) {
return {urlResults: [], success: false, badURLs: []};
}
const badHostnames: string[] = [];
// Track the reading action
const thisStep: VisitAction = {
action: 'visit',
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
URLTargets: urls
}
context.actionTracker.trackAction({thisStep})
// Process each URL in parallel
const urlResults = await Promise.all(
urls.map(async url => {
try {
const normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
return null;
}
// Store normalized URL for consistent reference
url = normalizedUrl;
const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response;
const guessedTime = await getLastModified(url);
if (guessedTime) {
console.log('Guessed time for', url, guessedTime);
}
// Early return if no valid data
if (!data?.url || !data?.content) {
throw new Error('No content found');
}
// Add to knowledge base
allKnowledge.push({
question: `What do expert say about "${question}"?`,
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
references: [data.url],
type: 'url',
updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
});
// Process page links
data.links?.forEach(link => {
const nnUrl = normalizeUrl(link[1]);
if (!nnUrl) return;
const r: SearchSnippet = {
title: link[0],
url: nnUrl,
description: link[0],
}
// in-page link has lower initial weight comparing to search links
if (r.url) {
addToAllURLs(r, allURLs, 0.1);
}
});
return {url, result: response};
} catch (error: any) {
console.error('Error reading URL:', url, error);
badURLs.push(url);
// Extract hostname from the URL
if (
(error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
(error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
error?.message?.includes("Couldn't resolve host name") ||
error?.message?.includes("could not be resolved") ||
error?.message?.includes("ERR_CERT_COMMON_NAME_INVALID") ||
error?.message?.includes("ERR_CONNECTION_REFUSED")
) {
let hostname = '';
try {
hostname = extractUrlParts(url).hostname;
} catch (e) {
console.error('Error parsing URL for hostname:', url, e);
}
badHostnames.push(hostname);
console.log(`Added ${hostname} to bad hostnames list`);
}
return null;
} finally {
// Only add valid URLs to visitedURLs list
if (url) {
visitedURLs.push(url);
}
}
})
);
// Filter out null results without changing the original array
const validResults = urlResults.filter(Boolean);
// remove any URL with bad hostnames from allURLs
if (badHostnames.length > 0) {
Object.keys(allURLs).forEach(url => {
if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url];
console.log(`Removed ${url} from allURLs`);
}
}
)
}
return {
urlResults: validResults,
success: validResults.length > 0,
badURLs
};
}