import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types"; import {getI18nText, smartMergeStrings} from "./text-tools"; import {rerankDocuments} from "../tools/jina-rerank"; import {readUrl} from "../tools/read"; import {Schemas} from "./schemas"; import {cherryPick} from "../tools/jina-latechunk"; import {formatDateBasedOnType} from "./date-tools"; export function normalizeUrl(urlString: string, debug = false, options = { removeAnchors: true, removeSessionIDs: true, removeUTMParams: true, removeTrackingParams: true }) { try { urlString = urlString.replace(/\s+/g, '').trim(); if (!urlString?.trim()) { throw new Error('Empty URL'); } if (urlString.startsWith('https://google.com/') || urlString.startsWith('https://www.google.com')) { throw new Error('Google search link'); } if (urlString.includes('example.com')) { throw new Error('Example URL'); } const url = new URL(urlString); if (url.protocol !== 'http:' && url.protocol !== 'https:') { throw new Error('Unsupported protocol'); } url.hostname = url.hostname.toLowerCase(); if (url.hostname.startsWith('www.')) { url.hostname = url.hostname.slice(4); } if ((url.protocol === 'http:' && url.port === '80') || (url.protocol === 'https:' && url.port === '443')) { url.port = ''; } // Path normalization with error tracking url.pathname = url.pathname .split('/') .map(segment => { try { return decodeURIComponent(segment); } catch (e) { if (debug) console.error(`Failed to decode path segment: ${segment}`, e); return segment; } }) .join('/') .replace(/\/+/g, '/') .replace(/\/+$/, '') || '/'; // Query parameter normalization with error details const searchParams = new URLSearchParams(url.search); const sortedParams = Array.from(searchParams.entries()) .map(([key, value]) => { if (value === '') return [key, '']; try { const decodedValue = decodeURIComponent(value); if (encodeURIComponent(decodedValue) === value) { return [key, decodedValue]; } } catch (e) { if (debug) console.error(`Failed to decode query param ${key}=${value}`, e); } return [key, value]; }) // Filter out tracking, session and UTM parameters .filter(([key]) => { if (key === '') return false; // Remove session IDs if (options.removeSessionIDs && /^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) { return false; } // Remove UTM parameters if (options.removeUTMParams && /^utm_/i.test(key)) { return false; } // Remove common tracking parameters if (options.removeTrackingParams && /^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) { return false; } return true; }) .sort(([keyA], [keyB]) => keyA.localeCompare(keyB)); url.search = new URLSearchParams(sortedParams).toString(); // Fragment (anchor) handling - remove completely if requested if (options.removeAnchors) { url.hash = ''; } else if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) { url.hash = ''; } else if (url.hash) { try { const decodedHash = decodeURIComponent(url.hash.slice(1)); const encodedBack = encodeURIComponent(decodedHash); // Only use decoded version if it's safe if (encodedBack === url.hash.slice(1)) { url.hash = '#' + decodedHash; } } catch (e) { if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e); } } let normalizedUrl = url.toString(); // Remove trailing slash from paths that aren't just "/" if (url.pathname.length > 1 && url.pathname.endsWith('/')) { url.pathname = url.pathname.slice(0, -1); } // Final URL normalization with validation try { const decodedUrl = decodeURIComponent(normalizedUrl); const encodedBack = encodeURIComponent(decodedUrl); // Only use decoded version if it's safe if (encodedBack === normalizedUrl) { normalizedUrl = decodedUrl; } } catch (e) { if (debug) console.error('Failed to decode final URL', e); } return normalizedUrl; } catch (error) { // Main URL parsing error - this one we should throw console.error(`Invalid URL "${urlString}": ${error}`); return; } } export function filterURLs(allURLs: Record, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] { return Object.entries(allURLs) .filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname)) .map(([, result]) => result); } // Function to extract hostname and path from a URL const extractUrlParts = (urlStr: string) => { try { const url = new URL(urlStr); return { hostname: url.hostname, path: url.pathname }; } catch (e) { console.error(`Error parsing URL: ${urlStr}`, e); return {hostname: "", path: ""}; } }; // Function to count occurrences of hostnames and paths export const countUrlParts = (urlItems: SearchResult[]) => { const hostnameCount: Record = {}; const pathPrefixCount: Record = {}; let totalUrls = 0; urlItems.forEach(item => { item = (item as { title: string; url: string; description: string; weight?: number }) if (!item || !item.url) return; // Skip invalid items totalUrls++; const {hostname, path} = extractUrlParts(item.url); // Count hostnames hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1; // Count path prefixes (segments) const pathSegments = path.split('/').filter(segment => segment.length > 0); pathSegments.forEach((segment, index) => { const prefix = '/' + pathSegments.slice(0, index + 1).join('/'); pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1; }); }); return {hostnameCount, pathPrefixCount, totalUrls}; }; // Calculate normalized frequency for boosting const normalizeCount = (count: any, total: any) => { return total > 0 ? count / total : 0; }; // Calculate boosted weights export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => { // Default parameters for boosting - can be overridden const { freqFactor = 0.5, // How much to boost based on term frequency hostnameBoostFactor = 0.5, // How much to boost based on hostname frequency pathBoostFactor = 0.4, // How much to boost based on path frequency decayFactor = 0.8, // Decay factor for longer paths (0-1) jinaRerankFactor = 0.8, // How much to boost based on Jina reranking minBoost = 0, // Minimum boost score maxBoost = 5, // Maximum boost score cap question = '', // Optional question for Jina reranking boostHostnames = [], // Optional hostnames to boost } = options; // Count URL parts first const counts = countUrlParts(urlItems); const {hostnameCount, pathPrefixCount, totalUrls} = counts; if (question.trim().length > 0) { // Step 1: Create a record to track unique content with their original indices const uniqueContentMap: Record = {}; urlItems.forEach((item, originalIndex) => { const mergedContent = smartMergeStrings(item.title, item.description); if (!uniqueContentMap[mergedContent]) { uniqueContentMap[mergedContent] = [originalIndex]; } else { uniqueContentMap[mergedContent].push(originalIndex); } }); // Step 2: Rerank only the unique contents const uniqueContents = Object.keys(uniqueContentMap); const uniqueIndicesMap = Object.values(uniqueContentMap); console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`) rerankDocuments(question, uniqueContents, trackers.tokenTracker) .then(({results}) => { // Step 3: Map the scores back to all original items results.forEach(({index, relevance_score}) => { const originalIndices = uniqueIndicesMap[index]; const boost = relevance_score * jinaRerankFactor; // Apply the same boost to all items with identical content originalIndices.forEach((originalIndex: number) => { (urlItems[originalIndex] as BoostedSearchSnippet).jinaRerankBoost = boost; }); }); }); } return (urlItems as BoostedSearchSnippet[]).map(item => { if (!item || !item.url) { console.error('Skipping invalid item:', item); return item; // Return unchanged } const {hostname, path} = extractUrlParts(item.url); // Base weight from original const freq = item.weight || 0; // Default to 1 if weight is missing // Hostname boost (normalized by total URLs) const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls); const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1); // Path boost (consider all path prefixes with decay for longer paths) let pathBoost = 0; const pathSegments = path.split('/').filter(segment => segment.length > 0); pathSegments.forEach((segment, index) => { const prefix = '/' + pathSegments.slice(0, index + 1).join('/'); const prefixCount = pathPrefixCount[prefix] || 0; const prefixFreq = normalizeCount(prefixCount, totalUrls); // Apply decay factor based on path depth const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor; pathBoost += decayedBoost; }); const freqBoost = freq / totalUrls * freqFactor; const jinaRerankBoost = item.jinaRerankBoost || 0; // Calculate new weight with clamping const finalScore = Math.min( Math.max( hostnameBoost + pathBoost + freqBoost + jinaRerankBoost, minBoost), maxBoost); return { ...item, freqBoost, hostnameBoost, pathBoost, jinaRerankBoost, finalScore } as BoostedSearchSnippet; }).sort((a, b) => b.finalScore - a.finalScore); }; export const addToAllURLs = (r: SearchSnippet, allURLs: Record, weightDelta = 1) => { const nURL = normalizeUrl(r.url); if (!nURL) return 0; if (!allURLs[nURL]) { allURLs[nURL] = r; allURLs[nURL].weight = weightDelta; return 1; } else { (allURLs[nURL].weight as number) += weightDelta; const curDesc = allURLs[nURL].description; allURLs[nURL].description = smartMergeStrings(curDesc, r.description); return 0; } } export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => { if (!allURLs || allURLs.length === 0) return ''; return (allURLs) .map(r => { const merged = smartMergeStrings(r.title, r.description); return { url: r.url, score: r.finalScore, merged }; }) .filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null) .sort((a, b) => (b.score || 0) - (a.score || 0)) .slice(0, maxURLs) .map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`) .join('\n'); } /** * Draw a sample from a multinomial distribution * @param items Array of [name, weight] tuples * @returns A randomly selected item based on the weights, or null if array is empty */ export function sampleMultinomial(items: [T, number][]): T | null { // Handle empty array if (!items || items.length === 0) { return null; } // Calculate total weight const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0); // Handle case where all weights are 0 if (totalWeight === 0) { return null; } // Generate a random number between 0 and total weight const randValue = Math.random() * totalWeight; // Find the item corresponding to the random value let cumulativeWeight = 0; for (const [item, weight] of items) { cumulativeWeight += weight; if (randValue <= cumulativeWeight) { return item; } } // Fallback (should rarely happen due to floating point precision) return items[items.length - 1][0]; } /** * Fetches the last modified date for a URL using the datetime detection API * @param url The URL to check for last modified date * @returns Promise containing the last modified date or null if not found */ export async function getLastModified(url: string): Promise { try { // Call the API with proper encoding const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`; const response = await fetch(apiUrl); if (!response.ok) { throw new Error(`API returned ${response.status}`); } const data = await response.json(); // Return the bestGuess date if available if (data.bestGuess && data.confidence >= 70) { return data.bestGuess; } return undefined; } catch (error) { console.error('Failed to fetch last modified date:', error); return undefined; } } export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => { const hostnameMap: Record = {}; const filteredResults: BoostedSearchSnippet[] = []; results.forEach((result) => { const hostname = extractUrlParts(result.url).hostname; if (hostnameMap[hostname] === undefined) { hostnameMap[hostname] = 0; } if (hostnameMap[hostname] < k) { filteredResults.push(result); hostnameMap[hostname]++; } }); return filteredResults; } export async function processURLs( urls: string[], context: TrackerContext, allKnowledge: KnowledgeItem[], allURLs: Record, visitedURLs: string[], badURLs: string[], schemaGen: Schemas, question: string ): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> { // Skip if no URLs to process if (urls.length === 0) { return {urlResults: [], success: false, badURLs: []}; } const badHostnames: string[] = []; // Track the reading action const thisStep: VisitAction = { action: 'visit', think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}), URLTargets: urls } context.actionTracker.trackAction({thisStep}) // Process each URL in parallel const urlResults = await Promise.all( urls.map(async url => { try { const normalizedUrl = normalizeUrl(url); if (!normalizedUrl) { return null; } // Store normalized URL for consistent reference url = normalizedUrl; const {response} = await readUrl(url, true, context.tokenTracker); const {data} = response; const guessedTime = await getLastModified(url); if (guessedTime) { console.log('Guessed time for', url, guessedTime); } // Early return if no valid data if (!data?.url || !data?.content) { throw new Error('No content found'); } // Add to knowledge base allKnowledge.push({ question: `What do expert say about "${question}"?`, answer: await cherryPick(question, data.content, {}, context, schemaGen, url), references: [data.url], type: 'url', updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined }); // Process page links data.links?.forEach(link => { const nnUrl = normalizeUrl(link[1]); if (!nnUrl) return; const r: SearchSnippet = { title: link[0], url: nnUrl, description: link[0], } // in-page link has lower initial weight comparing to search links if (r.url) { addToAllURLs(r, allURLs, 0.1); } }); return {url, result: response}; } catch (error: any) { console.error('Error reading URL:', url, error); badURLs.push(url); // Extract hostname from the URL if ( (error?.name === 'ParamValidationError' && error.message?.includes('Domain')) || (error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) || error?.message?.includes("Couldn't resolve host name") || error?.message?.includes("could not be resolved") || error?.message?.includes("ERR_CERT_COMMON_NAME_INVALID") || error?.message?.includes("ERR_CONNECTION_REFUSED") ) { let hostname = ''; try { hostname = extractUrlParts(url).hostname; } catch (e) { console.error('Error parsing URL for hostname:', url, e); } badHostnames.push(hostname); console.log(`Added ${hostname} to bad hostnames list`); } return null; } finally { // Only add valid URLs to visitedURLs list if (url) { visitedURLs.push(url); } } }) ); // Filter out null results without changing the original array const validResults = urlResults.filter(Boolean); // remove any URL with bad hostnames from allURLs if (badHostnames.length > 0) { Object.keys(allURLs).forEach(url => { if (badHostnames.includes(extractUrlParts(url).hostname)) { delete allURLs[url]; console.log(`Removed ${url} from allURLs`); } } ) } return { urlResults: validResults, success: validResults.length > 0, badURLs }; }