mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
365 lines
12 KiB
TypeScript
365 lines
12 KiB
TypeScript
import {BoostedSearchSnippet, SearchResult, SearchSnippet, TrackerContext} from "../types";
|
|
import {smartMergeStrings} from "./text-tools";
|
|
import {rerankDocuments} from "../tools/jina-rerank";
|
|
|
|
export function normalizeUrl(urlString: string, debug = false, options = {
|
|
removeAnchors: true,
|
|
removeSessionIDs: true,
|
|
removeUTMParams: true,
|
|
removeTrackingParams: true
|
|
}): string {
|
|
if (!urlString?.trim()) {
|
|
throw new Error('Empty URL');
|
|
}
|
|
|
|
urlString = urlString.trim();
|
|
|
|
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
|
|
urlString = 'https://' + urlString;
|
|
}
|
|
|
|
try {
|
|
const url = new URL(urlString);
|
|
|
|
url.hostname = url.hostname.toLowerCase();
|
|
if (url.hostname.startsWith('www.')) {
|
|
url.hostname = url.hostname.slice(4);
|
|
}
|
|
|
|
if ((url.protocol === 'http:' && url.port === '80') ||
|
|
(url.protocol === 'https:' && url.port === '443')) {
|
|
url.port = '';
|
|
}
|
|
|
|
// Path normalization with error tracking
|
|
url.pathname = url.pathname
|
|
.split('/')
|
|
.map(segment => {
|
|
try {
|
|
return decodeURIComponent(segment);
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
|
|
return segment;
|
|
}
|
|
})
|
|
.join('/')
|
|
.replace(/\/+/g, '/')
|
|
.replace(/\/+$/, '') || '/';
|
|
|
|
// Query parameter normalization with error details
|
|
const searchParams = new URLSearchParams(url.search);
|
|
const sortedParams = Array.from(searchParams.entries())
|
|
.map(([key, value]) => {
|
|
if (value === '') return [key, ''];
|
|
try {
|
|
const decodedValue = decodeURIComponent(value);
|
|
if (encodeURIComponent(decodedValue) === value) {
|
|
return [key, decodedValue];
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
|
|
}
|
|
return [key, value];
|
|
})
|
|
// Filter out tracking, session and UTM parameters
|
|
.filter(([key]) => {
|
|
if (key === '') return false;
|
|
|
|
// Remove session IDs
|
|
if (options.removeSessionIDs &&
|
|
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
// Remove UTM parameters
|
|
if (options.removeUTMParams && /^utm_/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
// Remove common tracking parameters
|
|
if (options.removeTrackingParams &&
|
|
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
})
|
|
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB));
|
|
|
|
url.search = new URLSearchParams(sortedParams).toString();
|
|
|
|
// Fragment (anchor) handling - remove completely if requested
|
|
if (options.removeAnchors) {
|
|
url.hash = '';
|
|
} else if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
|
|
url.hash = '';
|
|
} else if (url.hash) {
|
|
try {
|
|
const decodedHash = decodeURIComponent(url.hash.slice(1));
|
|
const encodedBack = encodeURIComponent(decodedHash);
|
|
// Only use decoded version if it's safe
|
|
if (encodedBack === url.hash.slice(1)) {
|
|
url.hash = '#' + decodedHash;
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
|
|
}
|
|
}
|
|
|
|
let normalizedUrl = url.toString();
|
|
|
|
// Remove trailing slash from paths that aren't just "/"
|
|
if (url.pathname.length > 1 && url.pathname.endsWith('/')) {
|
|
url.pathname = url.pathname.slice(0, -1);
|
|
}
|
|
|
|
// Final URL normalization with validation
|
|
try {
|
|
const decodedUrl = decodeURIComponent(normalizedUrl);
|
|
const encodedBack = encodeURIComponent(decodedUrl);
|
|
// Only use decoded version if it's safe
|
|
if (encodedBack === normalizedUrl) {
|
|
normalizedUrl = decodedUrl;
|
|
}
|
|
} catch (e) {
|
|
if (debug) console.error('Failed to decode final URL', e);
|
|
}
|
|
|
|
return normalizedUrl;
|
|
} catch (error) {
|
|
// Main URL parsing error - this one we should throw
|
|
throw new Error(`Invalid URL "${urlString}": ${error}`);
|
|
}
|
|
}
|
|
|
|
export function getUnvisitedURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
|
|
return Object.entries(allURLs)
|
|
.filter(([url]) => !visitedURLs.includes(url))
|
|
.map(([, result]) => result);
|
|
}
|
|
|
|
|
|
// Function to extract hostname and path from a URL
|
|
const extractUrlParts = (urlStr: string) => {
|
|
try {
|
|
const url = new URL(urlStr);
|
|
return {
|
|
hostname: url.hostname,
|
|
path: url.pathname
|
|
};
|
|
} catch (e) {
|
|
console.error(`Error parsing URL: ${urlStr}`, e);
|
|
return {hostname: "", path: ""};
|
|
}
|
|
};
|
|
|
|
// Function to count occurrences of hostnames and paths
|
|
export const countUrlParts = (urlItems: SearchResult[]) => {
|
|
const hostnameCount: Record<string, number> = {};
|
|
const pathPrefixCount: Record<string, number> = {};
|
|
let totalUrls = 0;
|
|
|
|
urlItems.forEach(item => {
|
|
item = (item as { title: string; url: string; description: string; weight?: number })
|
|
if (!item || !item.url) return; // Skip invalid items
|
|
|
|
totalUrls++;
|
|
const {hostname, path} = extractUrlParts(item.url);
|
|
|
|
// Count hostnames
|
|
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
|
|
|
// Count path prefixes (segments)
|
|
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
|
pathSegments.forEach((segment, index) => {
|
|
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
|
pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
|
|
});
|
|
});
|
|
|
|
return {hostnameCount, pathPrefixCount, totalUrls};
|
|
};
|
|
|
|
// Calculate normalized frequency for boosting
|
|
const normalizeCount = (count: any, total: any) => {
|
|
return total > 0 ? count / total : 0;
|
|
};
|
|
|
|
// Calculate boosted weights
|
|
export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => {
|
|
// Default parameters for boosting - can be overridden
|
|
const {
|
|
freqFactor = 0.5, // How much to boost based on term frequency
|
|
hostnameBoostFactor = 0.5, // How much to boost based on hostname frequency
|
|
pathBoostFactor = 0.4, // How much to boost based on path frequency
|
|
decayFactor = 0.8, // Decay factor for longer paths (0-1)
|
|
jinaRerankFactor = 0.8, // How much to boost based on Jina reranking
|
|
minBoost = 0, // Minimum boost score
|
|
maxBoost = 5, // Maximum boost score cap
|
|
question = '', // Optional question for Jina reranking
|
|
} = options;
|
|
|
|
// Count URL parts first
|
|
const counts = countUrlParts(urlItems);
|
|
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
|
|
|
|
if (question.trim().length > 0) {
|
|
// get from jina rerank
|
|
rerankDocuments(question, urlItems.map(item => smartMergeStrings(item.title, item.description)), trackers.tokenTracker)
|
|
.then(({results}) => {
|
|
results.forEach(({index, relevance_score}) => {
|
|
(urlItems[index] as BoostedSearchSnippet).jinaRerankBoost = relevance_score * jinaRerankFactor;
|
|
});
|
|
})
|
|
}
|
|
|
|
|
|
return (urlItems as BoostedSearchSnippet[]).map(item => {
|
|
if (!item || !item.url) {
|
|
console.error('Skipping invalid item:', item);
|
|
return item; // Return unchanged
|
|
}
|
|
|
|
const {hostname, path} = extractUrlParts(item.url);
|
|
|
|
// Base weight from original
|
|
const freq = item.weight || 0; // Default to 1 if weight is missing
|
|
|
|
// Hostname boost (normalized by total URLs)
|
|
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
|
|
const hostnameBoost = hostnameFreq * hostnameBoostFactor;
|
|
|
|
// Path boost (consider all path prefixes with decay for longer paths)
|
|
let pathBoost = 0;
|
|
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
|
pathSegments.forEach((segment, index) => {
|
|
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
|
const prefixCount = pathPrefixCount[prefix] || 0;
|
|
const prefixFreq = normalizeCount(prefixCount, totalUrls);
|
|
|
|
// Apply decay factor based on path depth
|
|
const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
|
|
pathBoost += decayedBoost;
|
|
});
|
|
|
|
const freqBoost = freq / totalUrls * freqFactor;
|
|
const jinaRerankBoost = item.jinaRerankBoost || 0;
|
|
// Calculate new weight with clamping
|
|
const finalScore = Math.min(
|
|
Math.max(
|
|
hostnameBoost
|
|
+ pathBoost
|
|
+ freqBoost
|
|
+ jinaRerankBoost, minBoost),
|
|
maxBoost);
|
|
|
|
return {
|
|
...item,
|
|
freqBoost,
|
|
hostnameBoost,
|
|
pathBoost,
|
|
jinaRerankBoost,
|
|
finalScore
|
|
} as BoostedSearchSnippet;
|
|
}).sort((a, b) => b.finalScore - a.finalScore);
|
|
};
|
|
|
|
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
|
|
if (!allURLs[r.url]) {
|
|
allURLs[r.url] = r;
|
|
allURLs[r.url].weight = weightDelta;
|
|
} else {
|
|
(allURLs[r.url].weight as number)+= weightDelta;
|
|
const curDesc = allURLs[r.url].description;
|
|
allURLs[r.url].description = smartMergeStrings(curDesc, r.description);
|
|
}
|
|
}
|
|
|
|
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
|
|
if (!allURLs || allURLs.length === 0) return '';
|
|
|
|
return (allURLs)
|
|
.map(r => {
|
|
const merged = smartMergeStrings(r.title, r.description);
|
|
return {
|
|
url: r.url,
|
|
score: r.finalScore,
|
|
merged
|
|
};
|
|
})
|
|
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
|
|
.sort((a, b) => (b.score || 0) - (a.score || 0))
|
|
.slice(0, maxURLs)
|
|
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
|
|
.join('\n');
|
|
}
|
|
|
|
|
|
/**
|
|
* Draw a sample from a multinomial distribution
|
|
* @param items Array of [name, weight] tuples
|
|
* @returns A randomly selected item based on the weights, or null if array is empty
|
|
*/
|
|
export function sampleMultinomial<T>(items: [T, number][]): T | null {
|
|
// Handle empty array
|
|
if (!items || items.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
// Calculate total weight
|
|
const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0);
|
|
|
|
// Handle case where all weights are 0
|
|
if (totalWeight === 0) {
|
|
return null;
|
|
}
|
|
|
|
// Generate a random number between 0 and total weight
|
|
const randValue = Math.random() * totalWeight;
|
|
|
|
// Find the item corresponding to the random value
|
|
let cumulativeWeight = 0;
|
|
|
|
for (const [item, weight] of items) {
|
|
cumulativeWeight += weight;
|
|
if (randValue <= cumulativeWeight) {
|
|
return item;
|
|
}
|
|
}
|
|
|
|
// Fallback (should rarely happen due to floating point precision)
|
|
return items[items.length - 1][0];
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
* Fetches the last modified date for a URL using the datetime detection API
|
|
* @param url The URL to check for last modified date
|
|
* @returns Promise containing the last modified date or null if not found
|
|
*/
|
|
export async function getLastModified(url: string): Promise<string | undefined> {
|
|
try {
|
|
// Call the API with proper encoding
|
|
const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`;
|
|
const response = await fetch(apiUrl);
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`API returned ${response.status}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
// Return the bestGuess date if available
|
|
if (data.bestGuess) {
|
|
return data.bestGuess;
|
|
}
|
|
|
|
return undefined;
|
|
} catch (error) {
|
|
console.error('Failed to fetch last modified date:', error);
|
|
return undefined;
|
|
}
|
|
}
|