node-DeepResearch/src/utils/url-tools.ts

import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
import {getI18nText, smartMergeStrings} from "./text-tools";
import {rerankDocuments} from "../tools/jina-rerank";
import {readUrl} from "../tools/read";
import {Schemas} from "./schemas";
import {cherryPick} from "../tools/jina-latechunk";
import {formatDateBasedOnType} from "./date-tools";

export function normalizeUrl(urlString: string, debug = false, options = {
  removeAnchors: true,
  removeSessionIDs: true,
  removeUTMParams: true,
  removeTrackingParams: true
}) {
  try {
    urlString = urlString.replace(/\s+/g, '').trim();

    if (!urlString?.trim()) {
      throw new Error('Empty URL');
    }

    if (urlString.startsWith('https://google.com') || urlString.startsWith('https://www.google.com')) {
      throw new Error('Google search link');
    }

    const url = new URL(urlString);
    if (url.protocol !== 'http:' && url.protocol !== 'https:') {
      throw new Error('Unsupported protocol');
    }

    url.hostname = url.hostname.toLowerCase();
    if (url.hostname.startsWith('www.')) {
      url.hostname = url.hostname.slice(4);
    }

    if ((url.protocol === 'http:' && url.port === '80') ||
      (url.protocol === 'https:' && url.port === '443')) {
      url.port = '';
    }

    // Path normalization with error tracking
    url.pathname = url.pathname
      .split('/')
      .map(segment => {
        try {
          return decodeURIComponent(segment);
        } catch (e) {
          if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
          return segment;
        }
      })
      .join('/')
      .replace(/\/+/g, '/')
      .replace(/\/+$/, '') || '/';

    // Query parameter normalization with error details
    const searchParams = new URLSearchParams(url.search);
    const sortedParams = Array.from(searchParams.entries())
      .map(([key, value]) => {
        if (value === '') return [key, ''];
        try {
          const decodedValue = decodeURIComponent(value);
          if (encodeURIComponent(decodedValue) === value) {
            return [key, decodedValue];
          }
        } catch (e) {
          if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
        }
        return [key, value];
      })
      // Filter out tracking, session and UTM parameters
      .filter(([key]) => {
        if (key === '') return false;

        // Remove session IDs
        if (options.removeSessionIDs &&
          /^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
          return false;
        }

        // Remove UTM parameters
        if (options.removeUTMParams && /^utm_/i.test(key)) {
          return false;
        }

        // Remove common tracking parameters
        if (options.removeTrackingParams &&
          /^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
          return false;
        }

        return true;
      })
      .sort(([keyA], [keyB]) => keyA.localeCompare(keyB));

    url.search = new URLSearchParams(sortedParams).toString();

    // Fragment (anchor) handling - remove completely if requested
    if (options.removeAnchors) {
      url.hash = '';
    } else if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
      url.hash = '';
    } else if (url.hash) {
      try {
        const decodedHash = decodeURIComponent(url.hash.slice(1));
        const encodedBack = encodeURIComponent(decodedHash);
        // Only use decoded version if it's safe
        if (encodedBack === url.hash.slice(1)) {
          url.hash = '#' + decodedHash;
        }
      } catch (e) {
        if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
      }
    }

    let normalizedUrl = url.toString();

    // Remove trailing slash from paths that aren't just "/"
    if (url.pathname.length > 1 && url.pathname.endsWith('/')) {
      url.pathname = url.pathname.slice(0, -1);
    }

    // Final URL normalization with validation
    try {
      const decodedUrl = decodeURIComponent(normalizedUrl);
      const encodedBack = encodeURIComponent(decodedUrl);
      // Only use decoded version if it's safe
      if (encodedBack === normalizedUrl) {
        normalizedUrl = decodedUrl;
      }
    } catch (e) {
      if (debug) console.error('Failed to decode final URL', e);
    }

    return normalizedUrl;
  } catch (error) {
    // Main URL parsing error - this one we should throw
    console.error(`Invalid URL "${urlString}": ${error}`);
    return;
  }
}

export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
  return Object.entries(allURLs)
    .filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
    .map(([, result]) => result);
}


// Function to extract hostname and path from a URL
const extractUrlParts = (urlStr: string) => {
  try {
    const url = new URL(urlStr);
    return {
      hostname: url.hostname,
      path: url.pathname
    };
  } catch (e) {
    console.error(`Error parsing URL: ${urlStr}`, e);
    return {hostname: "", path: ""};
  }
};

// Function to count occurrences of hostnames and paths
export const countUrlParts = (urlItems: SearchResult[]) => {
  const hostnameCount: Record<string, number> = {};
  const pathPrefixCount: Record<string, number> = {};
  let totalUrls = 0;

  urlItems.forEach(item => {
    item = (item as { title: string; url: string; description: string; weight?: number })
    if (!item || !item.url) return; // Skip invalid items

    totalUrls++;
    const {hostname, path} = extractUrlParts(item.url);

    // Count hostnames
    hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;

    // Count path prefixes (segments)
    const pathSegments = path.split('/').filter(segment => segment.length > 0);
    pathSegments.forEach((segment, index) => {
      const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
      pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
    });
  });

  return {hostnameCount, pathPrefixCount, totalUrls};
};

// Calculate normalized frequency for boosting
const normalizeCount = (count: any, total: any) => {
  return total > 0 ? count / total : 0;
};

// Calculate boosted weights
export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => {
  // Default parameters for boosting - can be overridden
  const {
    freqFactor = 0.5,           // How much to boost based on term frequency
    hostnameBoostFactor = 0.5,  // How much to boost based on hostname frequency
    pathBoostFactor = 0.4,      // How much to boost based on path frequency
    decayFactor = 0.8,          // Decay factor for longer paths (0-1)
    jinaRerankFactor = 0.8,     // How much to boost based on Jina reranking
    minBoost = 0,               // Minimum boost score
    maxBoost = 5,                // Maximum boost score cap
    question = '',              // Optional question for Jina reranking
    boostHostnames = [],        // Optional hostnames to boost
  } = options;

  // Count URL parts first
  const counts = countUrlParts(urlItems);
  const {hostnameCount, pathPrefixCount, totalUrls} = counts;

  if (question.trim().length > 0) {
    // get from jina rerank
    rerankDocuments(question, urlItems.map(item => smartMergeStrings(item.title, item.description)), trackers.tokenTracker)
      .then(({results}) => {
        results.forEach(({index, relevance_score}) => {
          (urlItems[index] as BoostedSearchSnippet).jinaRerankBoost = relevance_score * jinaRerankFactor;
        });
      })
  }


  return (urlItems as BoostedSearchSnippet[]).map(item => {
    if (!item || !item.url) {
      console.error('Skipping invalid item:', item);
      return item; // Return unchanged
    }

    const {hostname, path} = extractUrlParts(item.url);

    // Base weight from original
    const freq = item.weight || 0; // Default to 1 if weight is missing

    // Hostname boost (normalized by total URLs)
    const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
    const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1);

    // Path boost (consider all path prefixes with decay for longer paths)
    let pathBoost = 0;
    const pathSegments = path.split('/').filter(segment => segment.length > 0);
    pathSegments.forEach((segment, index) => {
      const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
      const prefixCount = pathPrefixCount[prefix] || 0;
      const prefixFreq = normalizeCount(prefixCount, totalUrls);

      // Apply decay factor based on path depth
      const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
      pathBoost += decayedBoost;
    });

    const freqBoost = freq / totalUrls * freqFactor;
    const jinaRerankBoost = item.jinaRerankBoost || 0;
    // Calculate new weight with clamping
    const finalScore = Math.min(
      Math.max(
        hostnameBoost
        + pathBoost
        + freqBoost
        + jinaRerankBoost, minBoost),
      maxBoost);

    return {
      ...item,
      freqBoost,
      hostnameBoost,
      pathBoost,
      jinaRerankBoost,
      finalScore
    } as BoostedSearchSnippet;
  }).sort((a, b) => b.finalScore - a.finalScore);
};

export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
  const nURL = normalizeUrl(r.url);
  if (!nURL) return 0;
  if (!allURLs[nURL]) {
    allURLs[nURL] = r;
    allURLs[nURL].weight = weightDelta;
    return 1;
  } else {
    (allURLs[nURL].weight as number) += weightDelta;
    const curDesc = allURLs[nURL].description;
    allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
    return 0;
  }
}

export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
  if (!allURLs || allURLs.length === 0) return '';

  return (allURLs)
    .map(r => {
      const merged = smartMergeStrings(r.title, r.description);
      return {
        url: r.url,
        score: r.finalScore,
        merged
      };
    })
    .filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
    .sort((a, b) => (b.score || 0) - (a.score || 0))
    .slice(0, maxURLs)
    .map(item => `  + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
    .join('\n');
}


/**
 * Draw a sample from a multinomial distribution
 * @param items Array of [name, weight] tuples
 * @returns A randomly selected item based on the weights, or null if array is empty
 */
export function sampleMultinomial<T>(items: [T, number][]): T | null {
  // Handle empty array
  if (!items || items.length === 0) {
    return null;
  }

  // Calculate total weight
  const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0);

  // Handle case where all weights are 0
  if (totalWeight === 0) {
    return null;
  }

  // Generate a random number between 0 and total weight
  const randValue = Math.random() * totalWeight;

  // Find the item corresponding to the random value
  let cumulativeWeight = 0;

  for (const [item, weight] of items) {
    cumulativeWeight += weight;
    if (randValue <= cumulativeWeight) {
      return item;
    }
  }

  // Fallback (should rarely happen due to floating point precision)
  return items[items.length - 1][0];
}


/**
 * Fetches the last modified date for a URL using the datetime detection API
 * @param url The URL to check for last modified date
 * @returns Promise containing the last modified date or null if not found
 */
export async function getLastModified(url: string): Promise<string | undefined> {
  try {
    // Call the API with proper encoding
    const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`;
    const response = await fetch(apiUrl);

    if (!response.ok) {
      throw new Error(`API returned ${response.status}`);
    }

    const data = await response.json();

    // Return the bestGuess date if available
    if (data.bestGuess && data.confidence >= 70) {
      return data.bestGuess;
    }

    return undefined;
  } catch (error) {
    console.error('Failed to fetch last modified date:', error);
    return undefined;
  }
}


export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
  const hostnameMap: Record<string, number> = {};
  const filteredResults: BoostedSearchSnippet[] = [];

  results.forEach((result) => {
    const hostname = extractUrlParts(result.url).hostname;
    if (hostnameMap[hostname] === undefined) {
      hostnameMap[hostname] = 0;
    }

    if (hostnameMap[hostname] < k) {
      filteredResults.push(result);
      hostnameMap[hostname]++;
    }
  });

  return filteredResults;
}

export async function processURLs(
  urls: string[],
  context: TrackerContext,
  allKnowledge: KnowledgeItem[],
  allURLs: Record<string, SearchSnippet>,
  visitedURLs: string[],
  badURLs: string[],
  schemaGen: Schemas,
  question: string
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
  // Skip if no URLs to process
  if (urls.length === 0) {
    return {urlResults: [], success: false, badURLs: []};
  }

  const badHostnames: string[] = [];

  // Track the reading action
  const thisStep: VisitAction = {
    action: 'visit',
    think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
    URLTargets: urls
  }
  context.actionTracker.trackAction({thisStep})

  // Process each URL in parallel
  const urlResults = await Promise.all(
    urls.map(async url => {
      try {
        const normalizedUrl = normalizeUrl(url);
        if (!normalizedUrl) {
          return null;
        }

        // Store normalized URL for consistent reference
        url = normalizedUrl;

        const {response} = await readUrl(url, true, context.tokenTracker);
        const {data} = response;
        const guessedTime = await getLastModified(url);
        if (guessedTime) {
          console.log('Guessed time for', url, guessedTime);
        }

        // Early return if no valid data
        if (!data?.url || !data?.content) {
          throw new Error('No content found');
        }

        // Add to knowledge base
        allKnowledge.push({
          question: `What do expert say about "${question}"?`,
          answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
          references: [data.url],
          type: 'url',
          updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
        });

        // Process page links
        data.links?.forEach(link => {
          const nnUrl = normalizeUrl(link[1]);
          if (!nnUrl) return;
          const r: SearchSnippet = {
            title: link[0],
            url: nnUrl,
            description: link[0],
          }
          // in-page link has lower initial weight comparing to search links
          if (r.url) {
            addToAllURLs(r, allURLs, 0.1);
          }
        });

        return {url, result: response};
      } catch (error: any) {
        console.error('Error reading URL:', url, error);
        badURLs.push(url);
        // Extract hostname from the URL
        if (
          (error?.name === 'ParamValidationError' && error.message?.includes('Domain')) ||
          (error?.name === 'AssertionFailureError' && error.message?.includes('resolve host name')) ||
          error?.message?.includes("Couldn't resolve host name") ||
          error?.message?.includes("could not be resolved") ||
          error?.message?.includes("ERR_CERT_COMMON_NAME_INVALID") ||
          error?.message?.includes("ERR_CONNECTION_REFUSED")
        ) {
          let hostname = '';
          try {
            hostname = extractUrlParts(url).hostname;
          } catch (e) {
            console.error('Error parsing URL for hostname:', url, e);
          }
          badHostnames.push(hostname);
          console.log(`Added ${hostname} to bad hostnames list`);
        }
        return null;
      } finally {
        // Only add valid URLs to visitedURLs list
        if (url) {
          visitedURLs.push(url);
        }
      }
    })
  );

  // Filter out null results without changing the original array
  const validResults = urlResults.filter(Boolean);

  // remove any URL with bad hostnames from allURLs
  if (badHostnames.length > 0) {
    Object.keys(allURLs).forEach(url => {
        if (badHostnames.includes(extractUrlParts(url).hostname)) {
          delete allURLs[url];
          console.log(`Removed ${url} from allURLs`);
        }
      }
    )
  }

  return {
    urlResults: validResults,
    success: validResults.length > 0,
    badURLs
  };
}