fix: add spacing for consistency in agent and url-tools

2025-12-26 06:28:56 +08:00 · 2025-06-09 15:48:41 -07:00 · 2025-06-09 15:48:41 -07:00 · 2affd41c79
commit 2affd41c79
parent b797273626
2 changed files with 39 additions and 30 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -517,6 +517,7 @@ export async function getResponse(question?: string,
          question: currentQuestion,
          boostHostnames
        }, context);
+
      // improve diversity by keep top 2 urls of each hostname
      weightedURLs = keepKPerHostname(weightedURLs, 2);
      console.log('Weighted URLs:', weightedURLs.length);
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -1,12 +1,12 @@
-import {BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types";
-import {getI18nText, smartMergeStrings} from "./text-tools";
-import {rerankDocuments} from "../tools/jina-rerank";
-import {readUrl} from "../tools/read";
-import {Schemas} from "./schemas";
-import {cherryPick} from "../tools/jina-latechunk";
-import {formatDateBasedOnType} from "./date-tools";
-import {classifyText} from "../tools/jina-classify-spam";
-import {segmentText} from "../tools/segment";
+import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
+import { getI18nText, smartMergeStrings } from "./text-tools";
+import { rerankDocuments } from "../tools/jina-rerank";
+import { readUrl } from "../tools/read";
+import { Schemas } from "./schemas";
+import { cherryPick } from "../tools/jina-latechunk";
+import { formatDateBasedOnType } from "./date-tools";
+import { classifyText } from "../tools/jina-classify-spam";
+import { segmentText } from "../tools/segment";
 import axiosClient from "./axios-client";

 export function normalizeUrl(urlString: string, debug = false, options = {
@ -179,7 +179,7 @@ const extractUrlParts = (urlStr: string) => {
    };
  } catch (e) {
    console.error(`Error parsing URL: ${urlStr}`, e);
-    return {hostname: "", path: ""};
+    return { hostname: "", path: "" };
  }
 };

@ -203,7 +203,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
    if (!item || !item.url) return; // Skip invalid items

    totalUrls++;
-    const {hostname, path} = extractUrlParts(item.url);
+    const { hostname, path } = extractUrlParts(item.url);

    // Count hostnames
    hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
@ -216,7 +216,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
    });
  });

-  return {hostnameCount, pathPrefixCount, totalUrls};
+  return { hostnameCount, pathPrefixCount, totalUrls };
 };

 // Calculate normalized frequency for boosting
@ -241,7 +241,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:

  // Count URL parts first
  const counts = countUrlParts(urlItems);
-  const {hostnameCount, pathPrefixCount, totalUrls} = counts;
+  const { hostnameCount, pathPrefixCount, totalUrls } = counts;

  if (question.trim().length > 0) {
    // Step 1: Create a record to track unique content with their original indices
@ -262,9 +262,9 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
    const uniqueIndicesMap = Object.values(uniqueContentMap);
    console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
    rerankDocuments(question, uniqueContents, trackers.tokenTracker)
-      .then(({results}) => {
+      .then(({ results }) => {
        // Step 3: Map the scores back to all original items
-        results.forEach(({index, relevance_score}) => {
+        results.forEach(({ index, relevance_score }) => {
          const originalIndices = uniqueIndicesMap[index];
          const boost = relevance_score * jinaRerankFactor;

@ -283,7 +283,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
      return item; // Return unchanged
    }

-    const {hostname, path} = extractUrlParts(item.url);
+    const { hostname, path } = extractUrlParts(item.url);

    // Base weight from original
    const freq = item.weight || 0; // Default to 1 if weight is missing
@ -427,6 +427,14 @@ export async function getLastModified(url: string): Promise<string | undefined>


 export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
+  // First count unique hostnames
+  const uniqueHostnames = new Set(results.map(result => extractUrlParts(result.url).hostname));
+
+  // If only one or zero unique hostnames, return original results
+  if (uniqueHostnames.size <= 1) {
+    return results;
+  }
+
  const hostnameMap: Record<string, number> = {};
  const filteredResults: BoostedSearchSnippet[] = [];

@ -458,7 +466,7 @@ export async function processURLs(
 ): Promise<{ urlResults: any[], success: boolean }> {
  // Skip if no URLs to process
  if (urls.length === 0) {
-    return {urlResults: [], success: false};
+    return { urlResults: [], success: false };
  }

  const badHostnames: string[] = [];
@ -466,10 +474,10 @@ export async function processURLs(
  // Track the reading action
  const thisStep: VisitAction = {
    action: 'visit',
-    think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
+    think: getI18nText('read_for', schemaGen.languageCode, { urls: urls.join(', ') }),
    URLTargets: urls
  }
-  context.actionTracker.trackAction({thisStep})
+  context.actionTracker.trackAction({ thisStep })

  // Process each URL in parallel
  const urlResults = await Promise.all(
@ -483,8 +491,8 @@ export async function processURLs(
        // Store normalized URL for consistent reference
        url = normalizedUrl;

-        const {response} = await readUrl(url, true, context.tokenTracker);
-        const {data} = response;
+        const { response } = await readUrl(url, true, context.tokenTracker);
+        const { data } = response;
        const guessedTime = await getLastModified(url);
        if (guessedTime) {
          console.log('Guessed time for', url, guessedTime);
@ -505,7 +513,7 @@ export async function processURLs(
        }

        // add to web contents
-        const {chunks, chunk_positions } = await segmentText(data.content, context);
+        const { chunks, chunk_positions } = await segmentText(data.content, context);
        // filter out the chunks that are too short, minChunkLength is 80
        const minChunkLength = 80;
        for (let i = 0; i < chunks.length; i++) {
@ -546,7 +554,7 @@ export async function processURLs(
          }
        });

-        return {url, result: response};
+        return { url, result: response };
      } catch (error: any) {
        console.error('Error reading URL:', url, error);
        badURLs.push(url);
@ -593,11 +601,11 @@ export async function processURLs(
  // remove any URL with bad hostnames from allURLs
  if (badHostnames.length > 0) {
    Object.keys(allURLs).forEach(url => {
-        if (badHostnames.includes(extractUrlParts(url).hostname)) {
-          delete allURLs[url];
-          console.log(`Removed ${url} from allURLs`);
-        }
+      if (badHostnames.includes(extractUrlParts(url).hostname)) {
+        delete allURLs[url];
+        console.log(`Removed ${url} from allURLs`);
      }
+    }
    )
  }

@ -658,7 +666,7 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
  const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;

  // Find all matches
-  const matches: Array<{url: string, index: number, length: number}> = [];
+  const matches: Array<{ url: string, index: number, length: number }> = [];
  let match: RegExpExecArray | null;

  while ((match = urlPattern.exec(text)) !== null) {
@ -697,14 +705,14 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb

    // Adjust boundaries to avoid overlapping with other URLs
    if (i > 0) {
-      const prevUrl = matches[i-1];
+      const prevUrl = matches[i - 1];
      if (startPos < prevUrl.index + prevUrl.length) {
        startPos = prevUrl.index + prevUrl.length;
      }
    }

    if (i < matches.length - 1) {
-      const nextUrl = matches[i+1];
+      const nextUrl = matches[i + 1];
      if (endPos > nextUrl.index) {
        endPos = nextUrl.index;
      }