From 51ad77d302d34170c8db8fffe3753f64048f5fb2 Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@jina.ai>
Date: Tue, 4 Mar 2025 16:29:22 +0800
Subject: [PATCH] feat: add url ranking

---
 src/agent.ts            |  73 +++++++++++++++++---------
 src/tools/jina-dedup.ts |   2 +-
 src/types.ts            |  16 +++++-
 src/utils/text-tools.ts |  38 +++++++++++++-
 src/utils/url-tools.ts  | 110 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 208 insertions(+), 31 deletions(-)
diff --git a/src/agent.ts b/src/agent.ts
index 665fac0..4851603 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
 import {analyzeSteps} from "./tools/error-analyzer";
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
-import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType} from "./types";
+import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType, BoostedSearchResult} from "./types";
 import {TrackerContext} from "./types";
 import {search} from "./tools/jina-search";
 // import {grounding} from "./tools/grounding";
@@ -19,8 +19,8 @@ import {zodToJsonSchema} from "zod-to-json-schema";
 import {ObjectGeneratorSafe} from "./utils/safe-generator";
 import {CodeSandbox} from "./tools/code-sandbox";
 import {serperSearch} from './tools/serper-search';
-import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
-import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
+import {calculateBoostedWeights, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
+import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools";
 import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
 
 async function sleep(ms: number) {
@@ -132,18 +132,22 @@ ${learnedStrategy}
   if (allowRead) {
     let urlList = '';
     if (allURLs && allURLs.length > 0) {
-      urlList = allURLs
+      const weightedURLs = calculateBoostedWeights(allURLs) as BoostedSearchResult[]
+
+      urlList = (weightedURLs)
         .filter(r => 'url' in r)
-        .map(r => `  + "${r.url}": "${r.title}"`)
+        .sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0))
+        .slice(0, 10)  // save context window and reduce noise, only keep top 10 urls
+        .map(r => `  + weight: ${r.boostedWeight.toFixed(3)} "${r.url}": "${r.title}"`)
         .join('\n');
     }
 
     actionSections.push(`
 <action-visit>
 - Access and read full content from URLs
-- Must check URLs mentioned in <question>
+- Must check URLs mentioned in <question> if any
 ${urlList ? `    
-- Review relevant URLs below for additional information
+- Choose and visit relevant URLs below for more knowledge. higher weight means more relevant and you should visit first:
 <url-list>
 ${urlList}
 </url-list>
@@ -302,7 +306,7 @@ export async function getResponse(question?: string,
       evaluationMetrics[currentQuestion] =
         await evaluateQuestion(currentQuestion, context, SchemaGen)
     }
-    if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step===1) {
+    if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step === 1) {
       // force strict eval for the original question, only once.
       evaluationMetrics[currentQuestion].push('strict')
     }
@@ -315,7 +319,7 @@ export async function getResponse(question?: string,
 
     // update all urls with buildURLMap
     // allowRead = allowRead && (Object.keys(allURLs).length > 0);
-    allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 50);  // disable search when too many urls already
+    allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 70);  // disable search when too many urls already
 
     // generate prompt for this step
     system = getPrompt(
@@ -568,10 +572,20 @@ But then you realized you have asked them before. You decided to to think out of
           const minResults = (results).map(r => ({
             title: r.title,
             url: normalizeUrl('url' in r ? r.url : r.link),
-            description: 'description' in r ? r.description : r.snippet
+            description: 'description' in r ? r.description : r.snippet,
           }));
 
-          minResults.forEach(r => allURLs[r.url] = r);
+          minResults.forEach(r => {
+            if (!allURLs[r.url]) {
+              allURLs[r.url] = r;
+              allURLs[r.url].weight = 1;
+            } else {
+              (allURLs[r.url].weight as number)++;
+              const curDesc = (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description;
+              (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description = smartMergeStrings(curDesc, r.description);
+            }
+
+          });
           allKeywords.push(query);
 
           allKnowledge.push({
@@ -722,11 +736,11 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
     }
 
 
-    await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
+    await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
     await sleep(STEP_SLEEP);
   }
 
-  await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
+  await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
   if (!(thisStep as AnswerAction).isFinal) {
     console.log('Enter Beast mode!!!')
     // any answer is better than no answer, humanity last resort
@@ -766,7 +780,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
   (thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction))
   console.log(thisStep)
 
-  await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
+  await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
   return {
     result: thisStep,
     context,
@@ -776,16 +790,25 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
 
 }
 
-async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
+async function storeContext(prompt: string, schema: any, memory: {
+                              allContext: StepAction[];
+                              allKeywords: string[];
+                              allQuestions: string[];
+                              allKnowledge: KnowledgeItem[];
+                              allURLs: Record<string, SearchResult>;
+                            }
+  , step: number) {
+
+  const {allContext, allKeywords, allQuestions, allKnowledge, allURLs} = memory;
   if ((process as any).asyncLocalContext?.available?.()) {
-    const [context, keywords, questions, knowledge] = memory;
+
     (process as any).asyncLocalContext.ctx.promptContext = {
       prompt,
       schema,
-      context,
-      keywords,
-      questions,
-      knowledge,
+      allContext,
+      allKeywords,
+      allQuestions,
+      allKnowledge,
       step
     };
     return;
@@ -799,11 +822,11 @@ ${prompt}
 JSONSchema:
 ${JSON.stringify(zodToJsonSchema(schema), null, 2)}
 `);
-    const [context, keywords, questions, knowledge] = memory;
-    await fs.writeFile('context.json', JSON.stringify(context, null, 2));
-    await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2));
-    await fs.writeFile('questions.json', JSON.stringify(questions, null, 2));
-    await fs.writeFile('knowledge.json', JSON.stringify(knowledge, null, 2));
+    await fs.writeFile('context.json', JSON.stringify(allContext, null, 2));
+    await fs.writeFile('queries.json', JSON.stringify(allKeywords, null, 2));
+    await fs.writeFile('questions.json', JSON.stringify(allQuestions, null, 2));
+    await fs.writeFile('knowledge.json', JSON.stringify(allKnowledge, null, 2));
+    await fs.writeFile('urls.json', JSON.stringify(calculateBoostedWeights(Object.entries(allURLs).map(([, result]) => result)), null, 2));
   } catch (error) {
     console.error('Context storage failed:', error);
   }
diff --git a/src/tools/jina-dedup.ts b/src/tools/jina-dedup.ts
index abcbdeb..f18768a 100644
--- a/src/tools/jina-dedup.ts
+++ b/src/tools/jina-dedup.ts
@@ -3,7 +3,7 @@ import {TokenTracker} from "../utils/token-tracker";
 import {JINA_API_KEY} from "../config";
 
 const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
-const SIMILARITY_THRESHOLD = 0.85; // Adjustable threshold for cosine similarity
+const SIMILARITY_THRESHOLD = 0.888; // Adjustable threshold for cosine similarity
 
 const JINA_API_CONFIG = {
   MODEL: 'jina-embeddings-v3',
diff --git a/src/types.ts b/src/types.ts
index f2a6d50..d8ad27a 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -172,9 +172,20 @@ export type ErrorAnalysisResponse = {
 };
 
 export type SearchResult =
-  | { title: string; url: string; description: string }
-  | { title: string; link: string; snippet: string };
+  | { title: string; url: string; description: string; weight?: number }
+  | { title: string; link: string; snippet: string; weight?: number };
 
+export type BoostedSearchResult = {
+  title: string;
+  url: string;
+  description: string;
+  weight: number;
+  originalWeight: number;
+  hostnameBoost: number;
+  pathBoost: number;
+  boostScore: number;
+  boostedWeight: number;
+}
 
 // OpenAI API Types
 export interface Model {
@@ -190,6 +201,7 @@ export type ResponseFormat = {
   type: 'json_schema' | 'json_object';
   json_schema?: any;
 }
+
 export interface ChatCompletionRequest {
   model: string;
   messages: Array<CoreUserMessage | CoreAssistantMessage>;
diff --git a/src/utils/text-tools.ts b/src/utils/text-tools.ts
index f0e8a62..e7e5941 100644
--- a/src/utils/text-tools.ts
+++ b/src/utils/text-tools.ts
@@ -160,4 +160,40 @@ export function getI18nText(key: string, lang = 'en', params: Record<string, str
   }
 
   return text;
-}
\ No newline at end of file
+}
+
+export function smartMergeStrings(str1: string, str2: string): string {
+  // If either string is empty, return the other
+  if (!str1) return str2;
+  if (!str2) return str1;
+
+  // Check if one string is entirely contained within the other
+  if (str1.includes(str2)) return str1;
+  if (str2.includes(str1)) return str2;
+
+  // Find the maximum possible overlap length
+  const maxOverlap = Math.min(str1.length, str2.length);
+  let bestOverlapLength = 0;
+
+  // Check for overlaps starting from the largest possible
+  for (let overlapLength = maxOverlap; overlapLength > 0; overlapLength--) {
+    // Get the end of first string with the current overlap length
+    const endOfStr1 = str1.slice(str1.length - overlapLength);
+    // Get the beginning of second string with the current overlap length
+    const startOfStr2 = str2.slice(0, overlapLength);
+
+    // If they match, we've found our overlap
+    if (endOfStr1 === startOfStr2) {
+      bestOverlapLength = overlapLength;
+      break;
+    }
+  }
+
+  // Merge the strings using the best overlap
+  if (bestOverlapLength > 0) {
+    return str1.slice(0, str1.length - bestOverlapLength) + str2;
+  } else {
+    // No overlap found, concatenate normally
+    return str1 + str2;
+  }
+}
diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts
index a882fb9..39148e8 100644
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@@ -1,4 +1,4 @@
-import {SearchResult} from "../types";
+import {BoostedSearchResult, SearchResult} from "../types";
 
 export function normalizeUrl(urlString: string, debug = false): string {
     if (!urlString?.trim()) {
@@ -101,4 +101,110 @@ export function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedU
     return Object.entries(allURLs)
         .filter(([url]) => !visitedURLs.includes(url))
         .map(([, result]) => result);
-}
\ No newline at end of file
+}
+
+
+// Function to extract hostname and path from a URL
+const extractUrlParts = (urlStr: string) => {
+  try {
+    const url = new URL(urlStr);
+    return {
+      hostname: url.hostname,
+      path: url.pathname
+    };
+  } catch (e) {
+    console.error(`Error parsing URL: ${urlStr}`, e);
+    return { hostname: "", path: "" };
+  }
+};
+
+// Function to count occurrences of hostnames and paths
+const countUrlParts = (urlItems: SearchResult[]) => {
+  const hostnameCount: Record<string, number> = {};
+  const pathPrefixCount: Record<string, number> = {};
+  let totalUrls = 0;
+
+  urlItems.forEach(item => {
+    item = (item as { title: string; url: string; description: string; weight?: number })
+    if (!item || !item.url) return; // Skip invalid items
+
+    totalUrls++;
+    const { hostname, path } = extractUrlParts(item.url);
+
+    // Count hostnames
+    hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
+
+    // Count path prefixes (segments)
+    const pathSegments = path.split('/').filter(segment => segment.length > 0);
+    pathSegments.forEach((segment, index) => {
+      const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
+      pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
+    });
+  });
+
+  return { hostnameCount, pathPrefixCount, totalUrls };
+};
+
+// Calculate normalized frequency for boosting
+const normalizeCount = (count: any, total: any) => {
+  return total > 0 ? count / total : 0;
+};
+
+// Calculate boosted weights
+export const calculateBoostedWeights = (urlItems: SearchResult[], options: any = {}): any[] => {
+  // Default parameters for boosting - can be overridden
+  const {
+    hostnameBoostFactor = 0.7,  // How much to boost based on hostname frequency
+    pathBoostFactor = 0.4,      // How much to boost based on path frequency
+    decayFactor = 0.8,          // Decay factor for longer paths (0-1)
+    minBoost = 0,               // Minimum boost score
+    maxBoost = 5                // Maximum boost score cap
+  } = options;
+
+  // Count URL parts first
+  const counts = countUrlParts(urlItems);
+  const { hostnameCount, pathPrefixCount, totalUrls } = counts;
+
+  return urlItems.map(item => {
+    item = (item as BoostedSearchResult)
+    if (!item || !item.url) {
+      console.error('Skipping invalid item:', item);
+      return item; // Return unchanged
+    }
+
+    const { hostname, path } = extractUrlParts(item.url);
+
+    // Base weight from original
+    const originalWeight = item.weight || 1.0; // Default to 1 if weight is missing
+
+    // Hostname boost (normalized by total URLs)
+    const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
+    const hostnameBoost = hostnameFreq * hostnameBoostFactor;
+
+    // Path boost (consider all path prefixes with decay for longer paths)
+    let pathBoost = 0;
+    const pathSegments = path.split('/').filter(segment => segment.length > 0);
+    pathSegments.forEach((segment, index) => {
+      const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
+      const prefixCount = pathPrefixCount[prefix] || 0;
+      const prefixFreq = normalizeCount(prefixCount, totalUrls);
+
+      // Apply decay factor based on path depth
+      const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
+      pathBoost += decayedBoost;
+    });
+
+    // Calculate new weight with clamping
+    const boostScore = Math.min(Math.max(hostnameBoost + pathBoost, minBoost), maxBoost);
+    const boostedWeight = originalWeight + boostScore;
+
+    return {
+      ...item,
+      originalWeight,
+      hostnameBoost,
+      pathBoost,
+      boostScore,
+      boostedWeight
+    } as BoostedSearchResult;
+  });
+};
\ No newline at end of file