fix: emit url idx in visit action

2025-12-26 06:28:56 +08:00 · 2025-03-27 15:29:51 +08:00 · 2025-03-27 15:29:51 +08:00 · 7bd4f51f42
commit 7bd4f51f42
parent 98d83e84bb
4 changed files with 128 additions and 41 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -30,7 +30,7 @@ import {
  rankURLs,
  filterURLs,
  normalizeUrl,
-  weightedURLToString, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks
+  sortSelectURLs, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks, extractUrlsWithDescription
 } from "./utils/url-tools";
 import {
  buildMdFromAnswer,
@ -111,7 +111,7 @@ function getPrompt(
  knowledge?: KnowledgeItem[],
  allURLs?: BoostedSearchSnippet[],
  beastMode?: boolean,
-): string {
+): { system: string, urlList?: string[]} {
  const sections: string[] = [];
  const actionSections: string[] = [];

@ -136,19 +136,20 @@ ${context.join('\n')}

  // Build actions section

-  if (allowRead) {
-    const urlList = weightedURLToString(allURLs || [], 20);
+  const urlList = sortSelectURLs(allURLs || [], 20);
+  if (allowRead && urlList.length > 0) {
+    const urlListStr = urlList
+      .map((item, idx) => `  - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
+    .join('\n')

    actionSections.push(`
 <action-visit>
 - Crawl and read full content from URLs, you can get the fulltext, last updated datetime etc of any URL.  
- Must check URLs mentioned in <question> if any
-${urlList ? `    
+- Must check URLs mentioned in <question> if any    
 - Choose and visit relevant URLs below for more knowledge. higher weight suggests more relevant:
 <url-list>
-${urlList}
+${urlListStr}
 </url-list>
-`.trim() : ''}
 </action-visit>
 `);
  }
@ -228,7 +229,10 @@ ${actionSections.join('\n\n')}
  // Add footer
  sections.push(`Think step by step, choose the action, then respond by matching the schema of that action.`);

-  return removeExtraLineBreaks(sections.join('\n\n'));
+  return {
+    system: removeExtraLineBreaks(sections.join('\n\n')),
+    urlList: urlList.map(u => u.url)
+};
 }


@ -421,7 +425,6 @@ export async function getResponse(question?: string,
  let allowRead = true;
  let allowReflect = true;
  let allowCoding = false;
-  let system = '';
  let msgWithKnowledge: CoreMessage[] = [];
  let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};

@ -433,6 +436,23 @@ export async function getResponse(question?: string,
  const regularBudget = tokenBudget * 0.85;
  const finalAnswerPIP: string[] = [];
  let trivialQuestion = false;
+
+  // add all mentioned URLs in messages to allURLs
+  messages.forEach(m => {
+    let strMsg = '';
+    if (typeof m.content === 'string') {
+      strMsg =  m.content.trim();
+    } else if (typeof  m.content === 'object' && Array.isArray( m.content)) {
+      // find the very last sub content whose 'type' is 'text'  and use 'text' as the question
+      strMsg =  m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
+    }
+
+    extractUrlsWithDescription(strMsg).forEach(u => {
+      addToAllURLs(u, allURLs);
+    });
+  })
+
+
  while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
    // add 1s delay to avoid rate limiting
    step++;
@ -486,7 +506,7 @@ export async function getResponse(question?: string,
    allowSearch = allowSearch && (weightedURLs.length < 200);  // disable search when too many urls already

    // generate prompt for this step
-    system = getPrompt(
+    const { system, urlList} = getPrompt(
      diaryContext,
      allQuestions,
      allKeywords,
@ -792,10 +812,10 @@ You decided to think out of the box or cut from a completely different angle.
        });
      }
      allowSearch = false;
-    } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
+    } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length && urlList?.length) {
      // normalize URLs
-      thisStep.URLTargets = thisStep.URLTargets
-        .map(url => normalizeUrl(url))
+      thisStep.URLTargets = (thisStep.URLTargets as number[])
+        .map(idx => normalizeUrl(urlList[idx - 1]))
        .filter(url => url && !visitedURLs.includes(url)) as string[];

      thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url!)])].slice(0, MAX_URLS_PER_STEP);
@ -892,21 +912,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
    await sleep(STEP_SLEEP);
  }

-  await storeContext(system, schema, {
-    allContext,
-    allKeywords,
-    allQuestions,
-    allKnowledge,
-    weightedURLs,
-    msgWithKnowledge
-  }, totalStep);
-
  if (!(thisStep as AnswerAction).isFinal) {
    console.log('Enter Beast mode!!!')
    // any answer is better than no answer, humanity last resort
    step++;
    totalStep++;
-    system = getPrompt(
+    const { system } = getPrompt(
      diaryContext,
      allQuestions,
      allKeywords,
@ -963,15 +974,6 @@ But unfortunately, you failed to solve the issue. You need to think out of the b

  console.log(thisStep)

-  await storeContext(system, schema, {
-    allContext,
-    allKeywords,
-    allQuestions,
-    allKnowledge,
-    weightedURLs,
-    msgWithKnowledge
-  }, totalStep);
-
  // max return 300 urls
  const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
  return {
--- a/src/types.ts
+++ b/src/types.ts
@ -51,7 +51,7 @@ export type ReflectAction = BaseAction & {

 export type VisitAction = BaseAction & {
  action: "visit";
-  URLTargets: string[];
+  URLTargets: number[] | string[];
 };

 export type CodingAction = BaseAction & {
--- a/src/utils/schemas.ts
+++ b/src/utils/schemas.ts
@ -257,9 +257,9 @@ Ensure each reflection question:

    if (allowRead) {
      actionSchemas.visit = z.object({
-        URLTargets: z.array(z.string())
+        URLTargets: z.array(z.number())
          .max(MAX_URLS_PER_STEP)
-          .describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`)
+          .describe(`Required when action='visit'. Must be the index of the URL in from the original list of URLs. Maximum ${MAX_URLS_PER_STEP} URLs allowed.`)
      }).optional();
    }

--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -331,8 +331,8 @@ export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSni
  }
 }

-export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
-  if (!allURLs || allURLs.length === 0) return '';
+export const sortSelectURLs = (allURLs: BoostedSearchSnippet[], maxURLs = 70): any[] => {
+  if (!allURLs || allURLs.length === 0) return [];

  return (allURLs)
    .map(r => {
@ -345,9 +345,7 @@ export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 7
    })
    .filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
    .sort((a, b) => (b.score || 0) - (a.score || 0))
-    .slice(0, maxURLs)
-    .map(item => `  + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
-    .join('\n');
+    .slice(0, maxURLs);
 }


@ -623,4 +621,91 @@ export function fixBadURLMdLinks(mdContent: string, allURLs: Record<string, Sear
      return match;
    }
  });
+}
+
+export function extractUrlsWithDescription(text: string, contextWindowSize: number = 50): SearchSnippet[] {
+  // Using a more precise regex for URL detection that works with multilingual text
+  // This matches URLs starting with http:// or https:// but avoids capturing trailing punctuation
+  const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
+
+  // Find all matches
+  const matches: Array<{url: string, index: number, length: number}> = [];
+  let match: RegExpExecArray | null;
+
+  while ((match = urlPattern.exec(text)) !== null) {
+    let url = match[0];
+    let length = url.length;
+
+    // Clean trailing punctuation (period, comma, etc.)
+    if (/[.,;:!?)]$/.test(url)) {
+      url = url.substring(0, url.length - 1);
+      length = url.length;
+      // Adjust lastIndex to avoid infinite loop with zero-width matches
+      urlPattern.lastIndex = match.index + length;
+    }
+
+    matches.push({
+      url,
+      index: match.index,
+      length
+    });
+  }
+
+  // If no URLs found, return empty array
+  if (matches.length === 0) {
+    return [];
+  }
+
+  // Extract context for each URL
+  const results: SearchSnippet[] = [];
+
+  for (let i = 0; i < matches.length; i++) {
+    const { url, index, length } = matches[i];
+
+    // Calculate boundaries for context
+    let startPos = Math.max(0, index - contextWindowSize);
+    let endPos = Math.min(text.length, index + length + contextWindowSize);
+
+    // Adjust boundaries to avoid overlapping with other URLs
+    if (i > 0) {
+      const prevUrl = matches[i-1];
+      if (startPos < prevUrl.index + prevUrl.length) {
+        startPos = prevUrl.index + prevUrl.length;
+      }
+    }
+
+    if (i < matches.length - 1) {
+      const nextUrl = matches[i+1];
+      if (endPos > nextUrl.index) {
+        endPos = nextUrl.index;
+      }
+    }
+
+    // Extract context
+    const beforeText = text.substring(startPos, index);
+    const afterText = text.substring(index + length, endPos);
+
+    // Combine into description
+    let description = '';
+    if (beforeText && afterText) {
+      description = `${beforeText.trim()} ... ${afterText.trim()}`;
+    } else if (beforeText) {
+      description = beforeText.trim();
+    } else if (afterText) {
+      description = afterText.trim();
+    } else {
+      description = 'No context available';
+    }
+
+    // Clean up description
+    description = description.replace(/\s+/g, ' ').trim();
+
+    results.push({
+      url,
+      description,
+      title: '' // Maintaining the title field as required by SearchSnippet interface
+    });
+  }
+
+  return results;
 }