fix: emit url idx in visit action

This commit is contained in:
Han Xiao 2025-03-27 15:29:51 +08:00
parent 98d83e84bb
commit 7bd4f51f42
4 changed files with 128 additions and 41 deletions

View File

@ -30,7 +30,7 @@ import {
rankURLs,
filterURLs,
normalizeUrl,
weightedURLToString, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks
sortSelectURLs, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks, extractUrlsWithDescription
} from "./utils/url-tools";
import {
buildMdFromAnswer,
@ -111,7 +111,7 @@ function getPrompt(
knowledge?: KnowledgeItem[],
allURLs?: BoostedSearchSnippet[],
beastMode?: boolean,
): string {
): { system: string, urlList?: string[]} {
const sections: string[] = [];
const actionSections: string[] = [];
@ -136,19 +136,20 @@ ${context.join('\n')}
// Build actions section
if (allowRead) {
const urlList = weightedURLToString(allURLs || [], 20);
const urlList = sortSelectURLs(allURLs || [], 20);
if (allowRead && urlList.length > 0) {
const urlListStr = urlList
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
.join('\n')
actionSections.push(`
<action-visit>
- Crawl and read full content from URLs, you can get the fulltext, last updated datetime etc of any URL.
- Must check URLs mentioned in <question> if any
${urlList ? `
- Must check URLs mentioned in <question> if any
- Choose and visit relevant URLs below for more knowledge. higher weight suggests more relevant:
<url-list>
${urlList}
${urlListStr}
</url-list>
`.trim() : ''}
</action-visit>
`);
}
@ -228,7 +229,10 @@ ${actionSections.join('\n\n')}
// Add footer
sections.push(`Think step by step, choose the action, then respond by matching the schema of that action.`);
return removeExtraLineBreaks(sections.join('\n\n'));
return {
system: removeExtraLineBreaks(sections.join('\n\n')),
urlList: urlList.map(u => u.url)
};
}
@ -421,7 +425,6 @@ export async function getResponse(question?: string,
let allowRead = true;
let allowReflect = true;
let allowCoding = false;
let system = '';
let msgWithKnowledge: CoreMessage[] = [];
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
@ -433,6 +436,23 @@ export async function getResponse(question?: string,
const regularBudget = tokenBudget * 0.85;
const finalAnswerPIP: string[] = [];
let trivialQuestion = false;
// add all mentioned URLs in messages to allURLs
messages.forEach(m => {
let strMsg = '';
if (typeof m.content === 'string') {
strMsg = m.content.trim();
} else if (typeof m.content === 'object' && Array.isArray( m.content)) {
// find the very last sub content whose 'type' is 'text' and use 'text' as the question
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
}
extractUrlsWithDescription(strMsg).forEach(u => {
addToAllURLs(u, allURLs);
});
})
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
// add 1s delay to avoid rate limiting
step++;
@ -486,7 +506,7 @@ export async function getResponse(question?: string,
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
// generate prompt for this step
system = getPrompt(
const { system, urlList} = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@ -792,10 +812,10 @@ You decided to think out of the box or cut from a completely different angle.
});
}
allowSearch = false;
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length && urlList?.length) {
// normalize URLs
thisStep.URLTargets = thisStep.URLTargets
.map(url => normalizeUrl(url))
thisStep.URLTargets = (thisStep.URLTargets as number[])
.map(idx => normalizeUrl(urlList[idx - 1]))
.filter(url => url && !visitedURLs.includes(url)) as string[];
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url!)])].slice(0, MAX_URLS_PER_STEP);
@ -892,21 +912,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
await sleep(STEP_SLEEP);
}
await storeContext(system, schema, {
allContext,
allKeywords,
allQuestions,
allKnowledge,
weightedURLs,
msgWithKnowledge
}, totalStep);
if (!(thisStep as AnswerAction).isFinal) {
console.log('Enter Beast mode!!!')
// any answer is better than no answer, humanity last resort
step++;
totalStep++;
system = getPrompt(
const { system } = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@ -963,15 +974,6 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
console.log(thisStep)
await storeContext(system, schema, {
allContext,
allKeywords,
allQuestions,
allKnowledge,
weightedURLs,
msgWithKnowledge
}, totalStep);
// max return 300 urls
const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
return {

View File

@ -51,7 +51,7 @@ export type ReflectAction = BaseAction & {
export type VisitAction = BaseAction & {
action: "visit";
URLTargets: string[];
URLTargets: number[] | string[];
};
export type CodingAction = BaseAction & {

View File

@ -257,9 +257,9 @@ Ensure each reflection question:
if (allowRead) {
actionSchemas.visit = z.object({
URLTargets: z.array(z.string())
URLTargets: z.array(z.number())
.max(MAX_URLS_PER_STEP)
.describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`)
.describe(`Required when action='visit'. Must be the index of the URL in from the original list of URLs. Maximum ${MAX_URLS_PER_STEP} URLs allowed.`)
}).optional();
}

View File

@ -331,8 +331,8 @@ export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSni
}
}
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
if (!allURLs || allURLs.length === 0) return '';
export const sortSelectURLs = (allURLs: BoostedSearchSnippet[], maxURLs = 70): any[] => {
if (!allURLs || allURLs.length === 0) return [];
return (allURLs)
.map(r => {
@ -345,9 +345,7 @@ export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 7
})
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
.sort((a, b) => (b.score || 0) - (a.score || 0))
.slice(0, maxURLs)
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
.join('\n');
.slice(0, maxURLs);
}
@ -623,4 +621,91 @@ export function fixBadURLMdLinks(mdContent: string, allURLs: Record<string, Sear
return match;
}
});
}
export function extractUrlsWithDescription(text: string, contextWindowSize: number = 50): SearchSnippet[] {
// Using a more precise regex for URL detection that works with multilingual text
// This matches URLs starting with http:// or https:// but avoids capturing trailing punctuation
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
// Find all matches
const matches: Array<{url: string, index: number, length: number}> = [];
let match: RegExpExecArray | null;
while ((match = urlPattern.exec(text)) !== null) {
let url = match[0];
let length = url.length;
// Clean trailing punctuation (period, comma, etc.)
if (/[.,;:!?)]$/.test(url)) {
url = url.substring(0, url.length - 1);
length = url.length;
// Adjust lastIndex to avoid infinite loop with zero-width matches
urlPattern.lastIndex = match.index + length;
}
matches.push({
url,
index: match.index,
length
});
}
// If no URLs found, return empty array
if (matches.length === 0) {
return [];
}
// Extract context for each URL
const results: SearchSnippet[] = [];
for (let i = 0; i < matches.length; i++) {
const { url, index, length } = matches[i];
// Calculate boundaries for context
let startPos = Math.max(0, index - contextWindowSize);
let endPos = Math.min(text.length, index + length + contextWindowSize);
// Adjust boundaries to avoid overlapping with other URLs
if (i > 0) {
const prevUrl = matches[i-1];
if (startPos < prevUrl.index + prevUrl.length) {
startPos = prevUrl.index + prevUrl.length;
}
}
if (i < matches.length - 1) {
const nextUrl = matches[i+1];
if (endPos > nextUrl.index) {
endPos = nextUrl.index;
}
}
// Extract context
const beforeText = text.substring(startPos, index);
const afterText = text.substring(index + length, endPos);
// Combine into description
let description = '';
if (beforeText && afterText) {
description = `${beforeText.trim()} ... ${afterText.trim()}`;
} else if (beforeText) {
description = beforeText.trim();
} else if (afterText) {
description = afterText.trim();
} else {
description = 'No context available';
}
// Clean up description
description = description.replace(/\s+/g, ' ').trim();
results.push({
url,
description,
title: '' // Maintaining the title field as required by SearchSnippet interface
});
}
return results;
}