diff --git a/src/agent.ts b/src/agent.ts index 3f95221..7f1cac3 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -30,7 +30,7 @@ import { rankURLs, filterURLs, normalizeUrl, - weightedURLToString, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks + sortSelectURLs, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks, extractUrlsWithDescription } from "./utils/url-tools"; import { buildMdFromAnswer, @@ -111,7 +111,7 @@ function getPrompt( knowledge?: KnowledgeItem[], allURLs?: BoostedSearchSnippet[], beastMode?: boolean, -): string { +): { system: string, urlList?: string[]} { const sections: string[] = []; const actionSections: string[] = []; @@ -136,19 +136,20 @@ ${context.join('\n')} // Build actions section - if (allowRead) { - const urlList = weightedURLToString(allURLs || [], 20); + const urlList = sortSelectURLs(allURLs || [], 20); + if (allowRead && urlList.length > 0) { + const urlListStr = urlList + .map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`) + .join('\n') actionSections.push(` - Crawl and read full content from URLs, you can get the fulltext, last updated datetime etc of any URL. -- Must check URLs mentioned in if any -${urlList ? ` +- Must check URLs mentioned in if any - Choose and visit relevant URLs below for more knowledge. higher weight suggests more relevant: -${urlList} +${urlListStr} -`.trim() : ''} `); } @@ -228,7 +229,10 @@ ${actionSections.join('\n\n')} // Add footer sections.push(`Think step by step, choose the action, then respond by matching the schema of that action.`); - return removeExtraLineBreaks(sections.join('\n\n')); + return { + system: removeExtraLineBreaks(sections.join('\n\n')), + urlList: urlList.map(u => u.url) +}; } @@ -421,7 +425,6 @@ export async function getResponse(question?: string, let allowRead = true; let allowReflect = true; let allowCoding = false; - let system = ''; let msgWithKnowledge: CoreMessage[] = []; let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false}; @@ -433,6 +436,23 @@ export async function getResponse(question?: string, const regularBudget = tokenBudget * 0.85; const finalAnswerPIP: string[] = []; let trivialQuestion = false; + + // add all mentioned URLs in messages to allURLs + messages.forEach(m => { + let strMsg = ''; + if (typeof m.content === 'string') { + strMsg = m.content.trim(); + } else if (typeof m.content === 'object' && Array.isArray( m.content)) { + // find the very last sub content whose 'type' is 'text' and use 'text' as the question + strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim(); + } + + extractUrlsWithDescription(strMsg).forEach(u => { + addToAllURLs(u, allURLs); + }); + }) + + while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) { // add 1s delay to avoid rate limiting step++; @@ -486,7 +506,7 @@ export async function getResponse(question?: string, allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already // generate prompt for this step - system = getPrompt( + const { system, urlList} = getPrompt( diaryContext, allQuestions, allKeywords, @@ -792,10 +812,10 @@ You decided to think out of the box or cut from a completely different angle. }); } allowSearch = false; - } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) { + } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length && urlList?.length) { // normalize URLs - thisStep.URLTargets = thisStep.URLTargets - .map(url => normalizeUrl(url)) + thisStep.URLTargets = (thisStep.URLTargets as number[]) + .map(idx => normalizeUrl(urlList[idx - 1])) .filter(url => url && !visitedURLs.includes(url)) as string[]; thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url!)])].slice(0, MAX_URLS_PER_STEP); @@ -892,21 +912,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b await sleep(STEP_SLEEP); } - await storeContext(system, schema, { - allContext, - allKeywords, - allQuestions, - allKnowledge, - weightedURLs, - msgWithKnowledge - }, totalStep); - if (!(thisStep as AnswerAction).isFinal) { console.log('Enter Beast mode!!!') // any answer is better than no answer, humanity last resort step++; totalStep++; - system = getPrompt( + const { system } = getPrompt( diaryContext, allQuestions, allKeywords, @@ -963,15 +974,6 @@ But unfortunately, you failed to solve the issue. You need to think out of the b console.log(thisStep) - await storeContext(system, schema, { - allContext, - allKeywords, - allQuestions, - allKnowledge, - weightedURLs, - msgWithKnowledge - }, totalStep); - // max return 300 urls const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url); return { diff --git a/src/types.ts b/src/types.ts index 6f958a3..afd52e8 100644 --- a/src/types.ts +++ b/src/types.ts @@ -51,7 +51,7 @@ export type ReflectAction = BaseAction & { export type VisitAction = BaseAction & { action: "visit"; - URLTargets: string[]; + URLTargets: number[] | string[]; }; export type CodingAction = BaseAction & { diff --git a/src/utils/schemas.ts b/src/utils/schemas.ts index a83f827..6fcb0e1 100644 --- a/src/utils/schemas.ts +++ b/src/utils/schemas.ts @@ -257,9 +257,9 @@ Ensure each reflection question: if (allowRead) { actionSchemas.visit = z.object({ - URLTargets: z.array(z.string()) + URLTargets: z.array(z.number()) .max(MAX_URLS_PER_STEP) - .describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`) + .describe(`Required when action='visit'. Must be the index of the URL in from the original list of URLs. Maximum ${MAX_URLS_PER_STEP} URLs allowed.`) }).optional(); } diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index fccafba..8a836ec 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -331,8 +331,8 @@ export const addToAllURLs = (r: SearchSnippet, allURLs: Record { - if (!allURLs || allURLs.length === 0) return ''; +export const sortSelectURLs = (allURLs: BoostedSearchSnippet[], maxURLs = 70): any[] => { + if (!allURLs || allURLs.length === 0) return []; return (allURLs) .map(r => { @@ -345,9 +345,7 @@ export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 7 }) .filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null) .sort((a, b) => (b.score || 0) - (a.score || 0)) - .slice(0, maxURLs) - .map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`) - .join('\n'); + .slice(0, maxURLs); } @@ -623,4 +621,91 @@ export function fixBadURLMdLinks(mdContent: string, allURLs: Record = []; + let match: RegExpExecArray | null; + + while ((match = urlPattern.exec(text)) !== null) { + let url = match[0]; + let length = url.length; + + // Clean trailing punctuation (period, comma, etc.) + if (/[.,;:!?)]$/.test(url)) { + url = url.substring(0, url.length - 1); + length = url.length; + // Adjust lastIndex to avoid infinite loop with zero-width matches + urlPattern.lastIndex = match.index + length; + } + + matches.push({ + url, + index: match.index, + length + }); + } + + // If no URLs found, return empty array + if (matches.length === 0) { + return []; + } + + // Extract context for each URL + const results: SearchSnippet[] = []; + + for (let i = 0; i < matches.length; i++) { + const { url, index, length } = matches[i]; + + // Calculate boundaries for context + let startPos = Math.max(0, index - contextWindowSize); + let endPos = Math.min(text.length, index + length + contextWindowSize); + + // Adjust boundaries to avoid overlapping with other URLs + if (i > 0) { + const prevUrl = matches[i-1]; + if (startPos < prevUrl.index + prevUrl.length) { + startPos = prevUrl.index + prevUrl.length; + } + } + + if (i < matches.length - 1) { + const nextUrl = matches[i+1]; + if (endPos > nextUrl.index) { + endPos = nextUrl.index; + } + } + + // Extract context + const beforeText = text.substring(startPos, index); + const afterText = text.substring(index + length, endPos); + + // Combine into description + let description = ''; + if (beforeText && afterText) { + description = `${beforeText.trim()} ... ${afterText.trim()}`; + } else if (beforeText) { + description = beforeText.trim(); + } else if (afterText) { + description = afterText.trim(); + } else { + description = 'No context available'; + } + + // Clean up description + description = description.replace(/\s+/g, ' ').trim(); + + results.push({ + url, + description, + title: '' // Maintaining the title field as required by SearchSnippet interface + }); + } + + return results; } \ No newline at end of file