diff --git a/src/agent.ts b/src/agent.ts
index 3f95221..7f1cac3 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -30,7 +30,7 @@ import {
rankURLs,
filterURLs,
normalizeUrl,
- weightedURLToString, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks
+ sortSelectURLs, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks, extractUrlsWithDescription
} from "./utils/url-tools";
import {
buildMdFromAnswer,
@@ -111,7 +111,7 @@ function getPrompt(
knowledge?: KnowledgeItem[],
allURLs?: BoostedSearchSnippet[],
beastMode?: boolean,
-): string {
+): { system: string, urlList?: string[]} {
const sections: string[] = [];
const actionSections: string[] = [];
@@ -136,19 +136,20 @@ ${context.join('\n')}
// Build actions section
- if (allowRead) {
- const urlList = weightedURLToString(allURLs || [], 20);
+ const urlList = sortSelectURLs(allURLs || [], 20);
+ if (allowRead && urlList.length > 0) {
+ const urlListStr = urlList
+ .map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
+ .join('\n')
actionSections.push(`
- Crawl and read full content from URLs, you can get the fulltext, last updated datetime etc of any URL.
-- Must check URLs mentioned in if any
-${urlList ? `
+- Must check URLs mentioned in if any
- Choose and visit relevant URLs below for more knowledge. higher weight suggests more relevant:
-${urlList}
+${urlListStr}
-`.trim() : ''}
`);
}
@@ -228,7 +229,10 @@ ${actionSections.join('\n\n')}
// Add footer
sections.push(`Think step by step, choose the action, then respond by matching the schema of that action.`);
- return removeExtraLineBreaks(sections.join('\n\n'));
+ return {
+ system: removeExtraLineBreaks(sections.join('\n\n')),
+ urlList: urlList.map(u => u.url)
+};
}
@@ -421,7 +425,6 @@ export async function getResponse(question?: string,
let allowRead = true;
let allowReflect = true;
let allowCoding = false;
- let system = '';
let msgWithKnowledge: CoreMessage[] = [];
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
@@ -433,6 +436,23 @@ export async function getResponse(question?: string,
const regularBudget = tokenBudget * 0.85;
const finalAnswerPIP: string[] = [];
let trivialQuestion = false;
+
+ // add all mentioned URLs in messages to allURLs
+ messages.forEach(m => {
+ let strMsg = '';
+ if (typeof m.content === 'string') {
+ strMsg = m.content.trim();
+ } else if (typeof m.content === 'object' && Array.isArray( m.content)) {
+ // find the very last sub content whose 'type' is 'text' and use 'text' as the question
+ strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
+ }
+
+ extractUrlsWithDescription(strMsg).forEach(u => {
+ addToAllURLs(u, allURLs);
+ });
+ })
+
+
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
// add 1s delay to avoid rate limiting
step++;
@@ -486,7 +506,7 @@ export async function getResponse(question?: string,
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
// generate prompt for this step
- system = getPrompt(
+ const { system, urlList} = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@@ -792,10 +812,10 @@ You decided to think out of the box or cut from a completely different angle.
});
}
allowSearch = false;
- } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
+ } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length && urlList?.length) {
// normalize URLs
- thisStep.URLTargets = thisStep.URLTargets
- .map(url => normalizeUrl(url))
+ thisStep.URLTargets = (thisStep.URLTargets as number[])
+ .map(idx => normalizeUrl(urlList[idx - 1]))
.filter(url => url && !visitedURLs.includes(url)) as string[];
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url!)])].slice(0, MAX_URLS_PER_STEP);
@@ -892,21 +912,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
await sleep(STEP_SLEEP);
}
- await storeContext(system, schema, {
- allContext,
- allKeywords,
- allQuestions,
- allKnowledge,
- weightedURLs,
- msgWithKnowledge
- }, totalStep);
-
if (!(thisStep as AnswerAction).isFinal) {
console.log('Enter Beast mode!!!')
// any answer is better than no answer, humanity last resort
step++;
totalStep++;
- system = getPrompt(
+ const { system } = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@@ -963,15 +974,6 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
console.log(thisStep)
- await storeContext(system, schema, {
- allContext,
- allKeywords,
- allQuestions,
- allKnowledge,
- weightedURLs,
- msgWithKnowledge
- }, totalStep);
-
// max return 300 urls
const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
return {
diff --git a/src/types.ts b/src/types.ts
index 6f958a3..afd52e8 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -51,7 +51,7 @@ export type ReflectAction = BaseAction & {
export type VisitAction = BaseAction & {
action: "visit";
- URLTargets: string[];
+ URLTargets: number[] | string[];
};
export type CodingAction = BaseAction & {
diff --git a/src/utils/schemas.ts b/src/utils/schemas.ts
index a83f827..6fcb0e1 100644
--- a/src/utils/schemas.ts
+++ b/src/utils/schemas.ts
@@ -257,9 +257,9 @@ Ensure each reflection question:
if (allowRead) {
actionSchemas.visit = z.object({
- URLTargets: z.array(z.string())
+ URLTargets: z.array(z.number())
.max(MAX_URLS_PER_STEP)
- .describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`)
+ .describe(`Required when action='visit'. Must be the index of the URL in from the original list of URLs. Maximum ${MAX_URLS_PER_STEP} URLs allowed.`)
}).optional();
}
diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts
index fccafba..8a836ec 100644
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@@ -331,8 +331,8 @@ export const addToAllURLs = (r: SearchSnippet, allURLs: Record {
- if (!allURLs || allURLs.length === 0) return '';
+export const sortSelectURLs = (allURLs: BoostedSearchSnippet[], maxURLs = 70): any[] => {
+ if (!allURLs || allURLs.length === 0) return [];
return (allURLs)
.map(r => {
@@ -345,9 +345,7 @@ export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 7
})
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
.sort((a, b) => (b.score || 0) - (a.score || 0))
- .slice(0, maxURLs)
- .map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
- .join('\n');
+ .slice(0, maxURLs);
}
@@ -623,4 +621,91 @@ export function fixBadURLMdLinks(mdContent: string, allURLs: Record = [];
+ let match: RegExpExecArray | null;
+
+ while ((match = urlPattern.exec(text)) !== null) {
+ let url = match[0];
+ let length = url.length;
+
+ // Clean trailing punctuation (period, comma, etc.)
+ if (/[.,;:!?)]$/.test(url)) {
+ url = url.substring(0, url.length - 1);
+ length = url.length;
+ // Adjust lastIndex to avoid infinite loop with zero-width matches
+ urlPattern.lastIndex = match.index + length;
+ }
+
+ matches.push({
+ url,
+ index: match.index,
+ length
+ });
+ }
+
+ // If no URLs found, return empty array
+ if (matches.length === 0) {
+ return [];
+ }
+
+ // Extract context for each URL
+ const results: SearchSnippet[] = [];
+
+ for (let i = 0; i < matches.length; i++) {
+ const { url, index, length } = matches[i];
+
+ // Calculate boundaries for context
+ let startPos = Math.max(0, index - contextWindowSize);
+ let endPos = Math.min(text.length, index + length + contextWindowSize);
+
+ // Adjust boundaries to avoid overlapping with other URLs
+ if (i > 0) {
+ const prevUrl = matches[i-1];
+ if (startPos < prevUrl.index + prevUrl.length) {
+ startPos = prevUrl.index + prevUrl.length;
+ }
+ }
+
+ if (i < matches.length - 1) {
+ const nextUrl = matches[i+1];
+ if (endPos > nextUrl.index) {
+ endPos = nextUrl.index;
+ }
+ }
+
+ // Extract context
+ const beforeText = text.substring(startPos, index);
+ const afterText = text.substring(index + length, endPos);
+
+ // Combine into description
+ let description = '';
+ if (beforeText && afterText) {
+ description = `${beforeText.trim()} ... ${afterText.trim()}`;
+ } else if (beforeText) {
+ description = beforeText.trim();
+ } else if (afterText) {
+ description = afterText.trim();
+ } else {
+ description = 'No context available';
+ }
+
+ // Clean up description
+ description = description.replace(/\s+/g, ' ').trim();
+
+ results.push({
+ url,
+ description,
+ title: '' // Maintaining the title field as required by SearchSnippet interface
+ });
+ }
+
+ return results;
}
\ No newline at end of file