mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: emit url idx in visit action
This commit is contained in:
parent
98d83e84bb
commit
7bd4f51f42
68
src/agent.ts
68
src/agent.ts
@ -30,7 +30,7 @@ import {
|
||||
rankURLs,
|
||||
filterURLs,
|
||||
normalizeUrl,
|
||||
weightedURLToString, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks
|
||||
sortSelectURLs, getLastModified, keepKPerHostname, processURLs, fixBadURLMdLinks, extractUrlsWithDescription
|
||||
} from "./utils/url-tools";
|
||||
import {
|
||||
buildMdFromAnswer,
|
||||
@ -111,7 +111,7 @@ function getPrompt(
|
||||
knowledge?: KnowledgeItem[],
|
||||
allURLs?: BoostedSearchSnippet[],
|
||||
beastMode?: boolean,
|
||||
): string {
|
||||
): { system: string, urlList?: string[]} {
|
||||
const sections: string[] = [];
|
||||
const actionSections: string[] = [];
|
||||
|
||||
@ -136,19 +136,20 @@ ${context.join('\n')}
|
||||
|
||||
// Build actions section
|
||||
|
||||
if (allowRead) {
|
||||
const urlList = weightedURLToString(allURLs || [], 20);
|
||||
const urlList = sortSelectURLs(allURLs || [], 20);
|
||||
if (allowRead && urlList.length > 0) {
|
||||
const urlListStr = urlList
|
||||
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
|
||||
.join('\n')
|
||||
|
||||
actionSections.push(`
|
||||
<action-visit>
|
||||
- Crawl and read full content from URLs, you can get the fulltext, last updated datetime etc of any URL.
|
||||
- Must check URLs mentioned in <question> if any
|
||||
${urlList ? `
|
||||
- Must check URLs mentioned in <question> if any
|
||||
- Choose and visit relevant URLs below for more knowledge. higher weight suggests more relevant:
|
||||
<url-list>
|
||||
${urlList}
|
||||
${urlListStr}
|
||||
</url-list>
|
||||
`.trim() : ''}
|
||||
</action-visit>
|
||||
`);
|
||||
}
|
||||
@ -228,7 +229,10 @@ ${actionSections.join('\n\n')}
|
||||
// Add footer
|
||||
sections.push(`Think step by step, choose the action, then respond by matching the schema of that action.`);
|
||||
|
||||
return removeExtraLineBreaks(sections.join('\n\n'));
|
||||
return {
|
||||
system: removeExtraLineBreaks(sections.join('\n\n')),
|
||||
urlList: urlList.map(u => u.url)
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -421,7 +425,6 @@ export async function getResponse(question?: string,
|
||||
let allowRead = true;
|
||||
let allowReflect = true;
|
||||
let allowCoding = false;
|
||||
let system = '';
|
||||
let msgWithKnowledge: CoreMessage[] = [];
|
||||
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
|
||||
|
||||
@ -433,6 +436,23 @@ export async function getResponse(question?: string,
|
||||
const regularBudget = tokenBudget * 0.85;
|
||||
const finalAnswerPIP: string[] = [];
|
||||
let trivialQuestion = false;
|
||||
|
||||
// add all mentioned URLs in messages to allURLs
|
||||
messages.forEach(m => {
|
||||
let strMsg = '';
|
||||
if (typeof m.content === 'string') {
|
||||
strMsg = m.content.trim();
|
||||
} else if (typeof m.content === 'object' && Array.isArray( m.content)) {
|
||||
// find the very last sub content whose 'type' is 'text' and use 'text' as the question
|
||||
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
|
||||
}
|
||||
|
||||
extractUrlsWithDescription(strMsg).forEach(u => {
|
||||
addToAllURLs(u, allURLs);
|
||||
});
|
||||
})
|
||||
|
||||
|
||||
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget) {
|
||||
// add 1s delay to avoid rate limiting
|
||||
step++;
|
||||
@ -486,7 +506,7 @@ export async function getResponse(question?: string,
|
||||
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
|
||||
|
||||
// generate prompt for this step
|
||||
system = getPrompt(
|
||||
const { system, urlList} = getPrompt(
|
||||
diaryContext,
|
||||
allQuestions,
|
||||
allKeywords,
|
||||
@ -792,10 +812,10 @@ You decided to think out of the box or cut from a completely different angle.
|
||||
});
|
||||
}
|
||||
allowSearch = false;
|
||||
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
|
||||
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length && urlList?.length) {
|
||||
// normalize URLs
|
||||
thisStep.URLTargets = thisStep.URLTargets
|
||||
.map(url => normalizeUrl(url))
|
||||
thisStep.URLTargets = (thisStep.URLTargets as number[])
|
||||
.map(idx => normalizeUrl(urlList[idx - 1]))
|
||||
.filter(url => url && !visitedURLs.includes(url)) as string[];
|
||||
|
||||
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url!)])].slice(0, MAX_URLS_PER_STEP);
|
||||
@ -892,21 +912,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
await sleep(STEP_SLEEP);
|
||||
}
|
||||
|
||||
await storeContext(system, schema, {
|
||||
allContext,
|
||||
allKeywords,
|
||||
allQuestions,
|
||||
allKnowledge,
|
||||
weightedURLs,
|
||||
msgWithKnowledge
|
||||
}, totalStep);
|
||||
|
||||
if (!(thisStep as AnswerAction).isFinal) {
|
||||
console.log('Enter Beast mode!!!')
|
||||
// any answer is better than no answer, humanity last resort
|
||||
step++;
|
||||
totalStep++;
|
||||
system = getPrompt(
|
||||
const { system } = getPrompt(
|
||||
diaryContext,
|
||||
allQuestions,
|
||||
allKeywords,
|
||||
@ -963,15 +974,6 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
|
||||
console.log(thisStep)
|
||||
|
||||
await storeContext(system, schema, {
|
||||
allContext,
|
||||
allKeywords,
|
||||
allQuestions,
|
||||
allKnowledge,
|
||||
weightedURLs,
|
||||
msgWithKnowledge
|
||||
}, totalStep);
|
||||
|
||||
// max return 300 urls
|
||||
const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
|
||||
return {
|
||||
|
||||
@ -51,7 +51,7 @@ export type ReflectAction = BaseAction & {
|
||||
|
||||
export type VisitAction = BaseAction & {
|
||||
action: "visit";
|
||||
URLTargets: string[];
|
||||
URLTargets: number[] | string[];
|
||||
};
|
||||
|
||||
export type CodingAction = BaseAction & {
|
||||
|
||||
@ -257,9 +257,9 @@ Ensure each reflection question:
|
||||
|
||||
if (allowRead) {
|
||||
actionSchemas.visit = z.object({
|
||||
URLTargets: z.array(z.string())
|
||||
URLTargets: z.array(z.number())
|
||||
.max(MAX_URLS_PER_STEP)
|
||||
.describe(`Required when action='visit'. Must be an array of URLs, choose up the most relevant ${MAX_URLS_PER_STEP} URLs to visit`)
|
||||
.describe(`Required when action='visit'. Must be the index of the URL in from the original list of URLs. Maximum ${MAX_URLS_PER_STEP} URLs allowed.`)
|
||||
}).optional();
|
||||
}
|
||||
|
||||
|
||||
@ -331,8 +331,8 @@ export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSni
|
||||
}
|
||||
}
|
||||
|
||||
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
|
||||
if (!allURLs || allURLs.length === 0) return '';
|
||||
export const sortSelectURLs = (allURLs: BoostedSearchSnippet[], maxURLs = 70): any[] => {
|
||||
if (!allURLs || allURLs.length === 0) return [];
|
||||
|
||||
return (allURLs)
|
||||
.map(r => {
|
||||
@ -345,9 +345,7 @@ export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 7
|
||||
})
|
||||
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
|
||||
.sort((a, b) => (b.score || 0) - (a.score || 0))
|
||||
.slice(0, maxURLs)
|
||||
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
|
||||
.join('\n');
|
||||
.slice(0, maxURLs);
|
||||
}
|
||||
|
||||
|
||||
@ -623,4 +621,91 @@ export function fixBadURLMdLinks(mdContent: string, allURLs: Record<string, Sear
|
||||
return match;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function extractUrlsWithDescription(text: string, contextWindowSize: number = 50): SearchSnippet[] {
|
||||
// Using a more precise regex for URL detection that works with multilingual text
|
||||
// This matches URLs starting with http:// or https:// but avoids capturing trailing punctuation
|
||||
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
|
||||
|
||||
// Find all matches
|
||||
const matches: Array<{url: string, index: number, length: number}> = [];
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = urlPattern.exec(text)) !== null) {
|
||||
let url = match[0];
|
||||
let length = url.length;
|
||||
|
||||
// Clean trailing punctuation (period, comma, etc.)
|
||||
if (/[.,;:!?)]$/.test(url)) {
|
||||
url = url.substring(0, url.length - 1);
|
||||
length = url.length;
|
||||
// Adjust lastIndex to avoid infinite loop with zero-width matches
|
||||
urlPattern.lastIndex = match.index + length;
|
||||
}
|
||||
|
||||
matches.push({
|
||||
url,
|
||||
index: match.index,
|
||||
length
|
||||
});
|
||||
}
|
||||
|
||||
// If no URLs found, return empty array
|
||||
if (matches.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Extract context for each URL
|
||||
const results: SearchSnippet[] = [];
|
||||
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const { url, index, length } = matches[i];
|
||||
|
||||
// Calculate boundaries for context
|
||||
let startPos = Math.max(0, index - contextWindowSize);
|
||||
let endPos = Math.min(text.length, index + length + contextWindowSize);
|
||||
|
||||
// Adjust boundaries to avoid overlapping with other URLs
|
||||
if (i > 0) {
|
||||
const prevUrl = matches[i-1];
|
||||
if (startPos < prevUrl.index + prevUrl.length) {
|
||||
startPos = prevUrl.index + prevUrl.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < matches.length - 1) {
|
||||
const nextUrl = matches[i+1];
|
||||
if (endPos > nextUrl.index) {
|
||||
endPos = nextUrl.index;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract context
|
||||
const beforeText = text.substring(startPos, index);
|
||||
const afterText = text.substring(index + length, endPos);
|
||||
|
||||
// Combine into description
|
||||
let description = '';
|
||||
if (beforeText && afterText) {
|
||||
description = `${beforeText.trim()} ... ${afterText.trim()}`;
|
||||
} else if (beforeText) {
|
||||
description = beforeText.trim();
|
||||
} else if (afterText) {
|
||||
description = afterText.trim();
|
||||
} else {
|
||||
description = 'No context available';
|
||||
}
|
||||
|
||||
// Clean up description
|
||||
description = description.replace(/\s+/g, ' ').trim();
|
||||
|
||||
results.push({
|
||||
url,
|
||||
description,
|
||||
title: '' // Maintaining the title field as required by SearchSnippet interface
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user