mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
refactor: pull url out
This commit is contained in:
117
src/agent.ts
117
src/agent.ts
@@ -1,7 +1,6 @@
|
|||||||
import {ZodObject} from 'zod';
|
import {ZodObject} from 'zod';
|
||||||
import {CoreMessage} from 'ai';
|
import {CoreMessage} from 'ai';
|
||||||
import {SEARCH_PROVIDER, STEP_SLEEP} from "./config";
|
import {SEARCH_PROVIDER, STEP_SLEEP} from "./config";
|
||||||
import {readUrl} from "./tools/read";
|
|
||||||
import fs from 'fs/promises';
|
import fs from 'fs/promises';
|
||||||
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
|
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
|
||||||
import {braveSearch} from "./tools/brave-search";
|
import {braveSearch} from "./tools/brave-search";
|
||||||
@@ -33,12 +32,11 @@ import {
|
|||||||
countUrlParts,
|
countUrlParts,
|
||||||
removeBFromA,
|
removeBFromA,
|
||||||
normalizeUrl, sampleMultinomial,
|
normalizeUrl, sampleMultinomial,
|
||||||
weightedURLToString, getLastModified, keepKPerHostname
|
weightedURLToString, getLastModified, keepKPerHostname, processURLs
|
||||||
} from "./utils/url-tools";
|
} from "./utils/url-tools";
|
||||||
import {
|
import {
|
||||||
buildMdFromAnswer,
|
buildMdFromAnswer,
|
||||||
chooseK,
|
chooseK,
|
||||||
removeAllLineBreaks,
|
|
||||||
removeExtraLineBreaks,
|
removeExtraLineBreaks,
|
||||||
removeHTMLtags
|
removeHTMLtags
|
||||||
} from "./utils/text-tools";
|
} from "./utils/text-tools";
|
||||||
@@ -430,56 +428,14 @@ export async function getResponse(question?: string,
|
|||||||
if (thisStep.references.length > 0) {
|
if (thisStep.references.length > 0) {
|
||||||
const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
|
const urls = thisStep.references?.filter(ref => !visitedURLs.includes(ref.url)).map(ref => ref.url) || [];
|
||||||
const uniqueNewURLs = [...new Set(urls)];
|
const uniqueNewURLs = [...new Set(urls)];
|
||||||
if (uniqueNewURLs.length > 0) {
|
await processURLs(
|
||||||
context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueNewURLs.join(', ')});
|
uniqueNewURLs,
|
||||||
const urlResults = await Promise.all(
|
context,
|
||||||
uniqueNewURLs.map(async url => {
|
allKnowledge,
|
||||||
try {
|
allURLs,
|
||||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
visitedURLs,
|
||||||
const {data} = response;
|
SchemaGen.languageCode
|
||||||
const guessedTime = await getLastModified(url);
|
);
|
||||||
console.log('Guessed time for', url, guessedTime)
|
|
||||||
|
|
||||||
// Early return if no valid data
|
|
||||||
if (!data?.url || !data?.content) {
|
|
||||||
throw new Error('No content found');
|
|
||||||
}
|
|
||||||
|
|
||||||
allKnowledge.push({
|
|
||||||
question: `What do expert say about "${data.title}"?`,
|
|
||||||
answer: removeAllLineBreaks(data.content),
|
|
||||||
references: [data.url],
|
|
||||||
type: 'url',
|
|
||||||
updated: guessedTime
|
|
||||||
});
|
|
||||||
|
|
||||||
data.links?.forEach(link => {
|
|
||||||
const r: SearchSnippet = {
|
|
||||||
title: link[0],
|
|
||||||
url: normalizeUrl(link[1]),
|
|
||||||
description: link[0],
|
|
||||||
}
|
|
||||||
// in-page link has lower initial weight comparing to search links
|
|
||||||
if (r.url && r.url.startsWith('http')) {
|
|
||||||
addToAllURLs(r, allURLs, 0.1);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
return {url, result: response};
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error reading URL:', error);
|
|
||||||
return null;
|
|
||||||
} finally {
|
|
||||||
visitedURLs.push(url);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
).then(results => results.filter(Boolean));
|
|
||||||
|
|
||||||
const success = urlResults.length > 0;
|
|
||||||
if (success) {
|
|
||||||
// knowledge updated
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
updateContext({
|
updateContext({
|
||||||
@@ -644,12 +600,10 @@ But then you realized you have asked them before. You decided to to think out of
|
|||||||
const topHosts = Object.entries(countUrlParts(
|
const topHosts = Object.entries(countUrlParts(
|
||||||
Object.entries(allURLs).map(([, result]) => result)
|
Object.entries(allURLs).map(([, result]) => result)
|
||||||
).hostnameCount).sort((a, b) => b[1] - a[1]);
|
).hostnameCount).sort((a, b) => b[1] - a[1]);
|
||||||
console.log(topHosts)
|
|
||||||
if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) {
|
if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) {
|
||||||
// explore-exploit
|
// explore-exploit
|
||||||
siteQuery = query.q + ' site:' + sampleMultinomial(topHosts);
|
siteQuery = query.q + ' site:' + sampleMultinomial(topHosts);
|
||||||
query.q = siteQuery;
|
query.q = siteQuery;
|
||||||
console.log('Site query:', siteQuery)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('Search query:', query);
|
console.log('Search query:', query);
|
||||||
@@ -741,52 +695,15 @@ You decided to think out of the box or cut from a completely different angle.
|
|||||||
console.log(uniqueURLs)
|
console.log(uniqueURLs)
|
||||||
|
|
||||||
if (uniqueURLs.length > 0) {
|
if (uniqueURLs.length > 0) {
|
||||||
context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueURLs.join(', ')});
|
const {urlResults, success} = await processURLs(
|
||||||
|
uniqueURLs,
|
||||||
|
context,
|
||||||
|
allKnowledge,
|
||||||
|
allURLs,
|
||||||
|
visitedURLs,
|
||||||
|
SchemaGen.languageCode
|
||||||
|
);
|
||||||
|
|
||||||
const urlResults = await Promise.all(
|
|
||||||
uniqueURLs.map(async url => {
|
|
||||||
try {
|
|
||||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
|
||||||
const {data} = response;
|
|
||||||
const guessedTime = await getLastModified(url);
|
|
||||||
console.log('Guessed time for', url, guessedTime)
|
|
||||||
|
|
||||||
// Early return if no valid data
|
|
||||||
if (!data?.url || !data?.content) {
|
|
||||||
throw new Error('No content found');
|
|
||||||
}
|
|
||||||
|
|
||||||
allKnowledge.push({
|
|
||||||
question: `What do expert say about "${data.title}"?`,
|
|
||||||
answer: removeAllLineBreaks(data.content),
|
|
||||||
references: [data.url],
|
|
||||||
type: 'url',
|
|
||||||
updated: guessedTime
|
|
||||||
});
|
|
||||||
|
|
||||||
data.links?.forEach(link => {
|
|
||||||
const r: SearchSnippet = {
|
|
||||||
title: link[0],
|
|
||||||
url: normalizeUrl(link[1]),
|
|
||||||
description: link[0],
|
|
||||||
}
|
|
||||||
// in-page link has lower initial weight comparing to search links
|
|
||||||
if (r.url && r.url.startsWith('http')) {
|
|
||||||
addToAllURLs(r, allURLs, 0.1);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
return {url, result: response};
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error reading URL:', error);
|
|
||||||
return null;
|
|
||||||
} finally {
|
|
||||||
visitedURLs.push(url);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
).then(results => results.filter(Boolean));
|
|
||||||
|
|
||||||
const success = urlResults.length > 0;
|
|
||||||
diaryContext.push(success
|
diaryContext.push(success
|
||||||
? `At step ${step}, you took the **visit** action and deep dive into the following URLs:
|
? `At step ${step}, you took the **visit** action and deep dive into the following URLs:
|
||||||
${urlResults.map(r => r?.url).join('\n')}
|
${urlResults.map(r => r?.url).join('\n')}
|
||||||
|
|||||||
@@ -684,31 +684,3 @@ export async function evaluateAnswer(
|
|||||||
return result?.object as EvaluationResponse;
|
return result?.object as EvaluationResponse;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to fetch and combine source content
|
|
||||||
async function fetchSourceContent(urls: string[], trackers: TrackerContext, schemaGen: Schemas): Promise<string> {
|
|
||||||
if (!urls.length) return '';
|
|
||||||
trackers.actionTracker.trackThink('read_for_verify', schemaGen.languageCode);
|
|
||||||
try {
|
|
||||||
const results = await Promise.all(
|
|
||||||
urls.map(async (url) => {
|
|
||||||
try {
|
|
||||||
const {response} = await readUrl(url, false, trackers.tokenTracker);
|
|
||||||
const content = response?.data?.content || '';
|
|
||||||
return removeAllLineBreaks(content);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error reading URL:', error);
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
// Filter out empty results and join with proper separation
|
|
||||||
return results
|
|
||||||
.filter(content => content.trim())
|
|
||||||
.join('\n\n');
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error fetching source content:', error);
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import {BoostedSearchSnippet, SearchResult, SearchSnippet, TrackerContext} from "../types";
|
import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types";
|
||||||
import {smartMergeStrings} from "./text-tools";
|
import {removeAllLineBreaks, smartMergeStrings} from "./text-tools";
|
||||||
import {rerankDocuments} from "../tools/jina-rerank";
|
import {rerankDocuments} from "../tools/jina-rerank";
|
||||||
|
import {readUrl} from "../tools/read";
|
||||||
|
|
||||||
export function normalizeUrl(urlString: string, debug = false, options = {
|
export function normalizeUrl(urlString: string, debug = false, options = {
|
||||||
removeAnchors: true,
|
removeAnchors: true,
|
||||||
@@ -381,4 +382,72 @@ export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) =>
|
|||||||
});
|
});
|
||||||
|
|
||||||
return filteredResults;
|
return filteredResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function processURLs(
|
||||||
|
urls: string[],
|
||||||
|
context: TrackerContext,
|
||||||
|
allKnowledge: KnowledgeItem[],
|
||||||
|
allURLs: Record<string, SearchSnippet>,
|
||||||
|
visitedURLs: string[],
|
||||||
|
languageCode: string
|
||||||
|
): Promise<{urlResults: any[], success: boolean}> {
|
||||||
|
// Skip if no URLs to process
|
||||||
|
if (urls.length === 0) {
|
||||||
|
return { urlResults: [], success: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track the reading action
|
||||||
|
context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')});
|
||||||
|
|
||||||
|
// Process each URL in parallel
|
||||||
|
const urlResults = await Promise.all(
|
||||||
|
urls.map(async url => {
|
||||||
|
try {
|
||||||
|
const {response} = await readUrl(url, true, context.tokenTracker);
|
||||||
|
const {data} = response;
|
||||||
|
const guessedTime = await getLastModified(url);
|
||||||
|
console.log('Guessed time for', url, guessedTime);
|
||||||
|
|
||||||
|
// Early return if no valid data
|
||||||
|
if (!data?.url || !data?.content) {
|
||||||
|
throw new Error('No content found');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to knowledge base
|
||||||
|
allKnowledge.push({
|
||||||
|
question: `What do expert say about "${data.title}"?`,
|
||||||
|
answer: removeAllLineBreaks(data.content),
|
||||||
|
references: [data.url],
|
||||||
|
type: 'url',
|
||||||
|
updated: guessedTime
|
||||||
|
});
|
||||||
|
|
||||||
|
// Process page links
|
||||||
|
data.links?.forEach(link => {
|
||||||
|
const r: SearchSnippet = {
|
||||||
|
title: link[0],
|
||||||
|
url: normalizeUrl(link[1]),
|
||||||
|
description: link[0],
|
||||||
|
}
|
||||||
|
// in-page link has lower initial weight comparing to search links
|
||||||
|
if (r.url && r.url.startsWith('http')) {
|
||||||
|
addToAllURLs(r, allURLs, 0.1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return {url, result: response};
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading URL:', error);
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
visitedURLs.push(url);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
).then(results => results.filter(Boolean));
|
||||||
|
|
||||||
|
return {
|
||||||
|
urlResults,
|
||||||
|
success: urlResults.length > 0
|
||||||
|
};
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user