mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
feat: add url ranking
This commit is contained in:
73
src/agent.ts
73
src/agent.ts
@@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
|
||||
import {analyzeSteps} from "./tools/error-analyzer";
|
||||
import {TokenTracker} from "./utils/token-tracker";
|
||||
import {ActionTracker} from "./utils/action-tracker";
|
||||
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType} from "./types";
|
||||
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType, BoostedSearchResult} from "./types";
|
||||
import {TrackerContext} from "./types";
|
||||
import {search} from "./tools/jina-search";
|
||||
// import {grounding} from "./tools/grounding";
|
||||
@@ -19,8 +19,8 @@ import {zodToJsonSchema} from "zod-to-json-schema";
|
||||
import {ObjectGeneratorSafe} from "./utils/safe-generator";
|
||||
import {CodeSandbox} from "./tools/code-sandbox";
|
||||
import {serperSearch} from './tools/serper-search';
|
||||
import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
|
||||
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
|
||||
import {calculateBoostedWeights, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
|
||||
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools";
|
||||
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
|
||||
|
||||
async function sleep(ms: number) {
|
||||
@@ -132,18 +132,22 @@ ${learnedStrategy}
|
||||
if (allowRead) {
|
||||
let urlList = '';
|
||||
if (allURLs && allURLs.length > 0) {
|
||||
urlList = allURLs
|
||||
const weightedURLs = calculateBoostedWeights(allURLs) as BoostedSearchResult[]
|
||||
|
||||
urlList = (weightedURLs)
|
||||
.filter(r => 'url' in r)
|
||||
.map(r => ` + "${r.url}": "${r.title}"`)
|
||||
.sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0))
|
||||
.slice(0, 10) // save context window and reduce noise, only keep top 10 urls
|
||||
.map(r => ` + weight: ${r.boostedWeight.toFixed(3)} "${r.url}": "${r.title}"`)
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
actionSections.push(`
|
||||
<action-visit>
|
||||
- Access and read full content from URLs
|
||||
- Must check URLs mentioned in <question>
|
||||
- Must check URLs mentioned in <question> if any
|
||||
${urlList ? `
|
||||
- Review relevant URLs below for additional information
|
||||
- Choose and visit relevant URLs below for more knowledge. higher weight means more relevant and you should visit first:
|
||||
<url-list>
|
||||
${urlList}
|
||||
</url-list>
|
||||
@@ -302,7 +306,7 @@ export async function getResponse(question?: string,
|
||||
evaluationMetrics[currentQuestion] =
|
||||
await evaluateQuestion(currentQuestion, context, SchemaGen)
|
||||
}
|
||||
if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step===1) {
|
||||
if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step === 1) {
|
||||
// force strict eval for the original question, only once.
|
||||
evaluationMetrics[currentQuestion].push('strict')
|
||||
}
|
||||
@@ -315,7 +319,7 @@ export async function getResponse(question?: string,
|
||||
|
||||
// update all urls with buildURLMap
|
||||
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
|
||||
allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 50); // disable search when too many urls already
|
||||
allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 70); // disable search when too many urls already
|
||||
|
||||
// generate prompt for this step
|
||||
system = getPrompt(
|
||||
@@ -568,10 +572,20 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
const minResults = (results).map(r => ({
|
||||
title: r.title,
|
||||
url: normalizeUrl('url' in r ? r.url : r.link),
|
||||
description: 'description' in r ? r.description : r.snippet
|
||||
description: 'description' in r ? r.description : r.snippet,
|
||||
}));
|
||||
|
||||
minResults.forEach(r => allURLs[r.url] = r);
|
||||
minResults.forEach(r => {
|
||||
if (!allURLs[r.url]) {
|
||||
allURLs[r.url] = r;
|
||||
allURLs[r.url].weight = 1;
|
||||
} else {
|
||||
(allURLs[r.url].weight as number)++;
|
||||
const curDesc = (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description;
|
||||
(allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description = smartMergeStrings(curDesc, r.description);
|
||||
}
|
||||
|
||||
});
|
||||
allKeywords.push(query);
|
||||
|
||||
allKnowledge.push({
|
||||
@@ -722,11 +736,11 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
}
|
||||
|
||||
|
||||
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
|
||||
await sleep(STEP_SLEEP);
|
||||
}
|
||||
|
||||
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
|
||||
if (!(thisStep as AnswerAction).isFinal) {
|
||||
console.log('Enter Beast mode!!!')
|
||||
// any answer is better than no answer, humanity last resort
|
||||
@@ -766,7 +780,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
(thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction))
|
||||
console.log(thisStep)
|
||||
|
||||
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
|
||||
return {
|
||||
result: thisStep,
|
||||
context,
|
||||
@@ -776,16 +790,25 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
|
||||
}
|
||||
|
||||
async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
|
||||
async function storeContext(prompt: string, schema: any, memory: {
|
||||
allContext: StepAction[];
|
||||
allKeywords: string[];
|
||||
allQuestions: string[];
|
||||
allKnowledge: KnowledgeItem[];
|
||||
allURLs: Record<string, SearchResult>;
|
||||
}
|
||||
, step: number) {
|
||||
|
||||
const {allContext, allKeywords, allQuestions, allKnowledge, allURLs} = memory;
|
||||
if ((process as any).asyncLocalContext?.available?.()) {
|
||||
const [context, keywords, questions, knowledge] = memory;
|
||||
|
||||
(process as any).asyncLocalContext.ctx.promptContext = {
|
||||
prompt,
|
||||
schema,
|
||||
context,
|
||||
keywords,
|
||||
questions,
|
||||
knowledge,
|
||||
allContext,
|
||||
allKeywords,
|
||||
allQuestions,
|
||||
allKnowledge,
|
||||
step
|
||||
};
|
||||
return;
|
||||
@@ -799,11 +822,11 @@ ${prompt}
|
||||
JSONSchema:
|
||||
${JSON.stringify(zodToJsonSchema(schema), null, 2)}
|
||||
`);
|
||||
const [context, keywords, questions, knowledge] = memory;
|
||||
await fs.writeFile('context.json', JSON.stringify(context, null, 2));
|
||||
await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2));
|
||||
await fs.writeFile('questions.json', JSON.stringify(questions, null, 2));
|
||||
await fs.writeFile('knowledge.json', JSON.stringify(knowledge, null, 2));
|
||||
await fs.writeFile('context.json', JSON.stringify(allContext, null, 2));
|
||||
await fs.writeFile('queries.json', JSON.stringify(allKeywords, null, 2));
|
||||
await fs.writeFile('questions.json', JSON.stringify(allQuestions, null, 2));
|
||||
await fs.writeFile('knowledge.json', JSON.stringify(allKnowledge, null, 2));
|
||||
await fs.writeFile('urls.json', JSON.stringify(calculateBoostedWeights(Object.entries(allURLs).map(([, result]) => result)), null, 2));
|
||||
} catch (error) {
|
||||
console.error('Context storage failed:', error);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ import {TokenTracker} from "../utils/token-tracker";
|
||||
import {JINA_API_KEY} from "../config";
|
||||
|
||||
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
|
||||
const SIMILARITY_THRESHOLD = 0.85; // Adjustable threshold for cosine similarity
|
||||
const SIMILARITY_THRESHOLD = 0.888; // Adjustable threshold for cosine similarity
|
||||
|
||||
const JINA_API_CONFIG = {
|
||||
MODEL: 'jina-embeddings-v3',
|
||||
|
||||
16
src/types.ts
16
src/types.ts
@@ -172,9 +172,20 @@ export type ErrorAnalysisResponse = {
|
||||
};
|
||||
|
||||
export type SearchResult =
|
||||
| { title: string; url: string; description: string }
|
||||
| { title: string; link: string; snippet: string };
|
||||
| { title: string; url: string; description: string; weight?: number }
|
||||
| { title: string; link: string; snippet: string; weight?: number };
|
||||
|
||||
export type BoostedSearchResult = {
|
||||
title: string;
|
||||
url: string;
|
||||
description: string;
|
||||
weight: number;
|
||||
originalWeight: number;
|
||||
hostnameBoost: number;
|
||||
pathBoost: number;
|
||||
boostScore: number;
|
||||
boostedWeight: number;
|
||||
}
|
||||
|
||||
// OpenAI API Types
|
||||
export interface Model {
|
||||
@@ -190,6 +201,7 @@ export type ResponseFormat = {
|
||||
type: 'json_schema' | 'json_object';
|
||||
json_schema?: any;
|
||||
}
|
||||
|
||||
export interface ChatCompletionRequest {
|
||||
model: string;
|
||||
messages: Array<CoreUserMessage | CoreAssistantMessage>;
|
||||
|
||||
@@ -160,4 +160,40 @@ export function getI18nText(key: string, lang = 'en', params: Record<string, str
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
export function smartMergeStrings(str1: string, str2: string): string {
|
||||
// If either string is empty, return the other
|
||||
if (!str1) return str2;
|
||||
if (!str2) return str1;
|
||||
|
||||
// Check if one string is entirely contained within the other
|
||||
if (str1.includes(str2)) return str1;
|
||||
if (str2.includes(str1)) return str2;
|
||||
|
||||
// Find the maximum possible overlap length
|
||||
const maxOverlap = Math.min(str1.length, str2.length);
|
||||
let bestOverlapLength = 0;
|
||||
|
||||
// Check for overlaps starting from the largest possible
|
||||
for (let overlapLength = maxOverlap; overlapLength > 0; overlapLength--) {
|
||||
// Get the end of first string with the current overlap length
|
||||
const endOfStr1 = str1.slice(str1.length - overlapLength);
|
||||
// Get the beginning of second string with the current overlap length
|
||||
const startOfStr2 = str2.slice(0, overlapLength);
|
||||
|
||||
// If they match, we've found our overlap
|
||||
if (endOfStr1 === startOfStr2) {
|
||||
bestOverlapLength = overlapLength;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Merge the strings using the best overlap
|
||||
if (bestOverlapLength > 0) {
|
||||
return str1.slice(0, str1.length - bestOverlapLength) + str2;
|
||||
} else {
|
||||
// No overlap found, concatenate normally
|
||||
return str1 + str2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import {SearchResult} from "../types";
|
||||
import {BoostedSearchResult, SearchResult} from "../types";
|
||||
|
||||
export function normalizeUrl(urlString: string, debug = false): string {
|
||||
if (!urlString?.trim()) {
|
||||
@@ -101,4 +101,110 @@ export function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedU
|
||||
return Object.entries(allURLs)
|
||||
.filter(([url]) => !visitedURLs.includes(url))
|
||||
.map(([, result]) => result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Function to extract hostname and path from a URL
|
||||
const extractUrlParts = (urlStr: string) => {
|
||||
try {
|
||||
const url = new URL(urlStr);
|
||||
return {
|
||||
hostname: url.hostname,
|
||||
path: url.pathname
|
||||
};
|
||||
} catch (e) {
|
||||
console.error(`Error parsing URL: ${urlStr}`, e);
|
||||
return { hostname: "", path: "" };
|
||||
}
|
||||
};
|
||||
|
||||
// Function to count occurrences of hostnames and paths
|
||||
const countUrlParts = (urlItems: SearchResult[]) => {
|
||||
const hostnameCount: Record<string, number> = {};
|
||||
const pathPrefixCount: Record<string, number> = {};
|
||||
let totalUrls = 0;
|
||||
|
||||
urlItems.forEach(item => {
|
||||
item = (item as { title: string; url: string; description: string; weight?: number })
|
||||
if (!item || !item.url) return; // Skip invalid items
|
||||
|
||||
totalUrls++;
|
||||
const { hostname, path } = extractUrlParts(item.url);
|
||||
|
||||
// Count hostnames
|
||||
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
||||
|
||||
// Count path prefixes (segments)
|
||||
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
||||
pathSegments.forEach((segment, index) => {
|
||||
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
||||
pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
|
||||
});
|
||||
});
|
||||
|
||||
return { hostnameCount, pathPrefixCount, totalUrls };
|
||||
};
|
||||
|
||||
// Calculate normalized frequency for boosting
|
||||
const normalizeCount = (count: any, total: any) => {
|
||||
return total > 0 ? count / total : 0;
|
||||
};
|
||||
|
||||
// Calculate boosted weights
|
||||
export const calculateBoostedWeights = (urlItems: SearchResult[], options: any = {}): any[] => {
|
||||
// Default parameters for boosting - can be overridden
|
||||
const {
|
||||
hostnameBoostFactor = 0.7, // How much to boost based on hostname frequency
|
||||
pathBoostFactor = 0.4, // How much to boost based on path frequency
|
||||
decayFactor = 0.8, // Decay factor for longer paths (0-1)
|
||||
minBoost = 0, // Minimum boost score
|
||||
maxBoost = 5 // Maximum boost score cap
|
||||
} = options;
|
||||
|
||||
// Count URL parts first
|
||||
const counts = countUrlParts(urlItems);
|
||||
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
|
||||
|
||||
return urlItems.map(item => {
|
||||
item = (item as BoostedSearchResult)
|
||||
if (!item || !item.url) {
|
||||
console.error('Skipping invalid item:', item);
|
||||
return item; // Return unchanged
|
||||
}
|
||||
|
||||
const { hostname, path } = extractUrlParts(item.url);
|
||||
|
||||
// Base weight from original
|
||||
const originalWeight = item.weight || 1.0; // Default to 1 if weight is missing
|
||||
|
||||
// Hostname boost (normalized by total URLs)
|
||||
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
|
||||
const hostnameBoost = hostnameFreq * hostnameBoostFactor;
|
||||
|
||||
// Path boost (consider all path prefixes with decay for longer paths)
|
||||
let pathBoost = 0;
|
||||
const pathSegments = path.split('/').filter(segment => segment.length > 0);
|
||||
pathSegments.forEach((segment, index) => {
|
||||
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
|
||||
const prefixCount = pathPrefixCount[prefix] || 0;
|
||||
const prefixFreq = normalizeCount(prefixCount, totalUrls);
|
||||
|
||||
// Apply decay factor based on path depth
|
||||
const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
|
||||
pathBoost += decayedBoost;
|
||||
});
|
||||
|
||||
// Calculate new weight with clamping
|
||||
const boostScore = Math.min(Math.max(hostnameBoost + pathBoost, minBoost), maxBoost);
|
||||
const boostedWeight = originalWeight + boostScore;
|
||||
|
||||
return {
|
||||
...item,
|
||||
originalWeight,
|
||||
hostnameBoost,
|
||||
pathBoost,
|
||||
boostScore,
|
||||
boostedWeight
|
||||
} as BoostedSearchResult;
|
||||
});
|
||||
};
|
||||
Reference in New Issue
Block a user