From 51ad77d302d34170c8db8fffe3753f64048f5fb2 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Tue, 4 Mar 2025 16:29:22 +0800 Subject: [PATCH] feat: add url ranking --- src/agent.ts | 73 +++++++++++++++++--------- src/tools/jina-dedup.ts | 2 +- src/types.ts | 16 +++++- src/utils/text-tools.ts | 38 +++++++++++++- src/utils/url-tools.ts | 110 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 208 insertions(+), 31 deletions(-) diff --git a/src/agent.ts b/src/agent.ts index 665fac0..4851603 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator"; import {analyzeSteps} from "./tools/error-analyzer"; import {TokenTracker} from "./utils/token-tracker"; import {ActionTracker} from "./utils/action-tracker"; -import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType} from "./types"; +import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType, BoostedSearchResult} from "./types"; import {TrackerContext} from "./types"; import {search} from "./tools/jina-search"; // import {grounding} from "./tools/grounding"; @@ -19,8 +19,8 @@ import {zodToJsonSchema} from "zod-to-json-schema"; import {ObjectGeneratorSafe} from "./utils/safe-generator"; import {CodeSandbox} from "./tools/code-sandbox"; import {serperSearch} from './tools/serper-search'; -import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools"; -import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools"; +import {calculateBoostedWeights, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools"; +import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools"; import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas"; async function sleep(ms: number) { @@ -132,18 +132,22 @@ ${learnedStrategy} if (allowRead) { let urlList = ''; if (allURLs && allURLs.length > 0) { - urlList = allURLs + const weightedURLs = calculateBoostedWeights(allURLs) as BoostedSearchResult[] + + urlList = (weightedURLs) .filter(r => 'url' in r) - .map(r => ` + "${r.url}": "${r.title}"`) + .sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0)) + .slice(0, 10) // save context window and reduce noise, only keep top 10 urls + .map(r => ` + weight: ${r.boostedWeight.toFixed(3)} "${r.url}": "${r.title}"`) .join('\n'); } actionSections.push(` - Access and read full content from URLs -- Must check URLs mentioned in +- Must check URLs mentioned in if any ${urlList ? ` -- Review relevant URLs below for additional information +- Choose and visit relevant URLs below for more knowledge. higher weight means more relevant and you should visit first: ${urlList} @@ -302,7 +306,7 @@ export async function getResponse(question?: string, evaluationMetrics[currentQuestion] = await evaluateQuestion(currentQuestion, context, SchemaGen) } - if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step===1) { + if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step === 1) { // force strict eval for the original question, only once. evaluationMetrics[currentQuestion].push('strict') } @@ -315,7 +319,7 @@ export async function getResponse(question?: string, // update all urls with buildURLMap // allowRead = allowRead && (Object.keys(allURLs).length > 0); - allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 50); // disable search when too many urls already + allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 70); // disable search when too many urls already // generate prompt for this step system = getPrompt( @@ -568,10 +572,20 @@ But then you realized you have asked them before. You decided to to think out of const minResults = (results).map(r => ({ title: r.title, url: normalizeUrl('url' in r ? r.url : r.link), - description: 'description' in r ? r.description : r.snippet + description: 'description' in r ? r.description : r.snippet, })); - minResults.forEach(r => allURLs[r.url] = r); + minResults.forEach(r => { + if (!allURLs[r.url]) { + allURLs[r.url] = r; + allURLs[r.url].weight = 1; + } else { + (allURLs[r.url].weight as number)++; + const curDesc = (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description; + (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description = smartMergeStrings(curDesc, r.description); + } + + }); allKeywords.push(query); allKnowledge.push({ @@ -722,11 +736,11 @@ But unfortunately, you failed to solve the issue. You need to think out of the b } - await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep); await sleep(STEP_SLEEP); } - await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep); if (!(thisStep as AnswerAction).isFinal) { console.log('Enter Beast mode!!!') // any answer is better than no answer, humanity last resort @@ -766,7 +780,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b (thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction)) console.log(thisStep) - await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep); + await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep); return { result: thisStep, context, @@ -776,16 +790,25 @@ But unfortunately, you failed to solve the issue. You need to think out of the b } -async function storeContext(prompt: string, schema: any, memory: any[][], step: number) { +async function storeContext(prompt: string, schema: any, memory: { + allContext: StepAction[]; + allKeywords: string[]; + allQuestions: string[]; + allKnowledge: KnowledgeItem[]; + allURLs: Record; + } + , step: number) { + + const {allContext, allKeywords, allQuestions, allKnowledge, allURLs} = memory; if ((process as any).asyncLocalContext?.available?.()) { - const [context, keywords, questions, knowledge] = memory; + (process as any).asyncLocalContext.ctx.promptContext = { prompt, schema, - context, - keywords, - questions, - knowledge, + allContext, + allKeywords, + allQuestions, + allKnowledge, step }; return; @@ -799,11 +822,11 @@ ${prompt} JSONSchema: ${JSON.stringify(zodToJsonSchema(schema), null, 2)} `); - const [context, keywords, questions, knowledge] = memory; - await fs.writeFile('context.json', JSON.stringify(context, null, 2)); - await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2)); - await fs.writeFile('questions.json', JSON.stringify(questions, null, 2)); - await fs.writeFile('knowledge.json', JSON.stringify(knowledge, null, 2)); + await fs.writeFile('context.json', JSON.stringify(allContext, null, 2)); + await fs.writeFile('queries.json', JSON.stringify(allKeywords, null, 2)); + await fs.writeFile('questions.json', JSON.stringify(allQuestions, null, 2)); + await fs.writeFile('knowledge.json', JSON.stringify(allKnowledge, null, 2)); + await fs.writeFile('urls.json', JSON.stringify(calculateBoostedWeights(Object.entries(allURLs).map(([, result]) => result)), null, 2)); } catch (error) { console.error('Context storage failed:', error); } diff --git a/src/tools/jina-dedup.ts b/src/tools/jina-dedup.ts index abcbdeb..f18768a 100644 --- a/src/tools/jina-dedup.ts +++ b/src/tools/jina-dedup.ts @@ -3,7 +3,7 @@ import {TokenTracker} from "../utils/token-tracker"; import {JINA_API_KEY} from "../config"; const JINA_API_URL = 'https://api.jina.ai/v1/embeddings'; -const SIMILARITY_THRESHOLD = 0.85; // Adjustable threshold for cosine similarity +const SIMILARITY_THRESHOLD = 0.888; // Adjustable threshold for cosine similarity const JINA_API_CONFIG = { MODEL: 'jina-embeddings-v3', diff --git a/src/types.ts b/src/types.ts index f2a6d50..d8ad27a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -172,9 +172,20 @@ export type ErrorAnalysisResponse = { }; export type SearchResult = - | { title: string; url: string; description: string } - | { title: string; link: string; snippet: string }; + | { title: string; url: string; description: string; weight?: number } + | { title: string; link: string; snippet: string; weight?: number }; +export type BoostedSearchResult = { + title: string; + url: string; + description: string; + weight: number; + originalWeight: number; + hostnameBoost: number; + pathBoost: number; + boostScore: number; + boostedWeight: number; +} // OpenAI API Types export interface Model { @@ -190,6 +201,7 @@ export type ResponseFormat = { type: 'json_schema' | 'json_object'; json_schema?: any; } + export interface ChatCompletionRequest { model: string; messages: Array; diff --git a/src/utils/text-tools.ts b/src/utils/text-tools.ts index f0e8a62..e7e5941 100644 --- a/src/utils/text-tools.ts +++ b/src/utils/text-tools.ts @@ -160,4 +160,40 @@ export function getI18nText(key: string, lang = 'en', params: Record 0; overlapLength--) { + // Get the end of first string with the current overlap length + const endOfStr1 = str1.slice(str1.length - overlapLength); + // Get the beginning of second string with the current overlap length + const startOfStr2 = str2.slice(0, overlapLength); + + // If they match, we've found our overlap + if (endOfStr1 === startOfStr2) { + bestOverlapLength = overlapLength; + break; + } + } + + // Merge the strings using the best overlap + if (bestOverlapLength > 0) { + return str1.slice(0, str1.length - bestOverlapLength) + str2; + } else { + // No overlap found, concatenate normally + return str1 + str2; + } +} diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index a882fb9..39148e8 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -1,4 +1,4 @@ -import {SearchResult} from "../types"; +import {BoostedSearchResult, SearchResult} from "../types"; export function normalizeUrl(urlString: string, debug = false): string { if (!urlString?.trim()) { @@ -101,4 +101,110 @@ export function getUnvisitedURLs(allURLs: Record, visitedU return Object.entries(allURLs) .filter(([url]) => !visitedURLs.includes(url)) .map(([, result]) => result); -} \ No newline at end of file +} + + +// Function to extract hostname and path from a URL +const extractUrlParts = (urlStr: string) => { + try { + const url = new URL(urlStr); + return { + hostname: url.hostname, + path: url.pathname + }; + } catch (e) { + console.error(`Error parsing URL: ${urlStr}`, e); + return { hostname: "", path: "" }; + } +}; + +// Function to count occurrences of hostnames and paths +const countUrlParts = (urlItems: SearchResult[]) => { + const hostnameCount: Record = {}; + const pathPrefixCount: Record = {}; + let totalUrls = 0; + + urlItems.forEach(item => { + item = (item as { title: string; url: string; description: string; weight?: number }) + if (!item || !item.url) return; // Skip invalid items + + totalUrls++; + const { hostname, path } = extractUrlParts(item.url); + + // Count hostnames + hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1; + + // Count path prefixes (segments) + const pathSegments = path.split('/').filter(segment => segment.length > 0); + pathSegments.forEach((segment, index) => { + const prefix = '/' + pathSegments.slice(0, index + 1).join('/'); + pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1; + }); + }); + + return { hostnameCount, pathPrefixCount, totalUrls }; +}; + +// Calculate normalized frequency for boosting +const normalizeCount = (count: any, total: any) => { + return total > 0 ? count / total : 0; +}; + +// Calculate boosted weights +export const calculateBoostedWeights = (urlItems: SearchResult[], options: any = {}): any[] => { + // Default parameters for boosting - can be overridden + const { + hostnameBoostFactor = 0.7, // How much to boost based on hostname frequency + pathBoostFactor = 0.4, // How much to boost based on path frequency + decayFactor = 0.8, // Decay factor for longer paths (0-1) + minBoost = 0, // Minimum boost score + maxBoost = 5 // Maximum boost score cap + } = options; + + // Count URL parts first + const counts = countUrlParts(urlItems); + const { hostnameCount, pathPrefixCount, totalUrls } = counts; + + return urlItems.map(item => { + item = (item as BoostedSearchResult) + if (!item || !item.url) { + console.error('Skipping invalid item:', item); + return item; // Return unchanged + } + + const { hostname, path } = extractUrlParts(item.url); + + // Base weight from original + const originalWeight = item.weight || 1.0; // Default to 1 if weight is missing + + // Hostname boost (normalized by total URLs) + const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls); + const hostnameBoost = hostnameFreq * hostnameBoostFactor; + + // Path boost (consider all path prefixes with decay for longer paths) + let pathBoost = 0; + const pathSegments = path.split('/').filter(segment => segment.length > 0); + pathSegments.forEach((segment, index) => { + const prefix = '/' + pathSegments.slice(0, index + 1).join('/'); + const prefixCount = pathPrefixCount[prefix] || 0; + const prefixFreq = normalizeCount(prefixCount, totalUrls); + + // Apply decay factor based on path depth + const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor; + pathBoost += decayedBoost; + }); + + // Calculate new weight with clamping + const boostScore = Math.min(Math.max(hostnameBoost + pathBoost, minBoost), maxBoost); + const boostedWeight = originalWeight + boostScore; + + return { + ...item, + originalWeight, + hostnameBoost, + pathBoost, + boostScore, + boostedWeight + } as BoostedSearchResult; + }); +}; \ No newline at end of file