From 4ca7804e58404e2acd1c862506bfd93c86542f44 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Tue, 18 Mar 2025 10:43:46 +0800 Subject: [PATCH] feat: add hostnames bw filter --- src/agent.ts | 7 +++++-- src/app.ts | 11 ++++++++++- src/types.ts | 3 +++ src/utils/url-tools.ts | 7 ++++--- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/agent.ts b/src/agent.ts index 2ae6cc9..9bd9d29 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -354,6 +354,8 @@ export async function getResponse(question?: string, messages?: Array, numReturnedURLs: number = 100, noDirectAnswer: boolean = false, + boostHostnames: string[] = [], + badHostnames: string[] = [], ): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> { let step = 0; @@ -446,9 +448,10 @@ export async function getResponse(question?: string, if (allURLs && Object.keys(allURLs).length > 0) { // rerank urls weightedURLs = rankURLs( - filterURLs(allURLs, visitedURLs), + filterURLs(allURLs, visitedURLs, badHostnames), { - question: currentQuestion + question: currentQuestion, + boostHostnames }, context); // improve diversity by keep top 2 urls of each hostname weightedURLs = keepKPerHostname(weightedURLs, 2); diff --git a/src/app.ts b/src/app.ts index c50cc7c..f6983f0 100644 --- a/src/app.ts +++ b/src/app.ts @@ -548,7 +548,16 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => { visitedURLs, readURLs, allURLs - } = await getResponse(undefined, tokenBudget, maxBadAttempts, context, body.messages, body.max_returned_urls, body.no_direct_answer) + } = await getResponse(undefined, + tokenBudget, + maxBadAttempts, + context, + body.messages, + body.max_returned_urls, + body.no_direct_answer, + body.boostHostnames, + body.badHostnames, + ) let finalAnswer = (finalStep as AnswerAction).mdAnswer; const annotations = (finalStep as AnswerAction).references?.map(ref => ({ diff --git a/src/types.ts b/src/types.ts index cfc33b6..97279d5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -217,6 +217,9 @@ export interface ChatCompletionRequest { response_format?: ResponseFormat; no_direct_answer?: boolean; max_returned_urls?: number; + + boostHostnames?: string[]; + badHostnames?: string[]; } export interface URLAnnotation { diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index f166884..8c8a5d9 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -140,9 +140,9 @@ export function normalizeUrl(urlString: string, debug = false, options = { } } -export function filterURLs(allURLs: Record, visitedURLs: string[]): SearchSnippet[] { +export function filterURLs(allURLs: Record, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] { return Object.entries(allURLs) - .filter(([url,]) => !visitedURLs.includes(url)) + .filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname)) .map(([, result]) => result); } @@ -205,6 +205,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: minBoost = 0, // Minimum boost score maxBoost = 5, // Maximum boost score cap question = '', // Optional question for Jina reranking + boostHostnames = [], // Optional hostnames to boost } = options; // Count URL parts first @@ -235,7 +236,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: // Hostname boost (normalized by total URLs) const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls); - const hostnameBoost = hostnameFreq * hostnameBoostFactor; + const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1); // Path boost (consider all path prefixes with decay for longer paths) let pathBoost = 0;