feat: add hostnames bw filter

This commit is contained in:
Han Xiao 2025-03-18 10:43:46 +08:00
parent b27eced6f8
commit 4ca7804e58
4 changed files with 22 additions and 6 deletions

View File

@ -354,6 +354,8 @@ export async function getResponse(question?: string,
messages?: Array<CoreMessage>,
numReturnedURLs: number = 100,
noDirectAnswer: boolean = false,
boostHostnames: string[] = [],
badHostnames: string[] = [],
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
let step = 0;
@ -446,9 +448,10 @@ export async function getResponse(question?: string,
if (allURLs && Object.keys(allURLs).length > 0) {
// rerank urls
weightedURLs = rankURLs(
filterURLs(allURLs, visitedURLs),
filterURLs(allURLs, visitedURLs, badHostnames),
{
question: currentQuestion
question: currentQuestion,
boostHostnames
}, context);
// improve diversity by keep top 2 urls of each hostname
weightedURLs = keepKPerHostname(weightedURLs, 2);

View File

@ -548,7 +548,16 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
visitedURLs,
readURLs,
allURLs
} = await getResponse(undefined, tokenBudget, maxBadAttempts, context, body.messages, body.max_returned_urls, body.no_direct_answer)
} = await getResponse(undefined,
tokenBudget,
maxBadAttempts,
context,
body.messages,
body.max_returned_urls,
body.no_direct_answer,
body.boostHostnames,
body.badHostnames,
)
let finalAnswer = (finalStep as AnswerAction).mdAnswer;
const annotations = (finalStep as AnswerAction).references?.map(ref => ({

View File

@ -217,6 +217,9 @@ export interface ChatCompletionRequest {
response_format?: ResponseFormat;
no_direct_answer?: boolean;
max_returned_urls?: number;
boostHostnames?: string[];
badHostnames?: string[];
}
export interface URLAnnotation {

View File

@ -140,9 +140,9 @@ export function normalizeUrl(urlString: string, debug = false, options = {
}
}
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
return Object.entries(allURLs)
.filter(([url,]) => !visitedURLs.includes(url))
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
.map(([, result]) => result);
}
@ -205,6 +205,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
minBoost = 0, // Minimum boost score
maxBoost = 5, // Maximum boost score cap
question = '', // Optional question for Jina reranking
boostHostnames = [], // Optional hostnames to boost
} = options;
// Count URL parts first
@ -235,7 +236,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
// Hostname boost (normalized by total URLs)
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
const hostnameBoost = hostnameFreq * hostnameBoostFactor;
const hostnameBoost = hostnameFreq * hostnameBoostFactor * (boostHostnames.includes(hostname) ? 2 : 1);
// Path boost (consider all path prefixes with decay for longer paths)
let pathBoost = 0;