feat: add url ranking

This commit is contained in:
Han Xiao
2025-03-04 16:29:22 +08:00
parent 79542148d8
commit 51ad77d302
5 changed files with 208 additions and 31 deletions

View File

@@ -11,7 +11,7 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker";
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType} from "./types";
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType, BoostedSearchResult} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
// import {grounding} from "./tools/grounding";
@@ -19,8 +19,8 @@ import {zodToJsonSchema} from "zod-to-json-schema";
import {ObjectGeneratorSafe} from "./utils/safe-generator";
import {CodeSandbox} from "./tools/code-sandbox";
import {serperSearch} from './tools/serper-search';
import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
import {calculateBoostedWeights, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
async function sleep(ms: number) {
@@ -132,18 +132,22 @@ ${learnedStrategy}
if (allowRead) {
let urlList = '';
if (allURLs && allURLs.length > 0) {
urlList = allURLs
const weightedURLs = calculateBoostedWeights(allURLs) as BoostedSearchResult[]
urlList = (weightedURLs)
.filter(r => 'url' in r)
.map(r => ` + "${r.url}": "${r.title}"`)
.sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0))
.slice(0, 10) // save context window and reduce noise, only keep top 10 urls
.map(r => ` + weight: ${r.boostedWeight.toFixed(3)} "${r.url}": "${r.title}"`)
.join('\n');
}
actionSections.push(`
<action-visit>
- Access and read full content from URLs
- Must check URLs mentioned in <question>
- Must check URLs mentioned in <question> if any
${urlList ? `
- Review relevant URLs below for additional information
- Choose and visit relevant URLs below for more knowledge. higher weight means more relevant and you should visit first:
<url-list>
${urlList}
</url-list>
@@ -302,7 +306,7 @@ export async function getResponse(question?: string,
evaluationMetrics[currentQuestion] =
await evaluateQuestion(currentQuestion, context, SchemaGen)
}
if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step===1) {
if (currentQuestion.trim() === question && !evaluationMetrics[currentQuestion].includes('strict') && step === 1) {
// force strict eval for the original question, only once.
evaluationMetrics[currentQuestion].push('strict')
}
@@ -315,7 +319,7 @@ export async function getResponse(question?: string,
// update all urls with buildURLMap
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 50); // disable search when too many urls already
allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 70); // disable search when too many urls already
// generate prompt for this step
system = getPrompt(
@@ -568,10 +572,20 @@ But then you realized you have asked them before. You decided to to think out of
const minResults = (results).map(r => ({
title: r.title,
url: normalizeUrl('url' in r ? r.url : r.link),
description: 'description' in r ? r.description : r.snippet
description: 'description' in r ? r.description : r.snippet,
}));
minResults.forEach(r => allURLs[r.url] = r);
minResults.forEach(r => {
if (!allURLs[r.url]) {
allURLs[r.url] = r;
allURLs[r.url].weight = 1;
} else {
(allURLs[r.url].weight as number)++;
const curDesc = (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description;
(allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description = smartMergeStrings(curDesc, r.description);
}
});
allKeywords.push(query);
allKnowledge.push({
@@ -722,11 +736,11 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
}
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
await sleep(STEP_SLEEP);
}
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
if (!(thisStep as AnswerAction).isFinal) {
console.log('Enter Beast mode!!!')
// any answer is better than no answer, humanity last resort
@@ -766,7 +780,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
(thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction))
console.log(thisStep)
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
return {
result: thisStep,
context,
@@ -776,16 +790,25 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
}
async function storeContext(prompt: string, schema: any, memory: any[][], step: number) {
async function storeContext(prompt: string, schema: any, memory: {
allContext: StepAction[];
allKeywords: string[];
allQuestions: string[];
allKnowledge: KnowledgeItem[];
allURLs: Record<string, SearchResult>;
}
, step: number) {
const {allContext, allKeywords, allQuestions, allKnowledge, allURLs} = memory;
if ((process as any).asyncLocalContext?.available?.()) {
const [context, keywords, questions, knowledge] = memory;
(process as any).asyncLocalContext.ctx.promptContext = {
prompt,
schema,
context,
keywords,
questions,
knowledge,
allContext,
allKeywords,
allQuestions,
allKnowledge,
step
};
return;
@@ -799,11 +822,11 @@ ${prompt}
JSONSchema:
${JSON.stringify(zodToJsonSchema(schema), null, 2)}
`);
const [context, keywords, questions, knowledge] = memory;
await fs.writeFile('context.json', JSON.stringify(context, null, 2));
await fs.writeFile('queries.json', JSON.stringify(keywords, null, 2));
await fs.writeFile('questions.json', JSON.stringify(questions, null, 2));
await fs.writeFile('knowledge.json', JSON.stringify(knowledge, null, 2));
await fs.writeFile('context.json', JSON.stringify(allContext, null, 2));
await fs.writeFile('queries.json', JSON.stringify(allKeywords, null, 2));
await fs.writeFile('questions.json', JSON.stringify(allQuestions, null, 2));
await fs.writeFile('knowledge.json', JSON.stringify(allKnowledge, null, 2));
await fs.writeFile('urls.json', JSON.stringify(calculateBoostedWeights(Object.entries(allURLs).map(([, result]) => result)), null, 2));
} catch (error) {
console.error('Context storage failed:', error);
}

View File

@@ -3,7 +3,7 @@ import {TokenTracker} from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
const SIMILARITY_THRESHOLD = 0.85; // Adjustable threshold for cosine similarity
const SIMILARITY_THRESHOLD = 0.888; // Adjustable threshold for cosine similarity
const JINA_API_CONFIG = {
MODEL: 'jina-embeddings-v3',

View File

@@ -172,9 +172,20 @@ export type ErrorAnalysisResponse = {
};
export type SearchResult =
| { title: string; url: string; description: string }
| { title: string; link: string; snippet: string };
| { title: string; url: string; description: string; weight?: number }
| { title: string; link: string; snippet: string; weight?: number };
export type BoostedSearchResult = {
title: string;
url: string;
description: string;
weight: number;
originalWeight: number;
hostnameBoost: number;
pathBoost: number;
boostScore: number;
boostedWeight: number;
}
// OpenAI API Types
export interface Model {
@@ -190,6 +201,7 @@ export type ResponseFormat = {
type: 'json_schema' | 'json_object';
json_schema?: any;
}
export interface ChatCompletionRequest {
model: string;
messages: Array<CoreUserMessage | CoreAssistantMessage>;

View File

@@ -160,4 +160,40 @@ export function getI18nText(key: string, lang = 'en', params: Record<string, str
}
return text;
}
}
export function smartMergeStrings(str1: string, str2: string): string {
// If either string is empty, return the other
if (!str1) return str2;
if (!str2) return str1;
// Check if one string is entirely contained within the other
if (str1.includes(str2)) return str1;
if (str2.includes(str1)) return str2;
// Find the maximum possible overlap length
const maxOverlap = Math.min(str1.length, str2.length);
let bestOverlapLength = 0;
// Check for overlaps starting from the largest possible
for (let overlapLength = maxOverlap; overlapLength > 0; overlapLength--) {
// Get the end of first string with the current overlap length
const endOfStr1 = str1.slice(str1.length - overlapLength);
// Get the beginning of second string with the current overlap length
const startOfStr2 = str2.slice(0, overlapLength);
// If they match, we've found our overlap
if (endOfStr1 === startOfStr2) {
bestOverlapLength = overlapLength;
break;
}
}
// Merge the strings using the best overlap
if (bestOverlapLength > 0) {
return str1.slice(0, str1.length - bestOverlapLength) + str2;
} else {
// No overlap found, concatenate normally
return str1 + str2;
}
}

View File

@@ -1,4 +1,4 @@
import {SearchResult} from "../types";
import {BoostedSearchResult, SearchResult} from "../types";
export function normalizeUrl(urlString: string, debug = false): string {
if (!urlString?.trim()) {
@@ -101,4 +101,110 @@ export function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedU
return Object.entries(allURLs)
.filter(([url]) => !visitedURLs.includes(url))
.map(([, result]) => result);
}
}
// Function to extract hostname and path from a URL
const extractUrlParts = (urlStr: string) => {
try {
const url = new URL(urlStr);
return {
hostname: url.hostname,
path: url.pathname
};
} catch (e) {
console.error(`Error parsing URL: ${urlStr}`, e);
return { hostname: "", path: "" };
}
};
// Function to count occurrences of hostnames and paths
const countUrlParts = (urlItems: SearchResult[]) => {
const hostnameCount: Record<string, number> = {};
const pathPrefixCount: Record<string, number> = {};
let totalUrls = 0;
urlItems.forEach(item => {
item = (item as { title: string; url: string; description: string; weight?: number })
if (!item || !item.url) return; // Skip invalid items
totalUrls++;
const { hostname, path } = extractUrlParts(item.url);
// Count hostnames
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
// Count path prefixes (segments)
const pathSegments = path.split('/').filter(segment => segment.length > 0);
pathSegments.forEach((segment, index) => {
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
pathPrefixCount[prefix] = (pathPrefixCount[prefix] || 0) + 1;
});
});
return { hostnameCount, pathPrefixCount, totalUrls };
};
// Calculate normalized frequency for boosting
const normalizeCount = (count: any, total: any) => {
return total > 0 ? count / total : 0;
};
// Calculate boosted weights
export const calculateBoostedWeights = (urlItems: SearchResult[], options: any = {}): any[] => {
// Default parameters for boosting - can be overridden
const {
hostnameBoostFactor = 0.7, // How much to boost based on hostname frequency
pathBoostFactor = 0.4, // How much to boost based on path frequency
decayFactor = 0.8, // Decay factor for longer paths (0-1)
minBoost = 0, // Minimum boost score
maxBoost = 5 // Maximum boost score cap
} = options;
// Count URL parts first
const counts = countUrlParts(urlItems);
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
return urlItems.map(item => {
item = (item as BoostedSearchResult)
if (!item || !item.url) {
console.error('Skipping invalid item:', item);
return item; // Return unchanged
}
const { hostname, path } = extractUrlParts(item.url);
// Base weight from original
const originalWeight = item.weight || 1.0; // Default to 1 if weight is missing
// Hostname boost (normalized by total URLs)
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
const hostnameBoost = hostnameFreq * hostnameBoostFactor;
// Path boost (consider all path prefixes with decay for longer paths)
let pathBoost = 0;
const pathSegments = path.split('/').filter(segment => segment.length > 0);
pathSegments.forEach((segment, index) => {
const prefix = '/' + pathSegments.slice(0, index + 1).join('/');
const prefixCount = pathPrefixCount[prefix] || 0;
const prefixFreq = normalizeCount(prefixCount, totalUrls);
// Apply decay factor based on path depth
const decayedBoost = prefixFreq * Math.pow(decayFactor, index) * pathBoostFactor;
pathBoost += decayedBoost;
});
// Calculate new weight with clamping
const boostScore = Math.min(Math.max(hostnameBoost + pathBoost, minBoost), maxBoost);
const boostedWeight = originalWeight + boostScore;
return {
...item,
originalWeight,
hostnameBoost,
pathBoost,
boostScore,
boostedWeight
} as BoostedSearchResult;
});
};