feat: improve url ranking, fix eval bugs

This commit is contained in:
Han Xiao 2025-03-06 14:17:56 +08:00
parent d7f2ea69a5
commit d9bfc2fd1f
8 changed files with 386 additions and 263 deletions

View File

@ -11,7 +11,15 @@ import {evaluateAnswer, evaluateQuestion} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker";
import {StepAction, AnswerAction, KnowledgeItem, SearchResult, EvaluationType, BoostedSearchResult} from "./types";
import {
StepAction,
AnswerAction,
KnowledgeItem,
SearchResult,
EvaluationType,
BoostedSearchSnippet,
SearchSnippet, EvaluationResponse
} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
// import {grounding} from "./tools/grounding";
@ -19,8 +27,15 @@ import {zodToJsonSchema} from "zod-to-json-schema";
import {ObjectGeneratorSafe} from "./utils/safe-generator";
import {CodeSandbox} from "./tools/code-sandbox";
import {serperSearch} from './tools/serper-search';
import {calculateBoostedWeights, countUrlParts, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools";
import {
addToAllURLs,
rankURLs,
countUrlParts,
getUnvisitedURLs,
normalizeUrl, sampleMultinomial,
weightedURLToString
} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
async function sleep(ms: number) {
@ -41,7 +56,7 @@ function getPrompt(
allowCoding: boolean = true,
badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[],
knowledge?: KnowledgeItem[],
allURLs?: BoostedSearchResult[],
allURLs?: BoostedSearchSnippet[],
beastMode?: boolean,
): string {
const sections: string[] = [];
@ -130,14 +145,7 @@ ${learnedStrategy}
// Build actions section
if (allowRead) {
let urlList = '';
if (allURLs && allURLs.length > 0) {
urlList = (allURLs)
.filter(r => 'url' in r)
.sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0))
.map(r => ` + weight: ${r.boostedWeight.toFixed(2)} "${r.url}": "${r.title}"`)
.join('\n');
}
const urlList = weightedURLToString(allURLs || [], 20);
actionSections.push(`
<action-visit>
@ -276,7 +284,7 @@ export async function getResponse(question?: string,
const badContext = [];
let diaryContext = [];
let weightedURLs: BoostedSearchResult[] = [];
let weightedURLs: BoostedSearchSnippet[] = [];
let allowAnswer = true;
let allowSearch = true;
let allowRead = true;
@ -285,7 +293,7 @@ export async function getResponse(question?: string,
let system = '';
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
const allURLs: Record<string, SearchResult> = {};
const allURLs: Record<string, SearchSnippet> = {};
const visitedURLs: string[] = [];
const evaluationMetrics: Record<string, EvaluationType[]> = {};
// reserve the 10% final budget for the beast mode
@ -319,10 +327,12 @@ export async function getResponse(question?: string,
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
if (allURLs && Object.keys(allURLs).length > 0) {
// rerank urls
weightedURLs = calculateBoostedWeights(getUnvisitedURLs(allURLs, visitedURLs));
weightedURLs = rankURLs(getUnvisitedURLs(allURLs, visitedURLs), {
question: currentQuestion
}, context);
}
allowSearch = allowSearch && (weightedURLs.length < 70); // disable search when too many urls already
// allowSearch = allowSearch && (weightedURLs.length < 70); // disable search when too many urls already
// generate prompt for this step
system = getPrompt(
@ -367,8 +377,20 @@ export async function getResponse(question?: string,
// execute the step and action
if (thisStep.action === 'answer') {
if (step === 1) {
// normalize all references urls, add title to it
thisStep.references = thisStep.references?.filter(ref => ref?.url && typeof ref.url === 'string' && ref.url.startsWith('http'))
.map(ref => {
const normalizedUrl = ref?.url ? normalizeUrl(ref.url) : '';
return {
exactQuote: ref?.exactQuote || '',
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
url: normalizedUrl
}
});
if (step === 1 && thisStep.references.length === 0) {
// LLM is so confident and answer immediately, skip all evaluations
// however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"
thisStep.isFinal = true;
break
}
@ -379,23 +401,18 @@ export async function getResponse(question?: string,
...thisStep,
});
// normalize all references urls, add title to it
thisStep.references = thisStep.references?.filter(ref => ref.url.startsWith('http')).map(ref => {
return {
exactQuote: ref.exactQuote,
title: allURLs[ref.url]?.title,
url: ref.url ? normalizeUrl(ref.url) : ''
}
});
context.actionTracker.trackThink('eval_first', SchemaGen.languageCode)
console.log(currentQuestion, evaluationMetrics[currentQuestion])
const evaluation = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion],
context,
visitedURLs,
SchemaGen
);
let evaluation: EvaluationResponse = {pass: true, think: ''};
if (evaluationMetrics[currentQuestion].length > 0) {
context.actionTracker.trackThink('eval_first', SchemaGen.languageCode)
evaluation = await evaluateAnswer(currentQuestion, thisStep,
evaluationMetrics[currentQuestion],
context,
visitedURLs,
SchemaGen
) || evaluation;
}
if (currentQuestion.trim() === question) {
if (evaluation.pass) {
@ -522,9 +539,6 @@ But then you realized you have asked them before. You decided to to think out of
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
// rewrite queries
console.log(countUrlParts(weightedURLs).hostnameCount)
const topHosts = Object.entries(countUrlParts(weightedURLs).hostnameCount).sort((a, b) => b[1] - a[1]).map(([host]) => host).slice(0, 2);
console.log(topHosts)
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context, SchemaGen);
// avoid exisitng searched queries
keywordsQueries = chooseK((await dedupQueries(keywordsQueries, allKeywords, context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
@ -540,8 +554,19 @@ But then you realized you have asked them before. You decided to to think out of
try {
let siteQuery = query
if (topHosts.length > 0) {
siteQuery = query + ' site:' + chooseK(topHosts, 1)[0];
if (allURLs && Object.keys(allURLs).length > 0) {
// rerank urls
weightedURLs = rankURLs(getUnvisitedURLs(allURLs, visitedURLs), {
question: currentQuestion
}, context);
}
const topHosts = Object.entries(countUrlParts(weightedURLs).hostnameCount).sort((a, b) => b[1] - a[1]);
console.log(topHosts)
if (topHosts.length > 0 && Math.random() < 0.6 && !query.includes('site:')) {
// explore-exploit
siteQuery = query + ' site:' + sampleMultinomial(topHosts);
console.log('Site query:', siteQuery)
}
switch (SEARCH_PROVIDER) {
case 'jina':
@ -569,22 +594,15 @@ But then you realized you have asked them before. You decided to to think out of
await sleep(STEP_SLEEP)
}
const minResults = (results).map(r => ({
const minResults: SearchSnippet[] = (results).map(r => ({
title: r.title,
url: normalizeUrl('url' in r ? r.url : r.link),
description: 'description' in r ? r.description : r.snippet,
weight: 1
}));
minResults.forEach(r => {
if (!allURLs[r.url]) {
allURLs[r.url] = r;
allURLs[r.url].weight = 1;
} else {
(allURLs[r.url].weight as number)++;
const curDesc = (allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description;
(allURLs[r.url] as { title: string; url: string; description: string; weight?: number }).description = smartMergeStrings(curDesc, r.description);
}
addToAllURLs(r, allURLs);
});
allKeywords.push(query);
@ -632,6 +650,7 @@ You decided to think out of the box or cut from a completely different angle.
.filter(url => url.startsWith('http'))
.map(url => normalizeUrl(url))
.filter(url => !visitedURLs.includes(url));
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
const uniqueURLs = thisStep.URLTargets;
@ -659,6 +678,15 @@ You decided to think out of the box or cut from a completely different angle.
updated: new Date().toISOString()
});
data.links?.forEach(link => {
const r: SearchSnippet = {
title: link[0],
url: normalizeUrl(link[1]),
description: link[0]
}
addToAllURLs(r, allURLs);
})
return {url, result: response};
} catch (error) {
console.error('Error reading URL:', error);
@ -740,11 +768,11 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
}
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, weightedURLs}, totalStep);
await sleep(STEP_SLEEP);
}
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, weightedURLs}, totalStep);
if (!(thisStep as AnswerAction).isFinal) {
console.log('Enter Beast mode!!!')
// any answer is better than no answer, humanity last resort
@ -784,7 +812,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
(thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction))
console.log(thisStep)
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, allURLs}, totalStep);
await storeContext(system, schema, {allContext, allKeywords, allQuestions, allKnowledge, weightedURLs}, totalStep);
return {
result: thisStep,
context,
@ -799,11 +827,11 @@ async function storeContext(prompt: string, schema: any, memory: {
allKeywords: string[];
allQuestions: string[];
allKnowledge: KnowledgeItem[];
allURLs: Record<string, SearchResult>;
weightedURLs: BoostedSearchSnippet[];
}
, step: number) {
const {allContext, allKeywords, allQuestions, allKnowledge, allURLs} = memory;
const {allContext, allKeywords, allQuestions, allKnowledge, weightedURLs} = memory;
if ((process as any).asyncLocalContext?.available?.()) {
(process as any).asyncLocalContext.ctx.promptContext = {
@ -830,7 +858,7 @@ ${JSON.stringify(zodToJsonSchema(schema), null, 2)}
await fs.writeFile('queries.json', JSON.stringify(allKeywords, null, 2));
await fs.writeFile('questions.json', JSON.stringify(allQuestions, null, 2));
await fs.writeFile('knowledge.json', JSON.stringify(allKnowledge, null, 2));
await fs.writeFile('urls.json', JSON.stringify(calculateBoostedWeights(Object.entries(allURLs).map(([, result]) => result)), null, 2));
await fs.writeFile('urls.json', JSON.stringify(weightedURLs, null, 2));
} catch (error) {
console.error('Context storage failed:', error);
}

View File

@ -26,85 +26,14 @@ answer: ${JSON.stringify(answer)}
function getAttributionPrompt(question: string, answer: string, sourceContent: string): PromptPair {
return {
system: `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided sources.
<rules>
1. Source Verification:
- Check if answer claims are supported by the provided source content
- Verify that quotes are accurate and in proper context
- Ensure numerical data and statistics match the source
- Flag any claims that go beyond what the sources support
2. Attribution Analysis:
- Check if answer properly references its sources
- Verify that important claims have clear source attribution
- Ensure quotes are properly marked and cited
- Check for any unsupported generalizations
3. Accuracy Requirements:
- Direct quotes must match source exactly
- Paraphrasing must maintain original meaning
- Statistics and numbers must be precise
- Context must be preserved
</rules>
<examples>
Question: "What are Jina AI's main products?"
Answer: "According to Jina AI's website, their main products are DocArray and Jina Framework."
Source Content: "Jina AI's flagship products include DocArray, Jina Framework, and JCloud, offering a complete ecosystem for neural search applications."
Evaluation: {
"think": "The answer omits JCloud which is mentioned as a main product in the source. The information provided is incomplete and potentially misleading as it fails to mention a significant product from the company's ecosystem.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": false,
"quotes_accurate": false
}
"pass": false,
}
Question: "When was Python first released?"
Answer: "Python was first released in 1991 by Guido van Rossum."
Source Content: "Python was first released in 1991 by Guido van Rossum while working at CWI."
Evaluation: {
"think": "The answer accurately reflects the core information from the source about Python's release date and creator, though it omits the additional context about CWI which isn't essential to the question.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
Question: "长城是什么时候建造的?"
Answer: "长城始建于公元前7世纪但现存的大部分长城是明朝时期修建的。"
Source Content: "中国长城始建于公元前7世纪的春秋战国时期历经多个朝代修建和扩展但现存的大部分长城是明朝1368-1644年时期修建的。"
Evaluation: {
"think": "这个回答准确地反映了原文中关于长城建造时间的核心信息包括最初的建造时期和现存长城的主要来源。虽然省略了具体的年份范围1368-1644年但这对回答问题的核心内容不是必要的。",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
Question: "Wann wurde die Berliner Mauer gebaut?"
Answer: "Die Berliner Mauer wurde am 13. August 1961 errichtet."
Source Content: "Die Berliner Mauer wurde am 13. August 1961 von der DDR-Regierung errichtet und fiel am 9. November 1989."
Evaluation: {
"think": "Die Antwort gibt das korrekte Datum des Mauerbaus wieder, wie in der Quelle angegeben. Der zusätzliche Kontext über den Fall der Mauer wurde weggelassen, da er für die spezifische Frage nach dem Bauzeitpunkt nicht wesentlich ist.",
"attribution_analysis": {
"sources_provided": true,
"sources_verified": true,
"quotes_accurate": true
}
"pass": true,
}
</examples>`,
system: `You are an evaluator that verifies if answer content is properly attributed to and supported by the provided context.`,
user: `
Context: ${sourceContent}
Question: ${question}
Answer: ${answer}
Source Content: ${sourceContent}`
Let me think
`
}
}
@ -739,13 +668,14 @@ export async function evaluateAnswer(
);
// fail one, return immediately
if (!(result?.object as EvaluationResponse).pass) {
return (result.object as EvaluationResponse);
if (!(result?.object as EvaluationResponse)?.pass) {
return result?.object as EvaluationResponse;
}
}
}
return (result!.object as EvaluationResponse);
return result?.object as EvaluationResponse;
}
// Helper function to fetch and combine source content

View File

@ -167,8 +167,8 @@ export async function dedupQueries(
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('dedup', {
promptTokens: tokens,
completionTokens: 0,
promptTokens: 0,
completionTokens: tokens,
totalTokens: tokens
});
console.log('Dedup:', uniqueQueries);

84
src/tools/jina-rerank.ts Normal file
View File

@ -0,0 +1,84 @@
import axios from 'axios';
import {TokenTracker} from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/rerank';
// Types for Jina Rerank API
interface JinaRerankRequest {
model: string;
query: string;
top_n: number;
documents: string[];
}
interface JinaRerankResponse {
model: string;
results: Array<{
index: number;
document: {
text: string;
};
relevance_score: number;
}>;
usage: {
total_tokens: number;
};
}
/**
* Reranks a list of documents based on relevance to a query
* @param query The query to rank documents against
* @param documents Array of documents to be ranked
* @param topN Number of top results to return
* @param tracker Optional token tracker for usage monitoring
* @returns Array of reranked documents with their scores
*/
export async function rerankDocuments(
query: string,
documents: string[],
tracker?: TokenTracker
): Promise<{ results: Array<{index: number, relevance_score: number, document: {text: string}}> }> {
try {
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
const request: JinaRerankRequest = {
model: 'jina-reranker-v2-base-multilingual',
query,
top_n: documents.length,
documents
};
const response = await axios.post<JinaRerankResponse>(
JINA_API_URL,
request,
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`
}
}
);
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('rerank', {
promptTokens: response.data.usage.total_tokens,
completionTokens: 0,
totalTokens: response.data.usage.total_tokens
});
console.log('Rerank results:', response.data.results);
return {
results: response.data.results
};
} catch (error) {
console.error('Error in reranking documents:', error);
// Return empty results if there is an error
return {
results: []
};
}
}

View File

@ -23,7 +23,7 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
'Content-Type': 'application/json',
'Content-Length': data.length,
'X-Retain-Images': 'none',
'X-Engine': 'direct'
'X-With-Links-Summary': 'all'
}
};

View File

@ -128,6 +128,7 @@ export interface ReadResponse {
url: string;
content: string;
usage: { tokens: number; };
links: Array<[string, string]>; // [anchor, url]
};
name?: string;
message?: string;
@ -147,11 +148,7 @@ export type EvaluationResponse = {
minimum_count_required: number;
actual_count_provided: number;
};
attribution_analysis?: {
sources_provided: boolean,
sources_verified: boolean,
quotes_accurate: boolean,
};
exactQuote? : string;
completeness_analysis?: {
aspects_expected: string,
aspects_provided: string,
@ -171,20 +168,19 @@ export type ErrorAnalysisResponse = {
questionsToAnswer: string[];
};
export type SearchResult =
| { title: string; url: string; description: string; weight?: number }
| SearchSnippet
| { title: string; link: string; snippet: string; weight?: number };
export type BoostedSearchResult = {
title: string;
url: string;
description: string;
weight: number;
originalWeight: number;
export type SearchSnippet = { title: string; url: string; description: string; weight?: number }
export type BoostedSearchSnippet = SearchSnippet & {
freqBoost: number;
hostnameBoost: number;
pathBoost: number;
boostScore: number;
boostedWeight: number;
jinaRerankBoost: number;
finalScore: number;
}
// OpenAI API Types

View File

@ -168,11 +168,7 @@ export class Schemas {
return z.object({
type: z.literal('attribution'),
...baseSchemaBefore,
attribution_analysis: z.object({
sources_provided: z.boolean().describe('Whether the answer provides source references'),
sources_verified: z.boolean().describe('Whether the provided sources contain the claimed information'),
quotes_accurate: z.boolean().describe('Whether the quotes accurately represent the source content')
}),
exactQuote: z.string().describe('Exact relevant quote and evidence from the source that strongly support the answer and justify this question-answer pair').max(200).optional(),
...baseSchemaAfter
});
case "completeness":
@ -223,7 +219,7 @@ export class Schemas {
references: z.array(
z.object({
exactQuote: z.string().describe("Exact relevant quote from the document, must be a soundbite, short and to the point, no fluff").max(30),
url: z.string().describe("source URL; must be directly from the context").max(100),
url: z.string().describe("source URL; must be copy directly from existing knowledge real URLs, avoid example.com or any placeholder fake URLs").max(100),
dateTime: z.string().describe("Apply this evidence hierarchy to determine the source timestamp: (1) Explicit dates in metadata/content, (2) Internal time references, (3) Contextual clues, (4) Version history if available. Format as YYYY-MM-DD when possible; otherwise provide narrowest defensible range with confidence level (High/Medium/Low).").max(16),
}).required()
).describe("Required when action='answer'. Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document"),

View File

@ -1,106 +1,108 @@
import {BoostedSearchResult, SearchResult} from "../types";
import {BoostedSearchSnippet, SearchResult, SearchSnippet, TrackerContext} from "../types";
import {smartMergeStrings} from "./text-tools";
import {rerankDocuments} from "../tools/jina-rerank";
export function normalizeUrl(urlString: string, debug = false): string {
if (!urlString?.trim()) {
throw new Error('Empty URL');
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
urlString = urlString.trim();
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
}
try {
const url = new URL(urlString);
url.hostname = url.hostname.toLowerCase();
if (url.hostname.startsWith('www.')) {
url.hostname = url.hostname.slice(4);
}
urlString = urlString.trim();
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
if ((url.protocol === 'http:' && url.port === '80') ||
(url.protocol === 'https:' && url.port === '443')) {
url.port = '';
}
try {
const url = new URL(urlString);
url.hostname = url.hostname.toLowerCase();
if (url.hostname.startsWith('www.')) {
url.hostname = url.hostname.slice(4);
}
if ((url.protocol === 'http:' && url.port === '80') ||
(url.protocol === 'https:' && url.port === '443')) {
url.port = '';
}
// Path normalization with error tracking
url.pathname = url.pathname
.split('/')
.map(segment => {
try {
return decodeURIComponent(segment);
} catch (e) {
if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
return segment;
}
})
.join('/')
.replace(/\/+/g, '/')
.replace(/\/+$/, '') || '/';
// Query parameter normalization with error details
const searchParams = new URLSearchParams(url.search);
const sortedParams = Array.from(searchParams.entries())
.map(([key, value]) => {
if (value === '') return [key, ''];
try {
const decodedValue = decodeURIComponent(value);
if (encodeURIComponent(decodedValue) === value) {
return [key, decodedValue];
}
} catch (e) {
if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
}
return [key, value];
})
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB))
.filter(([key]) => key !== '');
url.search = new URLSearchParams(sortedParams).toString();
// Fragment handling with validation
if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
url.hash = '';
} else if (url.hash) {
try {
const decodedHash = decodeURIComponent(url.hash.slice(1));
const encodedBack = encodeURIComponent(decodedHash);
// Only use decoded version if it's safe
if (encodedBack === url.hash.slice(1)) {
url.hash = '#' + decodedHash;
}
} catch (e) {
if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
}
}
let normalizedUrl = url.toString();
// Final URL normalization with validation
// Path normalization with error tracking
url.pathname = url.pathname
.split('/')
.map(segment => {
try {
const decodedUrl = decodeURIComponent(normalizedUrl);
const encodedBack = encodeURIComponent(decodedUrl);
// Only use decoded version if it's safe
if (encodedBack === normalizedUrl) {
normalizedUrl = decodedUrl;
}
return decodeURIComponent(segment);
} catch (e) {
if (debug) console.error('Failed to decode final URL', e);
if (debug) console.error(`Failed to decode path segment: ${segment}`, e);
return segment;
}
})
.join('/')
.replace(/\/+/g, '/')
.replace(/\/+$/, '') || '/';
return normalizedUrl;
} catch (error) {
// Main URL parsing error - this one we should throw
throw new Error(`Invalid URL "${urlString}": ${error}`);
// Query parameter normalization with error details
const searchParams = new URLSearchParams(url.search);
const sortedParams = Array.from(searchParams.entries())
.map(([key, value]) => {
if (value === '') return [key, ''];
try {
const decodedValue = decodeURIComponent(value);
if (encodeURIComponent(decodedValue) === value) {
return [key, decodedValue];
}
} catch (e) {
if (debug) console.error(`Failed to decode query param ${key}=${value}`, e);
}
return [key, value];
})
.sort(([keyA], [keyB]) => keyA.localeCompare(keyB))
.filter(([key]) => key !== '');
url.search = new URLSearchParams(sortedParams).toString();
// Fragment handling with validation
if (url.hash === '#' || url.hash === '#top' || url.hash === '#/' || !url.hash) {
url.hash = '';
} else if (url.hash) {
try {
const decodedHash = decodeURIComponent(url.hash.slice(1));
const encodedBack = encodeURIComponent(decodedHash);
// Only use decoded version if it's safe
if (encodedBack === url.hash.slice(1)) {
url.hash = '#' + decodedHash;
}
} catch (e) {
if (debug) console.error(`Failed to decode fragment: ${url.hash}`, e);
}
}
let normalizedUrl = url.toString();
// Final URL normalization with validation
try {
const decodedUrl = decodeURIComponent(normalizedUrl);
const encodedBack = encodeURIComponent(decodedUrl);
// Only use decoded version if it's safe
if (encodedBack === normalizedUrl) {
normalizedUrl = decodedUrl;
}
} catch (e) {
if (debug) console.error('Failed to decode final URL', e);
}
return normalizedUrl;
} catch (error) {
// Main URL parsing error - this one we should throw
throw new Error(`Invalid URL "${urlString}": ${error}`);
}
}
export function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedURLs: string[]): SearchResult[] {
return Object.entries(allURLs)
.filter(([url]) => !visitedURLs.includes(url))
.map(([, result]) => result);
export function getUnvisitedURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
return Object.entries(allURLs)
.filter(([url]) => !visitedURLs.includes(url))
.map(([, result]) => result);
}
@ -114,7 +116,7 @@ const extractUrlParts = (urlStr: string) => {
};
} catch (e) {
console.error(`Error parsing URL: ${urlStr}`, e);
return { hostname: "", path: "" };
return {hostname: "", path: ""};
}
};
@ -129,7 +131,7 @@ export const countUrlParts = (urlItems: SearchResult[]) => {
if (!item || !item.url) return; // Skip invalid items
totalUrls++;
const { hostname, path } = extractUrlParts(item.url);
const {hostname, path} = extractUrlParts(item.url);
// Count hostnames
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
@ -142,7 +144,7 @@ export const countUrlParts = (urlItems: SearchResult[]) => {
});
});
return { hostnameCount, pathPrefixCount, totalUrls };
return {hostnameCount, pathPrefixCount, totalUrls};
};
// Calculate normalized frequency for boosting
@ -151,31 +153,44 @@ const normalizeCount = (count: any, total: any) => {
};
// Calculate boosted weights
export const calculateBoostedWeights = (urlItems: SearchResult[], options: any = {}): any[] => {
export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers: TrackerContext): any[] => {
// Default parameters for boosting - can be overridden
const {
hostnameBoostFactor = 0.7, // How much to boost based on hostname frequency
freqFactor = 0.5, // How much to boost based on term frequency
hostnameBoostFactor = 0.5, // How much to boost based on hostname frequency
pathBoostFactor = 0.4, // How much to boost based on path frequency
decayFactor = 0.8, // Decay factor for longer paths (0-1)
jinaRerankFactor = 0.8, // How much to boost based on Jina reranking
minBoost = 0, // Minimum boost score
maxBoost = 5 // Maximum boost score cap
maxBoost = 5, // Maximum boost score cap
question = '', // Optional question for Jina reranking
} = options;
// Count URL parts first
const counts = countUrlParts(urlItems);
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
return urlItems.map(item => {
item = (item as BoostedSearchResult)
if (question.trim().length > 0) {
// get from jina rerank
rerankDocuments(question, urlItems.map(item => smartMergeStrings(item.title, item.description)), trackers.tokenTracker)
.then(({results}) => {
results.forEach(({index, relevance_score}) => {
(urlItems[index] as BoostedSearchSnippet).jinaRerankBoost = relevance_score * jinaRerankFactor;
});
})
}
return (urlItems as BoostedSearchSnippet[]).map(item => {
if (!item || !item.url) {
console.error('Skipping invalid item:', item);
return item; // Return unchanged
}
const { hostname, path } = extractUrlParts(item.url);
const {hostname, path} = extractUrlParts(item.url);
// Base weight from original
const originalWeight = item.weight || 1.0; // Default to 1 if weight is missing
const freq = item.weight || 1.0; // Default to 1 if weight is missing
// Hostname boost (normalized by total URLs)
const hostnameFreq = normalizeCount(hostnameCount[hostname] || 0, totalUrls);
@ -194,17 +209,91 @@ export const calculateBoostedWeights = (urlItems: SearchResult[], options: any =
pathBoost += decayedBoost;
});
const freqBoost = freq / totalUrls * freqFactor;
const jinaRerankBoost = item.jinaRerankBoost || 0;
// Calculate new weight with clamping
const boostScore = Math.min(Math.max(hostnameBoost + pathBoost, minBoost), maxBoost);
const boostedWeight = originalWeight + boostScore;
const finalScore = Math.min(
Math.max(
hostnameBoost
+ pathBoost
+ freqBoost
+ jinaRerankBoost, minBoost),
maxBoost);
return {
...item,
originalWeight,
freqBoost,
hostnameBoost,
pathBoost,
boostScore,
boostedWeight
} as BoostedSearchResult;
});
};
jinaRerankBoost,
finalScore
} as BoostedSearchSnippet;
}).sort((a, b) => b.finalScore - a.finalScore);
};
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>) => {
if (!allURLs[r.url]) {
allURLs[r.url] = r;
allURLs[r.url].weight = 1;
} else {
(allURLs[r.url].weight as number)++;
const curDesc = allURLs[r.url].description;
allURLs[r.url].description = smartMergeStrings(curDesc, r.description);
}
}
export const weightedURLToString = (allURLs: BoostedSearchSnippet[], maxURLs = 70) => {
if (!allURLs || allURLs.length === 0) return '';
return (allURLs)
.map(r => {
const merged = smartMergeStrings(r.title, r.description);
return {
url: r.url,
score: r.finalScore,
merged
};
})
.filter(item => item.merged !== '' && item.merged !== undefined && item.merged !== null)
.sort((a, b) => (b.score || 0) - (a.score || 0))
.slice(0, maxURLs)
.map(item => ` + weight: ${item.score.toFixed(2)} "${item.url}": "${item.merged}"`)
.join('\n');
}
/**
* Draw a sample from a multinomial distribution
* @param items Array of [name, weight] tuples
* @returns A randomly selected item based on the weights, or null if array is empty
*/
export function sampleMultinomial<T>(items: [T, number][]): T | null {
// Handle empty array
if (!items || items.length === 0) {
return null;
}
// Calculate total weight
const totalWeight = items.reduce((sum, [, weight]) => sum + weight, 0);
// Handle case where all weights are 0
if (totalWeight === 0) {
return null;
}
// Generate a random number between 0 and total weight
const randValue = Math.random() * totalWeight;
// Find the item corresponding to the random value
let cumulativeWeight = 0;
for (const [item, weight] of items) {
cumulativeWeight += weight;
if (randValue <= cumulativeWeight) {
return item;
}
}
// Fallback (should rarely happen due to floating point precision)
return items[items.length - 1][0];
}