fix: normalize url

This commit is contained in:
Han Xiao 2025-03-15 13:55:59 +08:00
parent 92d3aaae9e
commit f9cb542dd0
3 changed files with 100 additions and 70 deletions

View File

@ -17,7 +17,7 @@ import {
SearchResult,
EvaluationType,
BoostedSearchSnippet,
SearchSnippet, EvaluationResponse
SearchSnippet, EvaluationResponse, Reference
} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
@ -238,6 +238,31 @@ function updateContext(step: any) {
allContext.push(step)
}
async function updateReferences(thisStep: AnswerAction, allURLs: Record<string, SearchSnippet>) {
thisStep.references = thisStep.references
?.filter(ref => ref?.url)
.map(ref => {
const normalizedUrl = normalizeUrl(ref.url);
if (!normalizedUrl) return null; // This causes the type error
return {
exactQuote: ref?.exactQuote || '',
title: allURLs[normalizedUrl]?.title || '',
url: normalizedUrl,
dateTime: ref?.dateTime || ''
};
})
.filter(Boolean) as Reference[]; // Add type assertion here
// parallel process guess all url datetime
await Promise.all((thisStep.references || []).filter(ref => !ref.dateTime)
.map(async ref => {
ref.dateTime = await getLastModified(ref.url) || '';
}));
console.log('Updated references:', thisStep.references);
}
async function executeSearchQueries(
keywordsQueries: any[],
context: TrackerContext,
@ -297,12 +322,19 @@ async function executeSearchQueries(
await sleep(STEP_SLEEP);
}
const minResults: SearchSnippet[] = (results).map(r => ({
title: r.title,
url: normalizeUrl('url' in r ? r.url : r.link),
description: 'description' in r ? r.description : r.snippet,
weight: 1
}));
const minResults: SearchSnippet[] = results
.map(r => {
const url = normalizeUrl('url' in r ? r.url : r.link);
if (!url) return null; // Skip invalid URLs
return {
title: r.title,
url,
description: 'description' in r ? r.description : r.snippet,
weight: 1
};
})
.filter(Boolean) as SearchSnippet[]; // Filter out null entries and assert type
minResults.forEach(r => {
addToAllURLs(r, allURLs);
@ -477,24 +509,7 @@ export async function getResponse(question?: string,
// execute the step and action
if (thisStep.action === 'answer' && thisStep.answer) {
// normalize all references urls, add title to it
thisStep.references = thisStep.references?.filter(ref => ref?.url && typeof ref.url === 'string' && ref.url.startsWith('http'))
.map(ref => {
const normalizedUrl = ref?.url ? normalizeUrl(ref.url) : '';
return {
exactQuote: ref?.exactQuote || '',
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
url: normalizedUrl,
dateTime: ref?.dateTime || ''
}
});
// parallel process guess all url datetime
await Promise.all(thisStep.references.filter(ref => !(ref?.dateTime))
.map(async ref => {
ref.dateTime = await getLastModified(ref.url) || ''
}));
console.log('Updated references:', thisStep.references)
await updateReferences(thisStep, allURLs)
if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) {
// LLM is so confident and answer immediately, skip all evaluations
@ -746,9 +761,8 @@ You decided to think out of the box or cut from a completely different angle.
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
// normalize URLs
thisStep.URLTargets = thisStep.URLTargets
.filter(url => url.startsWith('http'))
.map(url => normalizeUrl(url))
.filter(url => !visitedURLs.includes(url));
.filter(url => url && !visitedURLs.includes(url)) as string[];
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
@ -883,6 +897,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
think: result.object.think,
...result.object[result.object.action]
} as AnswerAction;
await updateReferences(thisStep, allURLs);
(thisStep as AnswerAction).isFinal = true;
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
}

View File

@ -17,7 +17,7 @@ export function buildMdFromAnswer(answer: AnswerAction) {
const citation = `[^${i + 1}]: ${cleanQuote}`;
if (!ref.url?.startsWith('http')) return citation;
if (!ref.url) return citation;
const domainName = new URL(ref.url).hostname.replace('www.', '');
return `${citation} [${domainName}](${ref.url})`;

View File

@ -11,18 +11,19 @@ export function normalizeUrl(urlString: string, debug = false, options = {
removeSessionIDs: true,
removeUTMParams: true,
removeTrackingParams: true
}): string {
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
urlString = urlString.trim();
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
}
}) {
try {
urlString = urlString.replace(/\s+/g, '').trim();
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
}
const url = new URL(urlString);
url.hostname = url.hostname.toLowerCase();
@ -71,7 +72,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
// Remove session IDs
if (options.removeSessionIDs &&
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
return false;
}
@ -82,7 +83,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
// Remove common tracking parameters
if (options.removeTrackingParams &&
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
return false;
}
@ -132,13 +133,14 @@ export function normalizeUrl(urlString: string, debug = false, options = {
return normalizedUrl;
} catch (error) {
// Main URL parsing error - this one we should throw
throw new Error(`Invalid URL "${urlString}": ${error}`);
console.error(`Invalid URL "${urlString}": ${error}`);
return;
}
}
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
return Object.entries(allURLs)
.filter(([url, ]) => !visitedURLs.includes(url))
.filter(([url,]) => !visitedURLs.includes(url))
.map(([, result]) => result);
}
@ -270,11 +272,12 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
const nURL = normalizeUrl(r.url);
if (!nURL) return;
if (!allURLs[nURL]) {
allURLs[nURL] = r;
allURLs[nURL].weight = weightDelta;
} else {
(allURLs[nURL].weight as number)+= weightDelta;
(allURLs[nURL].weight as number) += weightDelta;
const curDesc = allURLs[nURL].description;
allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
}
@ -337,8 +340,6 @@ export function sampleMultinomial<T>(items: [T, number][]): T | null {
}
/**
* Fetches the last modified date for a URL using the datetime detection API
* @param url The URL to check for last modified date
@ -370,22 +371,22 @@ export async function getLastModified(url: string): Promise<string | undefined>
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = [];
const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = [];
results.forEach((result) => {
const hostname = extractUrlParts(result.url).hostname;
if (hostnameMap[hostname] === undefined) {
hostnameMap[hostname] = 0;
}
results.forEach((result) => {
const hostname = extractUrlParts(result.url).hostname;
if (hostnameMap[hostname] === undefined) {
hostnameMap[hostname] = 0;
}
if (hostnameMap[hostname] < k) {
filteredResults.push(result);
hostnameMap[hostname]++;
}
});
if (hostnameMap[hostname] < k) {
filteredResults.push(result);
hostnameMap[hostname]++;
}
});
return filteredResults;
return filteredResults;
}
export async function processURLs(
@ -396,10 +397,10 @@ export async function processURLs(
visitedURLs: string[],
schemaGen: Schemas,
question: string
): Promise<{urlResults: any[], success: boolean}> {
): Promise<{ urlResults: any[], success: boolean }> {
// Skip if no URLs to process
if (urls.length === 0) {
return { urlResults: [], success: false };
return {urlResults: [], success: false};
}
// Track the reading action
@ -414,7 +415,14 @@ export async function processURLs(
const urlResults = await Promise.all(
urls.map(async url => {
try {
url = normalizeUrl(url);
const normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
return null;
}
// Store normalized URL for consistent reference
url = normalizedUrl;
const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response;
const guessedTime = await getLastModified(url);
@ -422,7 +430,6 @@ export async function processURLs(
console.log('Guessed time for', url, guessedTime);
}
// Early return if no valid data
if (!data?.url || !data?.content) {
throw new Error('No content found');
@ -434,18 +441,20 @@ export async function processURLs(
answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
references: [data.url],
type: 'url',
updated: guessedTime? formatDateBasedOnType(new Date(guessedTime), 'full'): undefined
updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
});
// Process page links
data.links?.forEach(link => {
const nnUrl = normalizeUrl(link[1]);
if (!nnUrl) return;
const r: SearchSnippet = {
title: link[0],
url: normalizeUrl(link[1]),
url: nnUrl,
description: link[0],
}
// in-page link has lower initial weight comparing to search links
if (r.url && r.url.startsWith('http')) {
if (r.url) {
addToAllURLs(r, allURLs, 0.1);
}
});
@ -455,13 +464,19 @@ export async function processURLs(
console.error('Error reading URL:', url, error);
return null;
} finally {
visitedURLs.push(url);
// Only add valid URLs to visitedURLs list
if (url && typeof url === 'string') {
visitedURLs.push(url);
}
}
})
).then(results => results.filter(Boolean));
);
// Filter out null results without changing the original array
const validResults = urlResults.filter(Boolean);
return {
urlResults,
success: urlResults.length > 0
urlResults: validResults,
success: validResults.length > 0
};
}