fix: normalize url

This commit is contained in:
Han Xiao
2025-03-15 13:55:59 +08:00
parent 92d3aaae9e
commit f9cb542dd0
3 changed files with 100 additions and 70 deletions

View File

@@ -17,7 +17,7 @@ import {
SearchResult, SearchResult,
EvaluationType, EvaluationType,
BoostedSearchSnippet, BoostedSearchSnippet,
SearchSnippet, EvaluationResponse SearchSnippet, EvaluationResponse, Reference
} from "./types"; } from "./types";
import {TrackerContext} from "./types"; import {TrackerContext} from "./types";
import {search} from "./tools/jina-search"; import {search} from "./tools/jina-search";
@@ -238,6 +238,31 @@ function updateContext(step: any) {
allContext.push(step) allContext.push(step)
} }
async function updateReferences(thisStep: AnswerAction, allURLs: Record<string, SearchSnippet>) {
thisStep.references = thisStep.references
?.filter(ref => ref?.url)
.map(ref => {
const normalizedUrl = normalizeUrl(ref.url);
if (!normalizedUrl) return null; // This causes the type error
return {
exactQuote: ref?.exactQuote || '',
title: allURLs[normalizedUrl]?.title || '',
url: normalizedUrl,
dateTime: ref?.dateTime || ''
};
})
.filter(Boolean) as Reference[]; // Add type assertion here
// parallel process guess all url datetime
await Promise.all((thisStep.references || []).filter(ref => !ref.dateTime)
.map(async ref => {
ref.dateTime = await getLastModified(ref.url) || '';
}));
console.log('Updated references:', thisStep.references);
}
async function executeSearchQueries( async function executeSearchQueries(
keywordsQueries: any[], keywordsQueries: any[],
context: TrackerContext, context: TrackerContext,
@@ -297,12 +322,19 @@ async function executeSearchQueries(
await sleep(STEP_SLEEP); await sleep(STEP_SLEEP);
} }
const minResults: SearchSnippet[] = (results).map(r => ({ const minResults: SearchSnippet[] = results
title: r.title, .map(r => {
url: normalizeUrl('url' in r ? r.url : r.link), const url = normalizeUrl('url' in r ? r.url : r.link);
description: 'description' in r ? r.description : r.snippet, if (!url) return null; // Skip invalid URLs
weight: 1
})); return {
title: r.title,
url,
description: 'description' in r ? r.description : r.snippet,
weight: 1
};
})
.filter(Boolean) as SearchSnippet[]; // Filter out null entries and assert type
minResults.forEach(r => { minResults.forEach(r => {
addToAllURLs(r, allURLs); addToAllURLs(r, allURLs);
@@ -477,24 +509,7 @@ export async function getResponse(question?: string,
// execute the step and action // execute the step and action
if (thisStep.action === 'answer' && thisStep.answer) { if (thisStep.action === 'answer' && thisStep.answer) {
// normalize all references urls, add title to it // normalize all references urls, add title to it
thisStep.references = thisStep.references?.filter(ref => ref?.url && typeof ref.url === 'string' && ref.url.startsWith('http')) await updateReferences(thisStep, allURLs)
.map(ref => {
const normalizedUrl = ref?.url ? normalizeUrl(ref.url) : '';
return {
exactQuote: ref?.exactQuote || '',
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
url: normalizedUrl,
dateTime: ref?.dateTime || ''
}
});
// parallel process guess all url datetime
await Promise.all(thisStep.references.filter(ref => !(ref?.dateTime))
.map(async ref => {
ref.dateTime = await getLastModified(ref.url) || ''
}));
console.log('Updated references:', thisStep.references)
if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) { if (totalStep === 1 && thisStep.references.length === 0 && !noDirectAnswer) {
// LLM is so confident and answer immediately, skip all evaluations // LLM is so confident and answer immediately, skip all evaluations
@@ -746,9 +761,8 @@ You decided to think out of the box or cut from a completely different angle.
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) { } else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
// normalize URLs // normalize URLs
thisStep.URLTargets = thisStep.URLTargets thisStep.URLTargets = thisStep.URLTargets
.filter(url => url.startsWith('http'))
.map(url => normalizeUrl(url)) .map(url => normalizeUrl(url))
.filter(url => !visitedURLs.includes(url)); .filter(url => url && !visitedURLs.includes(url)) as string[];
thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP); thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
@@ -883,6 +897,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
think: result.object.think, think: result.object.think,
...result.object[result.object.action] ...result.object[result.object.action]
} as AnswerAction; } as AnswerAction;
await updateReferences(thisStep, allURLs);
(thisStep as AnswerAction).isFinal = true; (thisStep as AnswerAction).isFinal = true;
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts}); context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
} }

View File

@@ -17,7 +17,7 @@ export function buildMdFromAnswer(answer: AnswerAction) {
const citation = `[^${i + 1}]: ${cleanQuote}`; const citation = `[^${i + 1}]: ${cleanQuote}`;
if (!ref.url?.startsWith('http')) return citation; if (!ref.url) return citation;
const domainName = new URL(ref.url).hostname.replace('www.', ''); const domainName = new URL(ref.url).hostname.replace('www.', '');
return `${citation} [${domainName}](${ref.url})`; return `${citation} [${domainName}](${ref.url})`;

View File

@@ -11,18 +11,19 @@ export function normalizeUrl(urlString: string, debug = false, options = {
removeSessionIDs: true, removeSessionIDs: true,
removeUTMParams: true, removeUTMParams: true,
removeTrackingParams: true removeTrackingParams: true
}): string { }) {
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
urlString = urlString.trim();
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
}
try { try {
urlString = urlString.replace(/\s+/g, '').trim();
if (!urlString?.trim()) {
throw new Error('Empty URL');
}
if (!/^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(urlString)) {
urlString = 'https://' + urlString;
}
const url = new URL(urlString); const url = new URL(urlString);
url.hostname = url.hostname.toLowerCase(); url.hostname = url.hostname.toLowerCase();
@@ -71,7 +72,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
// Remove session IDs // Remove session IDs
if (options.removeSessionIDs && if (options.removeSessionIDs &&
/^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) { /^(s|session|sid|sessionid|phpsessid|jsessionid|aspsessionid|asp\.net_sessionid)$/i.test(key)) {
return false; return false;
} }
@@ -82,7 +83,7 @@ export function normalizeUrl(urlString: string, debug = false, options = {
// Remove common tracking parameters // Remove common tracking parameters
if (options.removeTrackingParams && if (options.removeTrackingParams &&
/^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) { /^(ref|referrer|fbclid|gclid|cid|mcid|source|medium|campaign|term|content|sc_rid|mc_[a-z]+)$/i.test(key)) {
return false; return false;
} }
@@ -132,13 +133,14 @@ export function normalizeUrl(urlString: string, debug = false, options = {
return normalizedUrl; return normalizedUrl;
} catch (error) { } catch (error) {
// Main URL parsing error - this one we should throw // Main URL parsing error - this one we should throw
throw new Error(`Invalid URL "${urlString}": ${error}`); console.error(`Invalid URL "${urlString}": ${error}`);
return;
} }
} }
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] { export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[]): SearchSnippet[] {
return Object.entries(allURLs) return Object.entries(allURLs)
.filter(([url, ]) => !visitedURLs.includes(url)) .filter(([url,]) => !visitedURLs.includes(url))
.map(([, result]) => result); .map(([, result]) => result);
} }
@@ -270,11 +272,12 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => { export const addToAllURLs = (r: SearchSnippet, allURLs: Record<string, SearchSnippet>, weightDelta = 1) => {
const nURL = normalizeUrl(r.url); const nURL = normalizeUrl(r.url);
if (!nURL) return;
if (!allURLs[nURL]) { if (!allURLs[nURL]) {
allURLs[nURL] = r; allURLs[nURL] = r;
allURLs[nURL].weight = weightDelta; allURLs[nURL].weight = weightDelta;
} else { } else {
(allURLs[nURL].weight as number)+= weightDelta; (allURLs[nURL].weight as number) += weightDelta;
const curDesc = allURLs[nURL].description; const curDesc = allURLs[nURL].description;
allURLs[nURL].description = smartMergeStrings(curDesc, r.description); allURLs[nURL].description = smartMergeStrings(curDesc, r.description);
} }
@@ -337,8 +340,6 @@ export function sampleMultinomial<T>(items: [T, number][]): T | null {
} }
/** /**
* Fetches the last modified date for a URL using the datetime detection API * Fetches the last modified date for a URL using the datetime detection API
* @param url The URL to check for last modified date * @param url The URL to check for last modified date
@@ -370,22 +371,22 @@ export async function getLastModified(url: string): Promise<string | undefined>
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => { export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
const hostnameMap: Record<string, number> = {}; const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = []; const filteredResults: BoostedSearchSnippet[] = [];
results.forEach((result) => { results.forEach((result) => {
const hostname = extractUrlParts(result.url).hostname; const hostname = extractUrlParts(result.url).hostname;
if (hostnameMap[hostname] === undefined) { if (hostnameMap[hostname] === undefined) {
hostnameMap[hostname] = 0; hostnameMap[hostname] = 0;
} }
if (hostnameMap[hostname] < k) { if (hostnameMap[hostname] < k) {
filteredResults.push(result); filteredResults.push(result);
hostnameMap[hostname]++; hostnameMap[hostname]++;
} }
}); });
return filteredResults; return filteredResults;
} }
export async function processURLs( export async function processURLs(
@@ -396,10 +397,10 @@ export async function processURLs(
visitedURLs: string[], visitedURLs: string[],
schemaGen: Schemas, schemaGen: Schemas,
question: string question: string
): Promise<{urlResults: any[], success: boolean}> { ): Promise<{ urlResults: any[], success: boolean }> {
// Skip if no URLs to process // Skip if no URLs to process
if (urls.length === 0) { if (urls.length === 0) {
return { urlResults: [], success: false }; return {urlResults: [], success: false};
} }
// Track the reading action // Track the reading action
@@ -414,7 +415,14 @@ export async function processURLs(
const urlResults = await Promise.all( const urlResults = await Promise.all(
urls.map(async url => { urls.map(async url => {
try { try {
url = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
if (!normalizedUrl) {
return null;
}
// Store normalized URL for consistent reference
url = normalizedUrl;
const {response} = await readUrl(url, true, context.tokenTracker); const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response; const {data} = response;
const guessedTime = await getLastModified(url); const guessedTime = await getLastModified(url);
@@ -422,7 +430,6 @@ export async function processURLs(
console.log('Guessed time for', url, guessedTime); console.log('Guessed time for', url, guessedTime);
} }
// Early return if no valid data // Early return if no valid data
if (!data?.url || !data?.content) { if (!data?.url || !data?.content) {
throw new Error('No content found'); throw new Error('No content found');
@@ -434,18 +441,20 @@ export async function processURLs(
answer: await cherryPick(question, data.content, {}, context, schemaGen, url), answer: await cherryPick(question, data.content, {}, context, schemaGen, url),
references: [data.url], references: [data.url],
type: 'url', type: 'url',
updated: guessedTime? formatDateBasedOnType(new Date(guessedTime), 'full'): undefined updated: guessedTime ? formatDateBasedOnType(new Date(guessedTime), 'full') : undefined
}); });
// Process page links // Process page links
data.links?.forEach(link => { data.links?.forEach(link => {
const nnUrl = normalizeUrl(link[1]);
if (!nnUrl) return;
const r: SearchSnippet = { const r: SearchSnippet = {
title: link[0], title: link[0],
url: normalizeUrl(link[1]), url: nnUrl,
description: link[0], description: link[0],
} }
// in-page link has lower initial weight comparing to search links // in-page link has lower initial weight comparing to search links
if (r.url && r.url.startsWith('http')) { if (r.url) {
addToAllURLs(r, allURLs, 0.1); addToAllURLs(r, allURLs, 0.1);
} }
}); });
@@ -455,13 +464,19 @@ export async function processURLs(
console.error('Error reading URL:', url, error); console.error('Error reading URL:', url, error);
return null; return null;
} finally { } finally {
visitedURLs.push(url); // Only add valid URLs to visitedURLs list
if (url && typeof url === 'string') {
visitedURLs.push(url);
}
} }
}) })
).then(results => results.filter(Boolean)); );
// Filter out null results without changing the original array
const validResults = urlResults.filter(Boolean);
return { return {
urlResults, urlResults: validResults,
success: urlResults.length > 0 success: validResults.length > 0
}; };
} }