fix: add spacing for consistency in agent and url-tools

This commit is contained in:
Han Xiao 2025-06-09 15:48:41 -07:00
parent b797273626
commit 2affd41c79
2 changed files with 39 additions and 30 deletions

View File

@ -517,6 +517,7 @@ export async function getResponse(question?: string,
question: currentQuestion,
boostHostnames
}, context);
// improve diversity by keep top 2 urls of each hostname
weightedURLs = keepKPerHostname(weightedURLs, 2);
console.log('Weighted URLs:', weightedURLs.length);

View File

@ -1,12 +1,12 @@
import {BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types";
import {getI18nText, smartMergeStrings} from "./text-tools";
import {rerankDocuments} from "../tools/jina-rerank";
import {readUrl} from "../tools/read";
import {Schemas} from "./schemas";
import {cherryPick} from "../tools/jina-latechunk";
import {formatDateBasedOnType} from "./date-tools";
import {classifyText} from "../tools/jina-classify-spam";
import {segmentText} from "../tools/segment";
import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
import { getI18nText, smartMergeStrings } from "./text-tools";
import { rerankDocuments } from "../tools/jina-rerank";
import { readUrl } from "../tools/read";
import { Schemas } from "./schemas";
import { cherryPick } from "../tools/jina-latechunk";
import { formatDateBasedOnType } from "./date-tools";
import { classifyText } from "../tools/jina-classify-spam";
import { segmentText } from "../tools/segment";
import axiosClient from "./axios-client";
export function normalizeUrl(urlString: string, debug = false, options = {
@ -179,7 +179,7 @@ const extractUrlParts = (urlStr: string) => {
};
} catch (e) {
console.error(`Error parsing URL: ${urlStr}`, e);
return {hostname: "", path: ""};
return { hostname: "", path: "" };
}
};
@ -203,7 +203,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
if (!item || !item.url) return; // Skip invalid items
totalUrls++;
const {hostname, path} = extractUrlParts(item.url);
const { hostname, path } = extractUrlParts(item.url);
// Count hostnames
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
@ -216,7 +216,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
});
});
return {hostnameCount, pathPrefixCount, totalUrls};
return { hostnameCount, pathPrefixCount, totalUrls };
};
// Calculate normalized frequency for boosting
@ -241,7 +241,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
// Count URL parts first
const counts = countUrlParts(urlItems);
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
if (question.trim().length > 0) {
// Step 1: Create a record to track unique content with their original indices
@ -262,9 +262,9 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
const uniqueIndicesMap = Object.values(uniqueContentMap);
console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
rerankDocuments(question, uniqueContents, trackers.tokenTracker)
.then(({results}) => {
.then(({ results }) => {
// Step 3: Map the scores back to all original items
results.forEach(({index, relevance_score}) => {
results.forEach(({ index, relevance_score }) => {
const originalIndices = uniqueIndicesMap[index];
const boost = relevance_score * jinaRerankFactor;
@ -283,7 +283,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
return item; // Return unchanged
}
const {hostname, path} = extractUrlParts(item.url);
const { hostname, path } = extractUrlParts(item.url);
// Base weight from original
const freq = item.weight || 0; // Default to 1 if weight is missing
@ -427,6 +427,14 @@ export async function getLastModified(url: string): Promise<string | undefined>
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
// First count unique hostnames
const uniqueHostnames = new Set(results.map(result => extractUrlParts(result.url).hostname));
// If only one or zero unique hostnames, return original results
if (uniqueHostnames.size <= 1) {
return results;
}
const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = [];
@ -458,7 +466,7 @@ export async function processURLs(
): Promise<{ urlResults: any[], success: boolean }> {
// Skip if no URLs to process
if (urls.length === 0) {
return {urlResults: [], success: false};
return { urlResults: [], success: false };
}
const badHostnames: string[] = [];
@ -466,10 +474,10 @@ export async function processURLs(
// Track the reading action
const thisStep: VisitAction = {
action: 'visit',
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
think: getI18nText('read_for', schemaGen.languageCode, { urls: urls.join(', ') }),
URLTargets: urls
}
context.actionTracker.trackAction({thisStep})
context.actionTracker.trackAction({ thisStep })
// Process each URL in parallel
const urlResults = await Promise.all(
@ -483,8 +491,8 @@ export async function processURLs(
// Store normalized URL for consistent reference
url = normalizedUrl;
const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response;
const { response } = await readUrl(url, true, context.tokenTracker);
const { data } = response;
const guessedTime = await getLastModified(url);
if (guessedTime) {
console.log('Guessed time for', url, guessedTime);
@ -505,7 +513,7 @@ export async function processURLs(
}
// add to web contents
const {chunks, chunk_positions } = await segmentText(data.content, context);
const { chunks, chunk_positions } = await segmentText(data.content, context);
// filter out the chunks that are too short, minChunkLength is 80
const minChunkLength = 80;
for (let i = 0; i < chunks.length; i++) {
@ -546,7 +554,7 @@ export async function processURLs(
}
});
return {url, result: response};
return { url, result: response };
} catch (error: any) {
console.error('Error reading URL:', url, error);
badURLs.push(url);
@ -593,11 +601,11 @@ export async function processURLs(
// remove any URL with bad hostnames from allURLs
if (badHostnames.length > 0) {
Object.keys(allURLs).forEach(url => {
if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url];
console.log(`Removed ${url} from allURLs`);
}
if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url];
console.log(`Removed ${url} from allURLs`);
}
}
)
}
@ -658,7 +666,7 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
// Find all matches
const matches: Array<{url: string, index: number, length: number}> = [];
const matches: Array<{ url: string, index: number, length: number }> = [];
let match: RegExpExecArray | null;
while ((match = urlPattern.exec(text)) !== null) {
@ -697,14 +705,14 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
// Adjust boundaries to avoid overlapping with other URLs
if (i > 0) {
const prevUrl = matches[i-1];
const prevUrl = matches[i - 1];
if (startPos < prevUrl.index + prevUrl.length) {
startPos = prevUrl.index + prevUrl.length;
}
}
if (i < matches.length - 1) {
const nextUrl = matches[i+1];
const nextUrl = matches[i + 1];
if (endPos > nextUrl.index) {
endPos = nextUrl.index;
}