fix: add spacing for consistency in agent and url-tools

This commit is contained in:
Han Xiao
2025-06-09 15:48:41 -07:00
parent b797273626
commit 2affd41c79
2 changed files with 39 additions and 30 deletions

View File

@@ -517,6 +517,7 @@ export async function getResponse(question?: string,
question: currentQuestion, question: currentQuestion,
boostHostnames boostHostnames
}, context); }, context);
// improve diversity by keep top 2 urls of each hostname // improve diversity by keep top 2 urls of each hostname
weightedURLs = keepKPerHostname(weightedURLs, 2); weightedURLs = keepKPerHostname(weightedURLs, 2);
console.log('Weighted URLs:', weightedURLs.length); console.log('Weighted URLs:', weightedURLs.length);

View File

@@ -1,12 +1,12 @@
import {BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types"; import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
import {getI18nText, smartMergeStrings} from "./text-tools"; import { getI18nText, smartMergeStrings } from "./text-tools";
import {rerankDocuments} from "../tools/jina-rerank"; import { rerankDocuments } from "../tools/jina-rerank";
import {readUrl} from "../tools/read"; import { readUrl } from "../tools/read";
import {Schemas} from "./schemas"; import { Schemas } from "./schemas";
import {cherryPick} from "../tools/jina-latechunk"; import { cherryPick } from "../tools/jina-latechunk";
import {formatDateBasedOnType} from "./date-tools"; import { formatDateBasedOnType } from "./date-tools";
import {classifyText} from "../tools/jina-classify-spam"; import { classifyText } from "../tools/jina-classify-spam";
import {segmentText} from "../tools/segment"; import { segmentText } from "../tools/segment";
import axiosClient from "./axios-client"; import axiosClient from "./axios-client";
export function normalizeUrl(urlString: string, debug = false, options = { export function normalizeUrl(urlString: string, debug = false, options = {
@@ -179,7 +179,7 @@ const extractUrlParts = (urlStr: string) => {
}; };
} catch (e) { } catch (e) {
console.error(`Error parsing URL: ${urlStr}`, e); console.error(`Error parsing URL: ${urlStr}`, e);
return {hostname: "", path: ""}; return { hostname: "", path: "" };
} }
}; };
@@ -203,7 +203,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
if (!item || !item.url) return; // Skip invalid items if (!item || !item.url) return; // Skip invalid items
totalUrls++; totalUrls++;
const {hostname, path} = extractUrlParts(item.url); const { hostname, path } = extractUrlParts(item.url);
// Count hostnames // Count hostnames
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1; hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
@@ -216,7 +216,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
}); });
}); });
return {hostnameCount, pathPrefixCount, totalUrls}; return { hostnameCount, pathPrefixCount, totalUrls };
}; };
// Calculate normalized frequency for boosting // Calculate normalized frequency for boosting
@@ -241,7 +241,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
// Count URL parts first // Count URL parts first
const counts = countUrlParts(urlItems); const counts = countUrlParts(urlItems);
const {hostnameCount, pathPrefixCount, totalUrls} = counts; const { hostnameCount, pathPrefixCount, totalUrls } = counts;
if (question.trim().length > 0) { if (question.trim().length > 0) {
// Step 1: Create a record to track unique content with their original indices // Step 1: Create a record to track unique content with their original indices
@@ -262,9 +262,9 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
const uniqueIndicesMap = Object.values(uniqueContentMap); const uniqueIndicesMap = Object.values(uniqueContentMap);
console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`) console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
rerankDocuments(question, uniqueContents, trackers.tokenTracker) rerankDocuments(question, uniqueContents, trackers.tokenTracker)
.then(({results}) => { .then(({ results }) => {
// Step 3: Map the scores back to all original items // Step 3: Map the scores back to all original items
results.forEach(({index, relevance_score}) => { results.forEach(({ index, relevance_score }) => {
const originalIndices = uniqueIndicesMap[index]; const originalIndices = uniqueIndicesMap[index];
const boost = relevance_score * jinaRerankFactor; const boost = relevance_score * jinaRerankFactor;
@@ -283,7 +283,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
return item; // Return unchanged return item; // Return unchanged
} }
const {hostname, path} = extractUrlParts(item.url); const { hostname, path } = extractUrlParts(item.url);
// Base weight from original // Base weight from original
const freq = item.weight || 0; // Default to 1 if weight is missing const freq = item.weight || 0; // Default to 1 if weight is missing
@@ -427,6 +427,14 @@ export async function getLastModified(url: string): Promise<string | undefined>
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => { export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
// First count unique hostnames
const uniqueHostnames = new Set(results.map(result => extractUrlParts(result.url).hostname));
// If only one or zero unique hostnames, return original results
if (uniqueHostnames.size <= 1) {
return results;
}
const hostnameMap: Record<string, number> = {}; const hostnameMap: Record<string, number> = {};
const filteredResults: BoostedSearchSnippet[] = []; const filteredResults: BoostedSearchSnippet[] = [];
@@ -458,7 +466,7 @@ export async function processURLs(
): Promise<{ urlResults: any[], success: boolean }> { ): Promise<{ urlResults: any[], success: boolean }> {
// Skip if no URLs to process // Skip if no URLs to process
if (urls.length === 0) { if (urls.length === 0) {
return {urlResults: [], success: false}; return { urlResults: [], success: false };
} }
const badHostnames: string[] = []; const badHostnames: string[] = [];
@@ -466,10 +474,10 @@ export async function processURLs(
// Track the reading action // Track the reading action
const thisStep: VisitAction = { const thisStep: VisitAction = {
action: 'visit', action: 'visit',
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}), think: getI18nText('read_for', schemaGen.languageCode, { urls: urls.join(', ') }),
URLTargets: urls URLTargets: urls
} }
context.actionTracker.trackAction({thisStep}) context.actionTracker.trackAction({ thisStep })
// Process each URL in parallel // Process each URL in parallel
const urlResults = await Promise.all( const urlResults = await Promise.all(
@@ -483,8 +491,8 @@ export async function processURLs(
// Store normalized URL for consistent reference // Store normalized URL for consistent reference
url = normalizedUrl; url = normalizedUrl;
const {response} = await readUrl(url, true, context.tokenTracker); const { response } = await readUrl(url, true, context.tokenTracker);
const {data} = response; const { data } = response;
const guessedTime = await getLastModified(url); const guessedTime = await getLastModified(url);
if (guessedTime) { if (guessedTime) {
console.log('Guessed time for', url, guessedTime); console.log('Guessed time for', url, guessedTime);
@@ -505,7 +513,7 @@ export async function processURLs(
} }
// add to web contents // add to web contents
const {chunks, chunk_positions } = await segmentText(data.content, context); const { chunks, chunk_positions } = await segmentText(data.content, context);
// filter out the chunks that are too short, minChunkLength is 80 // filter out the chunks that are too short, minChunkLength is 80
const minChunkLength = 80; const minChunkLength = 80;
for (let i = 0; i < chunks.length; i++) { for (let i = 0; i < chunks.length; i++) {
@@ -546,7 +554,7 @@ export async function processURLs(
} }
}); });
return {url, result: response}; return { url, result: response };
} catch (error: any) { } catch (error: any) {
console.error('Error reading URL:', url, error); console.error('Error reading URL:', url, error);
badURLs.push(url); badURLs.push(url);
@@ -593,11 +601,11 @@ export async function processURLs(
// remove any URL with bad hostnames from allURLs // remove any URL with bad hostnames from allURLs
if (badHostnames.length > 0) { if (badHostnames.length > 0) {
Object.keys(allURLs).forEach(url => { Object.keys(allURLs).forEach(url => {
if (badHostnames.includes(extractUrlParts(url).hostname)) { if (badHostnames.includes(extractUrlParts(url).hostname)) {
delete allURLs[url]; delete allURLs[url];
console.log(`Removed ${url} from allURLs`); console.log(`Removed ${url} from allURLs`);
}
} }
}
) )
} }
@@ -658,7 +666,7 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g; const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
// Find all matches // Find all matches
const matches: Array<{url: string, index: number, length: number}> = []; const matches: Array<{ url: string, index: number, length: number }> = [];
let match: RegExpExecArray | null; let match: RegExpExecArray | null;
while ((match = urlPattern.exec(text)) !== null) { while ((match = urlPattern.exec(text)) !== null) {
@@ -697,14 +705,14 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
// Adjust boundaries to avoid overlapping with other URLs // Adjust boundaries to avoid overlapping with other URLs
if (i > 0) { if (i > 0) {
const prevUrl = matches[i-1]; const prevUrl = matches[i - 1];
if (startPos < prevUrl.index + prevUrl.length) { if (startPos < prevUrl.index + prevUrl.length) {
startPos = prevUrl.index + prevUrl.length; startPos = prevUrl.index + prevUrl.length;
} }
} }
if (i < matches.length - 1) { if (i < matches.length - 1) {
const nextUrl = matches[i+1]; const nextUrl = matches[i + 1];
if (endPos > nextUrl.index) { if (endPos > nextUrl.index) {
endPos = nextUrl.index; endPos = nextUrl.index;
} }