mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: add spacing for consistency in agent and url-tools
This commit is contained in:
parent
b797273626
commit
2affd41c79
@ -517,6 +517,7 @@ export async function getResponse(question?: string,
|
||||
question: currentQuestion,
|
||||
boostHostnames
|
||||
}, context);
|
||||
|
||||
// improve diversity by keep top 2 urls of each hostname
|
||||
weightedURLs = keepKPerHostname(weightedURLs, 2);
|
||||
console.log('Weighted URLs:', weightedURLs.length);
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
import {BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types";
|
||||
import {getI18nText, smartMergeStrings} from "./text-tools";
|
||||
import {rerankDocuments} from "../tools/jina-rerank";
|
||||
import {readUrl} from "../tools/read";
|
||||
import {Schemas} from "./schemas";
|
||||
import {cherryPick} from "../tools/jina-latechunk";
|
||||
import {formatDateBasedOnType} from "./date-tools";
|
||||
import {classifyText} from "../tools/jina-classify-spam";
|
||||
import {segmentText} from "../tools/segment";
|
||||
import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
|
||||
import { getI18nText, smartMergeStrings } from "./text-tools";
|
||||
import { rerankDocuments } from "../tools/jina-rerank";
|
||||
import { readUrl } from "../tools/read";
|
||||
import { Schemas } from "./schemas";
|
||||
import { cherryPick } from "../tools/jina-latechunk";
|
||||
import { formatDateBasedOnType } from "./date-tools";
|
||||
import { classifyText } from "../tools/jina-classify-spam";
|
||||
import { segmentText } from "../tools/segment";
|
||||
import axiosClient from "./axios-client";
|
||||
|
||||
export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
@ -179,7 +179,7 @@ const extractUrlParts = (urlStr: string) => {
|
||||
};
|
||||
} catch (e) {
|
||||
console.error(`Error parsing URL: ${urlStr}`, e);
|
||||
return {hostname: "", path: ""};
|
||||
return { hostname: "", path: "" };
|
||||
}
|
||||
};
|
||||
|
||||
@ -203,7 +203,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
|
||||
if (!item || !item.url) return; // Skip invalid items
|
||||
|
||||
totalUrls++;
|
||||
const {hostname, path} = extractUrlParts(item.url);
|
||||
const { hostname, path } = extractUrlParts(item.url);
|
||||
|
||||
// Count hostnames
|
||||
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
||||
@ -216,7 +216,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
|
||||
});
|
||||
});
|
||||
|
||||
return {hostnameCount, pathPrefixCount, totalUrls};
|
||||
return { hostnameCount, pathPrefixCount, totalUrls };
|
||||
};
|
||||
|
||||
// Calculate normalized frequency for boosting
|
||||
@ -241,7 +241,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
||||
|
||||
// Count URL parts first
|
||||
const counts = countUrlParts(urlItems);
|
||||
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
|
||||
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
|
||||
|
||||
if (question.trim().length > 0) {
|
||||
// Step 1: Create a record to track unique content with their original indices
|
||||
@ -262,9 +262,9 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
||||
const uniqueIndicesMap = Object.values(uniqueContentMap);
|
||||
console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
|
||||
rerankDocuments(question, uniqueContents, trackers.tokenTracker)
|
||||
.then(({results}) => {
|
||||
.then(({ results }) => {
|
||||
// Step 3: Map the scores back to all original items
|
||||
results.forEach(({index, relevance_score}) => {
|
||||
results.forEach(({ index, relevance_score }) => {
|
||||
const originalIndices = uniqueIndicesMap[index];
|
||||
const boost = relevance_score * jinaRerankFactor;
|
||||
|
||||
@ -283,7 +283,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
||||
return item; // Return unchanged
|
||||
}
|
||||
|
||||
const {hostname, path} = extractUrlParts(item.url);
|
||||
const { hostname, path } = extractUrlParts(item.url);
|
||||
|
||||
// Base weight from original
|
||||
const freq = item.weight || 0; // Default to 1 if weight is missing
|
||||
@ -427,6 +427,14 @@ export async function getLastModified(url: string): Promise<string | undefined>
|
||||
|
||||
|
||||
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
||||
// First count unique hostnames
|
||||
const uniqueHostnames = new Set(results.map(result => extractUrlParts(result.url).hostname));
|
||||
|
||||
// If only one or zero unique hostnames, return original results
|
||||
if (uniqueHostnames.size <= 1) {
|
||||
return results;
|
||||
}
|
||||
|
||||
const hostnameMap: Record<string, number> = {};
|
||||
const filteredResults: BoostedSearchSnippet[] = [];
|
||||
|
||||
@ -458,7 +466,7 @@ export async function processURLs(
|
||||
): Promise<{ urlResults: any[], success: boolean }> {
|
||||
// Skip if no URLs to process
|
||||
if (urls.length === 0) {
|
||||
return {urlResults: [], success: false};
|
||||
return { urlResults: [], success: false };
|
||||
}
|
||||
|
||||
const badHostnames: string[] = [];
|
||||
@ -466,10 +474,10 @@ export async function processURLs(
|
||||
// Track the reading action
|
||||
const thisStep: VisitAction = {
|
||||
action: 'visit',
|
||||
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
|
||||
think: getI18nText('read_for', schemaGen.languageCode, { urls: urls.join(', ') }),
|
||||
URLTargets: urls
|
||||
}
|
||||
context.actionTracker.trackAction({thisStep})
|
||||
context.actionTracker.trackAction({ thisStep })
|
||||
|
||||
// Process each URL in parallel
|
||||
const urlResults = await Promise.all(
|
||||
@ -483,8 +491,8 @@ export async function processURLs(
|
||||
// Store normalized URL for consistent reference
|
||||
url = normalizedUrl;
|
||||
|
||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
||||
const {data} = response;
|
||||
const { response } = await readUrl(url, true, context.tokenTracker);
|
||||
const { data } = response;
|
||||
const guessedTime = await getLastModified(url);
|
||||
if (guessedTime) {
|
||||
console.log('Guessed time for', url, guessedTime);
|
||||
@ -505,7 +513,7 @@ export async function processURLs(
|
||||
}
|
||||
|
||||
// add to web contents
|
||||
const {chunks, chunk_positions } = await segmentText(data.content, context);
|
||||
const { chunks, chunk_positions } = await segmentText(data.content, context);
|
||||
// filter out the chunks that are too short, minChunkLength is 80
|
||||
const minChunkLength = 80;
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
@ -546,7 +554,7 @@ export async function processURLs(
|
||||
}
|
||||
});
|
||||
|
||||
return {url, result: response};
|
||||
return { url, result: response };
|
||||
} catch (error: any) {
|
||||
console.error('Error reading URL:', url, error);
|
||||
badURLs.push(url);
|
||||
@ -593,11 +601,11 @@ export async function processURLs(
|
||||
// remove any URL with bad hostnames from allURLs
|
||||
if (badHostnames.length > 0) {
|
||||
Object.keys(allURLs).forEach(url => {
|
||||
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
||||
delete allURLs[url];
|
||||
console.log(`Removed ${url} from allURLs`);
|
||||
}
|
||||
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
||||
delete allURLs[url];
|
||||
console.log(`Removed ${url} from allURLs`);
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
@ -658,7 +666,7 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
|
||||
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
|
||||
|
||||
// Find all matches
|
||||
const matches: Array<{url: string, index: number, length: number}> = [];
|
||||
const matches: Array<{ url: string, index: number, length: number }> = [];
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = urlPattern.exec(text)) !== null) {
|
||||
@ -697,14 +705,14 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
|
||||
|
||||
// Adjust boundaries to avoid overlapping with other URLs
|
||||
if (i > 0) {
|
||||
const prevUrl = matches[i-1];
|
||||
const prevUrl = matches[i - 1];
|
||||
if (startPos < prevUrl.index + prevUrl.length) {
|
||||
startPos = prevUrl.index + prevUrl.length;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < matches.length - 1) {
|
||||
const nextUrl = matches[i+1];
|
||||
const nextUrl = matches[i + 1];
|
||||
if (endPos > nextUrl.index) {
|
||||
endPos = nextUrl.index;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user