mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
fix: add spacing for consistency in agent and url-tools
This commit is contained in:
@@ -517,6 +517,7 @@ export async function getResponse(question?: string,
|
|||||||
question: currentQuestion,
|
question: currentQuestion,
|
||||||
boostHostnames
|
boostHostnames
|
||||||
}, context);
|
}, context);
|
||||||
|
|
||||||
// improve diversity by keep top 2 urls of each hostname
|
// improve diversity by keep top 2 urls of each hostname
|
||||||
weightedURLs = keepKPerHostname(weightedURLs, 2);
|
weightedURLs = keepKPerHostname(weightedURLs, 2);
|
||||||
console.log('Weighted URLs:', weightedURLs.length);
|
console.log('Weighted URLs:', weightedURLs.length);
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
import {BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types";
|
import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
|
||||||
import {getI18nText, smartMergeStrings} from "./text-tools";
|
import { getI18nText, smartMergeStrings } from "./text-tools";
|
||||||
import {rerankDocuments} from "../tools/jina-rerank";
|
import { rerankDocuments } from "../tools/jina-rerank";
|
||||||
import {readUrl} from "../tools/read";
|
import { readUrl } from "../tools/read";
|
||||||
import {Schemas} from "./schemas";
|
import { Schemas } from "./schemas";
|
||||||
import {cherryPick} from "../tools/jina-latechunk";
|
import { cherryPick } from "../tools/jina-latechunk";
|
||||||
import {formatDateBasedOnType} from "./date-tools";
|
import { formatDateBasedOnType } from "./date-tools";
|
||||||
import {classifyText} from "../tools/jina-classify-spam";
|
import { classifyText } from "../tools/jina-classify-spam";
|
||||||
import {segmentText} from "../tools/segment";
|
import { segmentText } from "../tools/segment";
|
||||||
import axiosClient from "./axios-client";
|
import axiosClient from "./axios-client";
|
||||||
|
|
||||||
export function normalizeUrl(urlString: string, debug = false, options = {
|
export function normalizeUrl(urlString: string, debug = false, options = {
|
||||||
@@ -179,7 +179,7 @@ const extractUrlParts = (urlStr: string) => {
|
|||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(`Error parsing URL: ${urlStr}`, e);
|
console.error(`Error parsing URL: ${urlStr}`, e);
|
||||||
return {hostname: "", path: ""};
|
return { hostname: "", path: "" };
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -203,7 +203,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
|
|||||||
if (!item || !item.url) return; // Skip invalid items
|
if (!item || !item.url) return; // Skip invalid items
|
||||||
|
|
||||||
totalUrls++;
|
totalUrls++;
|
||||||
const {hostname, path} = extractUrlParts(item.url);
|
const { hostname, path } = extractUrlParts(item.url);
|
||||||
|
|
||||||
// Count hostnames
|
// Count hostnames
|
||||||
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
hostnameCount[hostname] = (hostnameCount[hostname] || 0) + 1;
|
||||||
@@ -216,7 +216,7 @@ export const countUrlParts = (urlItems: SearchSnippet[]) => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
return {hostnameCount, pathPrefixCount, totalUrls};
|
return { hostnameCount, pathPrefixCount, totalUrls };
|
||||||
};
|
};
|
||||||
|
|
||||||
// Calculate normalized frequency for boosting
|
// Calculate normalized frequency for boosting
|
||||||
@@ -241,7 +241,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
|||||||
|
|
||||||
// Count URL parts first
|
// Count URL parts first
|
||||||
const counts = countUrlParts(urlItems);
|
const counts = countUrlParts(urlItems);
|
||||||
const {hostnameCount, pathPrefixCount, totalUrls} = counts;
|
const { hostnameCount, pathPrefixCount, totalUrls } = counts;
|
||||||
|
|
||||||
if (question.trim().length > 0) {
|
if (question.trim().length > 0) {
|
||||||
// Step 1: Create a record to track unique content with their original indices
|
// Step 1: Create a record to track unique content with their original indices
|
||||||
@@ -262,9 +262,9 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
|||||||
const uniqueIndicesMap = Object.values(uniqueContentMap);
|
const uniqueIndicesMap = Object.values(uniqueContentMap);
|
||||||
console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
|
console.log(`rerank URLs: ${urlItems.length}->${uniqueContents.length}`)
|
||||||
rerankDocuments(question, uniqueContents, trackers.tokenTracker)
|
rerankDocuments(question, uniqueContents, trackers.tokenTracker)
|
||||||
.then(({results}) => {
|
.then(({ results }) => {
|
||||||
// Step 3: Map the scores back to all original items
|
// Step 3: Map the scores back to all original items
|
||||||
results.forEach(({index, relevance_score}) => {
|
results.forEach(({ index, relevance_score }) => {
|
||||||
const originalIndices = uniqueIndicesMap[index];
|
const originalIndices = uniqueIndicesMap[index];
|
||||||
const boost = relevance_score * jinaRerankFactor;
|
const boost = relevance_score * jinaRerankFactor;
|
||||||
|
|
||||||
@@ -283,7 +283,7 @@ export const rankURLs = (urlItems: SearchSnippet[], options: any = {}, trackers:
|
|||||||
return item; // Return unchanged
|
return item; // Return unchanged
|
||||||
}
|
}
|
||||||
|
|
||||||
const {hostname, path} = extractUrlParts(item.url);
|
const { hostname, path } = extractUrlParts(item.url);
|
||||||
|
|
||||||
// Base weight from original
|
// Base weight from original
|
||||||
const freq = item.weight || 0; // Default to 1 if weight is missing
|
const freq = item.weight || 0; // Default to 1 if weight is missing
|
||||||
@@ -427,6 +427,14 @@ export async function getLastModified(url: string): Promise<string | undefined>
|
|||||||
|
|
||||||
|
|
||||||
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
export const keepKPerHostname = (results: BoostedSearchSnippet[], k: number) => {
|
||||||
|
// First count unique hostnames
|
||||||
|
const uniqueHostnames = new Set(results.map(result => extractUrlParts(result.url).hostname));
|
||||||
|
|
||||||
|
// If only one or zero unique hostnames, return original results
|
||||||
|
if (uniqueHostnames.size <= 1) {
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
const hostnameMap: Record<string, number> = {};
|
const hostnameMap: Record<string, number> = {};
|
||||||
const filteredResults: BoostedSearchSnippet[] = [];
|
const filteredResults: BoostedSearchSnippet[] = [];
|
||||||
|
|
||||||
@@ -458,7 +466,7 @@ export async function processURLs(
|
|||||||
): Promise<{ urlResults: any[], success: boolean }> {
|
): Promise<{ urlResults: any[], success: boolean }> {
|
||||||
// Skip if no URLs to process
|
// Skip if no URLs to process
|
||||||
if (urls.length === 0) {
|
if (urls.length === 0) {
|
||||||
return {urlResults: [], success: false};
|
return { urlResults: [], success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
const badHostnames: string[] = [];
|
const badHostnames: string[] = [];
|
||||||
@@ -466,10 +474,10 @@ export async function processURLs(
|
|||||||
// Track the reading action
|
// Track the reading action
|
||||||
const thisStep: VisitAction = {
|
const thisStep: VisitAction = {
|
||||||
action: 'visit',
|
action: 'visit',
|
||||||
think: getI18nText('read_for', schemaGen.languageCode, {urls: urls.join(', ')}),
|
think: getI18nText('read_for', schemaGen.languageCode, { urls: urls.join(', ') }),
|
||||||
URLTargets: urls
|
URLTargets: urls
|
||||||
}
|
}
|
||||||
context.actionTracker.trackAction({thisStep})
|
context.actionTracker.trackAction({ thisStep })
|
||||||
|
|
||||||
// Process each URL in parallel
|
// Process each URL in parallel
|
||||||
const urlResults = await Promise.all(
|
const urlResults = await Promise.all(
|
||||||
@@ -483,8 +491,8 @@ export async function processURLs(
|
|||||||
// Store normalized URL for consistent reference
|
// Store normalized URL for consistent reference
|
||||||
url = normalizedUrl;
|
url = normalizedUrl;
|
||||||
|
|
||||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
const { response } = await readUrl(url, true, context.tokenTracker);
|
||||||
const {data} = response;
|
const { data } = response;
|
||||||
const guessedTime = await getLastModified(url);
|
const guessedTime = await getLastModified(url);
|
||||||
if (guessedTime) {
|
if (guessedTime) {
|
||||||
console.log('Guessed time for', url, guessedTime);
|
console.log('Guessed time for', url, guessedTime);
|
||||||
@@ -505,7 +513,7 @@ export async function processURLs(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add to web contents
|
// add to web contents
|
||||||
const {chunks, chunk_positions } = await segmentText(data.content, context);
|
const { chunks, chunk_positions } = await segmentText(data.content, context);
|
||||||
// filter out the chunks that are too short, minChunkLength is 80
|
// filter out the chunks that are too short, minChunkLength is 80
|
||||||
const minChunkLength = 80;
|
const minChunkLength = 80;
|
||||||
for (let i = 0; i < chunks.length; i++) {
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
@@ -546,7 +554,7 @@ export async function processURLs(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return {url, result: response};
|
return { url, result: response };
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
console.error('Error reading URL:', url, error);
|
console.error('Error reading URL:', url, error);
|
||||||
badURLs.push(url);
|
badURLs.push(url);
|
||||||
@@ -593,11 +601,11 @@ export async function processURLs(
|
|||||||
// remove any URL with bad hostnames from allURLs
|
// remove any URL with bad hostnames from allURLs
|
||||||
if (badHostnames.length > 0) {
|
if (badHostnames.length > 0) {
|
||||||
Object.keys(allURLs).forEach(url => {
|
Object.keys(allURLs).forEach(url => {
|
||||||
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
if (badHostnames.includes(extractUrlParts(url).hostname)) {
|
||||||
delete allURLs[url];
|
delete allURLs[url];
|
||||||
console.log(`Removed ${url} from allURLs`);
|
console.log(`Removed ${url} from allURLs`);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -658,7 +666,7 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
|
|||||||
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
|
const urlPattern = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&//=]*)/g;
|
||||||
|
|
||||||
// Find all matches
|
// Find all matches
|
||||||
const matches: Array<{url: string, index: number, length: number}> = [];
|
const matches: Array<{ url: string, index: number, length: number }> = [];
|
||||||
let match: RegExpExecArray | null;
|
let match: RegExpExecArray | null;
|
||||||
|
|
||||||
while ((match = urlPattern.exec(text)) !== null) {
|
while ((match = urlPattern.exec(text)) !== null) {
|
||||||
@@ -697,14 +705,14 @@ export function extractUrlsWithDescription(text: string, contextWindowSize: numb
|
|||||||
|
|
||||||
// Adjust boundaries to avoid overlapping with other URLs
|
// Adjust boundaries to avoid overlapping with other URLs
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
const prevUrl = matches[i-1];
|
const prevUrl = matches[i - 1];
|
||||||
if (startPos < prevUrl.index + prevUrl.length) {
|
if (startPos < prevUrl.index + prevUrl.length) {
|
||||||
startPos = prevUrl.index + prevUrl.length;
|
startPos = prevUrl.index + prevUrl.length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < matches.length - 1) {
|
if (i < matches.length - 1) {
|
||||||
const nextUrl = matches[i+1];
|
const nextUrl = matches[i + 1];
|
||||||
if (endPos > nextUrl.index) {
|
if (endPos > nextUrl.index) {
|
||||||
endPos = nextUrl.index;
|
endPos = nextUrl.index;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user