mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
perf: async parallel
This commit is contained in:
@@ -3,7 +3,7 @@ import {Reference, TrackerContext, WebContent} from "../types";
|
|||||||
import {rerankDocuments} from "./jina-rerank";
|
import {rerankDocuments} from "./jina-rerank";
|
||||||
import {Schemas} from "../utils/schemas";
|
import {Schemas} from "../utils/schemas";
|
||||||
|
|
||||||
// New function to calculate Jaccard similarity as fallback
|
// Jaccard similarity function for fallback
|
||||||
function calculateJaccardSimilarity(text1: string, text2: string): number {
|
function calculateJaccardSimilarity(text1: string, text2: string): number {
|
||||||
// Convert texts to lowercase and tokenize by splitting on non-alphanumeric characters
|
// Convert texts to lowercase and tokenize by splitting on non-alphanumeric characters
|
||||||
const tokens1 = new Set(text1.toLowerCase().split(/\W+/).filter(t => t.length > 0));
|
const tokens1 = new Set(text1.toLowerCase().split(/\W+/).filter(t => t.length > 0));
|
||||||
@@ -19,7 +19,7 @@ function calculateJaccardSimilarity(text1: string, text2: string): number {
|
|||||||
return union.size === 0 ? 0 : intersection.size / union.size;
|
return union.size === 0 ? 0 : intersection.size / union.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
// New function to perform fallback similarity ranking
|
// Fallback similarity ranking
|
||||||
async function fallbackRerankWithJaccard(query: string, documents: string[]): Promise<{ results: { index: number, relevance_score: number }[] }> {
|
async function fallbackRerankWithJaccard(query: string, documents: string[]): Promise<{ results: { index: number, relevance_score: number }[] }> {
|
||||||
const results = documents.map((doc, index) => {
|
const results = documents.map((doc, index) => {
|
||||||
const score = calculateJaccardSimilarity(query, doc);
|
const score = calculateJaccardSimilarity(query, doc);
|
||||||
@@ -89,7 +89,7 @@ export async function buildReferences(
|
|||||||
|
|
||||||
validAnswerChunks.push(i);
|
validAnswerChunks.push(i);
|
||||||
|
|
||||||
// Create a reranking task (handling batch size constraint later)
|
// Create a reranking task
|
||||||
rerankTasks.push({
|
rerankTasks.push({
|
||||||
index: i,
|
index: i,
|
||||||
chunk: answerChunk,
|
chunk: answerChunk,
|
||||||
@@ -97,42 +97,17 @@ export async function buildReferences(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fixed batch size of 512 as suggested
|
// Process all reranking tasks in parallel using the updated rerankDocuments function
|
||||||
const BATCH_SIZE = 512;
|
const processTask = async (task: any) => {
|
||||||
|
|
||||||
// Process all reranking tasks in parallel with fixed batch size
|
|
||||||
const processTaskWithBatches = async (task: any) => {
|
|
||||||
try {
|
try {
|
||||||
// Create batches of web content chunks
|
// Use rerankDocuments directly - it now handles batching internally
|
||||||
const batches = [];
|
const result = await rerankDocuments(task.chunk, allWebContentChunks, context.tokenTracker);
|
||||||
for (let i = 0; i < allWebContentChunks.length; i += BATCH_SIZE) {
|
|
||||||
batches.push(allWebContentChunks.slice(i, i + BATCH_SIZE));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process all batches in parallel
|
|
||||||
const batchPromises = batches.map(async (batch, batchIndex) => {
|
|
||||||
const batchOffset = batchIndex * BATCH_SIZE;
|
|
||||||
const result = await rerankDocuments(task.chunk, batch, context.tokenTracker);
|
|
||||||
|
|
||||||
// Adjust indices to account for batching
|
|
||||||
return result.results.map(item => ({
|
|
||||||
index: item.index + batchOffset,
|
|
||||||
relevance_score: item.relevance_score
|
|
||||||
}));
|
|
||||||
});
|
|
||||||
|
|
||||||
// Wait for all batch processing to complete
|
|
||||||
const batchResults = await Promise.all(batchPromises);
|
|
||||||
|
|
||||||
// Combine and sort all results
|
|
||||||
const combinedResults = batchResults.flat();
|
|
||||||
combinedResults.sort((a, b) => b.relevance_score - a.relevance_score);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
answerChunkIndex: task.index,
|
answerChunkIndex: task.index,
|
||||||
answerChunk: task.chunk,
|
answerChunk: task.chunk,
|
||||||
answerChunkPosition: task.position,
|
answerChunkPosition: task.position,
|
||||||
results: combinedResults
|
results: result.results
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Reranking failed, falling back to Jaccard similarity', error);
|
console.error('Reranking failed, falling back to Jaccard similarity', error);
|
||||||
@@ -148,7 +123,7 @@ export async function buildReferences(
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Process all tasks in parallel
|
// Process all tasks in parallel
|
||||||
const taskResults = await Promise.all(rerankTasks.map(processTaskWithBatches));
|
const taskResults = await Promise.all(rerankTasks.map(processTask));
|
||||||
|
|
||||||
// Collect and flatten all matches
|
// Collect and flatten all matches
|
||||||
const allMatches = [];
|
const allMatches = [];
|
||||||
|
|||||||
@@ -51,12 +51,8 @@ export async function cherryPick(question: string, longContext: string, options:
|
|||||||
|
|
||||||
console.log(`Total length ${longContext.length} split ${chunks.length} chunks into ${chunkBatches.length} batches of ~${chunksPerBatch} chunks each`);
|
console.log(`Total length ${longContext.length} split ${chunks.length} chunks into ${chunkBatches.length} batches of ~${chunksPerBatch} chunks each`);
|
||||||
|
|
||||||
// Process each batch and collect the embeddings
|
// Process all batches in parallel
|
||||||
const allChunkEmbeddings: number[][] = [];
|
const batchPromises = chunkBatches.map(async (batch, batchIndex) => {
|
||||||
let totalTokensUsed = 0;
|
|
||||||
|
|
||||||
for (let batchIndex = 0; batchIndex < chunkBatches.length; batchIndex++) {
|
|
||||||
const batch = chunkBatches[batchIndex];
|
|
||||||
console.log(`Processing batch ${batchIndex + 1}/${chunkBatches.length} with ${batch.length} chunks`);
|
console.log(`Processing batch ${batchIndex + 1}/${chunkBatches.length} with ${batch.length} chunks`);
|
||||||
|
|
||||||
// Get embeddings for the current batch
|
// Get embeddings for the current batch
|
||||||
@@ -90,12 +86,25 @@ export async function cherryPick(question: string, longContext: string, options:
|
|||||||
|
|
||||||
// Extract embeddings from this batch
|
// Extract embeddings from this batch
|
||||||
const batchEmbeddings = batchEmbeddingResponse.data.data.map((item: any) => item.embedding);
|
const batchEmbeddings = batchEmbeddingResponse.data.data.map((item: any) => item.embedding);
|
||||||
allChunkEmbeddings.push(...batchEmbeddings);
|
|
||||||
|
|
||||||
// Track token usage
|
// Return both embeddings and token usage
|
||||||
const batchTokens = batchEmbeddingResponse.data.usage?.total_tokens || 0;
|
return {
|
||||||
totalTokensUsed += batchTokens;
|
embeddings: batchEmbeddings,
|
||||||
}
|
tokens: batchEmbeddingResponse.data.usage?.total_tokens || 0
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for all batch processing to complete
|
||||||
|
const batchResults = await Promise.all(batchPromises);
|
||||||
|
|
||||||
|
// Collect all embeddings and total token usage
|
||||||
|
const allChunkEmbeddings: number[][] = [];
|
||||||
|
let totalTokensUsed = 0;
|
||||||
|
|
||||||
|
batchResults.forEach(result => {
|
||||||
|
allChunkEmbeddings.push(...result.embeddings);
|
||||||
|
totalTokensUsed += result.tokens;
|
||||||
|
});
|
||||||
|
|
||||||
// Get embedding for the question
|
// Get embedding for the question
|
||||||
const questionEmbeddingResponse = await axios.post(
|
const questionEmbeddingResponse = await axios.post(
|
||||||
|
|||||||
@@ -26,57 +26,74 @@ interface JinaRerankResponse {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Reranks a list of documents based on relevance to a query
|
|
||||||
* @param query The query to rank documents against
|
|
||||||
* @param documents Array of documents to be ranked
|
|
||||||
* @param topN Number of top results to return
|
|
||||||
* @param tracker Optional token tracker for usage monitoring
|
|
||||||
* @returns Array of reranked documents with their scores
|
|
||||||
*/
|
|
||||||
export async function rerankDocuments(
|
export async function rerankDocuments(
|
||||||
query: string,
|
query: string,
|
||||||
documents: string[],
|
documents: string[],
|
||||||
tracker?: TokenTracker
|
tracker?: TokenTracker,
|
||||||
): Promise<{ results: Array<{index: number, relevance_score: number, document: {text: string}}> }> {
|
batchSize = 2000
|
||||||
|
): Promise<{ results: Array<{ index: number, relevance_score: number, document: { text: string } }> }> {
|
||||||
try {
|
try {
|
||||||
if (!JINA_API_KEY) {
|
if (!JINA_API_KEY) {
|
||||||
throw new Error('JINA_API_KEY is not set');
|
throw new Error('JINA_API_KEY is not set');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (documents.length > 2000) {
|
// No need to slice - we'll process all documents in batches
|
||||||
console.error(`Reranking ${documents.length} documents, which exceeds the recommended limit of 2000. This may lead to performance issues.`);
|
const batches: string[][] = [];
|
||||||
documents = documents.slice(0, 2000);
|
for (let i = 0; i < documents.length; i += batchSize) {
|
||||||
|
batches.push(documents.slice(i, i + batchSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
const request: JinaRerankRequest = {
|
console.log(`Processing ${documents.length} documents in ${batches.length} batches of up to ${batchSize} each`);
|
||||||
model: 'jina-reranker-v2-base-multilingual',
|
|
||||||
query,
|
|
||||||
top_n: documents.length,
|
|
||||||
documents
|
|
||||||
};
|
|
||||||
|
|
||||||
const response = await axios.post<JinaRerankResponse>(
|
// Process all batches in parallel
|
||||||
JINA_API_URL,
|
const batchResults = await Promise.all(
|
||||||
request,
|
batches.map(async (batchDocuments, batchIndex) => {
|
||||||
{
|
const startIdx = batchIndex * batchSize;
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
const request: JinaRerankRequest = {
|
||||||
'Authorization': `Bearer ${JINA_API_KEY}`
|
model: 'jina-reranker-v2-base-multilingual',
|
||||||
}
|
query,
|
||||||
}
|
top_n: batchDocuments.length,
|
||||||
|
documents: batchDocuments
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await axios.post<JinaRerankResponse>(
|
||||||
|
JINA_API_URL,
|
||||||
|
request,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${JINA_API_KEY}`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Track token usage from this batch
|
||||||
|
(tracker || new TokenTracker()).trackUsage('rerank', {
|
||||||
|
promptTokens: response.data.usage.total_tokens,
|
||||||
|
completionTokens: 0,
|
||||||
|
totalTokens: response.data.usage.total_tokens
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add the original document index to each result
|
||||||
|
return response.data.results.map(result => ({
|
||||||
|
...result,
|
||||||
|
originalIndex: startIdx + result.index // Map back to the original index
|
||||||
|
}));
|
||||||
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
// Track token usage from the API
|
// Flatten and sort all results by relevance score
|
||||||
(tracker || new TokenTracker()).trackUsage('rerank', {
|
const allResults = batchResults.flat().sort((a, b) => b.relevance_score - a.relevance_score);
|
||||||
promptTokens: response.data.usage.total_tokens,
|
|
||||||
completionTokens: 0,
|
|
||||||
totalTokens: response.data.usage.total_tokens
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
// Keep the original document indices in the results
|
||||||
results: response.data.results
|
const finalResults = allResults.map(result => ({
|
||||||
};
|
index: result.originalIndex, // Original document index
|
||||||
|
relevance_score: result.relevance_score,
|
||||||
|
document: result.document
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {results: finalResults};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error in reranking documents:', error);
|
console.error('Error in reranking documents:', error);
|
||||||
|
|
||||||
|
|||||||
@@ -3,22 +3,6 @@ import {TokenTracker} from "../utils/token-tracker";
|
|||||||
import {JINA_API_KEY} from "../config";
|
import {JINA_API_KEY} from "../config";
|
||||||
import {TrackerContext} from "../types";
|
import {TrackerContext} from "../types";
|
||||||
|
|
||||||
/**
|
|
||||||
* Segments text into chunks, handling text of arbitrary length by batching
|
|
||||||
* @param content Text to segment
|
|
||||||
* @param tracker Context for tracking token usage
|
|
||||||
* @param maxChunkLength Maximum length of each chunk (passed to Jina API)
|
|
||||||
* @param returnChunks Whether to return chunks in the API response
|
|
||||||
* @returns Object containing chunks and their positions
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
* Segments text into chunks, handling text of arbitrary length by batching
|
|
||||||
* @param content Text to segment
|
|
||||||
* @param tracker Context for tracking token usage
|
|
||||||
* @param maxChunkLength Maximum length of each chunk (passed to Jina API)
|
|
||||||
* @param returnChunks Whether to return chunks in the API response
|
|
||||||
* @returns Object containing chunks and chunk_positions matching Jina API format
|
|
||||||
*/
|
|
||||||
export async function segmentText(
|
export async function segmentText(
|
||||||
content: string,
|
content: string,
|
||||||
tracker: TrackerContext,
|
tracker: TrackerContext,
|
||||||
@@ -38,19 +22,20 @@ export async function segmentText(
|
|||||||
// Maximum size to send in a single API request (slightly under 64K to be safe)
|
// Maximum size to send in a single API request (slightly under 64K to be safe)
|
||||||
const MAX_BATCH_SIZE = 60000;
|
const MAX_BATCH_SIZE = 60000;
|
||||||
|
|
||||||
// Final results
|
|
||||||
const allChunks = [];
|
|
||||||
const allChunkPositions = [];
|
|
||||||
let totalTokens = 0;
|
|
||||||
|
|
||||||
// Split content into batches
|
// Split content into batches
|
||||||
const batches = splitTextIntoBatches(content, MAX_BATCH_SIZE);
|
const batches = splitTextIntoBatches(content, MAX_BATCH_SIZE);
|
||||||
console.log(`Split content into ${batches.length} batches`);
|
console.log(`Split content into ${batches.length} batches`);
|
||||||
|
|
||||||
// Process each batch sequentially
|
// Calculate offsets for each batch upfront
|
||||||
|
const batchOffsets: number[] = [];
|
||||||
let currentOffset = 0;
|
let currentOffset = 0;
|
||||||
for (let i = 0; i < batches.length; i++) {
|
for (const batch of batches) {
|
||||||
const batch = batches[i];
|
batchOffsets.push(currentOffset);
|
||||||
|
currentOffset += batch.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process all batches in parallel
|
||||||
|
const batchPromises = batches.map(async (batch, i) => {
|
||||||
console.log(`Processing batch ${i + 1}/${batches.length} (size: ${batch.length})`);
|
console.log(`Processing batch ${i + 1}/${batches.length} (size: ${batch.length})`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -81,33 +66,43 @@ export async function segmentText(
|
|||||||
tokenizer: data.tokenizer
|
tokenizer: data.tokenizer
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add chunks from this batch to the results
|
// Get the batch offset
|
||||||
if (data.chunks && returnChunks) {
|
const offset = batchOffsets[i];
|
||||||
allChunks.push(...data.chunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Adjust chunk positions to account for the offset of this batch
|
// Adjust chunk positions to account for the offset of this batch
|
||||||
if (data.chunk_positions) {
|
const adjustedPositions = data.chunk_positions
|
||||||
const adjustedPositions = data.chunk_positions.map((position: [number, number]) => {
|
? data.chunk_positions.map((position: [number, number]) => {
|
||||||
// The API returns chunk_positions as arrays of [start, end]
|
return [
|
||||||
return [
|
position[0] + offset,
|
||||||
position[0] + currentOffset,
|
position[1] + offset
|
||||||
position[1] + currentOffset
|
] as [number, number];
|
||||||
] as [number, number];
|
})
|
||||||
});
|
: [];
|
||||||
allChunkPositions.push(...adjustedPositions);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track token usage
|
|
||||||
const batchTokens = data.usage?.tokens || 0;
|
|
||||||
totalTokens += batchTokens;
|
|
||||||
|
|
||||||
// Update the current offset for the next batch
|
|
||||||
currentOffset += batch.length;
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
chunks: data.chunks || [],
|
||||||
|
positions: adjustedPositions,
|
||||||
|
tokens: data.usage?.tokens || 0
|
||||||
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
handleSegmentationError(error);
|
handleSegmentationError(error);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for all batches to complete
|
||||||
|
const batchResults = await Promise.all(batchPromises);
|
||||||
|
|
||||||
|
// Aggregate results
|
||||||
|
const allChunks = [];
|
||||||
|
const allChunkPositions = [];
|
||||||
|
let totalTokens = 0;
|
||||||
|
|
||||||
|
for (const result of batchResults) {
|
||||||
|
if (returnChunks) {
|
||||||
|
allChunks.push(...result.chunks);
|
||||||
|
}
|
||||||
|
allChunkPositions.push(...result.positions);
|
||||||
|
totalTokens += result.tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Track total token usage for all batches
|
// Track total token usage for all batches
|
||||||
|
|||||||
@@ -721,7 +721,8 @@ export function repairMarkdownFinal(markdown: string): string {
|
|||||||
|
|
||||||
// remove any '<27>'
|
// remove any '<27>'
|
||||||
repairedMarkdown = repairedMarkdown.replace(/<2F>/g, '');
|
repairedMarkdown = repairedMarkdown.replace(/<2F>/g, '');
|
||||||
|
// remove any <center> tags
|
||||||
|
repairedMarkdown = repairedMarkdown.replace(/<\/?center>/g, '');
|
||||||
|
|
||||||
// Step 1: Handle <hr> and <br> tags outside tables
|
// Step 1: Handle <hr> and <br> tags outside tables
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user