mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
refactor: v2
This commit is contained in:
parent
c7b42fb150
commit
08bf9e22fa
@ -1,75 +1,208 @@
|
||||
import axios from 'axios';
|
||||
import { TokenTracker } from "../utils/token-tracker";
|
||||
import { JINA_API_KEY } from "../config";
|
||||
import {TokenTracker} from "../utils/token-tracker";
|
||||
import {JINA_API_KEY} from "../config";
|
||||
import {TrackerContext} from "../types";
|
||||
|
||||
/**
|
||||
* Segments text into chunks, handling text of arbitrary length by batching
|
||||
* @param content Text to segment
|
||||
* @param tracker Context for tracking token usage
|
||||
* @param maxChunkLength Maximum length of each chunk (passed to Jina API)
|
||||
* @param returnChunks Whether to return chunks in the API response
|
||||
* @returns Object containing chunks and their positions
|
||||
*/
|
||||
/**
|
||||
* Segments text into chunks, handling text of arbitrary length by batching
|
||||
* @param content Text to segment
|
||||
* @param tracker Context for tracking token usage
|
||||
* @param maxChunkLength Maximum length of each chunk (passed to Jina API)
|
||||
* @param returnChunks Whether to return chunks in the API response
|
||||
* @returns Object containing chunks and chunk_positions matching Jina API format
|
||||
*/
|
||||
export async function segmentText(
|
||||
content: string,
|
||||
tracker: TrackerContext,
|
||||
maxChunkLength = 1000,
|
||||
returnChunks = true,
|
||||
) {
|
||||
): Promise<{
|
||||
chunks: string[];
|
||||
chunk_positions: [number, number][];
|
||||
}> {
|
||||
if (!content.trim()) {
|
||||
throw new Error('Content cannot be empty');
|
||||
}
|
||||
|
||||
try {
|
||||
const { data } = await axios.post(
|
||||
'https://api.jina.ai/v1/segment',
|
||||
{
|
||||
content,
|
||||
return_chunks: returnChunks,
|
||||
max_chunk_length: maxChunkLength
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${JINA_API_KEY}`,
|
||||
// Initialize token tracker
|
||||
const tokenTracker = tracker?.tokenTracker || new TokenTracker();
|
||||
|
||||
// Maximum size to send in a single API request (slightly under 64K to be safe)
|
||||
const MAX_BATCH_SIZE = 60000;
|
||||
|
||||
// Final results
|
||||
const allChunks = [];
|
||||
const allChunkPositions = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
// Split content into batches
|
||||
const batches = splitTextIntoBatches(content, MAX_BATCH_SIZE);
|
||||
console.log(`Split content into ${batches.length} batches`);
|
||||
|
||||
// Process each batch sequentially
|
||||
let currentOffset = 0;
|
||||
for (let i = 0; i < batches.length; i++) {
|
||||
const batch = batches[i];
|
||||
console.log(`Processing batch ${i + 1}/${batches.length} (size: ${batch.length})`);
|
||||
|
||||
try {
|
||||
const {data} = await axios.post(
|
||||
'https://api.jina.ai/v1/segment',
|
||||
{
|
||||
content: batch,
|
||||
return_chunks: returnChunks,
|
||||
max_chunk_length: maxChunkLength
|
||||
},
|
||||
timeout: 10000,
|
||||
responseType: 'json'
|
||||
}
|
||||
);
|
||||
|
||||
if (!data) {
|
||||
throw new Error('Invalid response data');
|
||||
}
|
||||
|
||||
console.log('Segment:', {
|
||||
numChunks: data.num_chunks,
|
||||
numTokens: data.num_tokens,
|
||||
tokenizer: data.tokenizer
|
||||
});
|
||||
|
||||
const tokens = data.usage?.tokens || 0;
|
||||
const tokenTracker = tracker?.tokenTracker || new TokenTracker();
|
||||
tokenTracker.trackUsage('segment', {
|
||||
totalTokens: tokens,
|
||||
promptTokens: content.length,
|
||||
completionTokens: tokens
|
||||
});
|
||||
|
||||
// Return only chunks and chunk_positions
|
||||
return {
|
||||
chunks: data.chunks,
|
||||
chunk_positions: data.chunk_positions
|
||||
};
|
||||
} catch (error) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
if (error.response) {
|
||||
const status = error.response.status;
|
||||
const errorData = error.response.data;
|
||||
|
||||
if (status === 402) {
|
||||
throw new Error(errorData?.readableMessage || 'Insufficient balance');
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${JINA_API_KEY}`,
|
||||
},
|
||||
timeout: 10000,
|
||||
responseType: 'json'
|
||||
}
|
||||
throw new Error(errorData?.readableMessage || `HTTP Error ${status}`);
|
||||
} else if (error.request) {
|
||||
throw new Error('No response received from server');
|
||||
);
|
||||
|
||||
if (!data) {
|
||||
throw new Error('Invalid response data');
|
||||
}
|
||||
|
||||
console.log(`Batch ${i + 1} result:`, {
|
||||
numChunks: data.num_chunks,
|
||||
numTokens: data.num_tokens,
|
||||
tokenizer: data.tokenizer
|
||||
});
|
||||
|
||||
// Add chunks from this batch to the results
|
||||
if (data.chunks && returnChunks) {
|
||||
allChunks.push(...data.chunks);
|
||||
}
|
||||
|
||||
// Adjust chunk positions to account for the offset of this batch
|
||||
if (data.chunk_positions) {
|
||||
const adjustedPositions = data.chunk_positions.map((position: [number, number]) => {
|
||||
// The API returns chunk_positions as arrays of [start, end]
|
||||
return [
|
||||
position[0] + currentOffset,
|
||||
position[1] + currentOffset
|
||||
] as [number, number];
|
||||
});
|
||||
allChunkPositions.push(...adjustedPositions);
|
||||
}
|
||||
|
||||
// Track token usage
|
||||
const batchTokens = data.usage?.tokens || 0;
|
||||
totalTokens += batchTokens;
|
||||
|
||||
// Update the current offset for the next batch
|
||||
currentOffset += batch.length;
|
||||
|
||||
} catch (error) {
|
||||
handleSegmentationError(error);
|
||||
}
|
||||
}
|
||||
|
||||
// Track total token usage for all batches
|
||||
tokenTracker.trackUsage('segment', {
|
||||
totalTokens: totalTokens,
|
||||
promptTokens: content.length,
|
||||
completionTokens: totalTokens
|
||||
});
|
||||
|
||||
return {
|
||||
chunks: allChunks,
|
||||
chunk_positions: allChunkPositions
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits text into batches that fit within the specified size limit
|
||||
* Tries to split at paragraph boundaries when possible
|
||||
*/
|
||||
function splitTextIntoBatches(text: string, maxBatchSize: number): string[] {
|
||||
const batches = [];
|
||||
let currentIndex = 0;
|
||||
|
||||
while (currentIndex < text.length) {
|
||||
if (currentIndex + maxBatchSize >= text.length) {
|
||||
// If the remaining text fits in one batch, add it and we're done
|
||||
batches.push(text.slice(currentIndex));
|
||||
break;
|
||||
}
|
||||
|
||||
// Find a good split point - preferably at a paragraph break
|
||||
// Look for the last paragraph break within the max batch size
|
||||
let endIndex = currentIndex + maxBatchSize;
|
||||
|
||||
// Try to find paragraph breaks (double newline)
|
||||
const paragraphBreakIndex = text.lastIndexOf('\n\n', endIndex);
|
||||
if (paragraphBreakIndex > currentIndex && paragraphBreakIndex <= endIndex - 10) {
|
||||
// Found a paragraph break that's at least 10 chars before the max size
|
||||
// This avoids tiny splits at the end of a batch
|
||||
endIndex = paragraphBreakIndex + 2; // Include the double newline
|
||||
} else {
|
||||
// If no paragraph break, try a single newline
|
||||
const newlineIndex = text.lastIndexOf('\n', endIndex);
|
||||
if (newlineIndex > currentIndex && newlineIndex <= endIndex - 5) {
|
||||
endIndex = newlineIndex + 1; // Include the newline
|
||||
} else {
|
||||
throw new Error(`Request failed: ${error.message}`);
|
||||
// If no newline, try a sentence break
|
||||
const sentenceBreakIndex = findLastSentenceBreak(text, currentIndex, endIndex);
|
||||
if (sentenceBreakIndex > currentIndex) {
|
||||
endIndex = sentenceBreakIndex;
|
||||
}
|
||||
// If no sentence break found, we'll just use the max batch size
|
||||
}
|
||||
}
|
||||
throw error;
|
||||
|
||||
batches.push(text.slice(currentIndex, endIndex));
|
||||
currentIndex = endIndex;
|
||||
}
|
||||
|
||||
return batches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the last sentence break (period, question mark, or exclamation point followed by space)
|
||||
* within the given range
|
||||
*/
|
||||
function findLastSentenceBreak(text: string, startIndex: number, endIndex: number): number {
|
||||
// Look for ". ", "? ", or "! " patterns
|
||||
for (let i = endIndex; i > startIndex; i--) {
|
||||
if ((text[i - 2] === '.' || text[i - 2] === '?' || text[i - 2] === '!') &&
|
||||
text[i - 1] === ' ') {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1; // No sentence break found
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles errors from the segmentation API
|
||||
*/
|
||||
function handleSegmentationError(error: any): never {
|
||||
if (axios.isAxiosError(error)) {
|
||||
if (error.response) {
|
||||
const status = error.response.status;
|
||||
const errorData = error.response.data;
|
||||
|
||||
if (status === 402) {
|
||||
throw new Error(errorData?.readableMessage || 'Insufficient balance');
|
||||
}
|
||||
throw new Error(errorData?.readableMessage || `HTTP Error ${status}`);
|
||||
} else if (error.request) {
|
||||
throw new Error('No response received from server');
|
||||
} else {
|
||||
throw new Error(`Request failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user