refactor: replace segmentText with chunkText in multiple files

2025-12-26 06:28:56 +08:00 · 2025-06-10 19:09:28 -07:00 · 2025-06-10 19:09:28 -07:00 · a59a524d1c
commit a59a524d1c
parent 4810393923
5 changed files with 64 additions and 193 deletions
--- a/src/tools/build-ref.ts
+++ b/src/tools/build-ref.ts
@ -1,4 +1,4 @@
-import { segmentText } from './segment';
+import { chunkText } from '../tools/segment';
 import { ImageObject, ImageReference, Reference, TrackerContext, WebContent } from "../types";
 import { Schemas } from "../utils/schemas";
 import { cosineSimilarity, jaccardRank } from "./cosine";
@ -22,7 +22,7 @@ export async function buildReferences(

  // Step 1: Chunk the answer
  logDebug(`[buildReferences] Step 1: Chunking answer text`);
-  const { chunks: answerChunks, chunk_positions: answerChunkPositions } = await segmentText(answer, context);
+  const { chunks: answerChunks, chunk_positions: answerChunkPositions } = chunkText(answer);
  logDebug(`[buildReferences] Answer segmented into ${answerChunks.length} chunks`);

  // Step 2: Prepare all web content chunks, filtering out those below minimum length
@ -385,7 +385,7 @@ export async function buildImageReferences(

  // Step 1: Chunk the answer
  logDebug(`[buildImageReferences] Step 1: Chunking answer text`);
-  const { chunks: answerChunks, chunk_positions: answerChunkPositions } = await segmentText(answer, context);
+  const { chunks: answerChunks, chunk_positions: answerChunkPositions } = chunkText(answer);
  logDebug(`[buildImageReferences] Answer segmented into ${answerChunks.length} chunks`);

  // Step 2: Prepare image content
--- a/src/tools/jina-latechunk.ts
+++ b/src/tools/jina-latechunk.ts
@ -2,7 +2,7 @@ import { TrackerContext } from "../types";
 import { Schemas } from "../utils/schemas";
 import { cosineSimilarity } from "./cosine";
 import { getEmbeddings } from "./embeddings";
-import { logInfo, logError, logDebug, logWarning } from '../logging';
+import { logError, logDebug } from '../logging';

 // Refactored cherryPick function
 export async function cherryPick(question: string, longContext: string, options: any = {}, trackers: TrackerContext, schemaGen: Schemas, url: string) {
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@ -2,7 +2,7 @@ import { TokenTracker } from "../utils/token-tracker";
 import { ReadResponse } from '../types';
 import { JINA_API_KEY } from "../config";
 import axiosClient from "../utils/axios-client";
-import { logInfo, logError, logDebug, logWarning } from '../logging';
+import { logDebug } from '../logging';

 export async function readUrl(
  url: string,
@ -63,7 +63,6 @@ export async function readUrl(

    return { response: data };
  } catch (error: any) {
-    logError(`Error reading URL: ${error.message}`);
    throw error;
  }
 }
--- a/src/tools/segment.ts
+++ b/src/tools/segment.ts
@ -1,183 +1,65 @@
-import { TokenTracker } from "../utils/token-tracker";
-import { JINA_API_KEY } from "../config";
-import { TrackerContext } from "../types";
-import axiosClient from "../utils/axios-client";
-import { logInfo, logError, logDebug, logWarning } from '../logging';
-
-export async function segmentText(
-  content: string,
-  tracker: TrackerContext,
-  maxChunkLength = 500,
-  returnChunks = true,
-): Promise<{
-  chunks: string[];
-  chunk_positions: [number, number][];
-}> {
-  if (!content.trim()) {
-    throw new Error('Content cannot be empty');
-  }
-
-  // Initialize token tracker
-  const tokenTracker = tracker?.tokenTracker || new TokenTracker();
-
-  // Maximum size to send in a single API request (slightly under 64K to be safe)
-  const MAX_BATCH_SIZE = 60000;
-
-  // Split content into batches
-  const batches = splitTextIntoBatches(content, MAX_BATCH_SIZE);
-  logDebug(`Segmenting ${batches.length} batches`);
-
-  // Calculate offsets for each batch upfront
-  const batchOffsets: number[] = [];
-  let currentOffset = 0;
-  for (const batch of batches) {
-    batchOffsets.push(currentOffset);
-    currentOffset += batch.length;
-  }
-
-  // Process all batches in parallel
-  const batchPromises = batches.map(async (batch, i) => {
-    logDebug(`[Segment] Processing batch ${i + 1}/${batches.length} (size: ${batch.length})`);
-
-    try {
-      const { data } = await axiosClient.post(
-        'https://api.jina.ai/v1/segment',
-        {
-          content: batch,
-          return_chunks: returnChunks,
-          max_chunk_length: maxChunkLength
-        },
-        {
-          headers: {
-            'Content-Type': 'application/json',
-            'Authorization': `Bearer ${JINA_API_KEY}`,
-          },
-          timeout: 10000,
-          responseType: 'json'
-        }
-      );
-
-      if (!data) {
-        throw new Error('Invalid response data');
-      }
-
-      logDebug(`Batch ${i + 1} result:`, {
-        numChunks: data.num_chunks,
-        numTokens: data.num_tokens,
-        tokenizer: data.tokenizer
-      });
-
-      // Get the batch offset
-      const offset = batchOffsets[i];
-
-      // Adjust chunk positions to account for the offset of this batch
-      const adjustedPositions = data.chunk_positions
-        ? data.chunk_positions.map((position: [number, number]) => {
-          return [
-            position[0] + offset,
-            position[1] + offset
-          ] as [number, number];
-        })
-        : [];
-
-      return {
-        chunks: data.chunks || [],
-        positions: adjustedPositions,
-        tokens: data.usage?.tokens || 0
-      };
-    } catch (error: any) {
-      logError(`Error processing batch ${i + 1}: ${error.message}`);
-      throw error;
-    }
-  });
-
-  // Wait for all batches to complete
-  const batchResults = await Promise.all(batchPromises);
-
-  // Aggregate results
-  const allChunks = [];
-  const allChunkPositions = [];
-  let totalTokens = 0;
-
-  for (const result of batchResults) {
-    if (returnChunks) {
-      allChunks.push(...result.chunks);
-    }
-    allChunkPositions.push(...result.positions);
-    totalTokens += result.tokens;
-  }
-
-  // Track total token usage for all batches
-  tokenTracker.trackUsage('segment', {
-    totalTokens: totalTokens,
-    promptTokens: content.length,
-    completionTokens: totalTokens
-  });
-
-  return {
-    chunks: allChunks,
-    chunk_positions: allChunkPositions
-  };
+interface ChunkOptions {
+    type?: 'newline' | 'punctuation' | 'characters' | 'regex';
+    value?: string | number;
+    minChunkLength?: number;
 }

-/**
- * Splits text into batches that fit within the specified size limit
- * Tries to split at paragraph boundaries when possible
- */
-function splitTextIntoBatches(text: string, maxBatchSize: number): string[] {
-  const batches = [];
-  let currentIndex = 0;
+export function chunkText(text: string, options: ChunkOptions = {}): {
+    chunks: string[];
+    chunk_positions: [number, number][];
+} {
+    let chunks: string[] = [];
+    const minChunkLength = options.minChunkLength || 80;
+    const type = options.type || 'newline';

-  while (currentIndex < text.length) {
-    if (currentIndex + maxBatchSize >= text.length) {
-      // If the remaining text fits in one batch, add it and we're done
-      batches.push(text.slice(currentIndex));
-      break;
+    switch (type) {
+        case 'newline':
+            chunks = text.split('\n').filter(chunk => chunk.trim().length > 0);
+            break;
+
+        case 'punctuation':
+            // Split by common Chinese and English punctuation while preserving them
+            chunks = text.split(/(?<=[.!?。！？])/).filter(chunk => chunk.trim().length > 0);
+            break;
+
+        case 'characters':
+            const chunkSize = Number(options.value) || 1000;
+            for (let i = 0; i < text.length; i += chunkSize) {
+                chunks.push(text.slice(i, i + chunkSize));
+            }
+            break;
+
+        case 'regex':
+            if (!options.value || typeof options.value !== 'string') {
+                throw new Error('Regex pattern is required for regex chunking');
+            }
+            chunks = text.split(new RegExp(options.value)).filter(chunk => chunk.trim().length > 0);
+            break;
+
+        default:
+            throw new Error('Invalid chunking type');
    }

-    // Find a good split point - preferably at a paragraph break
-    // Look for the last paragraph break within the max batch size
-    let endIndex = currentIndex + maxBatchSize;
+    // Filter out chunks that are too short
+    const filteredChunks: string[] = [];
+    const filteredPositions: [number, number][] = [];
+    let currentPos = 0;

-    // Try to find paragraph breaks (double newline)
-    const paragraphBreakIndex = text.lastIndexOf('\n\n', endIndex);
-    if (paragraphBreakIndex > currentIndex && paragraphBreakIndex <= endIndex - 10) {
-      // Found a paragraph break that's at least 10 chars before the max size
-      // This avoids tiny splits at the end of a batch
-      endIndex = paragraphBreakIndex + 2; // Include the double newline
-    } else {
-      // If no paragraph break, try a single newline
-      const newlineIndex = text.lastIndexOf('\n', endIndex);
-      if (newlineIndex > currentIndex && newlineIndex <= endIndex - 5) {
-        endIndex = newlineIndex + 1; // Include the newline
-      } else {
-        // If no newline, try a sentence break
-        const sentenceBreakIndex = findLastSentenceBreak(text, currentIndex, endIndex);
-        if (sentenceBreakIndex > currentIndex) {
-          endIndex = sentenceBreakIndex;
+    for (const chunk of chunks) {
+        const startPos = text.indexOf(chunk, currentPos);
+        if (startPos === -1) continue; // Skip if chunk not found
+        const endPos = startPos + chunk.length;
+
+        // Only include chunks that meet the minimum length requirement
+        if (chunk.length >= minChunkLength) {
+            filteredChunks.push(chunk);
+            filteredPositions.push([startPos, endPos]);
        }
-        // If no sentence break found, we'll just use the max batch size
-      }
+        currentPos = endPos;
    }

-    batches.push(text.slice(currentIndex, endIndex));
-    currentIndex = endIndex;
-  }
-
-  return batches;
-}
-
-/**
- * Finds the last sentence break (period, question mark, or exclamation point followed by space)
- * within the given range
- */
-function findLastSentenceBreak(text: string, startIndex: number, endIndex: number): number {
-  // Look for ". ", "? ", or "! " patterns
-  for (let i = endIndex; i > startIndex; i--) {
-    if ((text[i - 2] === '.' || text[i - 2] === '?' || text[i - 2] === '!') &&
-      text[i - 1] === ' ') {
-      return i;
-    }
-  }
-  return -1; // No sentence break found
-}
+    return {
+        chunks: filteredChunks,
+        chunk_positions: filteredPositions
+    };
+} 
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -7,7 +7,7 @@ import { cherryPick } from "../tools/jina-latechunk";
 import { formatDateBasedOnType } from "./date-tools";
 import { classifyText } from "../tools/jina-classify-spam";
 import { processImage } from "./image-tools";
-import { segmentText } from "../tools/segment";
+import { chunkText } from "../tools/segment";
 import axiosClient from "./axios-client";
 import { logError, logDebug, logWarning } from '../logging';

@ -528,22 +528,12 @@ export async function processURLs(
        }

        // add to web contents
-        const { chunks, chunk_positions } = await segmentText(data.content, context);
-        // filter out the chunks that are too short, minChunkLength is 80
-        const minChunkLength = 80;
-        for (let i = 0; i < chunks.length; i++) {
-          if (chunks[i].length < minChunkLength) {
-            chunks.splice(i, 1);
-            chunk_positions.splice(i, 1);
-            i--;
-          }
-        }
+        const { chunks, chunk_positions } = chunkText(data.content);
        webContents[data.url] = {
-          // full: data.content,
          chunks,
          chunk_positions,
          title: data.title
-        }
+        };

        // Add to knowledge base
        allKnowledge.push({