From c2dbfc65a3479b92484c81b1554c881c89f4863e Mon Sep 17 00:00:00 2001 From: Sha Zhou Date: Fri, 22 Aug 2025 16:04:54 +0800 Subject: [PATCH] fix late chunk error --- src/tools/embeddings.ts | 2 +- src/tools/jina-latechunk.ts | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/tools/embeddings.ts b/src/tools/embeddings.ts index 5be37cc..7d80da6 100644 --- a/src/tools/embeddings.ts +++ b/src/tools/embeddings.ts @@ -89,7 +89,7 @@ async function getBatchEmbeddingsWithRetry( const key = Object.keys(item)[0]; return key === 'text' ? { text: trimSymbols(item[key]) } : item; } - }).filter(item => typeof item !== 'string' || item.trim()); // Copy the original texts + }); // Copy the original texts let indexMap = new Map(); // Map to keep track of original indices // Initialize indexMap with original indices diff --git a/src/tools/jina-latechunk.ts b/src/tools/jina-latechunk.ts index 8d02829..d4cb13a 100644 --- a/src/tools/jina-latechunk.ts +++ b/src/tools/jina-latechunk.ts @@ -1,7 +1,7 @@ import { TrackerContext } from "../types"; import { Schemas } from "../utils/schemas"; import { cosineSimilarity } from "./cosine"; -import { getEmbeddings } from "./embeddings"; +import { getEmbeddings, trimSymbols } from "./embeddings"; import { logError, logDebug } from '../logging'; // Refactored cherryPick function @@ -21,7 +21,12 @@ export async function cherryPick(question: string, longContext: string, options: // Split the longContext into chunks of chunkSize const chunks: string[] = []; for (let i = 0; i < longContext.length; i += chunkSize) { - chunks.push(longContext.substring(i, Math.min(i + chunkSize, longContext.length))); + const str = longContext.substring(i, Math.min(i + chunkSize, longContext.length)); + const trimmedStr = trimSymbols(str); + if (trimmedStr.trim().length === 0) { + continue; // Skip empty chunks + } + chunks.push(str); } logDebug(`late chunking enabled! num chunks: ${chunks.length}`);