From 582292ac4b807e31b152105bf3d6ba3b92833076 Mon Sep 17 00:00:00 2001 From: Sha Zhou Date: Thu, 21 Aug 2025 15:52:39 +0800 Subject: [PATCH] fix unicode issue for embeddings --- src/tools/embeddings.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tools/embeddings.ts b/src/tools/embeddings.ts index 2d84a7a..256e7bd 100644 --- a/src/tools/embeddings.ts +++ b/src/tools/embeddings.ts @@ -84,10 +84,10 @@ async function getBatchEmbeddingsWithRetry( let retryCount = 0; let textsToProcess = [...batchTexts].map(item => { if (typeof item === 'string') { - return trimLeadingSymbols(item); + return trimSymbols(item); } else { const key = Object.keys(item)[0]; - return key === 'text' ? { text: trimLeadingSymbols(item[key]) } : item; + return key === 'text' ? { text: trimSymbols(item[key]) } : item; } }); // Copy the original texts let indexMap = new Map(); // Map to keep track of original indices @@ -251,7 +251,7 @@ function truncateInputString(input: string | Record): string { } } -function trimLeadingSymbols(str: string): string { - const regex = /^[\p{S}\p{P}\p{Z}\p{C}\p{Emoji}]+/u; - return str.replace(regex, ''); +function trimSymbols(str: string): string { + const regex = /[\p{S}\p{P}\p{Z}\p{C}\p{Emoji}]+/gu; + return str.replace(regex, ' '); } \ No newline at end of file