From 46ee5955d3b885193581735cf4f9defa361f308a Mon Sep 17 00:00:00 2001 From: Sha Zhou Date: Wed, 13 Aug 2025 16:01:54 +0800 Subject: [PATCH] fix emoji trimming in embeddings processing --- src/tools/embeddings.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/tools/embeddings.ts b/src/tools/embeddings.ts index 350ca5d..8ac8c6c 100644 --- a/src/tools/embeddings.ts +++ b/src/tools/embeddings.ts @@ -82,7 +82,14 @@ async function getBatchEmbeddingsWithRetry( const batchEmbeddings: number[][] = []; let batchTokens = 0; let retryCount = 0; - let textsToProcess = [...batchTexts]; // Copy the original texts + let textsToProcess = [...batchTexts].map(item => { + if (typeof item === 'string') { + return trimLeadingSymbols(item); + } else { + const key = Object.keys(item)[0]; + return key === 'text' ? { text: trimLeadingSymbols(item[key]) } : item; + } + }); // Copy the original texts let indexMap = new Map(); // Map to keep track of original indices // Initialize indexMap with original indices @@ -241,4 +248,8 @@ function truncateInputString(input: string | Record): string { } else { return Object.values(input)[0].slice(0, 50); } +} + +function trimLeadingSymbols(str: string): string { + return str.replace(/^(?:[\u{1F000}-\u{1F9FF}]|[\u{2600}-\u{27BF}]|[\u{FE00}-\u{FE0F}]|[\u{DFE5}]|\s)+/gu, ''); } \ No newline at end of file