fix encoding error for embeddings

This commit is contained in:
Sha Zhou 2025-08-22 15:37:52 +08:00
parent a4cb9139ed
commit 65636fbcdf

View File

@ -89,7 +89,7 @@ async function getBatchEmbeddingsWithRetry(
const key = Object.keys(item)[0]; const key = Object.keys(item)[0];
return key === 'text' ? { text: trimSymbols(item[key]) } : item; return key === 'text' ? { text: trimSymbols(item[key]) } : item;
} }
}); // Copy the original texts }).filter(item => typeof item !== 'string' || item.trim()); // Copy the original texts
let indexMap = new Map<number, number>(); // Map to keep track of original indices let indexMap = new Map<number, number>(); // Map to keep track of original indices
// Initialize indexMap with original indices // Initialize indexMap with original indices
@ -251,7 +251,7 @@ function truncateInputString(input: string | Record<string, string>): string {
} }
} }
function trimSymbols(str: string): string { export function trimSymbols(str: string): string {
const regex = /[\p{S}\p{P}\p{Z}\p{C}\p{Emoji}]+/gu; const regex = /[\p{S}\p{P}\p{Z}\p{C}\p{Emoji}]+/gu;
return str.replace(regex, ' '); return str.replace(regex, ' ');
} }