mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: broken cn chars
This commit is contained in:
parent
23e917b0fa
commit
c262f1af93
@ -1,29 +1,18 @@
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../config";
|
||||
import {TrackerContext} from "../types";
|
||||
import {detectBrokenUnicodeViaFileIO} from "../utils/text-tools";
|
||||
|
||||
function detectBrokenUnicodeInMemory(str: string) {
|
||||
// Use the browser or Node.js TextEncoder/TextDecoder APIs
|
||||
const encoder = new TextEncoder(); // Encodes to UTF-8
|
||||
const decoder = new TextDecoder('utf-8', {fatal: false}); // Replaces invalid sequences with <20>
|
||||
|
||||
// Round-trip the string through UTF-8 encoding
|
||||
const encoded = encoder.encode(str);
|
||||
const decoded = decoder.decode(encoded);
|
||||
|
||||
// Now check for the replacement character
|
||||
return {broken: decoded.includes('<27>'), decoded};
|
||||
}
|
||||
|
||||
/**
|
||||
* Repairs markdown content with <EFBFBD> characters by using Gemini to guess the missing text
|
||||
*/
|
||||
export async function repairUnknownChars(mdContent: string, trackers?: TrackerContext): Promise<string> {
|
||||
const { broken, decoded } = detectBrokenUnicodeInMemory(mdContent);
|
||||
if (!broken) return mdContent;
|
||||
const { broken, readStr } = await detectBrokenUnicodeViaFileIO(mdContent);
|
||||
if (!broken) return readStr;
|
||||
console.log("Detected broken unicode in output, attempting to repair...");
|
||||
|
||||
let repairedContent = decoded;
|
||||
let repairedContent = readStr;
|
||||
let remainingUnknowns = true;
|
||||
let iterations = 0;
|
||||
|
||||
@ -89,7 +78,7 @@ So what was the original text between these two contexts?`,
|
||||
// Validate the replacement
|
||||
if (
|
||||
replacement === "UNKNOWN" ||
|
||||
detectBrokenUnicodeInMemory(replacement).broken ||
|
||||
(await detectBrokenUnicodeViaFileIO(replacement)).broken ||
|
||||
replacement.length > unknownCount * 4
|
||||
) {
|
||||
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import {AnswerAction, KnowledgeItem, Reference} from "../types";
|
||||
import i18nJSON from './i18n.json';
|
||||
import {JSDOM} from 'jsdom';
|
||||
import fs from "fs/promises";
|
||||
|
||||
|
||||
export function buildMdFromAnswer(answer: AnswerAction) {
|
||||
@ -804,3 +805,21 @@ export function repairMarkdownFinal(markdown: string): string {
|
||||
}
|
||||
}
|
||||
|
||||
export async function detectBrokenUnicodeViaFileIO(str: string) {
|
||||
// Create a unique filename using timestamp and random string
|
||||
const timestamp = Date.now();
|
||||
const randomStr = Math.random().toString(36).substring(2, 10);
|
||||
const tempFilePath = `./temp_unicode_check_${timestamp}_${randomStr}.txt`;
|
||||
|
||||
// Write the string to a file (forcing encoding/decoding)
|
||||
await fs.writeFile(tempFilePath, str, 'utf8');
|
||||
|
||||
// Read it back
|
||||
const readStr = await fs.readFile(tempFilePath, 'utf8');
|
||||
|
||||
// Clean up
|
||||
await fs.unlink(tempFilePath);
|
||||
|
||||
// Now check for the visible replacement character
|
||||
return {broken: readStr.includes('<27>'), readStr};
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user