fix: broken cn chars

This commit is contained in:
Han Xiao 2025-04-03 11:07:39 +08:00
parent 23e917b0fa
commit c262f1af93
2 changed files with 24 additions and 16 deletions

View File

@ -1,29 +1,18 @@
import { generateText } from "ai";
import { getModel } from "../config";
import {TrackerContext} from "../types";
import {detectBrokenUnicodeViaFileIO} from "../utils/text-tools";
function detectBrokenUnicodeInMemory(str: string) {
// Use the browser or Node.js TextEncoder/TextDecoder APIs
const encoder = new TextEncoder(); // Encodes to UTF-8
const decoder = new TextDecoder('utf-8', {fatal: false}); // Replaces invalid sequences with <20>
// Round-trip the string through UTF-8 encoding
const encoded = encoder.encode(str);
const decoded = decoder.decode(encoded);
// Now check for the replacement character
return {broken: decoded.includes('<27>'), decoded};
}
/**
* Repairs markdown content with <EFBFBD> characters by using Gemini to guess the missing text
*/
export async function repairUnknownChars(mdContent: string, trackers?: TrackerContext): Promise<string> {
const { broken, decoded } = detectBrokenUnicodeInMemory(mdContent);
if (!broken) return mdContent;
const { broken, readStr } = await detectBrokenUnicodeViaFileIO(mdContent);
if (!broken) return readStr;
console.log("Detected broken unicode in output, attempting to repair...");
let repairedContent = decoded;
let repairedContent = readStr;
let remainingUnknowns = true;
let iterations = 0;
@ -89,7 +78,7 @@ So what was the original text between these two contexts?`,
// Validate the replacement
if (
replacement === "UNKNOWN" ||
detectBrokenUnicodeInMemory(replacement).broken ||
(await detectBrokenUnicodeViaFileIO(replacement)).broken ||
replacement.length > unknownCount * 4
) {
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);

View File

@ -1,6 +1,7 @@
import {AnswerAction, KnowledgeItem, Reference} from "../types";
import i18nJSON from './i18n.json';
import {JSDOM} from 'jsdom';
import fs from "fs/promises";
export function buildMdFromAnswer(answer: AnswerAction) {
@ -804,3 +805,21 @@ export function repairMarkdownFinal(markdown: string): string {
}
}
export async function detectBrokenUnicodeViaFileIO(str: string) {
// Create a unique filename using timestamp and random string
const timestamp = Date.now();
const randomStr = Math.random().toString(36).substring(2, 10);
const tempFilePath = `./temp_unicode_check_${timestamp}_${randomStr}.txt`;
// Write the string to a file (forcing encoding/decoding)
await fs.writeFile(tempFilePath, str, 'utf8');
// Read it back
const readStr = await fs.readFile(tempFilePath, 'utf8');
// Clean up
await fs.unlink(tempFilePath);
// Now check for the visible replacement character
return {broken: readStr.includes('<27>'), readStr};
}