fix: broken cn chars

This commit is contained in:
Han Xiao
2025-03-27 18:36:16 +08:00
parent 2bee1b6dda
commit 320f66697b
3 changed files with 108 additions and 8 deletions

View File

@@ -41,6 +41,7 @@ import {
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas"; import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools"; import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
import {fixMarkdown} from "./tools/md-fixer"; import {fixMarkdown} from "./tools/md-fixer";
import {repairUnknownChars} from "./tools/broken-ch-fixer";
async function sleep(ms: number) { async function sleep(ms: number) {
const seconds = Math.ceil(ms / 1000); const seconds = Math.ceil(ms / 1000);
@@ -139,7 +140,7 @@ ${context.join('\n')}
const urlList = sortSelectURLs(allURLs || [], 20); const urlList = sortSelectURLs(allURLs || [], 20);
if (allowRead && urlList.length > 0) { if (allowRead && urlList.length > 0) {
const urlListStr = urlList const urlListStr = urlList
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`) .map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged.slice(0, 50)}"`)
.join('\n') .join('\n')
actionSections.push(` actionSections.push(`
@@ -957,12 +958,13 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
fixBadURLMdLinks( fixBadURLMdLinks(
fixCodeBlockIndentation( fixCodeBlockIndentation(
repairMarkdownFootnotesOuter( repairMarkdownFootnotesOuter(
await repairUnknownChars(
await fixMarkdown( await fixMarkdown(
buildMdFromAnswer((thisStep as AnswerAction)), buildMdFromAnswer((thisStep as AnswerAction)),
allKnowledge, allKnowledge,
context, context,
SchemaGen SchemaGen
)) ), context))
), ),
allURLs))); allURLs)));
} else { } else {

View File

@@ -0,0 +1,98 @@
import { generateText } from "ai";
import { getModel } from "../config";
import {TrackerContext} from "../types";
/**
* Repairs markdown content with <20> characters by using Gemini to guess the missing text
*/
export async function repairUnknownChars(mdContent: string, trackers: TrackerContext): Promise<string> {
if (!mdContent.includes('<27>')) return mdContent;
let repairedContent = mdContent;
let remainingUnknowns = true;
let iterations = 0;
let lastPosition = -1;
while (remainingUnknowns && iterations < 20) {
iterations++;
// Find the position of the first <20> character
const position = repairedContent.indexOf('<27>');
if (position === -1) {
remainingUnknowns = false;
continue;
}
// Check if we're stuck at the same position
if (position === lastPosition) {
// Move past this character by removing it
repairedContent = repairedContent.substring(0, position) +
repairedContent.substring(position + 1);
continue;
}
// Update last position to detect loops
lastPosition = position;
// Count consecutive <20> characters
let unknownCount = 0;
for (let i = position; i < repairedContent.length && repairedContent[i] === '<27>'; i++) {
unknownCount++;
}
// Extract context around the unknown characters
const contextSize = 50;
const start = Math.max(0, position - contextSize);
const end = Math.min(repairedContent.length, position + unknownCount + contextSize);
const leftContext = repairedContent.substring(start, position);
const rightContext = repairedContent.substring(position + unknownCount, end);
// Ask Gemini to guess the missing characters
try {
const result = await generateText({
model: getModel('fallback'),
system: `You're helping fix a corrupted scanned markdown document that has stains (represented by <20>).
Looking at the surrounding context, determine the original text should be in place of the <20> symbols.
Rules:
1. ONLY output the exact replacement text - no explanations, quotes, or additional text
2. Keep your response appropriate to the length of the unknown sequence
3. Consider the document appears to be in Chinese if that's what the context suggests`,
prompt: `
The corrupted text has ${unknownCount} <20> mush in a row.
On the left of the stains: "${leftContext}"
On the right of the stains: "${rightContext}"
So what was the original text between these two contexts?`,
});
trackers.tokenTracker.trackUsage('md-fixer', result.usage)
const replacement = result.text.trim();
// Validate the replacement
if (
replacement === "UNKNOWN" ||
replacement.includes('<27>') ||
replacement.length > unknownCount * 4
) {
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
// Skip to the next <20> character without modifying content
} else {
// Replace the unknown sequence with the generated text
repairedContent = repairedContent.substring(0, position) +
replacement +
repairedContent.substring(position + unknownCount);
}
console.log(`Repair iteration ${iterations}: replaced ${unknownCount} <20> chars with "${replacement}"`);
} catch (error) {
console.error("Error repairing unknown characters:", error);
// Skip to the next <20> character without modifying this one
}
}
return repairedContent;
}

View File

@@ -49,7 +49,7 @@ export async function fixMarkdown(
trackers?.actionTracker.trackThink('final_answer', schema.languageCode) trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
const result = await generateText({ const result = await generateText({
model: getModel('evaluator'), model: getModel('fallback'),
system: prompt.system, system: prompt.system,
prompt: prompt.user, prompt: prompt.user,
}); });