fix: broken cn chars

2026-03-22 07:29:35 +08:00 · 2025-03-27 18:36:16 +08:00
parent 2bee1b6dda
commit 320f66697b
3 changed files with 108 additions and 8 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -41,6 +41,7 @@ import {
 import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
 import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
 import {fixMarkdown} from "./tools/md-fixer";
+import {repairUnknownChars} from "./tools/broken-ch-fixer";

 async function sleep(ms: number) {
  const seconds = Math.ceil(ms / 1000);
@@ -139,7 +140,7 @@ ${context.join('\n')}
  const urlList = sortSelectURLs(allURLs || [], 20);
  if (allowRead && urlList.length > 0) {
    const urlListStr = urlList
-      .map((item, idx) => `  - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
+      .map((item, idx) => `  - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged.slice(0, 50)}"`)
      .join('\n')

    actionSections.push(`
@@ -957,12 +958,13 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
          fixBadURLMdLinks(
            fixCodeBlockIndentation(
              repairMarkdownFootnotesOuter(
-                await fixMarkdown(
-                  buildMdFromAnswer((thisStep as AnswerAction)),
-                  allKnowledge,
-                  context,
-                  SchemaGen
-                ))
+                await repairUnknownChars(
+                  await fixMarkdown(
+                    buildMdFromAnswer((thisStep as AnswerAction)),
+                    allKnowledge,
+                    context,
+                    SchemaGen
+                  ), context))
            ),
            allURLs)));
  } else {
--- a/src/tools/broken-ch-fixer.ts
+++ b/src/tools/broken-ch-fixer.ts
@@ -0,0 +1,98 @@
+import { generateText } from "ai";
+import { getModel } from "../config";
+import {TrackerContext} from "../types";
+
+/**
+ * Repairs markdown content with <20> characters by using Gemini to guess the missing text
+ */
+export async function repairUnknownChars(mdContent: string, trackers: TrackerContext): Promise<string> {
+  if (!mdContent.includes('<27>')) return mdContent;
+
+  let repairedContent = mdContent;
+  let remainingUnknowns = true;
+  let iterations = 0;
+
+  let lastPosition = -1;
+
+  while (remainingUnknowns && iterations < 20) {
+    iterations++;
+
+    // Find the position of the first <20> character
+    const position = repairedContent.indexOf('<27>');
+    if (position === -1) {
+      remainingUnknowns = false;
+      continue;
+    }
+
+    // Check if we're stuck at the same position
+    if (position === lastPosition) {
+      // Move past this character by removing it
+      repairedContent = repairedContent.substring(0, position) +
+                         repairedContent.substring(position + 1);
+      continue;
+    }
+
+    // Update last position to detect loops
+    lastPosition = position;
+
+    // Count consecutive <20> characters
+    let unknownCount = 0;
+    for (let i = position; i < repairedContent.length && repairedContent[i] === '<27>'; i++) {
+      unknownCount++;
+    }
+
+    // Extract context around the unknown characters
+    const contextSize = 50;
+    const start = Math.max(0, position - contextSize);
+    const end = Math.min(repairedContent.length, position + unknownCount + contextSize);
+    const leftContext = repairedContent.substring(start, position);
+    const rightContext = repairedContent.substring(position + unknownCount, end);
+
+    // Ask Gemini to guess the missing characters
+    try {
+      const result = await generateText({
+        model: getModel('fallback'),
+        system: `You're helping fix a corrupted scanned markdown document that has stains (represented by <20>). 
+Looking at the surrounding context, determine the original text should be in place of the <20> symbols.
+
+Rules:
+1. ONLY output the exact replacement text - no explanations, quotes, or additional text
+2. Keep your response appropriate to the length of the unknown sequence
+3. Consider the document appears to be in Chinese if that's what the context suggests`,
+        prompt: `
+The corrupted text has ${unknownCount} <20> mush in a row.
+
+On the left of the stains: "${leftContext}"
+On the right of the stains: "${rightContext}"
+
+So what was the original text between these two contexts?`,
+      });
+
+      trackers.tokenTracker.trackUsage('md-fixer', result.usage)
+      const replacement = result.text.trim();
+
+      // Validate the replacement
+      if (
+        replacement === "UNKNOWN" ||
+        replacement.includes('<27>') ||
+        replacement.length > unknownCount * 4
+      ) {
+        console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
+        // Skip to the next <20> character without modifying content
+      } else {
+        // Replace the unknown sequence with the generated text
+        repairedContent = repairedContent.substring(0, position) +
+                         replacement +
+                         repairedContent.substring(position + unknownCount);
+      }
+
+      console.log(`Repair iteration ${iterations}: replaced ${unknownCount} <20> chars with "${replacement}"`);
+
+    } catch (error) {
+      console.error("Error repairing unknown characters:", error);
+      // Skip to the next <20> character without modifying this one
+    }
+  }
+
+  return repairedContent;
+}
--- a/src/tools/md-fixer.ts
+++ b/src/tools/md-fixer.ts
@@ -49,7 +49,7 @@ export async function fixMarkdown(
    trackers?.actionTracker.trackThink('final_answer', schema.languageCode)

    const result = await generateText({
-      model: getModel('evaluator'),
+      model: getModel('fallback'),
      system: prompt.system,
      prompt: prompt.user,
    });