mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
102 lines
3.7 KiB
TypeScript
102 lines
3.7 KiB
TypeScript
import { generateText } from "ai";
|
||
import { getModel } from "../config";
|
||
import {TrackerContext} from "../types";
|
||
import {detectBrokenUnicodeViaFileIO} from "../utils/text-tools";
|
||
|
||
|
||
/**
|
||
* Repairs markdown content with <20> characters by using Gemini to guess the missing text
|
||
*/
|
||
export async function repairUnknownChars(mdContent: string, trackers?: TrackerContext): Promise<string> {
|
||
const { broken, readStr } = await detectBrokenUnicodeViaFileIO(mdContent);
|
||
if (!broken) return readStr;
|
||
console.log("Detected broken unicode in output, attempting to repair...");
|
||
|
||
let repairedContent = readStr;
|
||
let remainingUnknowns = true;
|
||
let iterations = 0;
|
||
|
||
let lastPosition = -1;
|
||
|
||
while (remainingUnknowns && iterations < 20) {
|
||
iterations++;
|
||
|
||
// Find the position of the first <20> character
|
||
const position = repairedContent.indexOf('<27>');
|
||
if (position === -1) {
|
||
remainingUnknowns = false;
|
||
continue;
|
||
}
|
||
|
||
// Check if we're stuck at the same position
|
||
if (position === lastPosition) {
|
||
// Move past this character by removing it
|
||
repairedContent = repairedContent.substring(0, position) +
|
||
repairedContent.substring(position + 1);
|
||
continue;
|
||
}
|
||
|
||
// Update last position to detect loops
|
||
lastPosition = position;
|
||
|
||
// Count consecutive <20> characters
|
||
let unknownCount = 0;
|
||
for (let i = position; i < repairedContent.length && repairedContent[i] === '<27>'; i++) {
|
||
unknownCount++;
|
||
}
|
||
|
||
// Extract context around the unknown characters
|
||
const contextSize = 50;
|
||
const start = Math.max(0, position - contextSize);
|
||
const end = Math.min(repairedContent.length, position + unknownCount + contextSize);
|
||
const leftContext = repairedContent.substring(start, position);
|
||
const rightContext = repairedContent.substring(position + unknownCount, end);
|
||
|
||
// Ask Gemini to guess the missing characters
|
||
try {
|
||
const result = await generateText({
|
||
model: getModel('fallback'),
|
||
system: `You're helping fix a corrupted scanned markdown document that has stains (represented by <20>).
|
||
Looking at the surrounding context, determine the original text should be in place of the <20> symbols.
|
||
|
||
Rules:
|
||
1. ONLY output the exact replacement text - no explanations, quotes, or additional text
|
||
2. Keep your response appropriate to the length of the unknown sequence
|
||
3. Consider the document appears to be in Chinese if that's what the context suggests`,
|
||
prompt: `
|
||
The corrupted text has ${unknownCount} <20> mush in a row.
|
||
|
||
On the left of the stains: "${leftContext}"
|
||
On the right of the stains: "${rightContext}"
|
||
|
||
So what was the original text between these two contexts?`,
|
||
});
|
||
|
||
trackers?.tokenTracker.trackUsage('md-fixer', result.usage)
|
||
const replacement = result.text.trim();
|
||
|
||
// Validate the replacement
|
||
if (
|
||
replacement === "UNKNOWN" ||
|
||
(await detectBrokenUnicodeViaFileIO(replacement)).broken ||
|
||
replacement.length > unknownCount * 4
|
||
) {
|
||
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
|
||
// Skip to the next <20> character without modifying content
|
||
} else {
|
||
// Replace the unknown sequence with the generated text
|
||
repairedContent = repairedContent.substring(0, position) +
|
||
replacement +
|
||
repairedContent.substring(position + unknownCount);
|
||
}
|
||
|
||
console.log(`Repair iteration ${iterations}: replaced ${unknownCount} <20> chars with "${replacement}"`);
|
||
|
||
} catch (error) {
|
||
console.error("Error repairing unknown characters:", error);
|
||
// Skip to the next <20> character without modifying this one
|
||
}
|
||
}
|
||
|
||
return repairedContent;
|
||
} |