mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
fix: broken cn chars
This commit is contained in:
@@ -41,6 +41,7 @@ import {
|
|||||||
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
|
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
|
||||||
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
|
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
|
||||||
import {fixMarkdown} from "./tools/md-fixer";
|
import {fixMarkdown} from "./tools/md-fixer";
|
||||||
|
import {repairUnknownChars} from "./tools/broken-ch-fixer";
|
||||||
|
|
||||||
async function sleep(ms: number) {
|
async function sleep(ms: number) {
|
||||||
const seconds = Math.ceil(ms / 1000);
|
const seconds = Math.ceil(ms / 1000);
|
||||||
@@ -139,7 +140,7 @@ ${context.join('\n')}
|
|||||||
const urlList = sortSelectURLs(allURLs || [], 20);
|
const urlList = sortSelectURLs(allURLs || [], 20);
|
||||||
if (allowRead && urlList.length > 0) {
|
if (allowRead && urlList.length > 0) {
|
||||||
const urlListStr = urlList
|
const urlListStr = urlList
|
||||||
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
|
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged.slice(0, 50)}"`)
|
||||||
.join('\n')
|
.join('\n')
|
||||||
|
|
||||||
actionSections.push(`
|
actionSections.push(`
|
||||||
@@ -957,12 +958,13 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
fixBadURLMdLinks(
|
fixBadURLMdLinks(
|
||||||
fixCodeBlockIndentation(
|
fixCodeBlockIndentation(
|
||||||
repairMarkdownFootnotesOuter(
|
repairMarkdownFootnotesOuter(
|
||||||
|
await repairUnknownChars(
|
||||||
await fixMarkdown(
|
await fixMarkdown(
|
||||||
buildMdFromAnswer((thisStep as AnswerAction)),
|
buildMdFromAnswer((thisStep as AnswerAction)),
|
||||||
allKnowledge,
|
allKnowledge,
|
||||||
context,
|
context,
|
||||||
SchemaGen
|
SchemaGen
|
||||||
))
|
), context))
|
||||||
),
|
),
|
||||||
allURLs)));
|
allURLs)));
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
98
src/tools/broken-ch-fixer.ts
Normal file
98
src/tools/broken-ch-fixer.ts
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
import { generateText } from "ai";
|
||||||
|
import { getModel } from "../config";
|
||||||
|
import {TrackerContext} from "../types";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Repairs markdown content with <20> characters by using Gemini to guess the missing text
|
||||||
|
*/
|
||||||
|
export async function repairUnknownChars(mdContent: string, trackers: TrackerContext): Promise<string> {
|
||||||
|
if (!mdContent.includes('<27>')) return mdContent;
|
||||||
|
|
||||||
|
let repairedContent = mdContent;
|
||||||
|
let remainingUnknowns = true;
|
||||||
|
let iterations = 0;
|
||||||
|
|
||||||
|
let lastPosition = -1;
|
||||||
|
|
||||||
|
while (remainingUnknowns && iterations < 20) {
|
||||||
|
iterations++;
|
||||||
|
|
||||||
|
// Find the position of the first <20> character
|
||||||
|
const position = repairedContent.indexOf('<27>');
|
||||||
|
if (position === -1) {
|
||||||
|
remainingUnknowns = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we're stuck at the same position
|
||||||
|
if (position === lastPosition) {
|
||||||
|
// Move past this character by removing it
|
||||||
|
repairedContent = repairedContent.substring(0, position) +
|
||||||
|
repairedContent.substring(position + 1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update last position to detect loops
|
||||||
|
lastPosition = position;
|
||||||
|
|
||||||
|
// Count consecutive <20> characters
|
||||||
|
let unknownCount = 0;
|
||||||
|
for (let i = position; i < repairedContent.length && repairedContent[i] === '<27>'; i++) {
|
||||||
|
unknownCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract context around the unknown characters
|
||||||
|
const contextSize = 50;
|
||||||
|
const start = Math.max(0, position - contextSize);
|
||||||
|
const end = Math.min(repairedContent.length, position + unknownCount + contextSize);
|
||||||
|
const leftContext = repairedContent.substring(start, position);
|
||||||
|
const rightContext = repairedContent.substring(position + unknownCount, end);
|
||||||
|
|
||||||
|
// Ask Gemini to guess the missing characters
|
||||||
|
try {
|
||||||
|
const result = await generateText({
|
||||||
|
model: getModel('fallback'),
|
||||||
|
system: `You're helping fix a corrupted scanned markdown document that has stains (represented by <20>).
|
||||||
|
Looking at the surrounding context, determine the original text should be in place of the <20> symbols.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
1. ONLY output the exact replacement text - no explanations, quotes, or additional text
|
||||||
|
2. Keep your response appropriate to the length of the unknown sequence
|
||||||
|
3. Consider the document appears to be in Chinese if that's what the context suggests`,
|
||||||
|
prompt: `
|
||||||
|
The corrupted text has ${unknownCount} <20> mush in a row.
|
||||||
|
|
||||||
|
On the left of the stains: "${leftContext}"
|
||||||
|
On the right of the stains: "${rightContext}"
|
||||||
|
|
||||||
|
So what was the original text between these two contexts?`,
|
||||||
|
});
|
||||||
|
|
||||||
|
trackers.tokenTracker.trackUsage('md-fixer', result.usage)
|
||||||
|
const replacement = result.text.trim();
|
||||||
|
|
||||||
|
// Validate the replacement
|
||||||
|
if (
|
||||||
|
replacement === "UNKNOWN" ||
|
||||||
|
replacement.includes('<27>') ||
|
||||||
|
replacement.length > unknownCount * 4
|
||||||
|
) {
|
||||||
|
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
|
||||||
|
// Skip to the next <20> character without modifying content
|
||||||
|
} else {
|
||||||
|
// Replace the unknown sequence with the generated text
|
||||||
|
repairedContent = repairedContent.substring(0, position) +
|
||||||
|
replacement +
|
||||||
|
repairedContent.substring(position + unknownCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Repair iteration ${iterations}: replaced ${unknownCount} <20> chars with "${replacement}"`);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error repairing unknown characters:", error);
|
||||||
|
// Skip to the next <20> character without modifying this one
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return repairedContent;
|
||||||
|
}
|
||||||
@@ -49,7 +49,7 @@ export async function fixMarkdown(
|
|||||||
trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
|
trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
|
||||||
|
|
||||||
const result = await generateText({
|
const result = await generateText({
|
||||||
model: getModel('evaluator'),
|
model: getModel('fallback'),
|
||||||
system: prompt.system,
|
system: prompt.system,
|
||||||
prompt: prompt.user,
|
prompt: prompt.user,
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user