mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: broken cn chars
This commit is contained in:
parent
9a11d7e686
commit
23e917b0fa
@ -2,13 +2,28 @@ import { generateText } from "ai";
|
||||
import { getModel } from "../config";
|
||||
import {TrackerContext} from "../types";
|
||||
|
||||
function detectBrokenUnicodeInMemory(str: string) {
|
||||
// Use the browser or Node.js TextEncoder/TextDecoder APIs
|
||||
const encoder = new TextEncoder(); // Encodes to UTF-8
|
||||
const decoder = new TextDecoder('utf-8', {fatal: false}); // Replaces invalid sequences with <20>
|
||||
|
||||
// Round-trip the string through UTF-8 encoding
|
||||
const encoded = encoder.encode(str);
|
||||
const decoded = decoder.decode(encoded);
|
||||
|
||||
// Now check for the replacement character
|
||||
return {broken: decoded.includes('<27>'), decoded};
|
||||
}
|
||||
|
||||
/**
|
||||
* Repairs markdown content with <EFBFBD> characters by using Gemini to guess the missing text
|
||||
*/
|
||||
export async function repairUnknownChars(mdContent: string, trackers?: TrackerContext): Promise<string> {
|
||||
if (!mdContent.includes('<27>')) return mdContent;
|
||||
const { broken, decoded } = detectBrokenUnicodeInMemory(mdContent);
|
||||
if (!broken) return mdContent;
|
||||
console.log("Detected broken unicode in output, attempting to repair...");
|
||||
|
||||
let repairedContent = mdContent;
|
||||
let repairedContent = decoded;
|
||||
let remainingUnknowns = true;
|
||||
let iterations = 0;
|
||||
|
||||
@ -74,7 +89,7 @@ So what was the original text between these two contexts?`,
|
||||
// Validate the replacement
|
||||
if (
|
||||
replacement === "UNKNOWN" ||
|
||||
replacement.includes('<27>') ||
|
||||
detectBrokenUnicodeInMemory(replacement).broken ||
|
||||
replacement.length > unknownCount * 4
|
||||
) {
|
||||
console.log(`Skipping invalid replacement ${replacement} at position ${position}`);
|
||||
|
||||
@ -15,11 +15,11 @@ function getPrompt(mdContent: string, allKnowledge: KnowledgeItem[]): PromptPair
|
||||
Your task is to repair the provided markdown content while preserving its original content.
|
||||
|
||||
<rules>
|
||||
1. Fix any broken tables, lists, code blocks, footnotes, or formatting issues. T
|
||||
1. Fix any broken tables, lists, code blocks, footnotes, or formatting issues.
|
||||
2. Make sure nested lists are correctly indented, especially code blocks within the nested structure.
|
||||
3. Tables must always in basic HTML table syntax with proper <table> <thead> <tr> <th> <td> without any CSS styling. STRICTLY AVOID any markdown table syntax. HTML Table should NEVER BE fenced with (\`\`\`html) triple backticks.
|
||||
3. Tables are good! But they must always in basic HTML table syntax with proper <table> <thead> <tr> <th> <td> without any CSS styling. STRICTLY AVOID any markdown table syntax. HTML Table should NEVER BE fenced with (\`\`\`html) triple backticks.
|
||||
4. Use available knowledge to restore incomplete content.
|
||||
5. Flatten deeply nested structure into natural language sections/paragraphs to make the content more readable.
|
||||
5. Avoid over-using bullet points by elaborate deeply nested structure into natural language sections/paragraphs to make the content more readable.
|
||||
6. In the footnote section, keep each footnote items format and repair misaligned and duplicated footnotes. Each footnote item must contain a URL at the end.
|
||||
7. In the actual content, to cite multiple footnotes in a row use [^1][^2][^3], never [^1,2,3] or [^1-3].
|
||||
8. Pay attention to the original content's ending (before the footnotes section). If you find a very obvious incomplete/broken/interrupted ending, continue the content with a proper ending.
|
||||
@ -30,7 +30,7 @@ Your task is to repair the provided markdown content while preserving its origin
|
||||
The following knowledge items are provided for your reference. Note that some of them may not be directly related to the content user provided, but may give some subtle hints and insights:
|
||||
${KnowledgeStr.join('\n\n')}
|
||||
|
||||
Directly output the repaired markdown content, preserving HTML tables if exist, never use tripple backticks html to wrap html table. No explain, no summary, no analysis. Just the repaired content.
|
||||
Directly output the repaired markdown content, preserving HTML tables if exist, never use tripple backticks html to wrap html table. No explain, no summary, no analysis. Just output the repaired content.
|
||||
`,
|
||||
user: mdContent
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user