mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
fix: md repair
This commit is contained in:
@@ -16,15 +16,15 @@ Your task is to repair the provided markdown content while preserving its origin
|
|||||||
|
|
||||||
<rules>
|
<rules>
|
||||||
1. Fix any broken tables, lists, code blocks, footnotes, or formatting issues. T
|
1. Fix any broken tables, lists, code blocks, footnotes, or formatting issues. T
|
||||||
2. Tables must always in basic HTML table syntax with proper <table> <thead> <tr> <th> <td> without any CSS styling. STRICTLY AVOID any markdown table syntax. Table should NEVER BE fenced with (\`\`\`html) triple backticks.
|
|
||||||
2. Make sure nested lists are correctly indented, especially code blocks within the nested structure.
|
2. Make sure nested lists are correctly indented, especially code blocks within the nested structure.
|
||||||
3. Use available knowledge to restore incomplete content.
|
3. Tables must always in basic HTML table syntax with proper <table> <thead> <tr> <th> <td> without any CSS styling. STRICTLY AVOID any markdown table syntax. HTML Table should NEVER BE fenced with (\`\`\`html) triple backticks.
|
||||||
4. Flatten deeply nested structure into natural language sections/paragraphs to make the content more readable.
|
4. Use available knowledge to restore incomplete content.
|
||||||
5. In the footnote section, keep each footnote items format and repair misaligned and duplicated footnotes. Each footnote item must contain a URL at the end.
|
5. Flatten deeply nested structure into natural language sections/paragraphs to make the content more readable.
|
||||||
6. In the actual content, to cite multiple footnotes in a row use [^1][^2][^3], never [^1,2,3] or [^1-3].
|
6. In the footnote section, keep each footnote items format and repair misaligned and duplicated footnotes. Each footnote item must contain a URL at the end.
|
||||||
7. Pay attention to the original content's ending (before the footnotes section). If you find a very obvious incomplete/broken/interrupted ending, continue the content with a proper ending.
|
7. In the actual content, to cite multiple footnotes in a row use [^1][^2][^3], never [^1,2,3] or [^1-3].
|
||||||
8. Repair any <20><> symbols or other broken characters in the original content by decoding them to the correct content.
|
8. Pay attention to the original content's ending (before the footnotes section). If you find a very obvious incomplete/broken/interrupted ending, continue the content with a proper ending.
|
||||||
9. Replace any obvious placeholders or Lorem Ipsum values such as "example.com" with the actual content derived from the knowledge.
|
9. Repair any <EFBFBD><EFBFBD> symbols or other broken characters in the original content by decoding them to the correct content.
|
||||||
|
10. Replace any obvious placeholders or Lorem Ipsum values such as "example.com" with the actual content derived from the knowledge.
|
||||||
</rules>
|
</rules>
|
||||||
|
|
||||||
The following knowledge items are provided for your reference. Note that some of them may not be directly related to the content user provided, but may give some subtle hints and insights:
|
The following knowledge items are provided for your reference. Note that some of them may not be directly related to the content user provided, but may give some subtle hints and insights:
|
||||||
|
|||||||
@@ -162,17 +162,23 @@ ${formatReferences(references)}
|
|||||||
* It extracts existing footnote definitions and uses them as references
|
* It extracts existing footnote definitions and uses them as references
|
||||||
*/
|
*/
|
||||||
export function repairMarkdownFootnotesOuter(markdownString: string): string {
|
export function repairMarkdownFootnotesOuter(markdownString: string): string {
|
||||||
// Remove outer code fence if it exists
|
|
||||||
// First trim the string to handle any extra whitespace
|
// First trim the string to handle any extra whitespace
|
||||||
markdownString = markdownString.trim();
|
markdownString = markdownString.trim();
|
||||||
|
|
||||||
// Check if the string starts with ```markdown or ```html and ends with ```
|
// Unwrap ALL code fences throughout the document
|
||||||
const codeBlockRegex = /^```(markdown|html)\n([\s\S]*)\n```$/;
|
// This matches any content between ```markdown or ```html and closing ```
|
||||||
const codeBlockMatch = markdownString.match(codeBlockRegex);
|
const codeBlockRegex = /```(markdown|html)\n([\s\S]*?)\n```/g;
|
||||||
if (codeBlockMatch) {
|
let match;
|
||||||
markdownString = codeBlockMatch[2];
|
let processedString = markdownString;
|
||||||
|
|
||||||
|
while ((match = codeBlockRegex.exec(markdownString)) !== null) {
|
||||||
|
const entireMatch = match[0];
|
||||||
|
const codeContent = match[2];
|
||||||
|
processedString = processedString.replace(entireMatch, codeContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
markdownString = processedString;
|
||||||
|
|
||||||
// Extract existing footnote definitions
|
// Extract existing footnote definitions
|
||||||
const footnoteDefRegex = /\[\^(\d+)]:\s*(.*?)(?=\n\[\^|$)/gs;
|
const footnoteDefRegex = /\[\^(\d+)]:\s*(.*?)(?=\n\[\^|$)/gs;
|
||||||
const references: Array<Reference> = [];
|
const references: Array<Reference> = [];
|
||||||
@@ -192,10 +198,10 @@ export function repairMarkdownFootnotesOuter(markdownString: string): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract all footnote definitions
|
// Extract all footnote definitions
|
||||||
let match;
|
let footnoteMatch;
|
||||||
while ((match = footnoteDefRegex.exec(footnotesPart)) !== null) {
|
while ((footnoteMatch = footnoteDefRegex.exec(footnotesPart)) !== null) {
|
||||||
// The footnote content
|
// The footnote content
|
||||||
let content = match[2].trim();
|
let content = footnoteMatch[2].trim();
|
||||||
|
|
||||||
// Extract URL and title if present
|
// Extract URL and title if present
|
||||||
// Looking for [domain.com](url) pattern at the end of the content
|
// Looking for [domain.com](url) pattern at the end of the content
|
||||||
@@ -474,21 +480,6 @@ export function convertHtmlTablesToMd(mdString: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then check for markdown tables inside code fences (including with the "html" language specifier)
|
|
||||||
const codeFenceRegex = /```(?:html)?\s*\n(\s*\|\s*[^|]+\s*\|[\s\S]*?)\n\s*```/g;
|
|
||||||
let codeFenceMatch;
|
|
||||||
|
|
||||||
while ((codeFenceMatch = codeFenceRegex.exec(mdString)) !== null) {
|
|
||||||
const entireMatch = codeFenceMatch[0];
|
|
||||||
const tableContent = codeFenceMatch[1];
|
|
||||||
|
|
||||||
// Check if this is actually a markdown table by looking for the separator row
|
|
||||||
if (tableContent.includes('\n| ---') || tableContent.includes('\n|---')) {
|
|
||||||
// It's already a markdown table, so just remove the code fence
|
|
||||||
result = result.replace(entireMatch, tableContent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error converting HTML tables to Markdown:', error);
|
console.error('Error converting HTML tables to Markdown:', error);
|
||||||
|
|||||||
Reference in New Issue
Block a user