mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: markdown repair
This commit is contained in:
parent
7bd4f51f42
commit
2bee1b6dda
44
src/agent.ts
44
src/agent.ts
@ -36,7 +36,7 @@ import {
|
||||
buildMdFromAnswer,
|
||||
chooseK, convertHtmlTablesToMd, fixCodeBlockIndentation,
|
||||
removeExtraLineBreaks,
|
||||
removeHTMLtags, repairMarkdownFootnotesOuter
|
||||
removeHTMLtags, repairMarkdownFinal, repairMarkdownFootnotesOuter
|
||||
} from "./utils/text-tools";
|
||||
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
|
||||
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
|
||||
@ -111,7 +111,7 @@ function getPrompt(
|
||||
knowledge?: KnowledgeItem[],
|
||||
allURLs?: BoostedSearchSnippet[],
|
||||
beastMode?: boolean,
|
||||
): { system: string, urlList?: string[]} {
|
||||
): { system: string, urlList?: string[] } {
|
||||
const sections: string[] = [];
|
||||
const actionSections: string[] = [];
|
||||
|
||||
@ -140,7 +140,7 @@ ${context.join('\n')}
|
||||
if (allowRead && urlList.length > 0) {
|
||||
const urlListStr = urlList
|
||||
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
|
||||
.join('\n')
|
||||
.join('\n')
|
||||
|
||||
actionSections.push(`
|
||||
<action-visit>
|
||||
@ -232,7 +232,7 @@ ${actionSections.join('\n\n')}
|
||||
return {
|
||||
system: removeExtraLineBreaks(sections.join('\n\n')),
|
||||
urlList: urlList.map(u => u.url)
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -441,10 +441,10 @@ export async function getResponse(question?: string,
|
||||
messages.forEach(m => {
|
||||
let strMsg = '';
|
||||
if (typeof m.content === 'string') {
|
||||
strMsg = m.content.trim();
|
||||
} else if (typeof m.content === 'object' && Array.isArray( m.content)) {
|
||||
strMsg = m.content.trim();
|
||||
} else if (typeof m.content === 'object' && Array.isArray(m.content)) {
|
||||
// find the very last sub content whose 'type' is 'text' and use 'text' as the question
|
||||
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
|
||||
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
|
||||
}
|
||||
|
||||
extractUrlsWithDescription(strMsg).forEach(u => {
|
||||
@ -506,7 +506,7 @@ export async function getResponse(question?: string,
|
||||
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
|
||||
|
||||
// generate prompt for this step
|
||||
const { system, urlList} = getPrompt(
|
||||
const {system, urlList} = getPrompt(
|
||||
diaryContext,
|
||||
allQuestions,
|
||||
allKeywords,
|
||||
@ -917,7 +917,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
// any answer is better than no answer, humanity last resort
|
||||
step++;
|
||||
totalStep++;
|
||||
const { system } = getPrompt(
|
||||
const {system} = getPrompt(
|
||||
diaryContext,
|
||||
allQuestions,
|
||||
allKeywords,
|
||||
@ -952,18 +952,19 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
|
||||
if (!trivialQuestion) {
|
||||
(thisStep as AnswerAction).mdAnswer =
|
||||
convertHtmlTablesToMd(
|
||||
fixBadURLMdLinks(
|
||||
fixCodeBlockIndentation(
|
||||
repairMarkdownFootnotesOuter(
|
||||
await fixMarkdown(
|
||||
buildMdFromAnswer((thisStep as AnswerAction)),
|
||||
allKnowledge,
|
||||
context,
|
||||
SchemaGen
|
||||
))
|
||||
),
|
||||
allURLs));
|
||||
repairMarkdownFinal(
|
||||
convertHtmlTablesToMd(
|
||||
fixBadURLMdLinks(
|
||||
fixCodeBlockIndentation(
|
||||
repairMarkdownFootnotesOuter(
|
||||
await fixMarkdown(
|
||||
buildMdFromAnswer((thisStep as AnswerAction)),
|
||||
allKnowledge,
|
||||
context,
|
||||
SchemaGen
|
||||
))
|
||||
),
|
||||
allURLs)));
|
||||
} else {
|
||||
(thisStep as AnswerAction).mdAnswer =
|
||||
convertHtmlTablesToMd(
|
||||
@ -1029,7 +1030,6 @@ ${JSON.stringify(zodToJsonSchema(schema), null, 2)}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function main() {
|
||||
const question = process.argv[2] || "";
|
||||
const {
|
||||
|
||||
@ -663,4 +663,140 @@ if (typeof window === 'undefined') {
|
||||
return dom.window.document;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes special regex characters in a string
|
||||
*/
|
||||
function escapeRegExp(string: string): string {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts occurrences of a specific character in a string
|
||||
*/
|
||||
function countChar(text: string, char: string): number {
|
||||
return (text.match(new RegExp(escapeRegExp(char), 'g')) || []).length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes formatted text and moves colons outside the formatting markers
|
||||
*/
|
||||
function processFormattedText(text: string, openMarker: string, closeMarker: string): string {
|
||||
const pattern = new RegExp(`${escapeRegExp(openMarker)}(.*?)${escapeRegExp(closeMarker)}`, 'g');
|
||||
|
||||
return text.replace(pattern, (match, content) => {
|
||||
// Check if content contains colon
|
||||
if (content.includes(':') || content.includes(':')) {
|
||||
// Count colons before removing them
|
||||
const standardColonCount = countChar(content, ':');
|
||||
const wideColonCount = countChar(content, ':');
|
||||
|
||||
// Remove colons and trim content
|
||||
const trimmedContent = content.replace(/[::]/g, '').trim();
|
||||
|
||||
// Add colons back outside the formatting
|
||||
const standardColons = ':'.repeat(standardColonCount);
|
||||
const wideColons = ':'.repeat(wideColonCount);
|
||||
|
||||
return `${openMarker}${trimmedContent}${closeMarker}${standardColons}${wideColons}`;
|
||||
}
|
||||
return match;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Repairs markdown by:
|
||||
* 1. Removing <hr> and <br> tags that are not inside tables
|
||||
* 2. Moving colons outside of bold and italic formatting
|
||||
*
|
||||
* @param markdown - The markdown string to repair
|
||||
* @returns The repaired markdown, or the original if an error occurs
|
||||
*/
|
||||
export function repairMarkdownFinal(markdown: string): string {
|
||||
try {
|
||||
let repairedMarkdown = markdown;
|
||||
|
||||
// Step 1: Handle <hr> and <br> tags outside tables
|
||||
|
||||
// First, identify table regions to exclude them from the replacement
|
||||
const tableRegions: Array<[number, number]> = [];
|
||||
|
||||
// Find HTML tables
|
||||
const htmlTableRegex = /<table[\s\S]*?<\/table>/g;
|
||||
let htmlTableMatch;
|
||||
while ((htmlTableMatch = htmlTableRegex.exec(repairedMarkdown)) !== null) {
|
||||
tableRegions.push([htmlTableMatch.index, htmlTableMatch.index + htmlTableMatch[0].length]);
|
||||
}
|
||||
|
||||
// Find markdown tables
|
||||
const lines = repairedMarkdown.split('\n');
|
||||
let inMarkdownTable = false;
|
||||
let markdownTableStart = 0;
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
|
||||
if (line.startsWith('|') && line.includes('|', 1)) {
|
||||
if (!inMarkdownTable) {
|
||||
inMarkdownTable = true;
|
||||
markdownTableStart = repairedMarkdown.indexOf(lines[i]);
|
||||
}
|
||||
} else if (inMarkdownTable && line === '') {
|
||||
inMarkdownTable = false;
|
||||
const tableEnd = repairedMarkdown.indexOf(lines[i - 1]) + lines[i - 1].length;
|
||||
tableRegions.push([markdownTableStart, tableEnd]);
|
||||
}
|
||||
}
|
||||
|
||||
if (inMarkdownTable) {
|
||||
const tableEnd = repairedMarkdown.length;
|
||||
tableRegions.push([markdownTableStart, tableEnd]);
|
||||
}
|
||||
|
||||
// Check if an index is inside any table region
|
||||
const isInTable = (index: number): boolean => {
|
||||
return tableRegions.some(([start, end]) => index >= start && index < end);
|
||||
};
|
||||
|
||||
// Remove <hr> and <br> tags outside tables
|
||||
let result = '';
|
||||
let i = 0;
|
||||
|
||||
while (i < repairedMarkdown.length) {
|
||||
if (repairedMarkdown.substring(i, i + 4) === '<hr>' && !isInTable(i)) {
|
||||
i += 4;
|
||||
}
|
||||
else if (repairedMarkdown.substring(i, i + 4) === '<br>' && !isInTable(i)) {
|
||||
i += 4;
|
||||
}
|
||||
else {
|
||||
result += repairedMarkdown[i];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
repairedMarkdown = result;
|
||||
|
||||
// Step 2: Fix formatting with colons
|
||||
// Process from most specific (longest) patterns to most general
|
||||
const formattingPatterns = [
|
||||
['****', '****'], // Four asterisks
|
||||
['****', '***'], // Four opening, three closing
|
||||
['***', '****'], // Three opening, four closing
|
||||
['***', '***'], // Three asterisks
|
||||
['**', '**'], // Two asterisks (bold)
|
||||
['*', '*'] // One asterisk (italic)
|
||||
];
|
||||
|
||||
for (const [open, close] of formattingPatterns) {
|
||||
repairedMarkdown = processFormattedText(repairedMarkdown, open, close);
|
||||
}
|
||||
|
||||
return repairedMarkdown;
|
||||
} catch (error) {
|
||||
// Return the original markdown if any error occurs
|
||||
return markdown;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user