fix: markdown repair

This commit is contained in:
Han Xiao 2025-03-27 16:49:29 +08:00
parent 7bd4f51f42
commit 2bee1b6dda
2 changed files with 159 additions and 23 deletions

View File

@ -36,7 +36,7 @@ import {
buildMdFromAnswer,
chooseK, convertHtmlTablesToMd, fixCodeBlockIndentation,
removeExtraLineBreaks,
removeHTMLtags, repairMarkdownFootnotesOuter
removeHTMLtags, repairMarkdownFinal, repairMarkdownFootnotesOuter
} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
@ -111,7 +111,7 @@ function getPrompt(
knowledge?: KnowledgeItem[],
allURLs?: BoostedSearchSnippet[],
beastMode?: boolean,
): { system: string, urlList?: string[]} {
): { system: string, urlList?: string[] } {
const sections: string[] = [];
const actionSections: string[] = [];
@ -140,7 +140,7 @@ ${context.join('\n')}
if (allowRead && urlList.length > 0) {
const urlListStr = urlList
.map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
.join('\n')
.join('\n')
actionSections.push(`
<action-visit>
@ -232,7 +232,7 @@ ${actionSections.join('\n\n')}
return {
system: removeExtraLineBreaks(sections.join('\n\n')),
urlList: urlList.map(u => u.url)
};
};
}
@ -441,10 +441,10 @@ export async function getResponse(question?: string,
messages.forEach(m => {
let strMsg = '';
if (typeof m.content === 'string') {
strMsg = m.content.trim();
} else if (typeof m.content === 'object' && Array.isArray( m.content)) {
strMsg = m.content.trim();
} else if (typeof m.content === 'object' && Array.isArray(m.content)) {
// find the very last sub content whose 'type' is 'text' and use 'text' as the question
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
}
extractUrlsWithDescription(strMsg).forEach(u => {
@ -506,7 +506,7 @@ export async function getResponse(question?: string,
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
// generate prompt for this step
const { system, urlList} = getPrompt(
const {system, urlList} = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@ -917,7 +917,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
// any answer is better than no answer, humanity last resort
step++;
totalStep++;
const { system } = getPrompt(
const {system} = getPrompt(
diaryContext,
allQuestions,
allKeywords,
@ -952,18 +952,19 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
if (!trivialQuestion) {
(thisStep as AnswerAction).mdAnswer =
convertHtmlTablesToMd(
fixBadURLMdLinks(
fixCodeBlockIndentation(
repairMarkdownFootnotesOuter(
await fixMarkdown(
buildMdFromAnswer((thisStep as AnswerAction)),
allKnowledge,
context,
SchemaGen
))
),
allURLs));
repairMarkdownFinal(
convertHtmlTablesToMd(
fixBadURLMdLinks(
fixCodeBlockIndentation(
repairMarkdownFootnotesOuter(
await fixMarkdown(
buildMdFromAnswer((thisStep as AnswerAction)),
allKnowledge,
context,
SchemaGen
))
),
allURLs)));
} else {
(thisStep as AnswerAction).mdAnswer =
convertHtmlTablesToMd(
@ -1029,7 +1030,6 @@ ${JSON.stringify(zodToJsonSchema(schema), null, 2)}
}
}
export async function main() {
const question = process.argv[2] || "";
const {

View File

@ -663,4 +663,140 @@ if (typeof window === 'undefined') {
return dom.window.document;
}
};
}
}
/**
* Escapes special regex characters in a string
*/
function escapeRegExp(string: string): string {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/**
* Counts occurrences of a specific character in a string
*/
function countChar(text: string, char: string): number {
return (text.match(new RegExp(escapeRegExp(char), 'g')) || []).length;
}
/**
* Processes formatted text and moves colons outside the formatting markers
*/
function processFormattedText(text: string, openMarker: string, closeMarker: string): string {
const pattern = new RegExp(`${escapeRegExp(openMarker)}(.*?)${escapeRegExp(closeMarker)}`, 'g');
return text.replace(pattern, (match, content) => {
// Check if content contains colon
if (content.includes(':') || content.includes('')) {
// Count colons before removing them
const standardColonCount = countChar(content, ':');
const wideColonCount = countChar(content, '');
// Remove colons and trim content
const trimmedContent = content.replace(/[:]/g, '').trim();
// Add colons back outside the formatting
const standardColons = ':'.repeat(standardColonCount);
const wideColons = ''.repeat(wideColonCount);
return `${openMarker}${trimmedContent}${closeMarker}${standardColons}${wideColons}`;
}
return match;
});
}
/**
* Repairs markdown by:
* 1. Removing <hr> and <br> tags that are not inside tables
* 2. Moving colons outside of bold and italic formatting
*
* @param markdown - The markdown string to repair
* @returns The repaired markdown, or the original if an error occurs
*/
export function repairMarkdownFinal(markdown: string): string {
try {
let repairedMarkdown = markdown;
// Step 1: Handle <hr> and <br> tags outside tables
// First, identify table regions to exclude them from the replacement
const tableRegions: Array<[number, number]> = [];
// Find HTML tables
const htmlTableRegex = /<table[\s\S]*?<\/table>/g;
let htmlTableMatch;
while ((htmlTableMatch = htmlTableRegex.exec(repairedMarkdown)) !== null) {
tableRegions.push([htmlTableMatch.index, htmlTableMatch.index + htmlTableMatch[0].length]);
}
// Find markdown tables
const lines = repairedMarkdown.split('\n');
let inMarkdownTable = false;
let markdownTableStart = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('|') && line.includes('|', 1)) {
if (!inMarkdownTable) {
inMarkdownTable = true;
markdownTableStart = repairedMarkdown.indexOf(lines[i]);
}
} else if (inMarkdownTable && line === '') {
inMarkdownTable = false;
const tableEnd = repairedMarkdown.indexOf(lines[i - 1]) + lines[i - 1].length;
tableRegions.push([markdownTableStart, tableEnd]);
}
}
if (inMarkdownTable) {
const tableEnd = repairedMarkdown.length;
tableRegions.push([markdownTableStart, tableEnd]);
}
// Check if an index is inside any table region
const isInTable = (index: number): boolean => {
return tableRegions.some(([start, end]) => index >= start && index < end);
};
// Remove <hr> and <br> tags outside tables
let result = '';
let i = 0;
while (i < repairedMarkdown.length) {
if (repairedMarkdown.substring(i, i + 4) === '<hr>' && !isInTable(i)) {
i += 4;
}
else if (repairedMarkdown.substring(i, i + 4) === '<br>' && !isInTable(i)) {
i += 4;
}
else {
result += repairedMarkdown[i];
i++;
}
}
repairedMarkdown = result;
// Step 2: Fix formatting with colons
// Process from most specific (longest) patterns to most general
const formattingPatterns = [
['****', '****'], // Four asterisks
['****', '***'], // Four opening, three closing
['***', '****'], // Three opening, four closing
['***', '***'], // Three asterisks
['**', '**'], // Two asterisks (bold)
['*', '*'] // One asterisk (italic)
];
for (const [open, close] of formattingPatterns) {
repairedMarkdown = processFormattedText(repairedMarkdown, open, close);
}
return repairedMarkdown;
} catch (error) {
// Return the original markdown if any error occurs
return markdown;
}
}