fix: markdown repair

2025-12-26 06:28:56 +08:00 · 2025-03-27 16:49:29 +08:00 · 2025-03-27 16:49:29 +08:00 · 2bee1b6dda
commit 2bee1b6dda
parent 7bd4f51f42
2 changed files with 159 additions and 23 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -36,7 +36,7 @@ import {
  buildMdFromAnswer,
  chooseK, convertHtmlTablesToMd, fixCodeBlockIndentation,
  removeExtraLineBreaks,
-  removeHTMLtags, repairMarkdownFootnotesOuter
+  removeHTMLtags, repairMarkdownFinal, repairMarkdownFootnotesOuter
 } from "./utils/text-tools";
 import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
 import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
@ -111,7 +111,7 @@ function getPrompt(
  knowledge?: KnowledgeItem[],
  allURLs?: BoostedSearchSnippet[],
  beastMode?: boolean,
-): { system: string, urlList?: string[]} {
+): { system: string, urlList?: string[] } {
  const sections: string[] = [];
  const actionSections: string[] = [];

@ -140,7 +140,7 @@ ${context.join('\n')}
  if (allowRead && urlList.length > 0) {
    const urlListStr = urlList
      .map((item, idx) => `  - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`)
-    .join('\n')
+      .join('\n')

    actionSections.push(`
 <action-visit>
@ -232,7 +232,7 @@ ${actionSections.join('\n\n')}
  return {
    system: removeExtraLineBreaks(sections.join('\n\n')),
    urlList: urlList.map(u => u.url)
-};
+  };
 }


@ -441,10 +441,10 @@ export async function getResponse(question?: string,
  messages.forEach(m => {
    let strMsg = '';
    if (typeof m.content === 'string') {
-      strMsg =  m.content.trim();
-    } else if (typeof  m.content === 'object' && Array.isArray( m.content)) {
+      strMsg = m.content.trim();
+    } else if (typeof m.content === 'object' && Array.isArray(m.content)) {
      // find the very last sub content whose 'type' is 'text'  and use 'text' as the question
-      strMsg =  m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
+      strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim();
    }

    extractUrlsWithDescription(strMsg).forEach(u => {
@ -506,7 +506,7 @@ export async function getResponse(question?: string,
    allowSearch = allowSearch && (weightedURLs.length < 200);  // disable search when too many urls already

    // generate prompt for this step
-    const { system, urlList} = getPrompt(
+    const {system, urlList} = getPrompt(
      diaryContext,
      allQuestions,
      allKeywords,
@ -917,7 +917,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
    // any answer is better than no answer, humanity last resort
    step++;
    totalStep++;
-    const { system } = getPrompt(
+    const {system} = getPrompt(
      diaryContext,
      allQuestions,
      allKeywords,
@ -952,18 +952,19 @@ But unfortunately, you failed to solve the issue. You need to think out of the b

  if (!trivialQuestion) {
    (thisStep as AnswerAction).mdAnswer =
-      convertHtmlTablesToMd(
-        fixBadURLMdLinks(
-          fixCodeBlockIndentation(
-            repairMarkdownFootnotesOuter(
-              await fixMarkdown(
-                buildMdFromAnswer((thisStep as AnswerAction)),
-                allKnowledge,
-                context,
-                SchemaGen
-              ))
-          ),
-          allURLs));
+      repairMarkdownFinal(
+        convertHtmlTablesToMd(
+          fixBadURLMdLinks(
+            fixCodeBlockIndentation(
+              repairMarkdownFootnotesOuter(
+                await fixMarkdown(
+                  buildMdFromAnswer((thisStep as AnswerAction)),
+                  allKnowledge,
+                  context,
+                  SchemaGen
+                ))
+            ),
+            allURLs)));
  } else {
    (thisStep as AnswerAction).mdAnswer =
      convertHtmlTablesToMd(
@ -1029,7 +1030,6 @@ ${JSON.stringify(zodToJsonSchema(schema), null, 2)}
  }
 }

-
 export async function main() {
  const question = process.argv[2] || "";
  const {
--- a/src/utils/text-tools.ts
+++ b/src/utils/text-tools.ts
@ -663,4 +663,140 @@ if (typeof window === 'undefined') {
      return dom.window.document;
    }
  };
-}
+}
+
+/**
+ * Escapes special regex characters in a string
+ */
+function escapeRegExp(string: string): string {
+  return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+/**
+ * Counts occurrences of a specific character in a string
+ */
+function countChar(text: string, char: string): number {
+  return (text.match(new RegExp(escapeRegExp(char), 'g')) || []).length;
+}
+
+/**
+ * Processes formatted text and moves colons outside the formatting markers
+ */
+function processFormattedText(text: string, openMarker: string, closeMarker: string): string {
+  const pattern = new RegExp(`${escapeRegExp(openMarker)}(.*?)${escapeRegExp(closeMarker)}`, 'g');
+
+  return text.replace(pattern, (match, content) => {
+    // Check if content contains colon
+    if (content.includes(':') || content.includes('：')) {
+      // Count colons before removing them
+      const standardColonCount = countChar(content, ':');
+      const wideColonCount = countChar(content, '：');
+
+      // Remove colons and trim content
+      const trimmedContent = content.replace(/[:：]/g, '').trim();
+
+      // Add colons back outside the formatting
+      const standardColons = ':'.repeat(standardColonCount);
+      const wideColons = '：'.repeat(wideColonCount);
+
+      return `${openMarker}${trimmedContent}${closeMarker}${standardColons}${wideColons}`;
+    }
+    return match;
+  });
+}
+
+/**
+ * Repairs markdown by:
+ * 1. Removing <hr> and <br> tags that are not inside tables
+ * 2. Moving colons outside of bold and italic formatting
+ *
+ * @param markdown - The markdown string to repair
+ * @returns The repaired markdown, or the original if an error occurs
+ */
+export function repairMarkdownFinal(markdown: string): string {
+  try {
+    let repairedMarkdown = markdown;
+
+    // Step 1: Handle <hr> and <br> tags outside tables
+
+    // First, identify table regions to exclude them from the replacement
+    const tableRegions: Array<[number, number]> = [];
+
+    // Find HTML tables
+    const htmlTableRegex = /<table[\s\S]*?<\/table>/g;
+    let htmlTableMatch;
+    while ((htmlTableMatch = htmlTableRegex.exec(repairedMarkdown)) !== null) {
+      tableRegions.push([htmlTableMatch.index, htmlTableMatch.index + htmlTableMatch[0].length]);
+    }
+
+    // Find markdown tables
+    const lines = repairedMarkdown.split('\n');
+    let inMarkdownTable = false;
+    let markdownTableStart = 0;
+
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i].trim();
+
+      if (line.startsWith('|') && line.includes('|', 1)) {
+        if (!inMarkdownTable) {
+          inMarkdownTable = true;
+          markdownTableStart = repairedMarkdown.indexOf(lines[i]);
+        }
+      } else if (inMarkdownTable && line === '') {
+        inMarkdownTable = false;
+        const tableEnd = repairedMarkdown.indexOf(lines[i - 1]) + lines[i - 1].length;
+        tableRegions.push([markdownTableStart, tableEnd]);
+      }
+    }
+
+    if (inMarkdownTable) {
+      const tableEnd = repairedMarkdown.length;
+      tableRegions.push([markdownTableStart, tableEnd]);
+    }
+
+    // Check if an index is inside any table region
+    const isInTable = (index: number): boolean => {
+      return tableRegions.some(([start, end]) => index >= start && index < end);
+    };
+
+    // Remove <hr> and <br> tags outside tables
+    let result = '';
+    let i = 0;
+
+    while (i < repairedMarkdown.length) {
+      if (repairedMarkdown.substring(i, i + 4) === '<hr>' && !isInTable(i)) {
+        i += 4;
+      }
+      else if (repairedMarkdown.substring(i, i + 4) === '<br>' && !isInTable(i)) {
+        i += 4;
+      }
+      else {
+        result += repairedMarkdown[i];
+        i++;
+      }
+    }
+
+    repairedMarkdown = result;
+
+    // Step 2: Fix formatting with colons
+    // Process from most specific (longest) patterns to most general
+    const formattingPatterns = [
+      ['****', '****'], // Four asterisks
+      ['****', '***'],  // Four opening, three closing
+      ['***', '****'],  // Three opening, four closing
+      ['***', '***'],   // Three asterisks
+      ['**', '**'],     // Two asterisks (bold)
+      ['*', '*']        // One asterisk (italic)
+    ];
+
+    for (const [open, close] of formattingPatterns) {
+      repairedMarkdown = processFormattedText(repairedMarkdown, open, close);
+    }
+
+    return repairedMarkdown;
+  } catch (error) {
+    // Return the original markdown if any error occurs
+    return markdown;
+  }
+}
+