From 473a3d30600aa22a175cc741b2c57a7d876a8a95 Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@jina.ai>
Date: Tue, 18 Mar 2025 16:06:40 +0800
Subject: [PATCH] fix: broken md

---
 src/agent.ts            |  9 +++++-
 src/tools/evaluator.ts  | 27 ++---------------
 src/tools/md-fixer.ts   | 67 +++++++++++++++++++++++++++++++++++++++++
 src/utils/i18n.json     | 39 ++++++++++++++++--------
 src/utils/text-tools.ts | 35 ++++++++++++++++++---
 5 files changed, 134 insertions(+), 43 deletions(-)
 create mode 100644 src/tools/md-fixer.ts
diff --git a/src/agent.ts b/src/agent.ts
index ecf3c46..533ee45 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -41,6 +41,7 @@ import {
 } from "./utils/text-tools";
 import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
 import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
+import {fixMarkdown} from "./tools/md-fixer";
 
 async function sleep(ms: number) {
   const seconds = Math.ceil(ms / 1000);
@@ -906,7 +907,13 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
     context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
   }
 
-  (thisStep as AnswerAction).mdAnswer = fixCodeBlockIndentation(buildMdFromAnswer((thisStep as AnswerAction)));
+  (thisStep as AnswerAction).mdAnswer = fixCodeBlockIndentation(await fixMarkdown(
+      buildMdFromAnswer((thisStep as AnswerAction)),
+      allKnowledge,
+      context,
+      SchemaGen
+    )
+  );
   console.log(thisStep)
 
   await storeContext(system, schema, {
diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
index 0222b44..0fbcd2d 100644
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -2,36 +2,13 @@ import {GenerateObjectResult} from 'ai';
 import {AnswerAction, EvaluationResponse, EvaluationType, KnowledgeItem, PromptPair, TrackerContext} from '../types';
 import {ObjectGeneratorSafe} from "../utils/safe-generator";
 import {Schemas} from "../utils/schemas";
-import {removeExtraLineBreaks} from "../utils/text-tools";
+import {getKnowledgeStr} from "../utils/text-tools";
 
 const TOOL_NAME = 'evaluator';
 
 
 function getRejectAllAnswersPrompt(question: string, answer: AnswerAction, allKnowledge: KnowledgeItem[]): PromptPair {
-  const KnowledgeStr = allKnowledge.map((k, idx) => {
-    const aMsg = `
-<knowledge-${idx+1}>
-${k.question}
-
-${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
-<knowledge-datetime>
-${k.updated}
-</knowledge-datetime>
-` : ''}
-
-${k.references && k.type === 'url' ? `
-<knowledge-url>
-${k.references[0]}
-</knowledge-url>
-` : ''}
-
-
-${k.answer}
-</knowledge-${idx+1}>
-      `.trim();
-
-    return removeExtraLineBreaks(aMsg);
-  })
+  const KnowledgeStr = getKnowledgeStr(allKnowledge);
 
   return {
     system: `
diff --git a/src/tools/md-fixer.ts b/src/tools/md-fixer.ts
new file mode 100644
index 0000000..6556a69
--- /dev/null
+++ b/src/tools/md-fixer.ts
@@ -0,0 +1,67 @@
+import {KnowledgeItem, PromptPair, TrackerContext} from '../types';
+import {getKnowledgeStr} from "../utils/text-tools";
+import {getModel} from "../config";
+import {generateText} from "ai";
+import {Schemas} from "../utils/schemas";
+
+
+function getPrompt(mdContent: string, allKnowledge: KnowledgeItem[]): PromptPair {
+  const KnowledgeStr = getKnowledgeStr(allKnowledge);
+
+
+  return {
+    system: `You are an expert Markdown Restoration Specialist.
+
+Your task is to repair the provided markdown content while preserving its original content.
+
+<rules>
+1. Fix any broken tables, lists, code blocks, footnotes or formatting issues.
+2. Make sure code block are properly closed languages are correctly specified.
+3. Make sure nested lists are correctly indented, especially those code blocks in the nested structure.
+4. Leverage existing knowledge to fix the incomplete content.
+5. Leverage existing knowledge to add missing references, citations.
+6. Reduce the level of nested structure to make the content more readable.
+7. Pay attention to the original's content's ending, if you find very obvious incomplete/broken/interrupted ending, continue the content with a proper ending.
+8. Repair any �� or other broken characters in the content.
+</rules>
+
+The following knowledge items are provided for your reference. Note that some of them may not be directly related to the content user provided, but may give some subtle hints and insights:
+${KnowledgeStr.join('\n\n')}
+
+Directly output the repaired markdown content. No explain, no summary, no analysis. Just the repaired content.
+`,
+    user: mdContent
+  }
+}
+
+const TOOL_NAME = 'md-fixer';
+
+export async function fixMarkdown(
+  mdContent: string,
+  knowledgeItems: KnowledgeItem[],
+  trackers: TrackerContext,
+  schema: Schemas
+): Promise<string> {
+  try {
+    const prompt = getPrompt(mdContent, knowledgeItems);
+    trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
+
+    const result = await generateText({
+      model: getModel('evaluator'),
+      system: prompt.system,
+      prompt: prompt.user,
+    });
+
+    trackers.tokenTracker.trackUsage('md-fixer', result.usage)
+
+
+    console.log(TOOL_NAME, result.text);
+    console.log('repaired before/after', mdContent.length, result.text.length);
+
+    return result.text;
+
+  } catch (error) {
+    console.error(`Error in ${TOOL_NAME}`, error);
+    return mdContent;
+  }
+}
\ No newline at end of file
diff --git a/src/utils/i18n.json b/src/utils/i18n.json
index af95a62..b307fed 100644
--- a/src/utils/i18n.json
+++ b/src/utils/i18n.json
@@ -4,77 +4,88 @@
     "search_for": "Let me search for ${keywords} to gather more information.",
     "read_for": "Let me read ${urls} to gather more information.",
     "read_for_verify": "Let me fetch the source content to verify the answer.",
-    "late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts."
+    "late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
+    "final_answer": "Let me finalize the answer."
   },
   "zh-CN": {
     "eval_first": "等等，让我先自己评估一下答案。",
     "search_for": "让我搜索${keywords}来获取更多信息。",
     "read_for": "让我读取网页 ${urls} 来获取更多信息。",
     "read_for_verify": "让我读取源网页内容来验证答案。",
-    "late_chunk": "网页 ${url} 内容太长，我正在筛选精华部分。"
+    "late_chunk": "网页 ${url} 内容太长，我正在筛选精华部分。",
+    "final_answer": "我来整理一下答案。"
   },
   "zh-TW": {
     "eval_first": "等等，讓我先評估一下答案。",
     "search_for": "讓我搜索${keywords}來獲取更多信息。",
     "read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
     "read_for_verify": "讓我獲取源內容來驗證答案。",
-    "late_chunk": "網頁 ${url} 內容太長，我正在挑選相關部分。"
+    "late_chunk": "網頁 ${url} 內容太長，我正在挑選相關部分。",
+    "final_answer": "我來整理一下答案。"
   },
   "ja": {
     "eval_first": "ちょっと待って、まず答えを評価します。",
     "search_for": "キーワード${keywords}で検索して、情報を集めます。",
     "read_for": "${urls} を読んで、情報を集めます。",
     "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
-    "late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。"
+    "late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
+    "final_answer": "答えをまとめます。"
   },
   "ko": {
     "eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
     "search_for": "키워드 ${keywords}로 검색하여 더 많은 정보를 수집하겠습니다.",
     "read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
     "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
-    "late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다."
+    "late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
+    "final_answer": "답변을 마무리하겠습니다."
   },
   "fr": {
     "eval_first": "Un instant, je vais d'abord évaluer la réponse.",
     "search_for": "Je vais rechercher ${keywords} pour obtenir plus d'informations.",
     "read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
     "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
-    "late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes."
+    "late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
+    "final_answer": "Je vais finaliser la réponse."
   },
   "de": {
     "eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
     "search_for": "Ich werde nach ${keywords} suchen, um weitere Informationen zu sammeln.",
     "read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
     "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
-    "late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen."
+    "late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
+    "final_answer": "Ich werde die Antwort abschließen."
   },
   "es": {
     "eval_first": "Un momento, voy a evaluar la respuesta primero.",
     "search_for": "Voy a buscar ${keywords} para recopilar más información.",
     "read_for": "Voy a leer ${urls} para recopilar más información.",
     "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
-    "late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes."
+    "late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
+    "final_answer": "Voy a finalizar la respuesta."
   },
   "it": {
     "eval_first": "Un attimo, valuterò prima la risposta.",
     "search_for": "Cercherò ${keywords} per raccogliere ulteriori informazioni.",
     "read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
     "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
-    "late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti."
+    "late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
+    "final_answer": "Finalizzerò la risposta."
   },
   "pt": {
     "eval_first": "Um momento, vou avaliar a resposta primeiro.",
     "search_for": "Vou pesquisar ${keywords} para reunir mais informações.",
     "read_for": "Vou ler ${urls} para reunir mais informações.",
     "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
-    "late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes."
+    "late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
+    "final_answer": "Vou finalizar a resposta."
   },
   "ru": {
     "eval_first": "Подождите, я сначала оценю ответ.",
     "search_for": "Дайте мне поискать ${keywords} для сбора дополнительной информации.",
     "read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
     "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
-    "late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части."
+    "late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
+    "final_answer": "Дайте мне завершить ответ."
   },
   "ar": {
     "eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
@@ -88,13 +99,15 @@
     "search_for": "Ik zal zoeken naar ${keywords} om meer informatie te verzamelen.",
     "read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
     "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
-    "late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren."
+    "late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
+    "final_answer": "Ik zal het antwoord afronden."
   },
   "zh": {
     "eval_first": "等等，让我先评估一下答案。",
     "search_for": "让我搜索${keywords}来获取更多信息。",
     "read_for": "让我阅读 ${urls} 来获取更多信息。",
     "read_for_verify": "让我获取源内容来验证答案。",
-    "late_chunk": "网页 ${url} 内容太长，我正在筛选精华部分。"
+    "late_chunk": "网页 ${url} 内容太长，我正在筛选精华部分。",
+    "final_answer": "我来整理一下答案。"
   }
 }
\ No newline at end of file
diff --git a/src/utils/text-tools.ts b/src/utils/text-tools.ts
index 6c75d7d..60250af 100644
--- a/src/utils/text-tools.ts
+++ b/src/utils/text-tools.ts
@@ -1,4 +1,4 @@
-import {AnswerAction} from "../types";
+import {AnswerAction, KnowledgeItem} from "../types";
 import i18nJSON from './i18n.json';
 
 export function buildMdFromAnswer(answer: AnswerAction) {
@@ -261,7 +261,7 @@ export function fixCodeBlockIndentation(markdownText: string): string {
           }
         }
 
-        codeBlockStack.push({ indent, language: restOfLine, listIndent });
+        codeBlockStack.push({indent, language: restOfLine, listIndent});
         result.push(line);
       } else {
         // This is a closing code fence
@@ -288,8 +288,8 @@ export function fixCodeBlockIndentation(markdownText: string): string {
           // For code blocks in lists, we need to preserve the list indentation plus the code fence indentation
           // The total indentation should be at least listIndent + some standard indentation (usually 4 spaces)
           const codeIndent = openingBlock.indent.length > openingBlock.listIndent.length ?
-                             openingBlock.indent :
-                             openingBlock.listIndent + "    ";
+            openingBlock.indent :
+            openingBlock.listIndent + "    ";
 
           result.push(`${codeIndent}${trimmedLine}`);
         } else {
@@ -309,3 +309,30 @@ export function fixCodeBlockIndentation(markdownText: string): string {
   return result.join('\n');
 }
 
+export function getKnowledgeStr(allKnowledge: KnowledgeItem[]) {
+  return allKnowledge.map((k, idx) => {
+    const aMsg = `
+<knowledge-${idx + 1}>
+${k.question}
+
+${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
+<knowledge-datetime>
+${k.updated}
+</knowledge-datetime>
+` : ''}
+
+${k.references && k.type === 'url' ? `
+<knowledge-url>
+${k.references[0]}
+</knowledge-url>
+` : ''}
+
+
+${k.answer}
+</knowledge-${idx + 1}>
+      `.trim();
+
+    return removeExtraLineBreaks(aMsg);
+  })
+}
+