node-DeepResearch/src/utils/text-tools.ts

import {AnswerAction, KnowledgeItem, Reference} from "../types";
import i18nJSON from './i18n.json';
import {JSDOM} from 'jsdom';


export function buildMdFromAnswer(answer: AnswerAction) {
  return repairMarkdownFootnotes(answer.answer, answer.references);
}

export function repairMarkdownFootnotes(
  markdownString: string,
  references?: Array<Reference>
): string {
  // Standard footnote regex - handles [^1], [1^], and [1] formats
  const footnoteRegex = /\[(\^(\d+)|(\d+)\^|(\d+))]/g;

  // Regex to catch grouped footnotes like [^1, ^2, ^3] or [^1,^2,^3]
  const groupedFootnoteRegex = /\[\^(\d+)(?:,\s*\^(\d+))+]/g;

  // New regex to catch partially marked footnotes like [^10, 11]
  const partialGroupedFootnoteRegex = /\[\^(\d+)(?:,\s*(\d+))+]/g;

  // Helper function to format references
  const formatReferences = (refs: Array<Reference>) => {
    return refs.map((ref, i) => {
      const cleanQuote = ref.exactQuote
        .replace(/[^\p{L}\p{N}\s]/gu, ' ')
        .replace(/\s+/g, ' ').trim();

      const citation = `[^${i + 1}]: ${cleanQuote}`;

      if (!ref.url) return citation;

      const domainName = new URL(ref.url).hostname.replace('www.', '');
      return `${citation} [${ref.title || domainName}](${ref.url})`;
    }).join('\n\n');
  };

  // First case: no references - remove any footnote citations
  if (!references?.length) {
    return markdownString
      .replace(partialGroupedFootnoteRegex, (match) => {
        // Extract all numbers from the partially marked grouped footnote
        const numbers = match.match(/\d+/g) || [];
        return numbers.map(num => `[^${num}]`).join(', ');
      })
      .replace(groupedFootnoteRegex, (match) => {
        // Extract all numbers from the grouped footnote
        const numbers = match.match(/\d+/g) || [];
        return numbers.map(num => `[^${num}]`).join(', ');
      })
      .replace(footnoteRegex, '');
  }

  // Normalize footnotes first (convert [1^] to [^1] format and [1] to [^1] format)
  let processedMarkdown = markdownString
    .replace(/\[(\d+)\^]/g, (_, num) => `[^${num}]`)
    .replace(/\[(\d+)]/g, (_, num) => `[^${num}]`);

  // Fix grouped footnotes - both fully marked and partially marked types
  processedMarkdown = processedMarkdown
    .replace(groupedFootnoteRegex, (match) => {
      const numbers = match.match(/\d+/g) || [];
      return numbers.map(num => `[^${num}]`).join(', ');
    })
    .replace(partialGroupedFootnoteRegex, (match) => {
      const numbers = match.match(/\d+/g) || [];
      return numbers.map(num => `[^${num}]`).join(', ');
    });

  // Now extract all footnotes from the processed answer
  const footnotes: string[] = [];
  let match;
  const standardFootnoteRegex = /\[\^(\d+)]/g; // Use standard format after normalization
  while ((match = standardFootnoteRegex.exec(processedMarkdown)) !== null) {
    footnotes.push(match[1]);
  }

  // Remove footnote markers that don't have corresponding references
  let cleanedMarkdown = processedMarkdown;
  footnotes.forEach(footnote => {
    const footnoteNumber = parseInt(footnote);
    if (footnoteNumber > references.length) {
      const footnoteRegexExact = new RegExp(`\\[\\^${footnoteNumber}\\]`, 'g');
      cleanedMarkdown = cleanedMarkdown.replace(footnoteRegexExact, '');
    }
  });

  // Get valid footnotes after cleaning
  const validFootnotes: string[] = [];
  while ((match = standardFootnoteRegex.exec(cleanedMarkdown)) !== null) {
    validFootnotes.push(match[1]);
  }

  // No footnotes in answer but we have references - append them at the end
  if (validFootnotes.length === 0) {
    const appendedCitations = Array.from(
      {length: references.length},
      (_, i) => `[^${i + 1}]`
    ).join('');

    const formattedReferences = formatReferences(references);

    return `
${cleanedMarkdown}

⁜${appendedCitations}

${formattedReferences}
`.trim();
  }

  // Check if correction is needed
  const needsCorrection =
    (validFootnotes.length === references.length && validFootnotes.every(n => n === validFootnotes[0])) ||
    (validFootnotes.every(n => n === validFootnotes[0]) && parseInt(validFootnotes[0]) > references.length) ||
    (validFootnotes.length > 0 && validFootnotes.every(n => parseInt(n) > references.length));

  // New case: we have more references than footnotes
  if (references.length > validFootnotes.length && !needsCorrection) {
    // Get the used indices
    const usedIndices = new Set(validFootnotes.map(n => parseInt(n)));

    // Create citations for unused references
    const unusedReferences = Array.from(
      {length: references.length},
      (_, i) => !usedIndices.has(i + 1) ? `[^${i + 1}]` : ''
    ).join('');

    return `
${cleanedMarkdown}

⁜${unusedReferences}

${formatReferences(references)}
`.trim();
  }

  if (!needsCorrection) {
    return `
${cleanedMarkdown}

${formatReferences(references)}
`.trim();
  }

  // Apply correction: sequentially number the footnotes
  let currentIndex = 0;
  const correctedMarkdown = cleanedMarkdown.replace(standardFootnoteRegex, () =>
    `[^${++currentIndex}]`
  );

  return `
${correctedMarkdown}

${formatReferences(references)}
`.trim();
}

/**
 * A variant of the function that only takes a markdown string
 * It extracts existing footnote definitions and uses them as references
 */
export function repairMarkdownFootnotesOuter(markdownString: string): string {
  // First trim the string to handle any extra whitespace
  markdownString = markdownString.trim();

  // Unwrap ALL code fences throughout the document
  // This matches any content between ```markdown or ```html and closing ```
  const codeBlockRegex = /```(markdown|html)\n([\s\S]*?)\n```/g;
  let match;
  let processedString = markdownString;

  while ((match = codeBlockRegex.exec(markdownString)) !== null) {
    const entireMatch = match[0];
    const codeContent = match[2];
    processedString = processedString.replace(entireMatch, codeContent);
  }

  markdownString = processedString;

  // Extract existing footnote definitions
  const footnoteDefRegex = /\[\^(\d+)]:\s*(.*?)(?=\n\[\^|$)/gs;
  const references: Array<Reference> = [];

  // Extract content part (without footnote definitions)
  let contentPart = markdownString;
  let footnotesPart = '';

  // Try to find where footnote definitions start
  const firstFootnoteMatch = markdownString.match(/\[\^(\d+)]:/);
  if (firstFootnoteMatch) {
    const footnoteStartIndex = firstFootnoteMatch.index;
    if (footnoteStartIndex !== undefined) {
      contentPart = markdownString.substring(0, footnoteStartIndex);
      footnotesPart = markdownString.substring(footnoteStartIndex);
    }
  }

  // Extract all footnote definitions
  let footnoteMatch;
  while ((footnoteMatch = footnoteDefRegex.exec(footnotesPart)) !== null) {
    // The footnote content
    let content = footnoteMatch[2].trim();

    // Extract URL and title if present
    // Looking for [domain.com](url) pattern at the end of the content
    const urlMatch = content.match(/\s*\[([^\]]+)]\(([^)]+)\)\s*$/);

    let url = '';
    let title = '';

    if (urlMatch) {
      // Extract the domain name as title
      title = urlMatch[1];
      // Extract the URL
      url = urlMatch[2];

      // Remove the URL part from the content to get clean exactQuote
      content = content.replace(urlMatch[0], '').trim();
    }

    // Add to references array
    references.push({
      exactQuote: content,
      url,
      title
    });
  }

  // Only process if we found valid references
  if (references.length > 0) {
    return repairMarkdownFootnotes(contentPart, references);
  }

  // Otherwise, return original markdown unchanged
  return markdownString;
}

export const removeExtraLineBreaks = (text: string) => {
  return text.replace(/\n{2,}/gm, '\n\n');
}

export function chooseK(a: string[], k: number) {
  // randomly sample k from `a` without repitition
  return a.sort(() => 0.5 - Math.random()).slice(0, k);
}

export function removeHTMLtags(text: string) {
  return text.replace(/<[^>]*>?/gm, '');
}

export function removeAllLineBreaks(text: string) {
  return text.replace(/(\r\n|\n|\r)/gm, " ");
}

export function getI18nText(key: string, lang = 'en', params: Record<string, string> = {}) {
  // 获取i18n数据
  const i18nData = i18nJSON as Record<string, any>;
  // 确保语言代码存在，如果不存在则使用英语作为后备
  if (!i18nData[lang]) {
    console.error(`Language '${lang}' not found, falling back to English.`);
    lang = 'en';
  }

  // 获取对应语言的文本
  let text = i18nData[lang][key];

  // 如果文本不存在，则使用英语作为后备
  if (!text) {
    console.error(`Key '${key}' not found for language '${lang}', falling back to English.`);
    text = i18nData['en'][key];

    // 如果英语版本也不存在，则返回键名
    if (!text) {
      console.error(`Key '${key}' not found for English either.`);
      return key;
    }
  }

  // 替换模板中的变量
  if (params) {
    Object.keys(params).forEach(paramKey => {
      text = text.replace(`\${${paramKey}}`, params[paramKey]);
    });
  }

  return text;
}

export function smartMergeStrings(str1: string, str2: string): string {
  // If either string is empty, return the other
  if (!str1) return str2;
  if (!str2) return str1;

  // Check if one string is entirely contained within the other
  if (str1.includes(str2)) return str1;
  if (str2.includes(str1)) return str2;

  // Find the maximum possible overlap length
  const maxOverlap = Math.min(str1.length, str2.length);
  let bestOverlapLength = 0;

  // Check for overlaps starting from the largest possible
  for (let overlapLength = maxOverlap; overlapLength > 0; overlapLength--) {
    // Get the end of first string with the current overlap length
    const endOfStr1 = str1.slice(str1.length - overlapLength);
    // Get the beginning of second string with the current overlap length
    const startOfStr2 = str2.slice(0, overlapLength);

    // If they match, we've found our overlap
    if (endOfStr1 === startOfStr2) {
      bestOverlapLength = overlapLength;
      break;
    }
  }

  // Merge the strings using the best overlap
  if (bestOverlapLength > 0) {
    return str1.slice(0, str1.length - bestOverlapLength) + str2;
  } else {
    // No overlap found, concatenate normally
    return str1 + str2;
  }
}


export function fixCodeBlockIndentation(markdownText: string): string {
  // Track the state of code blocks and their indentation
  const lines = markdownText.split('\n');
  const result: string[] = [];

  // Track open code blocks and their indentation
  const codeBlockStack: { indent: string; language: string; listIndent: string }[] = [];

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];

    // Check if the line potentially contains a code fence marker
    if (line.trimStart().startsWith('```')) {
      const indent = line.substring(0, line.indexOf('```'));
      const restOfLine = line.trimStart().substring(3).trim();

      if (codeBlockStack.length === 0) {
        // This is an opening code fence

        // Determine if we're in a list context by looking at previous lines
        let listIndent = "";
        if (i > 0) {
          // Look back up to 3 lines to find list markers
          for (let j = i - 1; j >= Math.max(0, i - 3); j--) {
            const prevLine = lines[j];
            // Check for list markers like *, -, 1., etc.
            if (/^\s*(?:[*\-+]|\d+\.)\s/.test(prevLine)) {
              // Extract the list's base indentation
              const match = prevLine.match(/^(\s*)/);
              if (match) {
                listIndent = match[1];
                break;
              }
            }
          }
        }

        codeBlockStack.push({indent, language: restOfLine, listIndent});
        result.push(line);
      } else {
        // This is a closing code fence
        const openingBlock = codeBlockStack.pop();

        if (openingBlock) {
          // Replace the indentation with the one from the opening fence
          result.push(`${openingBlock.indent}\`\`\``);
        } else {
          // Something went wrong, just keep the line as is
          result.push(line);
        }
      }
    } else if (codeBlockStack.length > 0) {
      // Inside a code block - handle indentation
      const openingBlock = codeBlockStack[codeBlockStack.length - 1];

      if (line.trim().length > 0) {
        // Calculate proper base indentation for the code block
        let baseIndent;
        if (openingBlock.listIndent) {
          // For code blocks in lists
          baseIndent = openingBlock.listIndent + "    ";
        } else {
          // Not in a list
          baseIndent = openingBlock.indent;
        }

        // Get the indentation of this specific line
        const lineIndentMatch = line.match(/^(\s*)/);
        const lineIndent = lineIndentMatch ? lineIndentMatch[0] : '';

        // Find the common prefix between the line's indent and the opening block's indent
        // This represents the part of the indentation that's due to the markdown structure
        let commonPrefix = '';
        const minLength = Math.min(lineIndent.length, openingBlock.indent.length);
        for (let i = 0; i < minLength; i++) {
          if (lineIndent[i] === openingBlock.indent[i]) {
            commonPrefix += lineIndent[i];
          } else {
            break;
          }
        }

        // Remove just the common prefix (markdown structure indentation)
        // and keep the rest (code's own indentation)
        const contentAfterCommonIndent = line.substring(commonPrefix.length);

        // Add the proper base indentation plus the preserved code indentation
        result.push(`${baseIndent}${contentAfterCommonIndent}`);
      } else {
        // For empty lines, just keep them as is
        result.push(line);
      }
    } else {
      // Not in a code block, just add it as is
      result.push(line);
    }
  }

  return result.join('\n');
}

export function getKnowledgeStr(allKnowledge: KnowledgeItem[]) {
  return allKnowledge.map((k, idx) => {
    const aMsg = `
<knowledge-${idx + 1}>
${k.question}

${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
<knowledge-datetime>
${k.updated}
</knowledge-datetime>
` : ''}

${k.references && k.type === 'url' ? `
<knowledge-url>
${k.references[0]}
</knowledge-url>
` : ''}


${k.answer}
</knowledge-${idx + 1}>
      `.trim();

    return removeExtraLineBreaks(aMsg);
  })
}


/**
 * Converts HTML tables in a markdown string to markdown tables
 * @param mdString The markdown string containing potential HTML tables
 * @returns The markdown string with HTML tables converted to markdown tables, or the original string if no conversions were made
 */
export function convertHtmlTablesToMd(mdString: string): string {
  try {
    let result = mdString;

    // First check for HTML tables
    if (mdString.includes('<table>')) {
      // Regular expression to find HTML tables
      const tableRegex = /<table>([\s\S]*?)<\/table>/g;
      let match;

      // Process each table found
      while ((match = tableRegex.exec(mdString)) !== null) {
        const htmlTable = match[0];
        const convertedTable = convertSingleHtmlTableToMd(htmlTable);

        if (convertedTable) {
          result = result.replace(htmlTable, convertedTable);
        }
      }
    }

    return result;
  } catch (error) {
    console.error('Error converting HTML tables to Markdown:', error);
    return mdString; // Return original string if conversion fails
  }
}

/**
 * Converts a single HTML table to a markdown table
 * @param htmlTable The HTML table string
 * @returns The markdown table string or null if conversion fails
 */
function convertSingleHtmlTableToMd(htmlTable: string): string | null {
  try {
    // Create a DOM parser to parse the HTML
    const parser = new DOMParser();
    const doc = parser.parseFromString(htmlTable, 'text/html');
    const table = doc.querySelector('table');

    if (!table) {
      return null;
    }

    // Extract headers
    const headers = Array.from(table.querySelectorAll('thead th'))
      .map(th => sanitizeCell(th.textContent || ''));

    // Check if headers were found
    if (headers.length === 0) {
      // Try to find headers in the first row of tbody
      const firstRow = table.querySelector('tbody tr');
      if (firstRow) {
        headers.push(...Array.from(firstRow.querySelectorAll('td, th'))
          .map(cell => sanitizeCell(cell.textContent || '')));
      }
    }

    if (headers.length === 0) {
      return null; // No headers found, can't create a valid markdown table
    }

    // Start building the markdown table
    let mdTable = '';

    // Add the header row
    mdTable += '| ' + headers.join(' | ') + ' |\n';

    // Add the separator row
    mdTable += '| ' + headers.map(() => '---').join(' | ') + ' |\n';

    // Add the data rows
    const rows = Array.from(table.querySelectorAll('tbody tr'));

    for (const row of rows) {
      // Skip the first row if it was used for headers
      if (table.querySelector('thead') === null && row === rows[0]) {
        continue;
      }

      const cells = Array.from(row.querySelectorAll('td'))
        .map(td => {
          // Check for markdown content in the cell
          const cellContent = td.innerHTML;
          let processedContent = cellContent;

          // Detect if the cell contains markdown formatting
          const containsMarkdown =
            cellContent.includes('**') ||
            cellContent.includes('*   ') ||
            cellContent.includes('*  ') ||
            cellContent.includes('* ');

          if (containsMarkdown) {
            // Handle mixed HTML and Markdown

            // Handle lists inside cells (both ordered and unordered)
            if (cellContent.includes('* ') || cellContent.includes('*  ') || cellContent.includes('*   ')) {
              // Extract list items, handling both HTML list structures or markdown-style lists
              let listItems = [];

              if (td.querySelectorAll('li').length > 0) {
                // Handle HTML lists
                listItems = Array.from(td.querySelectorAll('li'))
                  .map(li => li.innerHTML.trim());
              } else {
                // Handle markdown-style lists with asterisks
                const lines = cellContent.split('\n');
                for (const line of lines) {
                  const trimmedLine = line.trim();
                  if (trimmedLine.match(/^\s*\*\s+/)) {
                    listItems.push(trimmedLine.replace(/^\s*\*\s+/, ''));
                  }
                }
              }

              // Format as bullet points with line breaks
              processedContent = listItems.map(item => `• ${item}`).join('<br>');

              // Preserve markdown formatting like bold and italic within list items
              processedContent = processedContent
                .replace(/\*\*(.*?)\*\*/g, '**$1**')  // Preserve bold
                .replace(/_(.*?)_/g, '_$1_');         // Preserve italic
            } else {
              // For cells without lists but with markdown, preserve the markdown formatting
              processedContent = cellContent
                .replace(/<\/?strong>/g, '**')  // Convert HTML bold to markdown
                .replace(/<\/?em>/g, '_')       // Convert HTML italic to markdown
                .replace(/<\/?b>/g, '**')       // Convert HTML bold to markdown
                .replace(/<\/?i>/g, '_')        // Convert HTML italic to markdown
                .replace(/<br\s*\/?>/g, '<br>') // Preserve line breaks as <br> tags
                .replace(/<p\s*\/?>/g, '')      // Remove opening paragraph tags
                .replace(/<\/p>/g, '<br>');     // Convert closing paragraph tags to line breaks
            }
          } else {
            // For regular HTML cells without markdown
            processedContent = processedContent
              .replace(/<\/?strong>/g, '**')  // Bold
              .replace(/<\/?em>/g, '_')       // Italic
              .replace(/<\/?b>/g, '**')       // Bold
              .replace(/<\/?i>/g, '_')        // Italic
              .replace(/<br\s*\/?>/g, '<br>') // Preserve line breaks as <br> tags
              .replace(/<p\s*\/?>/g, '')      // Opening paragraph tags
              .replace(/<\/p>/g, '<br>');     // Convert closing paragraph tags to line breaks
          }

          // Strip any remaining HTML tags, but preserve markdown syntax and <br> tags
          processedContent = processedContent
            .replace(/<(?!\/?br\b)[^>]*>/g, '') // Remove all HTML tags except <br>
            .trim();

          return sanitizeCell(processedContent);
        });

      // Ensure each row has the same number of cells as headers
      while (cells.length < headers.length) {
        cells.push('');
      }

      mdTable += '| ' + cells.join(' | ') + ' |\n';
    }

    return mdTable;
  } catch (error) {
    console.error('Error converting single HTML table:', error);
    return null;
  }
}

/**
 * Sanitizes a cell's content for use in a markdown table
 * @param content The cell content
 * @returns Sanitized content
 */
function sanitizeCell(content: string): string {
  // Trim whitespace
  let sanitized = content.trim();

  // Normalize pipe characters in content (escape them)
  sanitized = sanitized.replace(/\|/g, '\\|');

  // Preserve line breaks
  sanitized = sanitized.replace(/\n/g, '<br>');

  // Keep existing <br> tags intact (don't escape them)
  sanitized = sanitized.replace(/&lt;br&gt;/g, '<br>');

  // Preserve markdown formatting
  sanitized = sanitized
    .replace(/\\\*\\\*/g, '**')  // Fix escaped bold markers
    .replace(/\\\*/g, '*')       // Fix escaped list markers
    .replace(/\\_/g, '_');       // Fix escaped italic markers

  return sanitized;
}


if (typeof window === 'undefined') {
  global.DOMParser = class DOMParser {
    parseFromString(htmlString: string, mimeType: string) {
      const dom = new JSDOM(htmlString, {contentType: mimeType});
      return dom.window.document;
    }
  };
}