node-DeepResearch/src/utils/text-tools.ts
2025-03-21 15:00:32 +08:00

666 lines
22 KiB
TypeScript

import {AnswerAction, KnowledgeItem, Reference} from "../types";
import i18nJSON from './i18n.json';
import {JSDOM} from 'jsdom';
export function buildMdFromAnswer(answer: AnswerAction) {
return repairMarkdownFootnotes(answer.answer, answer.references);
}
export function repairMarkdownFootnotes(
markdownString: string,
references?: Array<Reference>
): string {
// Standard footnote regex - handles [^1], [1^], and [1] formats
const footnoteRegex = /\[(\^(\d+)|(\d+)\^|(\d+))]/g;
// Regex to catch grouped footnotes like [^1, ^2, ^3] or [^1,^2,^3]
const groupedFootnoteRegex = /\[\^(\d+)(?:,\s*\^(\d+))+]/g;
// New regex to catch partially marked footnotes like [^10, 11]
const partialGroupedFootnoteRegex = /\[\^(\d+)(?:,\s*(\d+))+]/g;
// Helper function to format references
const formatReferences = (refs: Array<Reference>) => {
return refs.map((ref, i) => {
const cleanQuote = ref.exactQuote
.replace(/[^\p{L}\p{N}\s]/gu, ' ')
.replace(/\s+/g, ' ').trim();
const citation = `[^${i + 1}]: ${cleanQuote}`;
if (!ref.url) return citation;
const domainName = new URL(ref.url).hostname.replace('www.', '');
return `${citation} [${ref.title || domainName}](${ref.url})`;
}).join('\n\n');
};
// First case: no references - remove any footnote citations
if (!references?.length) {
return markdownString
.replace(partialGroupedFootnoteRegex, (match) => {
// Extract all numbers from the partially marked grouped footnote
const numbers = match.match(/\d+/g) || [];
return numbers.map(num => `[^${num}]`).join(', ');
})
.replace(groupedFootnoteRegex, (match) => {
// Extract all numbers from the grouped footnote
const numbers = match.match(/\d+/g) || [];
return numbers.map(num => `[^${num}]`).join(', ');
})
.replace(footnoteRegex, '');
}
// Normalize footnotes first (convert [1^] to [^1] format and [1] to [^1] format)
let processedMarkdown = markdownString
.replace(/\[(\d+)\^]/g, (_, num) => `[^${num}]`)
.replace(/\[(\d+)]/g, (_, num) => `[^${num}]`);
// Fix grouped footnotes - both fully marked and partially marked types
processedMarkdown = processedMarkdown
.replace(groupedFootnoteRegex, (match) => {
const numbers = match.match(/\d+/g) || [];
return numbers.map(num => `[^${num}]`).join(', ');
})
.replace(partialGroupedFootnoteRegex, (match) => {
const numbers = match.match(/\d+/g) || [];
return numbers.map(num => `[^${num}]`).join(', ');
});
// Now extract all footnotes from the processed answer
const footnotes: string[] = [];
let match;
const standardFootnoteRegex = /\[\^(\d+)]/g; // Use standard format after normalization
while ((match = standardFootnoteRegex.exec(processedMarkdown)) !== null) {
footnotes.push(match[1]);
}
// Remove footnote markers that don't have corresponding references
let cleanedMarkdown = processedMarkdown;
footnotes.forEach(footnote => {
const footnoteNumber = parseInt(footnote);
if (footnoteNumber > references.length) {
const footnoteRegexExact = new RegExp(`\\[\\^${footnoteNumber}\\]`, 'g');
cleanedMarkdown = cleanedMarkdown.replace(footnoteRegexExact, '');
}
});
// Get valid footnotes after cleaning
const validFootnotes: string[] = [];
while ((match = standardFootnoteRegex.exec(cleanedMarkdown)) !== null) {
validFootnotes.push(match[1]);
}
// No footnotes in answer but we have references - append them at the end
if (validFootnotes.length === 0) {
const appendedCitations = Array.from(
{length: references.length},
(_, i) => `[^${i + 1}]`
).join('');
const formattedReferences = formatReferences(references);
return `
${cleanedMarkdown}
${appendedCitations}
${formattedReferences}
`.trim();
}
// Check if correction is needed
const needsCorrection =
(validFootnotes.length === references.length && validFootnotes.every(n => n === validFootnotes[0])) ||
(validFootnotes.every(n => n === validFootnotes[0]) && parseInt(validFootnotes[0]) > references.length) ||
(validFootnotes.length > 0 && validFootnotes.every(n => parseInt(n) > references.length));
// New case: we have more references than footnotes
if (references.length > validFootnotes.length && !needsCorrection) {
// Get the used indices
const usedIndices = new Set(validFootnotes.map(n => parseInt(n)));
// Create citations for unused references
const unusedReferences = Array.from(
{length: references.length},
(_, i) => !usedIndices.has(i + 1) ? `[^${i + 1}]` : ''
).join('');
return `
${cleanedMarkdown}
${unusedReferences}
${formatReferences(references)}
`.trim();
}
if (!needsCorrection) {
return `
${cleanedMarkdown}
${formatReferences(references)}
`.trim();
}
// Apply correction: sequentially number the footnotes
let currentIndex = 0;
const correctedMarkdown = cleanedMarkdown.replace(standardFootnoteRegex, () =>
`[^${++currentIndex}]`
);
return `
${correctedMarkdown}
${formatReferences(references)}
`.trim();
}
/**
* A variant of the function that only takes a markdown string
* It extracts existing footnote definitions and uses them as references
*/
export function repairMarkdownFootnotesOuter(markdownString: string): string {
// First trim the string to handle any extra whitespace
markdownString = markdownString.trim();
// Unwrap ALL code fences throughout the document
// This matches any content between ```markdown or ```html and closing ```
const codeBlockRegex = /```(markdown|html)\n([\s\S]*?)\n```/g;
let match;
let processedString = markdownString;
while ((match = codeBlockRegex.exec(markdownString)) !== null) {
const entireMatch = match[0];
const codeContent = match[2];
processedString = processedString.replace(entireMatch, codeContent);
}
markdownString = processedString;
// Extract existing footnote definitions
const footnoteDefRegex = /\[\^(\d+)]:\s*(.*?)(?=\n\[\^|$)/gs;
const references: Array<Reference> = [];
// Extract content part (without footnote definitions)
let contentPart = markdownString;
let footnotesPart = '';
// Try to find where footnote definitions start
const firstFootnoteMatch = markdownString.match(/\[\^(\d+)]:/);
if (firstFootnoteMatch) {
const footnoteStartIndex = firstFootnoteMatch.index;
if (footnoteStartIndex !== undefined) {
contentPart = markdownString.substring(0, footnoteStartIndex);
footnotesPart = markdownString.substring(footnoteStartIndex);
}
}
// Extract all footnote definitions
let footnoteMatch;
while ((footnoteMatch = footnoteDefRegex.exec(footnotesPart)) !== null) {
// The footnote content
let content = footnoteMatch[2].trim();
// Extract URL and title if present
// Looking for [domain.com](url) pattern at the end of the content
const urlMatch = content.match(/\s*\[([^\]]+)]\(([^)]+)\)\s*$/);
let url = '';
let title = '';
if (urlMatch) {
// Extract the domain name as title
title = urlMatch[1];
// Extract the URL
url = urlMatch[2];
// Remove the URL part from the content to get clean exactQuote
content = content.replace(urlMatch[0], '').trim();
}
// Add to references array
references.push({
exactQuote: content,
url,
title
});
}
// Only process if we found valid references
if (references.length > 0) {
return repairMarkdownFootnotes(contentPart, references);
}
// Otherwise, return original markdown unchanged
return markdownString;
}
export const removeExtraLineBreaks = (text: string) => {
return text.replace(/\n{2,}/gm, '\n\n');
}
export function chooseK(a: string[], k: number) {
// randomly sample k from `a` without repitition
return a.sort(() => 0.5 - Math.random()).slice(0, k);
}
export function removeHTMLtags(text: string) {
return text.replace(/<[^>]*>?/gm, '');
}
export function removeAllLineBreaks(text: string) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}
export function getI18nText(key: string, lang = 'en', params: Record<string, string> = {}) {
// 获取i18n数据
const i18nData = i18nJSON as Record<string, any>;
// 确保语言代码存在,如果不存在则使用英语作为后备
if (!i18nData[lang]) {
console.error(`Language '${lang}' not found, falling back to English.`);
lang = 'en';
}
// 获取对应语言的文本
let text = i18nData[lang][key];
// 如果文本不存在,则使用英语作为后备
if (!text) {
console.error(`Key '${key}' not found for language '${lang}', falling back to English.`);
text = i18nData['en'][key];
// 如果英语版本也不存在,则返回键名
if (!text) {
console.error(`Key '${key}' not found for English either.`);
return key;
}
}
// 替换模板中的变量
if (params) {
Object.keys(params).forEach(paramKey => {
text = text.replace(`\${${paramKey}}`, params[paramKey]);
});
}
return text;
}
export function smartMergeStrings(str1: string, str2: string): string {
// If either string is empty, return the other
if (!str1) return str2;
if (!str2) return str1;
// Check if one string is entirely contained within the other
if (str1.includes(str2)) return str1;
if (str2.includes(str1)) return str2;
// Find the maximum possible overlap length
const maxOverlap = Math.min(str1.length, str2.length);
let bestOverlapLength = 0;
// Check for overlaps starting from the largest possible
for (let overlapLength = maxOverlap; overlapLength > 0; overlapLength--) {
// Get the end of first string with the current overlap length
const endOfStr1 = str1.slice(str1.length - overlapLength);
// Get the beginning of second string with the current overlap length
const startOfStr2 = str2.slice(0, overlapLength);
// If they match, we've found our overlap
if (endOfStr1 === startOfStr2) {
bestOverlapLength = overlapLength;
break;
}
}
// Merge the strings using the best overlap
if (bestOverlapLength > 0) {
return str1.slice(0, str1.length - bestOverlapLength) + str2;
} else {
// No overlap found, concatenate normally
return str1 + str2;
}
}
export function fixCodeBlockIndentation(markdownText: string): string {
// Track the state of code blocks and their indentation
const lines = markdownText.split('\n');
const result: string[] = [];
// Track open code blocks and their indentation
const codeBlockStack: { indent: string; language: string; listIndent: string }[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Check if the line potentially contains a code fence marker
if (line.trimStart().startsWith('```')) {
const indent = line.substring(0, line.indexOf('```'));
const restOfLine = line.trimStart().substring(3).trim();
if (codeBlockStack.length === 0) {
// This is an opening code fence
// Determine if we're in a list context by looking at previous lines
let listIndent = "";
if (i > 0) {
// Look back up to 3 lines to find list markers
for (let j = i - 1; j >= Math.max(0, i - 3); j--) {
const prevLine = lines[j];
// Check for list markers like *, -, 1., etc.
if (/^\s*(?:[*\-+]|\d+\.)\s/.test(prevLine)) {
// Extract the list's base indentation
const match = prevLine.match(/^(\s*)/);
if (match) {
listIndent = match[1];
break;
}
}
}
}
codeBlockStack.push({indent, language: restOfLine, listIndent});
result.push(line);
} else {
// This is a closing code fence
const openingBlock = codeBlockStack.pop();
if (openingBlock) {
// Replace the indentation with the one from the opening fence
result.push(`${openingBlock.indent}\`\`\``);
} else {
// Something went wrong, just keep the line as is
result.push(line);
}
}
} else if (codeBlockStack.length > 0) {
// Inside a code block - handle indentation
const openingBlock = codeBlockStack[codeBlockStack.length - 1];
if (line.trim().length > 0) {
// Calculate proper base indentation for the code block
let baseIndent;
if (openingBlock.listIndent) {
// For code blocks in lists
baseIndent = openingBlock.listIndent + " ";
} else {
// Not in a list
baseIndent = openingBlock.indent;
}
// Get the indentation of this specific line
const lineIndentMatch = line.match(/^(\s*)/);
const lineIndent = lineIndentMatch ? lineIndentMatch[0] : '';
// Find the common prefix between the line's indent and the opening block's indent
// This represents the part of the indentation that's due to the markdown structure
let commonPrefix = '';
const minLength = Math.min(lineIndent.length, openingBlock.indent.length);
for (let i = 0; i < minLength; i++) {
if (lineIndent[i] === openingBlock.indent[i]) {
commonPrefix += lineIndent[i];
} else {
break;
}
}
// Remove just the common prefix (markdown structure indentation)
// and keep the rest (code's own indentation)
const contentAfterCommonIndent = line.substring(commonPrefix.length);
// Add the proper base indentation plus the preserved code indentation
result.push(`${baseIndent}${contentAfterCommonIndent}`);
} else {
// For empty lines, just keep them as is
result.push(line);
}
} else {
// Not in a code block, just add it as is
result.push(line);
}
}
return result.join('\n');
}
export function getKnowledgeStr(allKnowledge: KnowledgeItem[]) {
return allKnowledge.map((k, idx) => {
const aMsg = `
<knowledge-${idx + 1}>
${k.question}
${k.updated && (k.type === 'url' || k.type === 'side-info') ? `
<knowledge-datetime>
${k.updated}
</knowledge-datetime>
` : ''}
${k.references && k.type === 'url' ? `
<knowledge-url>
${k.references[0]}
</knowledge-url>
` : ''}
${k.answer}
</knowledge-${idx + 1}>
`.trim();
return removeExtraLineBreaks(aMsg);
})
}
/**
* Converts HTML tables in a markdown string to markdown tables
* @param mdString The markdown string containing potential HTML tables
* @returns The markdown string with HTML tables converted to markdown tables, or the original string if no conversions were made
*/
export function convertHtmlTablesToMd(mdString: string): string {
try {
let result = mdString;
// First check for HTML tables
if (mdString.includes('<table>')) {
// Regular expression to find HTML tables
const tableRegex = /<table>([\s\S]*?)<\/table>/g;
let match;
// Process each table found
while ((match = tableRegex.exec(mdString)) !== null) {
const htmlTable = match[0];
const convertedTable = convertSingleHtmlTableToMd(htmlTable);
if (convertedTable) {
result = result.replace(htmlTable, convertedTable);
}
}
}
return result;
} catch (error) {
console.error('Error converting HTML tables to Markdown:', error);
return mdString; // Return original string if conversion fails
}
}
/**
* Converts a single HTML table to a markdown table
* @param htmlTable The HTML table string
* @returns The markdown table string or null if conversion fails
*/
function convertSingleHtmlTableToMd(htmlTable: string): string | null {
try {
// Create a DOM parser to parse the HTML
const parser = new DOMParser();
const doc = parser.parseFromString(htmlTable, 'text/html');
const table = doc.querySelector('table');
if (!table) {
return null;
}
// Extract headers
const headers = Array.from(table.querySelectorAll('thead th'))
.map(th => sanitizeCell(th.textContent || ''));
// Check if headers were found
if (headers.length === 0) {
// Try to find headers in the first row of tbody
const firstRow = table.querySelector('tbody tr');
if (firstRow) {
headers.push(...Array.from(firstRow.querySelectorAll('td, th'))
.map(cell => sanitizeCell(cell.textContent || '')));
}
}
if (headers.length === 0) {
return null; // No headers found, can't create a valid markdown table
}
// Start building the markdown table
let mdTable = '';
// Add the header row
mdTable += '| ' + headers.join(' | ') + ' |\n';
// Add the separator row
mdTable += '| ' + headers.map(() => '---').join(' | ') + ' |\n';
// Add the data rows
const rows = Array.from(table.querySelectorAll('tbody tr'));
for (const row of rows) {
// Skip the first row if it was used for headers
if (table.querySelector('thead') === null && row === rows[0]) {
continue;
}
const cells = Array.from(row.querySelectorAll('td'))
.map(td => {
// Check for markdown content in the cell
const cellContent = td.innerHTML;
let processedContent = cellContent;
// Detect if the cell contains markdown formatting
const containsMarkdown =
cellContent.includes('**') ||
cellContent.includes('* ') ||
cellContent.includes('* ') ||
cellContent.includes('* ');
if (containsMarkdown) {
// Handle mixed HTML and Markdown
// Handle lists inside cells (both ordered and unordered)
if (cellContent.includes('* ') || cellContent.includes('* ') || cellContent.includes('* ')) {
// Extract list items, handling both HTML list structures or markdown-style lists
let listItems = [];
if (td.querySelectorAll('li').length > 0) {
// Handle HTML lists
listItems = Array.from(td.querySelectorAll('li'))
.map(li => li.innerHTML.trim());
} else {
// Handle markdown-style lists with asterisks
const lines = cellContent.split('\n');
for (const line of lines) {
const trimmedLine = line.trim();
if (trimmedLine.match(/^\s*\*\s+/)) {
listItems.push(trimmedLine.replace(/^\s*\*\s+/, ''));
}
}
}
// Format as bullet points with line breaks
processedContent = listItems.map(item => `${item}`).join('<br>');
// Preserve markdown formatting like bold and italic within list items
processedContent = processedContent
.replace(/\*\*(.*?)\*\*/g, '**$1**') // Preserve bold
.replace(/_(.*?)_/g, '_$1_'); // Preserve italic
} else {
// For cells without lists but with markdown, preserve the markdown formatting
processedContent = cellContent
.replace(/<\/?strong>/g, '**') // Convert HTML bold to markdown
.replace(/<\/?em>/g, '_') // Convert HTML italic to markdown
.replace(/<\/?b>/g, '**') // Convert HTML bold to markdown
.replace(/<\/?i>/g, '_') // Convert HTML italic to markdown
.replace(/<br\s*\/?>/g, '<br>') // Preserve line breaks as <br> tags
.replace(/<p\s*\/?>/g, '') // Remove opening paragraph tags
.replace(/<\/p>/g, '<br>'); // Convert closing paragraph tags to line breaks
}
} else {
// For regular HTML cells without markdown
processedContent = processedContent
.replace(/<\/?strong>/g, '**') // Bold
.replace(/<\/?em>/g, '_') // Italic
.replace(/<\/?b>/g, '**') // Bold
.replace(/<\/?i>/g, '_') // Italic
.replace(/<br\s*\/?>/g, '<br>') // Preserve line breaks as <br> tags
.replace(/<p\s*\/?>/g, '') // Opening paragraph tags
.replace(/<\/p>/g, '<br>'); // Convert closing paragraph tags to line breaks
}
// Strip any remaining HTML tags, but preserve markdown syntax and <br> tags
processedContent = processedContent
.replace(/<(?!\/?br\b)[^>]*>/g, '') // Remove all HTML tags except <br>
.trim();
return sanitizeCell(processedContent);
});
// Ensure each row has the same number of cells as headers
while (cells.length < headers.length) {
cells.push('');
}
mdTable += '| ' + cells.join(' | ') + ' |\n';
}
return mdTable;
} catch (error) {
console.error('Error converting single HTML table:', error);
return null;
}
}
/**
* Sanitizes a cell's content for use in a markdown table
* @param content The cell content
* @returns Sanitized content
*/
function sanitizeCell(content: string): string {
// Trim whitespace
let sanitized = content.trim();
// Normalize pipe characters in content (escape them)
sanitized = sanitized.replace(/\|/g, '\\|');
// Preserve line breaks
sanitized = sanitized.replace(/\n/g, '<br>');
// Keep existing <br> tags intact (don't escape them)
sanitized = sanitized.replace(/&lt;br&gt;/g, '<br>');
// Preserve markdown formatting
sanitized = sanitized
.replace(/\\\*\\\*/g, '**') // Fix escaped bold markers
.replace(/\\\*/g, '*') // Fix escaped list markers
.replace(/\\_/g, '_'); // Fix escaped italic markers
return sanitized;
}
if (typeof window === 'undefined') {
global.DOMParser = class DOMParser {
parseFromString(htmlString: string, mimeType: string) {
const dom = new JSDOM(htmlString, {contentType: mimeType});
return dom.window.document;
}
};
}