diff --git a/src/agent.ts b/src/agent.ts index 7f1cac3..259c6ca 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -36,7 +36,7 @@ import { buildMdFromAnswer, chooseK, convertHtmlTablesToMd, fixCodeBlockIndentation, removeExtraLineBreaks, - removeHTMLtags, repairMarkdownFootnotesOuter + removeHTMLtags, repairMarkdownFinal, repairMarkdownFootnotesOuter } from "./utils/text-tools"; import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas"; import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools"; @@ -111,7 +111,7 @@ function getPrompt( knowledge?: KnowledgeItem[], allURLs?: BoostedSearchSnippet[], beastMode?: boolean, -): { system: string, urlList?: string[]} { +): { system: string, urlList?: string[] } { const sections: string[] = []; const actionSections: string[] = []; @@ -140,7 +140,7 @@ ${context.join('\n')} if (allowRead && urlList.length > 0) { const urlListStr = urlList .map((item, idx) => ` - [idx=${idx + 1}] [weight=${item.score.toFixed(2)}] "${item.url}": "${item.merged}"`) - .join('\n') + .join('\n') actionSections.push(` @@ -232,7 +232,7 @@ ${actionSections.join('\n\n')} return { system: removeExtraLineBreaks(sections.join('\n\n')), urlList: urlList.map(u => u.url) -}; + }; } @@ -441,10 +441,10 @@ export async function getResponse(question?: string, messages.forEach(m => { let strMsg = ''; if (typeof m.content === 'string') { - strMsg = m.content.trim(); - } else if (typeof m.content === 'object' && Array.isArray( m.content)) { + strMsg = m.content.trim(); + } else if (typeof m.content === 'object' && Array.isArray(m.content)) { // find the very last sub content whose 'type' is 'text' and use 'text' as the question - strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim(); + strMsg = m.content.filter(c => c.type === 'text').map(c => c.text).join('\n').trim(); } extractUrlsWithDescription(strMsg).forEach(u => { @@ -506,7 +506,7 @@ export async function getResponse(question?: string, allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already // generate prompt for this step - const { system, urlList} = getPrompt( + const {system, urlList} = getPrompt( diaryContext, allQuestions, allKeywords, @@ -917,7 +917,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b // any answer is better than no answer, humanity last resort step++; totalStep++; - const { system } = getPrompt( + const {system} = getPrompt( diaryContext, allQuestions, allKeywords, @@ -952,18 +952,19 @@ But unfortunately, you failed to solve the issue. You need to think out of the b if (!trivialQuestion) { (thisStep as AnswerAction).mdAnswer = - convertHtmlTablesToMd( - fixBadURLMdLinks( - fixCodeBlockIndentation( - repairMarkdownFootnotesOuter( - await fixMarkdown( - buildMdFromAnswer((thisStep as AnswerAction)), - allKnowledge, - context, - SchemaGen - )) - ), - allURLs)); + repairMarkdownFinal( + convertHtmlTablesToMd( + fixBadURLMdLinks( + fixCodeBlockIndentation( + repairMarkdownFootnotesOuter( + await fixMarkdown( + buildMdFromAnswer((thisStep as AnswerAction)), + allKnowledge, + context, + SchemaGen + )) + ), + allURLs))); } else { (thisStep as AnswerAction).mdAnswer = convertHtmlTablesToMd( @@ -1029,7 +1030,6 @@ ${JSON.stringify(zodToJsonSchema(schema), null, 2)} } } - export async function main() { const question = process.argv[2] || ""; const { diff --git a/src/utils/text-tools.ts b/src/utils/text-tools.ts index 00f35ea..6ea1f24 100644 --- a/src/utils/text-tools.ts +++ b/src/utils/text-tools.ts @@ -663,4 +663,140 @@ if (typeof window === 'undefined') { return dom.window.document; } }; -} \ No newline at end of file +} + +/** + * Escapes special regex characters in a string + */ +function escapeRegExp(string: string): string { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Counts occurrences of a specific character in a string + */ +function countChar(text: string, char: string): number { + return (text.match(new RegExp(escapeRegExp(char), 'g')) || []).length; +} + +/** + * Processes formatted text and moves colons outside the formatting markers + */ +function processFormattedText(text: string, openMarker: string, closeMarker: string): string { + const pattern = new RegExp(`${escapeRegExp(openMarker)}(.*?)${escapeRegExp(closeMarker)}`, 'g'); + + return text.replace(pattern, (match, content) => { + // Check if content contains colon + if (content.includes(':') || content.includes(':')) { + // Count colons before removing them + const standardColonCount = countChar(content, ':'); + const wideColonCount = countChar(content, ':'); + + // Remove colons and trim content + const trimmedContent = content.replace(/[::]/g, '').trim(); + + // Add colons back outside the formatting + const standardColons = ':'.repeat(standardColonCount); + const wideColons = ':'.repeat(wideColonCount); + + return `${openMarker}${trimmedContent}${closeMarker}${standardColons}${wideColons}`; + } + return match; + }); +} + +/** + * Repairs markdown by: + * 1. Removing
and
tags that are not inside tables + * 2. Moving colons outside of bold and italic formatting + * + * @param markdown - The markdown string to repair + * @returns The repaired markdown, or the original if an error occurs + */ +export function repairMarkdownFinal(markdown: string): string { + try { + let repairedMarkdown = markdown; + + // Step 1: Handle
and
tags outside tables + + // First, identify table regions to exclude them from the replacement + const tableRegions: Array<[number, number]> = []; + + // Find HTML tables + const htmlTableRegex = //g; + let htmlTableMatch; + while ((htmlTableMatch = htmlTableRegex.exec(repairedMarkdown)) !== null) { + tableRegions.push([htmlTableMatch.index, htmlTableMatch.index + htmlTableMatch[0].length]); + } + + // Find markdown tables + const lines = repairedMarkdown.split('\n'); + let inMarkdownTable = false; + let markdownTableStart = 0; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + + if (line.startsWith('|') && line.includes('|', 1)) { + if (!inMarkdownTable) { + inMarkdownTable = true; + markdownTableStart = repairedMarkdown.indexOf(lines[i]); + } + } else if (inMarkdownTable && line === '') { + inMarkdownTable = false; + const tableEnd = repairedMarkdown.indexOf(lines[i - 1]) + lines[i - 1].length; + tableRegions.push([markdownTableStart, tableEnd]); + } + } + + if (inMarkdownTable) { + const tableEnd = repairedMarkdown.length; + tableRegions.push([markdownTableStart, tableEnd]); + } + + // Check if an index is inside any table region + const isInTable = (index: number): boolean => { + return tableRegions.some(([start, end]) => index >= start && index < end); + }; + + // Remove
and
tags outside tables + let result = ''; + let i = 0; + + while (i < repairedMarkdown.length) { + if (repairedMarkdown.substring(i, i + 4) === '
' && !isInTable(i)) { + i += 4; + } + else if (repairedMarkdown.substring(i, i + 4) === '
' && !isInTable(i)) { + i += 4; + } + else { + result += repairedMarkdown[i]; + i++; + } + } + + repairedMarkdown = result; + + // Step 2: Fix formatting with colons + // Process from most specific (longest) patterns to most general + const formattingPatterns = [ + ['****', '****'], // Four asterisks + ['****', '***'], // Four opening, three closing + ['***', '****'], // Three opening, four closing + ['***', '***'], // Three asterisks + ['**', '**'], // Two asterisks (bold) + ['*', '*'] // One asterisk (italic) + ]; + + for (const [open, close] of formattingPatterns) { + repairedMarkdown = processFormattedText(repairedMarkdown, open, close); + } + + return repairedMarkdown; + } catch (error) { + // Return the original markdown if any error occurs + return markdown; + } +} +