diff --git a/config.json b/config.json index fcdad2c..bb229dd 100644 --- a/config.json +++ b/config.json @@ -51,9 +51,12 @@ "agentBeastMode": { "temperature": 0.7 }, - "mdFixer": { + "finalizer": { "model": "gemini-2.5-flash-preview-05-20" }, + "reducer": { + "maxTokens": 16000 + }, "fallback": { "maxTokens": 8000, "model": "gemini-2.0-flash-lite" @@ -85,7 +88,10 @@ "fallback": { "temperature": 0 }, - "mdFixer": {} + "finalizer": {}, + "reducer": { + "maxTokens": 16000 + } } } } diff --git a/jina-ai/config.json b/jina-ai/config.json index a5059f0..c8dd248 100644 --- a/jina-ai/config.json +++ b/jina-ai/config.json @@ -59,7 +59,8 @@ "maxTokens": 8000, "model": "gemini-2.0-flash-lite" }, - "mdFixer": {} + "finalizer": {}, + "reducer": {"maxTokens": 16000} } }, "openai": { @@ -87,7 +88,8 @@ "fallback": { "temperature": 0 }, - "mdFixer": {} + "finalizer": {}, + "reducer": {"maxTokens": 16000} } } } diff --git a/package.json b/package.json index 9359156..3651e87 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "dev": "npx ts-node src/agent.ts", "search": "npx ts-node src/test-duck.ts", "rewrite": "npx ts-node src/tools/query-rewriter.ts", + "ngram": "npx ts-node src/cli/ngram.ts", "lint": "eslint . --ext .ts", "lint:fix": "eslint . --ext .ts --fix", "serve": "ts-node src/server.ts", @@ -65,4 +66,4 @@ "optionalDependencies": { "@ai-sdk/google-vertex": "^2.1.12" } -} +} \ No newline at end of file diff --git a/src/agent.ts b/src/agent.ts index ea8180c..d2278af 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -41,10 +41,11 @@ import { } from "./utils/text-tools"; import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas"; import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools"; -import { reviseAnswer } from "./tools/md-fixer"; +import { finalizeAnswer } from "./tools/finalizer"; import { buildImageReferences, buildReferences } from "./tools/build-ref"; import { logInfo, logError, logDebug, logWarning } from './logging'; import { researchPlan } from './tools/research-planner'; +import { reduceAnswers } from './tools/reducer'; async function wait(seconds: number) { logDebug(`Waiting ${seconds}s...`); @@ -813,8 +814,13 @@ But then you realized you have asked them before. You decided to to think out of isAggregated: true } as AnswerAction; + // aggregate urls + visitedURLs.push(...subproblemResponses.map(r => r.readURLs).flat()); + weightedURLs = subproblemResponses.map(r => r.allURLs.map(url => ({ url, title: '' } as BoostedSearchSnippet))).flat(); - // break the loop, move to final boxing + // TODO aggregate images @shazhou2015 + + // break the loop, jump directly final boxing break; } @@ -1037,7 +1043,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b fixBadURLMdLinks( fixCodeBlockIndentation( repairMarkdownFootnotesOuter( - await reviseAnswer( + await finalizeAnswer( answerStep.answer, allKnowledge, context, @@ -1072,6 +1078,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b imageReferences = []; } } + } else if (answerStep.isAggregated) { + answerStep.answer = await reduceAnswers(answerStep.answer, context, SchemaGen); + answerStep.mdAnswer = repairMarkdownFootnotesOuter(buildMdFromAnswer(answerStep)); } // max return 300 urls @@ -1079,7 +1088,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b return { result: thisStep, context, - visitedURLs: returnedURLs, + visitedURLs: returnedURLs, // deprecated readURLs: visitedURLs.filter(url => !badURLs.includes(url)), allURLs: weightedURLs.map(r => r.url), allImages: withImages ? imageObjects.map(i => i.url) : undefined, diff --git a/src/cli/ngram.ts b/src/cli/ngram.ts new file mode 100644 index 0000000..cd8b79c --- /dev/null +++ b/src/cli/ngram.ts @@ -0,0 +1,36 @@ +import fs from 'fs'; +import { extractNgrams } from '../utils/text-tools'; + +async function main() { + const args = process.argv.slice(2); + if (args.length === 0) { + console.error('Please provide a file path'); + process.exit(1); + } + + const filePath = args[0]; + const n = parseInt(args[1]) || 3; // Default to 3-grams + const minFreq = parseInt(args[2]) || 2; // Default minimum frequency of 2 + const minPMI = parseFloat(args[3]) || 1.0; // Default minimum PMI of 1.0 + + try { + const text = await fs.promises.readFile(filePath, 'utf-8'); + const results = extractNgrams(text, n, minFreq, minPMI); + + console.log('\nN-gram Analysis Results:'); + console.log('------------------------'); + results.forEach(({ ngram, freq, pmi }) => { + if (pmi !== undefined) { + console.log(`${ngram}: ${freq} (PMI: ${pmi.toFixed(2)})`); + } else { + console.log(`${ngram}: ${freq}`); + } + }); + } catch (err) { + const error = err as Error; + console.error('Error:', error.message); + process.exit(1); + } +} + +main(); \ No newline at end of file diff --git a/src/tools/md-fixer.ts b/src/tools/finalizer.ts similarity index 93% rename from src/tools/md-fixer.ts rename to src/tools/finalizer.ts index 50df7c3..2928fbc 100644 --- a/src/tools/md-fixer.ts +++ b/src/tools/finalizer.ts @@ -61,9 +61,9 @@ IMPORTANT: Do not begin your response with phrases like "Sure", "Here is", "Belo } } -const TOOL_NAME = 'mdFixer'; +const TOOL_NAME = 'finalizer'; -export async function reviseAnswer( +export async function finalizeAnswer( mdContent: string, knowledgeItems: KnowledgeItem[], trackers: TrackerContext, @@ -71,7 +71,7 @@ export async function reviseAnswer( ): Promise { try { const prompt = getPrompt(mdContent, knowledgeItems, schema); - trackers?.actionTracker.trackThink('final_answer', schema.languageCode) + trackers?.actionTracker.trackThink('finalize_answer', schema.languageCode) const result = await generateText({ model: getModel(TOOL_NAME), @@ -83,10 +83,10 @@ export async function reviseAnswer( logInfo(TOOL_NAME, { text: result.text }); - logDebug(`repaired before/after: ${mdContent.length} -> ${result.text.length}`); + logDebug(`finalized answer before/after: ${mdContent.length} -> ${result.text.length}`); if (result.text.length < mdContent.length * 0.85) { - logWarning(`repaired content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, { + logWarning(`finalized answer length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, { originalContent: mdContent, repairedContent: result.text }); @@ -96,7 +96,7 @@ export async function reviseAnswer( return result.text; } catch (error) { - logError(`Error in ${TOOL_NAME}`, { error }); + logError(TOOL_NAME, { error }); return mdContent; } } \ No newline at end of file diff --git a/src/tools/reducer.ts b/src/tools/reducer.ts new file mode 100644 index 0000000..913be96 --- /dev/null +++ b/src/tools/reducer.ts @@ -0,0 +1,93 @@ +import { PromptPair, TrackerContext } from '../types'; +import { getModel } from "../config"; +import { generateText } from "ai"; +import { Schemas } from "../utils/schemas"; +import { logInfo, logError, logDebug } from '../logging'; + + +function getPrompt(mdContent: string): PromptPair { + + + return { + system: ` +You are an article aggregator that creates a coherent, high-quality article by smartly merging multiple source articles. Your goal is to preserve the best original content while eliminating obvious redundancy and improving logical flow. + + +1. Content Preservation +ALWAYS preserve original sentences verbatim - do not paraphrase or rewrite +Select the highest quality version when multiple articles cover the same point +Maintain the original author's voice and technical accuracy +Keep direct quotes, statistics, and factual claims exactly as written +2. Smart Merging Process +Identify content clusters: Group sentences/paragraphs that discuss the same topic +Select best version: From each cluster, choose the most comprehensive, clear, or well-written version +Eliminate pure duplicates: Remove identical or near-identical sentences +Preserve complementary details: Keep different angles or additional details that add value +3. Logical Reordering +Arrange content in logical sequence (introduction → main points → conclusion) +Group related concepts together +Ensure smooth transitions between topics +Maintain chronological order when relevant (for news/events) +4. Quality Criteria for Selection +When choosing between similar content, prioritize: +Clarity: More understandable explanations +Completeness: More comprehensive coverage +Accuracy: Better sourced or more precise information +Relevance: More directly related to the main topic + + + +Structure the final article with: +Clear section headings (when appropriate) +Logical paragraph breaks +Smooth flow between topics +No attribution to individual sources (present as unified piece) + + +Do not add your own commentary or analysis +Do not change technical terms, names, or specific details + +Your final output should read as a cohesive, high-quality article that appears to be written by a single author, while actually being a careful curation of the best sentences from all input sources. + `, + user: mdContent + } +} + +const TOOL_NAME = 'reducer'; + +export async function reduceAnswers( + mdContent: string, + trackers: TrackerContext, + schema: Schemas +): Promise { + try { + const prompt = getPrompt(mdContent); + trackers?.actionTracker.trackThink('reduce_answer', schema.languageCode) + + const result = await generateText({ + model: getModel(TOOL_NAME), + system: prompt.system, + prompt: prompt.user, + }); + + trackers.tokenTracker.trackUsage(TOOL_NAME, result.usage) + + + logInfo(TOOL_NAME, { text: result.text }); + logDebug(`reduce before/after: ${mdContent.length} -> ${result.text.length}`); + + // if (result.text.length < mdContent.length * 0.85) { + // logWarning(`reduce content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, { + // originalContent: mdContent, + // repairedContent: result.text + // }); + // return mdContent; + // } + + return result.text; + + } catch (error) { + logError(TOOL_NAME, { error }); + return mdContent; + } +} \ No newline at end of file diff --git a/src/utils/i18n.json b/src/utils/i18n.json index 218e7a5..7b035f2 100644 --- a/src/utils/i18n.json +++ b/src/utils/i18n.json @@ -5,7 +5,8 @@ "read_for": "Let me read ${urls} to gather more information.", "read_for_verify": "Let me fetch the source content to verify the answer.", "late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.", - "final_answer": "Let me finalize the answer.", + "finalize_answer": "Let me finalize the answer.", + "reduce_answer": "Let me aggregate all research results.", "blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.", "hostnames_no_results": "Can't find any results from ${hostnames}.", "cross_reference": "Let me cross-reference the information from the web to verify the answer." @@ -16,7 +17,8 @@ "read_for": "让我读取网页 ${urls} 来获取更多信息。", "read_for_verify": "让我读取源网页内容来验证答案。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", - "final_answer": "我来整理一下答案。", + "finalize_answer": "我来整理一下答案。", + "reduce_answer": "让我综合整理所有的调研结果。", "blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。", "hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。", "cross_reference": "让我交叉验证一下网页上的信息来验证答案。" @@ -27,7 +29,8 @@ "read_for": "讓我閱讀 ${urls} 來獲取更多信息。", "read_for_verify": "讓我獲取源內容來驗證答案。", "late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。", - "final_answer": "我來整理一下答案。", + "finalize_answer": "我來整理一下答案。", + "reduce_answer": "讓我整合所有調研結果。", "blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!", "hostnames_no_results": "咦... ${hostnames} 找不到什么结果。", "cross_reference": "讓我交叉驗證一下網頁上的信息來驗證答案。" @@ -38,7 +41,8 @@ "read_for": "${urls} を読んで、情報を集めます。", "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。", "late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。", - "final_answer": "答えをまとめます。", + "finalize_answer": "答えをまとめます。", + "reduce_answer": "答えをまとめます。", "blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。", "hostnames_no_results": "${hostnames} から結果が見つかりません。", "cross_reference": "ウェブ上の情報をクロスリファレンスして、答えを確認します。" @@ -49,7 +53,8 @@ "read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.", "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.", "late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.", - "final_answer": "답변을 마무리하겠습니다.", + "finalize_answer": "답변을 마무리하겠습니다.", + "reduce_answer": "답변을 마무리하겠습니다.", "blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!", "hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다.", "cross_reference": "웹에서 정보를 교차 검증하여 답변을 확인하겠습니다." @@ -60,7 +65,8 @@ "read_for": "Je vais lire ${urls} pour obtenir plus d'informations.", "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.", "late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.", - "final_answer": "Je vais finaliser la réponse.", + "finalize_answer": "Je vais finaliser la réponse.", + "reduce_answer": "Je vais finaliser la réponse.", "blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !", "hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}.", "cross_reference": "Je vais croiser les informations sur le web pour vérifier la réponse." @@ -71,7 +77,8 @@ "read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.", "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.", "late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.", - "final_answer": "Ich werde die Antwort abschließen.", + "finalize_answer": "Ich werde die Antwort abschließen.", + "reduce_answer": "Ich werde die Antwort abschließen.", "blocked_content": "Mist! ${url} lässt mich nicht rein.", "hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden.", "cross_reference": "Ich werde die Informationen im Web abgleichen, um die Antwort zu überprüfen." @@ -82,7 +89,8 @@ "read_for": "Voy a leer ${urls} para recopilar más información.", "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.", "late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.", - "final_answer": "Voy a finalizar la respuesta.", + "finalize_answer": "Voy a finalizar la respuesta.", + "reduce_answer": "Voy a finalizar la respuesta.", "blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!", "hostnames_no_results": "No se encontraron resultados de ${hostnames}." }, @@ -92,7 +100,8 @@ "read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.", "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.", "late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.", - "final_answer": "Finalizzerò la risposta.", + "finalize_answer": "Finalizzerò la risposta.", + "reduce_answer": "Finalizzerò la risposta.", "blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!", "hostnames_no_results": "Nessun risultato trovato da ${hostnames}.", "cross_reference": "Incrocerò le informazioni sul web per verificare la risposta." @@ -103,7 +112,8 @@ "read_for": "Vou ler ${urls} para reunir mais informações.", "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.", "late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.", - "final_answer": "Vou finalizar a resposta.", + "finalize_answer": "Vou finalizar a resposta.", + "reduce_answer": "Vou finalizar a resposta.", "blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!", "hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}.", "cross_reference": "Vou cruzar as informações da web para verificar a resposta." @@ -114,7 +124,8 @@ "read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.", "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.", "late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.", - "final_answer": "Дайте мне завершить ответ.", + "finalize_answer": "Дайте мне завершить ответ.", + "reduce_answer": "Дайте мне завершить ответ.", "blocked_content": "Ой! Меня заблокировал ${url}, не круто!", "hostnames_no_results": "Ничего не найдено на ${hostnames}.", "cross_reference": "Дайте мне сопоставить информацию из сети, чтобы проверить ответ." @@ -125,6 +136,8 @@ "read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.", "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.", "late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.", + "finalize_answer": "دعني أنهي الإجابة.", + "reduce_answer": "دعني أنهي الإجابة.", "blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!", "hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}.", "cross_reference": "دعني أقوم بمقارنة المعلومات من الويب للتحقق من الإجابة." @@ -135,7 +148,8 @@ "read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.", "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.", "late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.", - "final_answer": "Ik zal het antwoord afronden.", + "finalize_answer": "Ik zal het antwoord afronden.", + "reduce_answer": "Ik zal het antwoord afronden.", "blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.", "hostnames_no_results": "Geen resultaten gevonden van ${hostnames}.", "cross_reference": "Ik zal de informatie op het web kruisverwijzen om het antwoord te verifiëren." @@ -146,7 +160,8 @@ "read_for": "让我阅读 ${urls} 来获取更多信息。", "read_for_verify": "让我获取源内容来验证答案。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", - "final_answer": "我来整理一下答案。", + "finalize_answer": "我来整理一下答案。", + "reduce_answer": "让我整合所有调研结果。", "blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。", "hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。", "cross_reference": "让我交叉验证一下网页上的信息来验证答案。" diff --git a/src/utils/text-tools.ts b/src/utils/text-tools.ts index 424af25..b913021 100644 --- a/src/utils/text-tools.ts +++ b/src/utils/text-tools.ts @@ -824,3 +824,111 @@ export async function detectBrokenUnicodeViaFileIO(str: string) { // Now check for the visible replacement character return { broken: readStr.includes('�'), readStr }; } + +interface NgramResult { + ngram: string; + freq: number; + pmi?: number; // Added PMI score +} + +function calculatePMI( + ngram: string, + ngramFreq: number, + wordFreqs: Map, + totalNgrams: number +): number { + const words = ngram.split(' '); + if (words.length < 2) return 0; + + // Calculate joint probability + const jointProb = ngramFreq / totalNgrams; + + // Calculate individual probabilities + const wordProbs = words.map(word => (wordFreqs.get(word) || 0) / totalNgrams); + + // Calculate PMI + const pmi = Math.log2(jointProb / wordProbs.reduce((a, b) => a * b, 1)); + return pmi; +} + +function isCJK(char: string): boolean { + const code = char.charCodeAt(0); + return ( + (code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs + (code >= 0x3040 && code <= 0x309F) || // Hiragana + (code >= 0x30A0 && code <= 0x30FF) || // Katakana + (code >= 0xAC00 && code <= 0xD7AF) // Hangul + ); +} + +function isCJKText(text: string): boolean { + return Array.from(text).some(char => isCJK(char)); +} + +export function extractNgrams( + text: string, + n: number, + minFreq: number = 2, + minPMI: number = 1.0 // Added minimum PMI threshold +): NgramResult[] { + // Split text into chunks by newlines + const chunks = text.split('\n').filter(chunk => chunk.trim().length > 0); + + // Maps to store frequencies + const ngramFreq: Map = new Map(); + const wordFreq: Map = new Map(); + let totalNgrams = 0; + + // First pass: collect frequencies + for (const chunk of chunks) { + if (isCJKText(chunk)) { + // For CJK text, use character-level ngrams + for (let len = 2; len <= n; len++) { + for (let i = 0; i <= chunk.length - len; i++) { + const ngram = chunk.slice(i, i + len); + ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1); + totalNgrams++; + } + } + } else { + // For non-CJK text, use word-level ngrams + const words = chunk.split(/\s+/).filter(word => word.length > 0); + + // Count individual word frequencies + words.forEach(word => { + wordFreq.set(word, (wordFreq.get(word) || 0) + 1); + }); + + // Count ngram frequencies + for (let len = 2; len <= n; len++) { + for (let i = 0; i <= words.length - len; i++) { + const ngram = words.slice(i, i + len).join(' '); + ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1); + totalNgrams++; + } + } + } + } + + // Second pass: calculate PMI and filter + const results: NgramResult[] = Array.from(ngramFreq.entries()) + .filter(([ngram, freq]) => freq >= minFreq) + .map(([ngram, freq]) => { + const pmi = isCJKText(ngram) ? 0 : calculatePMI(ngram, freq, wordFreq, totalNgrams); + return { ngram, freq, pmi }; + }) + .filter(result => result.pmi === undefined || result.pmi >= minPMI) + .sort((a, b) => { + // If both have PMI scores, sort by PMI + if (a.pmi !== undefined && b.pmi !== undefined) { + return b.pmi - a.pmi; + } + // If only one has PMI, prioritize the one with PMI + if (a.pmi !== undefined) return -1; + if (b.pmi !== undefined) return 1; + // If neither has PMI (CJK text), sort by frequency + return b.freq - a.freq; + }); + + return results; +}