refactor: replace mdFixer with finalizer and reducer, add ngram script

This commit is contained in:
Han Xiao
2025-06-11 17:02:33 -07:00
parent 7965ce1167
commit 1fef3c26d9
9 changed files with 298 additions and 28 deletions

View File

@@ -51,9 +51,12 @@
"agentBeastMode": {
"temperature": 0.7
},
"mdFixer": {
"finalizer": {
"model": "gemini-2.5-flash-preview-05-20"
},
"reducer": {
"maxTokens": 16000
},
"fallback": {
"maxTokens": 8000,
"model": "gemini-2.0-flash-lite"
@@ -85,7 +88,10 @@
"fallback": {
"temperature": 0
},
"mdFixer": {}
"finalizer": {},
"reducer": {
"maxTokens": 16000
}
}
}
}

View File

@@ -59,7 +59,8 @@
"maxTokens": 8000,
"model": "gemini-2.0-flash-lite"
},
"mdFixer": {}
"finalizer": {},
"reducer": {"maxTokens": 16000}
}
},
"openai": {
@@ -87,7 +88,8 @@
"fallback": {
"temperature": 0
},
"mdFixer": {}
"finalizer": {},
"reducer": {"maxTokens": 16000}
}
}
}

View File

@@ -12,6 +12,7 @@
"dev": "npx ts-node src/agent.ts",
"search": "npx ts-node src/test-duck.ts",
"rewrite": "npx ts-node src/tools/query-rewriter.ts",
"ngram": "npx ts-node src/cli/ngram.ts",
"lint": "eslint . --ext .ts",
"lint:fix": "eslint . --ext .ts --fix",
"serve": "ts-node src/server.ts",
@@ -65,4 +66,4 @@
"optionalDependencies": {
"@ai-sdk/google-vertex": "^2.1.12"
}
}
}

View File

@@ -41,10 +41,11 @@ import {
} from "./utils/text-tools";
import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas";
import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools";
import { reviseAnswer } from "./tools/md-fixer";
import { finalizeAnswer } from "./tools/finalizer";
import { buildImageReferences, buildReferences } from "./tools/build-ref";
import { logInfo, logError, logDebug, logWarning } from './logging';
import { researchPlan } from './tools/research-planner';
import { reduceAnswers } from './tools/reducer';
async function wait(seconds: number) {
logDebug(`Waiting ${seconds}s...`);
@@ -813,8 +814,13 @@ But then you realized you have asked them before. You decided to to think out of
isAggregated: true
} as AnswerAction;
// aggregate urls
visitedURLs.push(...subproblemResponses.map(r => r.readURLs).flat());
weightedURLs = subproblemResponses.map(r => r.allURLs.map(url => ({ url, title: '' } as BoostedSearchSnippet))).flat();
// break the loop, move to final boxing
// TODO aggregate images @shazhou2015
// break the loop, jump directly final boxing
break;
}
@@ -1037,7 +1043,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
fixBadURLMdLinks(
fixCodeBlockIndentation(
repairMarkdownFootnotesOuter(
await reviseAnswer(
await finalizeAnswer(
answerStep.answer,
allKnowledge,
context,
@@ -1072,6 +1078,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
imageReferences = [];
}
}
} else if (answerStep.isAggregated) {
answerStep.answer = await reduceAnswers(answerStep.answer, context, SchemaGen);
answerStep.mdAnswer = repairMarkdownFootnotesOuter(buildMdFromAnswer(answerStep));
}
// max return 300 urls
@@ -1079,7 +1088,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
return {
result: thisStep,
context,
visitedURLs: returnedURLs,
visitedURLs: returnedURLs, // deprecated
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
allURLs: weightedURLs.map(r => r.url),
allImages: withImages ? imageObjects.map(i => i.url) : undefined,

36
src/cli/ngram.ts Normal file
View File

@@ -0,0 +1,36 @@
import fs from 'fs';
import { extractNgrams } from '../utils/text-tools';
async function main() {
const args = process.argv.slice(2);
if (args.length === 0) {
console.error('Please provide a file path');
process.exit(1);
}
const filePath = args[0];
const n = parseInt(args[1]) || 3; // Default to 3-grams
const minFreq = parseInt(args[2]) || 2; // Default minimum frequency of 2
const minPMI = parseFloat(args[3]) || 1.0; // Default minimum PMI of 1.0
try {
const text = await fs.promises.readFile(filePath, 'utf-8');
const results = extractNgrams(text, n, minFreq, minPMI);
console.log('\nN-gram Analysis Results:');
console.log('------------------------');
results.forEach(({ ngram, freq, pmi }) => {
if (pmi !== undefined) {
console.log(`${ngram}: ${freq} (PMI: ${pmi.toFixed(2)})`);
} else {
console.log(`${ngram}: ${freq}`);
}
});
} catch (err) {
const error = err as Error;
console.error('Error:', error.message);
process.exit(1);
}
}
main();

View File

@@ -61,9 +61,9 @@ IMPORTANT: Do not begin your response with phrases like "Sure", "Here is", "Belo
}
}
const TOOL_NAME = 'mdFixer';
const TOOL_NAME = 'finalizer';
export async function reviseAnswer(
export async function finalizeAnswer(
mdContent: string,
knowledgeItems: KnowledgeItem[],
trackers: TrackerContext,
@@ -71,7 +71,7 @@ export async function reviseAnswer(
): Promise<string> {
try {
const prompt = getPrompt(mdContent, knowledgeItems, schema);
trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
trackers?.actionTracker.trackThink('finalize_answer', schema.languageCode)
const result = await generateText({
model: getModel(TOOL_NAME),
@@ -83,10 +83,10 @@ export async function reviseAnswer(
logInfo(TOOL_NAME, { text: result.text });
logDebug(`repaired before/after: ${mdContent.length} -> ${result.text.length}`);
logDebug(`finalized answer before/after: ${mdContent.length} -> ${result.text.length}`);
if (result.text.length < mdContent.length * 0.85) {
logWarning(`repaired content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
logWarning(`finalized answer length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
originalContent: mdContent,
repairedContent: result.text
});
@@ -96,7 +96,7 @@ export async function reviseAnswer(
return result.text;
} catch (error) {
logError(`Error in ${TOOL_NAME}`, { error });
logError(TOOL_NAME, { error });
return mdContent;
}
}

93
src/tools/reducer.ts Normal file
View File

@@ -0,0 +1,93 @@
import { PromptPair, TrackerContext } from '../types';
import { getModel } from "../config";
import { generateText } from "ai";
import { Schemas } from "../utils/schemas";
import { logInfo, logError, logDebug } from '../logging';
function getPrompt(mdContent: string): PromptPair {
return {
system: `
You are an article aggregator that creates a coherent, high-quality article by smartly merging multiple source articles. Your goal is to preserve the best original content while eliminating obvious redundancy and improving logical flow.
<core-instructions>
1. Content Preservation
ALWAYS preserve original sentences verbatim - do not paraphrase or rewrite
Select the highest quality version when multiple articles cover the same point
Maintain the original author's voice and technical accuracy
Keep direct quotes, statistics, and factual claims exactly as written
2. Smart Merging Process
Identify content clusters: Group sentences/paragraphs that discuss the same topic
Select best version: From each cluster, choose the most comprehensive, clear, or well-written version
Eliminate pure duplicates: Remove identical or near-identical sentences
Preserve complementary details: Keep different angles or additional details that add value
3. Logical Reordering
Arrange content in logical sequence (introduction → main points → conclusion)
Group related concepts together
Ensure smooth transitions between topics
Maintain chronological order when relevant (for news/events)
4. Quality Criteria for Selection
When choosing between similar content, prioritize:
Clarity: More understandable explanations
Completeness: More comprehensive coverage
Accuracy: Better sourced or more precise information
Relevance: More directly related to the main topic
</core-instructions>
<output-format>
Structure the final article with:
Clear section headings (when appropriate)
Logical paragraph breaks
Smooth flow between topics
No attribution to individual sources (present as unified piece)
</output-format>
Do not add your own commentary or analysis
Do not change technical terms, names, or specific details
Your final output should read as a cohesive, high-quality article that appears to be written by a single author, while actually being a careful curation of the best sentences from all input sources.
`,
user: mdContent
}
}
const TOOL_NAME = 'reducer';
export async function reduceAnswers(
mdContent: string,
trackers: TrackerContext,
schema: Schemas
): Promise<string> {
try {
const prompt = getPrompt(mdContent);
trackers?.actionTracker.trackThink('reduce_answer', schema.languageCode)
const result = await generateText({
model: getModel(TOOL_NAME),
system: prompt.system,
prompt: prompt.user,
});
trackers.tokenTracker.trackUsage(TOOL_NAME, result.usage)
logInfo(TOOL_NAME, { text: result.text });
logDebug(`reduce before/after: ${mdContent.length} -> ${result.text.length}`);
// if (result.text.length < mdContent.length * 0.85) {
// logWarning(`reduce content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
// originalContent: mdContent,
// repairedContent: result.text
// });
// return mdContent;
// }
return result.text;
} catch (error) {
logError(TOOL_NAME, { error });
return mdContent;
}
}

View File

@@ -5,7 +5,8 @@
"read_for": "Let me read ${urls} to gather more information.",
"read_for_verify": "Let me fetch the source content to verify the answer.",
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
"final_answer": "Let me finalize the answer.",
"finalize_answer": "Let me finalize the answer.",
"reduce_answer": "Let me aggregate all research results.",
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
"hostnames_no_results": "Can't find any results from ${hostnames}.",
"cross_reference": "Let me cross-reference the information from the web to verify the answer."
@@ -16,7 +17,8 @@
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
"read_for_verify": "让我读取源网页内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。",
"finalize_answer": "我来整理一下答案。",
"reduce_answer": "让我综合整理所有的调研结果。",
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
@@ -27,7 +29,8 @@
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
"read_for_verify": "讓我獲取源內容來驗證答案。",
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
"final_answer": "我來整理一下答案。",
"finalize_answer": "我來整理一下答案。",
"reduce_answer": "讓我整合所有調研結果。",
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。",
"cross_reference": "讓我交叉驗證一下網頁上的信息來驗證答案。"
@@ -38,7 +41,8 @@
"read_for": "${urls} を読んで、情報を集めます。",
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
"final_answer": "答えをまとめます。",
"finalize_answer": "答えをまとめます。",
"reduce_answer": "答えをまとめます。",
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
"hostnames_no_results": "${hostnames} から結果が見つかりません。",
"cross_reference": "ウェブ上の情報をクロスリファレンスして、答えを確認します。"
@@ -49,7 +53,8 @@
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
"final_answer": "답변을 마무리하겠습니다.",
"finalize_answer": "답변을 마무리하겠습니다.",
"reduce_answer": "답변을 마무리하겠습니다.",
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다.",
"cross_reference": "웹에서 정보를 교차 검증하여 답변을 확인하겠습니다."
@@ -60,7 +65,8 @@
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
"final_answer": "Je vais finaliser la réponse.",
"finalize_answer": "Je vais finaliser la réponse.",
"reduce_answer": "Je vais finaliser la réponse.",
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}.",
"cross_reference": "Je vais croiser les informations sur le web pour vérifier la réponse."
@@ -71,7 +77,8 @@
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
"final_answer": "Ich werde die Antwort abschließen.",
"finalize_answer": "Ich werde die Antwort abschließen.",
"reduce_answer": "Ich werde die Antwort abschließen.",
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden.",
"cross_reference": "Ich werde die Informationen im Web abgleichen, um die Antwort zu überprüfen."
@@ -82,7 +89,8 @@
"read_for": "Voy a leer ${urls} para recopilar más información.",
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
"final_answer": "Voy a finalizar la respuesta.",
"finalize_answer": "Voy a finalizar la respuesta.",
"reduce_answer": "Voy a finalizar la respuesta.",
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
},
@@ -92,7 +100,8 @@
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
"final_answer": "Finalizzerò la risposta.",
"finalize_answer": "Finalizzerò la risposta.",
"reduce_answer": "Finalizzerò la risposta.",
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}.",
"cross_reference": "Incrocerò le informazioni sul web per verificare la risposta."
@@ -103,7 +112,8 @@
"read_for": "Vou ler ${urls} para reunir mais informações.",
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
"final_answer": "Vou finalizar a resposta.",
"finalize_answer": "Vou finalizar a resposta.",
"reduce_answer": "Vou finalizar a resposta.",
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}.",
"cross_reference": "Vou cruzar as informações da web para verificar a resposta."
@@ -114,7 +124,8 @@
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
"final_answer": "Дайте мне завершить ответ.",
"finalize_answer": "Дайте мне завершить ответ.",
"reduce_answer": "Дайте мне завершить ответ.",
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
"hostnames_no_results": "Ничего не найдено на ${hostnames}.",
"cross_reference": "Дайте мне сопоставить информацию из сети, чтобы проверить ответ."
@@ -125,6 +136,8 @@
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
"finalize_answer": "دعني أنهي الإجابة.",
"reduce_answer": "دعني أنهي الإجابة.",
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}.",
"cross_reference": "دعني أقوم بمقارنة المعلومات من الويب للتحقق من الإجابة."
@@ -135,7 +148,8 @@
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
"final_answer": "Ik zal het antwoord afronden.",
"finalize_answer": "Ik zal het antwoord afronden.",
"reduce_answer": "Ik zal het antwoord afronden.",
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}.",
"cross_reference": "Ik zal de informatie op het web kruisverwijzen om het antwoord te verifiëren."
@@ -146,7 +160,8 @@
"read_for": "让我阅读 ${urls} 来获取更多信息。",
"read_for_verify": "让我获取源内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。",
"finalize_answer": "我来整理一下答案。",
"reduce_answer": "让我整合所有调研结果。",
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"

View File

@@ -824,3 +824,111 @@ export async function detectBrokenUnicodeViaFileIO(str: string) {
// Now check for the visible replacement character
return { broken: readStr.includes('<27>'), readStr };
}
interface NgramResult {
ngram: string;
freq: number;
pmi?: number; // Added PMI score
}
function calculatePMI(
ngram: string,
ngramFreq: number,
wordFreqs: Map<string, number>,
totalNgrams: number
): number {
const words = ngram.split(' ');
if (words.length < 2) return 0;
// Calculate joint probability
const jointProb = ngramFreq / totalNgrams;
// Calculate individual probabilities
const wordProbs = words.map(word => (wordFreqs.get(word) || 0) / totalNgrams);
// Calculate PMI
const pmi = Math.log2(jointProb / wordProbs.reduce((a, b) => a * b, 1));
return pmi;
}
function isCJK(char: string): boolean {
const code = char.charCodeAt(0);
return (
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
(code >= 0x3040 && code <= 0x309F) || // Hiragana
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
(code >= 0xAC00 && code <= 0xD7AF) // Hangul
);
}
function isCJKText(text: string): boolean {
return Array.from(text).some(char => isCJK(char));
}
export function extractNgrams(
text: string,
n: number,
minFreq: number = 2,
minPMI: number = 1.0 // Added minimum PMI threshold
): NgramResult[] {
// Split text into chunks by newlines
const chunks = text.split('\n').filter(chunk => chunk.trim().length > 0);
// Maps to store frequencies
const ngramFreq: Map<string, number> = new Map();
const wordFreq: Map<string, number> = new Map();
let totalNgrams = 0;
// First pass: collect frequencies
for (const chunk of chunks) {
if (isCJKText(chunk)) {
// For CJK text, use character-level ngrams
for (let len = 2; len <= n; len++) {
for (let i = 0; i <= chunk.length - len; i++) {
const ngram = chunk.slice(i, i + len);
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
totalNgrams++;
}
}
} else {
// For non-CJK text, use word-level ngrams
const words = chunk.split(/\s+/).filter(word => word.length > 0);
// Count individual word frequencies
words.forEach(word => {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
});
// Count ngram frequencies
for (let len = 2; len <= n; len++) {
for (let i = 0; i <= words.length - len; i++) {
const ngram = words.slice(i, i + len).join(' ');
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
totalNgrams++;
}
}
}
}
// Second pass: calculate PMI and filter
const results: NgramResult[] = Array.from(ngramFreq.entries())
.filter(([ngram, freq]) => freq >= minFreq)
.map(([ngram, freq]) => {
const pmi = isCJKText(ngram) ? 0 : calculatePMI(ngram, freq, wordFreq, totalNgrams);
return { ngram, freq, pmi };
})
.filter(result => result.pmi === undefined || result.pmi >= minPMI)
.sort((a, b) => {
// If both have PMI scores, sort by PMI
if (a.pmi !== undefined && b.pmi !== undefined) {
return b.pmi - a.pmi;
}
// If only one has PMI, prioritize the one with PMI
if (a.pmi !== undefined) return -1;
if (b.pmi !== undefined) return 1;
// If neither has PMI (CJK text), sort by frequency
return b.freq - a.freq;
});
return results;
}