mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 15:39:06 +08:00
refactor: replace mdFixer with finalizer and reducer, add ngram script
This commit is contained in:
10
config.json
10
config.json
@@ -51,9 +51,12 @@
|
||||
"agentBeastMode": {
|
||||
"temperature": 0.7
|
||||
},
|
||||
"mdFixer": {
|
||||
"finalizer": {
|
||||
"model": "gemini-2.5-flash-preview-05-20"
|
||||
},
|
||||
"reducer": {
|
||||
"maxTokens": 16000
|
||||
},
|
||||
"fallback": {
|
||||
"maxTokens": 8000,
|
||||
"model": "gemini-2.0-flash-lite"
|
||||
@@ -85,7 +88,10 @@
|
||||
"fallback": {
|
||||
"temperature": 0
|
||||
},
|
||||
"mdFixer": {}
|
||||
"finalizer": {},
|
||||
"reducer": {
|
||||
"maxTokens": 16000
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +59,8 @@
|
||||
"maxTokens": 8000,
|
||||
"model": "gemini-2.0-flash-lite"
|
||||
},
|
||||
"mdFixer": {}
|
||||
"finalizer": {},
|
||||
"reducer": {"maxTokens": 16000}
|
||||
}
|
||||
},
|
||||
"openai": {
|
||||
@@ -87,7 +88,8 @@
|
||||
"fallback": {
|
||||
"temperature": 0
|
||||
},
|
||||
"mdFixer": {}
|
||||
"finalizer": {},
|
||||
"reducer": {"maxTokens": 16000}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
"dev": "npx ts-node src/agent.ts",
|
||||
"search": "npx ts-node src/test-duck.ts",
|
||||
"rewrite": "npx ts-node src/tools/query-rewriter.ts",
|
||||
"ngram": "npx ts-node src/cli/ngram.ts",
|
||||
"lint": "eslint . --ext .ts",
|
||||
"lint:fix": "eslint . --ext .ts --fix",
|
||||
"serve": "ts-node src/server.ts",
|
||||
@@ -65,4 +66,4 @@
|
||||
"optionalDependencies": {
|
||||
"@ai-sdk/google-vertex": "^2.1.12"
|
||||
}
|
||||
}
|
||||
}
|
||||
17
src/agent.ts
17
src/agent.ts
@@ -41,10 +41,11 @@ import {
|
||||
} from "./utils/text-tools";
|
||||
import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas";
|
||||
import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools";
|
||||
import { reviseAnswer } from "./tools/md-fixer";
|
||||
import { finalizeAnswer } from "./tools/finalizer";
|
||||
import { buildImageReferences, buildReferences } from "./tools/build-ref";
|
||||
import { logInfo, logError, logDebug, logWarning } from './logging';
|
||||
import { researchPlan } from './tools/research-planner';
|
||||
import { reduceAnswers } from './tools/reducer';
|
||||
|
||||
async function wait(seconds: number) {
|
||||
logDebug(`Waiting ${seconds}s...`);
|
||||
@@ -813,8 +814,13 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
isAggregated: true
|
||||
} as AnswerAction;
|
||||
|
||||
// aggregate urls
|
||||
visitedURLs.push(...subproblemResponses.map(r => r.readURLs).flat());
|
||||
weightedURLs = subproblemResponses.map(r => r.allURLs.map(url => ({ url, title: '' } as BoostedSearchSnippet))).flat();
|
||||
|
||||
// break the loop, move to final boxing
|
||||
// TODO aggregate images @shazhou2015
|
||||
|
||||
// break the loop, jump directly final boxing
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1037,7 +1043,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
fixBadURLMdLinks(
|
||||
fixCodeBlockIndentation(
|
||||
repairMarkdownFootnotesOuter(
|
||||
await reviseAnswer(
|
||||
await finalizeAnswer(
|
||||
answerStep.answer,
|
||||
allKnowledge,
|
||||
context,
|
||||
@@ -1072,6 +1078,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
imageReferences = [];
|
||||
}
|
||||
}
|
||||
} else if (answerStep.isAggregated) {
|
||||
answerStep.answer = await reduceAnswers(answerStep.answer, context, SchemaGen);
|
||||
answerStep.mdAnswer = repairMarkdownFootnotesOuter(buildMdFromAnswer(answerStep));
|
||||
}
|
||||
|
||||
// max return 300 urls
|
||||
@@ -1079,7 +1088,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
return {
|
||||
result: thisStep,
|
||||
context,
|
||||
visitedURLs: returnedURLs,
|
||||
visitedURLs: returnedURLs, // deprecated
|
||||
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
|
||||
allURLs: weightedURLs.map(r => r.url),
|
||||
allImages: withImages ? imageObjects.map(i => i.url) : undefined,
|
||||
|
||||
36
src/cli/ngram.ts
Normal file
36
src/cli/ngram.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import fs from 'fs';
|
||||
import { extractNgrams } from '../utils/text-tools';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
if (args.length === 0) {
|
||||
console.error('Please provide a file path');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const filePath = args[0];
|
||||
const n = parseInt(args[1]) || 3; // Default to 3-grams
|
||||
const minFreq = parseInt(args[2]) || 2; // Default minimum frequency of 2
|
||||
const minPMI = parseFloat(args[3]) || 1.0; // Default minimum PMI of 1.0
|
||||
|
||||
try {
|
||||
const text = await fs.promises.readFile(filePath, 'utf-8');
|
||||
const results = extractNgrams(text, n, minFreq, minPMI);
|
||||
|
||||
console.log('\nN-gram Analysis Results:');
|
||||
console.log('------------------------');
|
||||
results.forEach(({ ngram, freq, pmi }) => {
|
||||
if (pmi !== undefined) {
|
||||
console.log(`${ngram}: ${freq} (PMI: ${pmi.toFixed(2)})`);
|
||||
} else {
|
||||
console.log(`${ngram}: ${freq}`);
|
||||
}
|
||||
});
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -61,9 +61,9 @@ IMPORTANT: Do not begin your response with phrases like "Sure", "Here is", "Belo
|
||||
}
|
||||
}
|
||||
|
||||
const TOOL_NAME = 'mdFixer';
|
||||
const TOOL_NAME = 'finalizer';
|
||||
|
||||
export async function reviseAnswer(
|
||||
export async function finalizeAnswer(
|
||||
mdContent: string,
|
||||
knowledgeItems: KnowledgeItem[],
|
||||
trackers: TrackerContext,
|
||||
@@ -71,7 +71,7 @@ export async function reviseAnswer(
|
||||
): Promise<string> {
|
||||
try {
|
||||
const prompt = getPrompt(mdContent, knowledgeItems, schema);
|
||||
trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
|
||||
trackers?.actionTracker.trackThink('finalize_answer', schema.languageCode)
|
||||
|
||||
const result = await generateText({
|
||||
model: getModel(TOOL_NAME),
|
||||
@@ -83,10 +83,10 @@ export async function reviseAnswer(
|
||||
|
||||
|
||||
logInfo(TOOL_NAME, { text: result.text });
|
||||
logDebug(`repaired before/after: ${mdContent.length} -> ${result.text.length}`);
|
||||
logDebug(`finalized answer before/after: ${mdContent.length} -> ${result.text.length}`);
|
||||
|
||||
if (result.text.length < mdContent.length * 0.85) {
|
||||
logWarning(`repaired content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
||||
logWarning(`finalized answer length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
||||
originalContent: mdContent,
|
||||
repairedContent: result.text
|
||||
});
|
||||
@@ -96,7 +96,7 @@ export async function reviseAnswer(
|
||||
return result.text;
|
||||
|
||||
} catch (error) {
|
||||
logError(`Error in ${TOOL_NAME}`, { error });
|
||||
logError(TOOL_NAME, { error });
|
||||
return mdContent;
|
||||
}
|
||||
}
|
||||
93
src/tools/reducer.ts
Normal file
93
src/tools/reducer.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { PromptPair, TrackerContext } from '../types';
|
||||
import { getModel } from "../config";
|
||||
import { generateText } from "ai";
|
||||
import { Schemas } from "../utils/schemas";
|
||||
import { logInfo, logError, logDebug } from '../logging';
|
||||
|
||||
|
||||
function getPrompt(mdContent: string): PromptPair {
|
||||
|
||||
|
||||
return {
|
||||
system: `
|
||||
You are an article aggregator that creates a coherent, high-quality article by smartly merging multiple source articles. Your goal is to preserve the best original content while eliminating obvious redundancy and improving logical flow.
|
||||
|
||||
<core-instructions>
|
||||
1. Content Preservation
|
||||
ALWAYS preserve original sentences verbatim - do not paraphrase or rewrite
|
||||
Select the highest quality version when multiple articles cover the same point
|
||||
Maintain the original author's voice and technical accuracy
|
||||
Keep direct quotes, statistics, and factual claims exactly as written
|
||||
2. Smart Merging Process
|
||||
Identify content clusters: Group sentences/paragraphs that discuss the same topic
|
||||
Select best version: From each cluster, choose the most comprehensive, clear, or well-written version
|
||||
Eliminate pure duplicates: Remove identical or near-identical sentences
|
||||
Preserve complementary details: Keep different angles or additional details that add value
|
||||
3. Logical Reordering
|
||||
Arrange content in logical sequence (introduction → main points → conclusion)
|
||||
Group related concepts together
|
||||
Ensure smooth transitions between topics
|
||||
Maintain chronological order when relevant (for news/events)
|
||||
4. Quality Criteria for Selection
|
||||
When choosing between similar content, prioritize:
|
||||
Clarity: More understandable explanations
|
||||
Completeness: More comprehensive coverage
|
||||
Accuracy: Better sourced or more precise information
|
||||
Relevance: More directly related to the main topic
|
||||
</core-instructions>
|
||||
|
||||
<output-format>
|
||||
Structure the final article with:
|
||||
Clear section headings (when appropriate)
|
||||
Logical paragraph breaks
|
||||
Smooth flow between topics
|
||||
No attribution to individual sources (present as unified piece)
|
||||
</output-format>
|
||||
|
||||
Do not add your own commentary or analysis
|
||||
Do not change technical terms, names, or specific details
|
||||
|
||||
Your final output should read as a cohesive, high-quality article that appears to be written by a single author, while actually being a careful curation of the best sentences from all input sources.
|
||||
`,
|
||||
user: mdContent
|
||||
}
|
||||
}
|
||||
|
||||
const TOOL_NAME = 'reducer';
|
||||
|
||||
export async function reduceAnswers(
|
||||
mdContent: string,
|
||||
trackers: TrackerContext,
|
||||
schema: Schemas
|
||||
): Promise<string> {
|
||||
try {
|
||||
const prompt = getPrompt(mdContent);
|
||||
trackers?.actionTracker.trackThink('reduce_answer', schema.languageCode)
|
||||
|
||||
const result = await generateText({
|
||||
model: getModel(TOOL_NAME),
|
||||
system: prompt.system,
|
||||
prompt: prompt.user,
|
||||
});
|
||||
|
||||
trackers.tokenTracker.trackUsage(TOOL_NAME, result.usage)
|
||||
|
||||
|
||||
logInfo(TOOL_NAME, { text: result.text });
|
||||
logDebug(`reduce before/after: ${mdContent.length} -> ${result.text.length}`);
|
||||
|
||||
// if (result.text.length < mdContent.length * 0.85) {
|
||||
// logWarning(`reduce content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
||||
// originalContent: mdContent,
|
||||
// repairedContent: result.text
|
||||
// });
|
||||
// return mdContent;
|
||||
// }
|
||||
|
||||
return result.text;
|
||||
|
||||
} catch (error) {
|
||||
logError(TOOL_NAME, { error });
|
||||
return mdContent;
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,8 @@
|
||||
"read_for": "Let me read ${urls} to gather more information.",
|
||||
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
||||
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
|
||||
"final_answer": "Let me finalize the answer.",
|
||||
"finalize_answer": "Let me finalize the answer.",
|
||||
"reduce_answer": "Let me aggregate all research results.",
|
||||
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
|
||||
"hostnames_no_results": "Can't find any results from ${hostnames}.",
|
||||
"cross_reference": "Let me cross-reference the information from the web to verify the answer."
|
||||
@@ -16,7 +17,8 @@
|
||||
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
|
||||
"read_for_verify": "让我读取源网页内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"finalize_answer": "我来整理一下答案。",
|
||||
"reduce_answer": "让我综合整理所有的调研结果。",
|
||||
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
|
||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
||||
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
||||
@@ -27,7 +29,8 @@
|
||||
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
|
||||
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
||||
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
|
||||
"final_answer": "我來整理一下答案。",
|
||||
"finalize_answer": "我來整理一下答案。",
|
||||
"reduce_answer": "讓我整合所有調研結果。",
|
||||
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
|
||||
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。",
|
||||
"cross_reference": "讓我交叉驗證一下網頁上的信息來驗證答案。"
|
||||
@@ -38,7 +41,8 @@
|
||||
"read_for": "${urls} を読んで、情報を集めます。",
|
||||
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
||||
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
|
||||
"final_answer": "答えをまとめます。",
|
||||
"finalize_answer": "答えをまとめます。",
|
||||
"reduce_answer": "答えをまとめます。",
|
||||
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
|
||||
"hostnames_no_results": "${hostnames} から結果が見つかりません。",
|
||||
"cross_reference": "ウェブ上の情報をクロスリファレンスして、答えを確認します。"
|
||||
@@ -49,7 +53,8 @@
|
||||
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
|
||||
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
||||
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
|
||||
"final_answer": "답변을 마무리하겠습니다.",
|
||||
"finalize_answer": "답변을 마무리하겠습니다.",
|
||||
"reduce_answer": "답변을 마무리하겠습니다.",
|
||||
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
|
||||
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다.",
|
||||
"cross_reference": "웹에서 정보를 교차 검증하여 답변을 확인하겠습니다."
|
||||
@@ -60,7 +65,8 @@
|
||||
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
||||
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
||||
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
|
||||
"final_answer": "Je vais finaliser la réponse.",
|
||||
"finalize_answer": "Je vais finaliser la réponse.",
|
||||
"reduce_answer": "Je vais finaliser la réponse.",
|
||||
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
|
||||
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}.",
|
||||
"cross_reference": "Je vais croiser les informations sur le web pour vérifier la réponse."
|
||||
@@ -71,7 +77,8 @@
|
||||
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
||||
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
||||
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
|
||||
"final_answer": "Ich werde die Antwort abschließen.",
|
||||
"finalize_answer": "Ich werde die Antwort abschließen.",
|
||||
"reduce_answer": "Ich werde die Antwort abschließen.",
|
||||
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
|
||||
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden.",
|
||||
"cross_reference": "Ich werde die Informationen im Web abgleichen, um die Antwort zu überprüfen."
|
||||
@@ -82,7 +89,8 @@
|
||||
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
||||
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
||||
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
|
||||
"final_answer": "Voy a finalizar la respuesta.",
|
||||
"finalize_answer": "Voy a finalizar la respuesta.",
|
||||
"reduce_answer": "Voy a finalizar la respuesta.",
|
||||
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
|
||||
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
|
||||
},
|
||||
@@ -92,7 +100,8 @@
|
||||
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
||||
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
|
||||
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
|
||||
"final_answer": "Finalizzerò la risposta.",
|
||||
"finalize_answer": "Finalizzerò la risposta.",
|
||||
"reduce_answer": "Finalizzerò la risposta.",
|
||||
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
|
||||
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}.",
|
||||
"cross_reference": "Incrocerò le informazioni sul web per verificare la risposta."
|
||||
@@ -103,7 +112,8 @@
|
||||
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
||||
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
||||
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
|
||||
"final_answer": "Vou finalizar a resposta.",
|
||||
"finalize_answer": "Vou finalizar a resposta.",
|
||||
"reduce_answer": "Vou finalizar a resposta.",
|
||||
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
|
||||
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}.",
|
||||
"cross_reference": "Vou cruzar as informações da web para verificar a resposta."
|
||||
@@ -114,7 +124,8 @@
|
||||
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
||||
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
||||
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
|
||||
"final_answer": "Дайте мне завершить ответ.",
|
||||
"finalize_answer": "Дайте мне завершить ответ.",
|
||||
"reduce_answer": "Дайте мне завершить ответ.",
|
||||
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
|
||||
"hostnames_no_results": "Ничего не найдено на ${hostnames}.",
|
||||
"cross_reference": "Дайте мне сопоставить информацию из сети, чтобы проверить ответ."
|
||||
@@ -125,6 +136,8 @@
|
||||
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
||||
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
||||
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
|
||||
"finalize_answer": "دعني أنهي الإجابة.",
|
||||
"reduce_answer": "دعني أنهي الإجابة.",
|
||||
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
|
||||
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}.",
|
||||
"cross_reference": "دعني أقوم بمقارنة المعلومات من الويب للتحقق من الإجابة."
|
||||
@@ -135,7 +148,8 @@
|
||||
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
||||
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
||||
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
|
||||
"final_answer": "Ik zal het antwoord afronden.",
|
||||
"finalize_answer": "Ik zal het antwoord afronden.",
|
||||
"reduce_answer": "Ik zal het antwoord afronden.",
|
||||
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
|
||||
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}.",
|
||||
"cross_reference": "Ik zal de informatie op het web kruisverwijzen om het antwoord te verifiëren."
|
||||
@@ -146,7 +160,8 @@
|
||||
"read_for": "让我阅读 ${urls} 来获取更多信息。",
|
||||
"read_for_verify": "让我获取源内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"finalize_answer": "我来整理一下答案。",
|
||||
"reduce_answer": "让我整合所有调研结果。",
|
||||
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
|
||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
||||
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
||||
|
||||
@@ -824,3 +824,111 @@ export async function detectBrokenUnicodeViaFileIO(str: string) {
|
||||
// Now check for the visible replacement character
|
||||
return { broken: readStr.includes('<27>'), readStr };
|
||||
}
|
||||
|
||||
interface NgramResult {
|
||||
ngram: string;
|
||||
freq: number;
|
||||
pmi?: number; // Added PMI score
|
||||
}
|
||||
|
||||
function calculatePMI(
|
||||
ngram: string,
|
||||
ngramFreq: number,
|
||||
wordFreqs: Map<string, number>,
|
||||
totalNgrams: number
|
||||
): number {
|
||||
const words = ngram.split(' ');
|
||||
if (words.length < 2) return 0;
|
||||
|
||||
// Calculate joint probability
|
||||
const jointProb = ngramFreq / totalNgrams;
|
||||
|
||||
// Calculate individual probabilities
|
||||
const wordProbs = words.map(word => (wordFreqs.get(word) || 0) / totalNgrams);
|
||||
|
||||
// Calculate PMI
|
||||
const pmi = Math.log2(jointProb / wordProbs.reduce((a, b) => a * b, 1));
|
||||
return pmi;
|
||||
}
|
||||
|
||||
function isCJK(char: string): boolean {
|
||||
const code = char.charCodeAt(0);
|
||||
return (
|
||||
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
|
||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
|
||||
(code >= 0xAC00 && code <= 0xD7AF) // Hangul
|
||||
);
|
||||
}
|
||||
|
||||
function isCJKText(text: string): boolean {
|
||||
return Array.from(text).some(char => isCJK(char));
|
||||
}
|
||||
|
||||
export function extractNgrams(
|
||||
text: string,
|
||||
n: number,
|
||||
minFreq: number = 2,
|
||||
minPMI: number = 1.0 // Added minimum PMI threshold
|
||||
): NgramResult[] {
|
||||
// Split text into chunks by newlines
|
||||
const chunks = text.split('\n').filter(chunk => chunk.trim().length > 0);
|
||||
|
||||
// Maps to store frequencies
|
||||
const ngramFreq: Map<string, number> = new Map();
|
||||
const wordFreq: Map<string, number> = new Map();
|
||||
let totalNgrams = 0;
|
||||
|
||||
// First pass: collect frequencies
|
||||
for (const chunk of chunks) {
|
||||
if (isCJKText(chunk)) {
|
||||
// For CJK text, use character-level ngrams
|
||||
for (let len = 2; len <= n; len++) {
|
||||
for (let i = 0; i <= chunk.length - len; i++) {
|
||||
const ngram = chunk.slice(i, i + len);
|
||||
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
|
||||
totalNgrams++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// For non-CJK text, use word-level ngrams
|
||||
const words = chunk.split(/\s+/).filter(word => word.length > 0);
|
||||
|
||||
// Count individual word frequencies
|
||||
words.forEach(word => {
|
||||
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
|
||||
});
|
||||
|
||||
// Count ngram frequencies
|
||||
for (let len = 2; len <= n; len++) {
|
||||
for (let i = 0; i <= words.length - len; i++) {
|
||||
const ngram = words.slice(i, i + len).join(' ');
|
||||
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
|
||||
totalNgrams++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: calculate PMI and filter
|
||||
const results: NgramResult[] = Array.from(ngramFreq.entries())
|
||||
.filter(([ngram, freq]) => freq >= minFreq)
|
||||
.map(([ngram, freq]) => {
|
||||
const pmi = isCJKText(ngram) ? 0 : calculatePMI(ngram, freq, wordFreq, totalNgrams);
|
||||
return { ngram, freq, pmi };
|
||||
})
|
||||
.filter(result => result.pmi === undefined || result.pmi >= minPMI)
|
||||
.sort((a, b) => {
|
||||
// If both have PMI scores, sort by PMI
|
||||
if (a.pmi !== undefined && b.pmi !== undefined) {
|
||||
return b.pmi - a.pmi;
|
||||
}
|
||||
// If only one has PMI, prioritize the one with PMI
|
||||
if (a.pmi !== undefined) return -1;
|
||||
if (b.pmi !== undefined) return 1;
|
||||
// If neither has PMI (CJK text), sort by frequency
|
||||
return b.freq - a.freq;
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user