mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
refactor: replace mdFixer with finalizer and reducer, add ngram script
This commit is contained in:
10
config.json
10
config.json
@@ -51,9 +51,12 @@
|
|||||||
"agentBeastMode": {
|
"agentBeastMode": {
|
||||||
"temperature": 0.7
|
"temperature": 0.7
|
||||||
},
|
},
|
||||||
"mdFixer": {
|
"finalizer": {
|
||||||
"model": "gemini-2.5-flash-preview-05-20"
|
"model": "gemini-2.5-flash-preview-05-20"
|
||||||
},
|
},
|
||||||
|
"reducer": {
|
||||||
|
"maxTokens": 16000
|
||||||
|
},
|
||||||
"fallback": {
|
"fallback": {
|
||||||
"maxTokens": 8000,
|
"maxTokens": 8000,
|
||||||
"model": "gemini-2.0-flash-lite"
|
"model": "gemini-2.0-flash-lite"
|
||||||
@@ -85,7 +88,10 @@
|
|||||||
"fallback": {
|
"fallback": {
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
},
|
},
|
||||||
"mdFixer": {}
|
"finalizer": {},
|
||||||
|
"reducer": {
|
||||||
|
"maxTokens": 16000
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,7 +59,8 @@
|
|||||||
"maxTokens": 8000,
|
"maxTokens": 8000,
|
||||||
"model": "gemini-2.0-flash-lite"
|
"model": "gemini-2.0-flash-lite"
|
||||||
},
|
},
|
||||||
"mdFixer": {}
|
"finalizer": {},
|
||||||
|
"reducer": {"maxTokens": 16000}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"openai": {
|
"openai": {
|
||||||
@@ -87,7 +88,8 @@
|
|||||||
"fallback": {
|
"fallback": {
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
},
|
},
|
||||||
"mdFixer": {}
|
"finalizer": {},
|
||||||
|
"reducer": {"maxTokens": 16000}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
"dev": "npx ts-node src/agent.ts",
|
"dev": "npx ts-node src/agent.ts",
|
||||||
"search": "npx ts-node src/test-duck.ts",
|
"search": "npx ts-node src/test-duck.ts",
|
||||||
"rewrite": "npx ts-node src/tools/query-rewriter.ts",
|
"rewrite": "npx ts-node src/tools/query-rewriter.ts",
|
||||||
|
"ngram": "npx ts-node src/cli/ngram.ts",
|
||||||
"lint": "eslint . --ext .ts",
|
"lint": "eslint . --ext .ts",
|
||||||
"lint:fix": "eslint . --ext .ts --fix",
|
"lint:fix": "eslint . --ext .ts --fix",
|
||||||
"serve": "ts-node src/server.ts",
|
"serve": "ts-node src/server.ts",
|
||||||
@@ -65,4 +66,4 @@
|
|||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@ai-sdk/google-vertex": "^2.1.12"
|
"@ai-sdk/google-vertex": "^2.1.12"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
17
src/agent.ts
17
src/agent.ts
@@ -41,10 +41,11 @@ import {
|
|||||||
} from "./utils/text-tools";
|
} from "./utils/text-tools";
|
||||||
import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas";
|
import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas";
|
||||||
import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools";
|
import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools";
|
||||||
import { reviseAnswer } from "./tools/md-fixer";
|
import { finalizeAnswer } from "./tools/finalizer";
|
||||||
import { buildImageReferences, buildReferences } from "./tools/build-ref";
|
import { buildImageReferences, buildReferences } from "./tools/build-ref";
|
||||||
import { logInfo, logError, logDebug, logWarning } from './logging';
|
import { logInfo, logError, logDebug, logWarning } from './logging';
|
||||||
import { researchPlan } from './tools/research-planner';
|
import { researchPlan } from './tools/research-planner';
|
||||||
|
import { reduceAnswers } from './tools/reducer';
|
||||||
|
|
||||||
async function wait(seconds: number) {
|
async function wait(seconds: number) {
|
||||||
logDebug(`Waiting ${seconds}s...`);
|
logDebug(`Waiting ${seconds}s...`);
|
||||||
@@ -813,8 +814,13 @@ But then you realized you have asked them before. You decided to to think out of
|
|||||||
isAggregated: true
|
isAggregated: true
|
||||||
} as AnswerAction;
|
} as AnswerAction;
|
||||||
|
|
||||||
|
// aggregate urls
|
||||||
|
visitedURLs.push(...subproblemResponses.map(r => r.readURLs).flat());
|
||||||
|
weightedURLs = subproblemResponses.map(r => r.allURLs.map(url => ({ url, title: '' } as BoostedSearchSnippet))).flat();
|
||||||
|
|
||||||
// break the loop, move to final boxing
|
// TODO aggregate images @shazhou2015
|
||||||
|
|
||||||
|
// break the loop, jump directly final boxing
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1037,7 +1043,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
fixBadURLMdLinks(
|
fixBadURLMdLinks(
|
||||||
fixCodeBlockIndentation(
|
fixCodeBlockIndentation(
|
||||||
repairMarkdownFootnotesOuter(
|
repairMarkdownFootnotesOuter(
|
||||||
await reviseAnswer(
|
await finalizeAnswer(
|
||||||
answerStep.answer,
|
answerStep.answer,
|
||||||
allKnowledge,
|
allKnowledge,
|
||||||
context,
|
context,
|
||||||
@@ -1072,6 +1078,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
imageReferences = [];
|
imageReferences = [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (answerStep.isAggregated) {
|
||||||
|
answerStep.answer = await reduceAnswers(answerStep.answer, context, SchemaGen);
|
||||||
|
answerStep.mdAnswer = repairMarkdownFootnotesOuter(buildMdFromAnswer(answerStep));
|
||||||
}
|
}
|
||||||
|
|
||||||
// max return 300 urls
|
// max return 300 urls
|
||||||
@@ -1079,7 +1088,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
|||||||
return {
|
return {
|
||||||
result: thisStep,
|
result: thisStep,
|
||||||
context,
|
context,
|
||||||
visitedURLs: returnedURLs,
|
visitedURLs: returnedURLs, // deprecated
|
||||||
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
|
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
|
||||||
allURLs: weightedURLs.map(r => r.url),
|
allURLs: weightedURLs.map(r => r.url),
|
||||||
allImages: withImages ? imageObjects.map(i => i.url) : undefined,
|
allImages: withImages ? imageObjects.map(i => i.url) : undefined,
|
||||||
|
|||||||
36
src/cli/ngram.ts
Normal file
36
src/cli/ngram.ts
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import fs from 'fs';
|
||||||
|
import { extractNgrams } from '../utils/text-tools';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
if (args.length === 0) {
|
||||||
|
console.error('Please provide a file path');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const filePath = args[0];
|
||||||
|
const n = parseInt(args[1]) || 3; // Default to 3-grams
|
||||||
|
const minFreq = parseInt(args[2]) || 2; // Default minimum frequency of 2
|
||||||
|
const minPMI = parseFloat(args[3]) || 1.0; // Default minimum PMI of 1.0
|
||||||
|
|
||||||
|
try {
|
||||||
|
const text = await fs.promises.readFile(filePath, 'utf-8');
|
||||||
|
const results = extractNgrams(text, n, minFreq, minPMI);
|
||||||
|
|
||||||
|
console.log('\nN-gram Analysis Results:');
|
||||||
|
console.log('------------------------');
|
||||||
|
results.forEach(({ ngram, freq, pmi }) => {
|
||||||
|
if (pmi !== undefined) {
|
||||||
|
console.log(`${ngram}: ${freq} (PMI: ${pmi.toFixed(2)})`);
|
||||||
|
} else {
|
||||||
|
console.log(`${ngram}: ${freq}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
const error = err as Error;
|
||||||
|
console.error('Error:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
@@ -61,9 +61,9 @@ IMPORTANT: Do not begin your response with phrases like "Sure", "Here is", "Belo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const TOOL_NAME = 'mdFixer';
|
const TOOL_NAME = 'finalizer';
|
||||||
|
|
||||||
export async function reviseAnswer(
|
export async function finalizeAnswer(
|
||||||
mdContent: string,
|
mdContent: string,
|
||||||
knowledgeItems: KnowledgeItem[],
|
knowledgeItems: KnowledgeItem[],
|
||||||
trackers: TrackerContext,
|
trackers: TrackerContext,
|
||||||
@@ -71,7 +71,7 @@ export async function reviseAnswer(
|
|||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const prompt = getPrompt(mdContent, knowledgeItems, schema);
|
const prompt = getPrompt(mdContent, knowledgeItems, schema);
|
||||||
trackers?.actionTracker.trackThink('final_answer', schema.languageCode)
|
trackers?.actionTracker.trackThink('finalize_answer', schema.languageCode)
|
||||||
|
|
||||||
const result = await generateText({
|
const result = await generateText({
|
||||||
model: getModel(TOOL_NAME),
|
model: getModel(TOOL_NAME),
|
||||||
@@ -83,10 +83,10 @@ export async function reviseAnswer(
|
|||||||
|
|
||||||
|
|
||||||
logInfo(TOOL_NAME, { text: result.text });
|
logInfo(TOOL_NAME, { text: result.text });
|
||||||
logDebug(`repaired before/after: ${mdContent.length} -> ${result.text.length}`);
|
logDebug(`finalized answer before/after: ${mdContent.length} -> ${result.text.length}`);
|
||||||
|
|
||||||
if (result.text.length < mdContent.length * 0.85) {
|
if (result.text.length < mdContent.length * 0.85) {
|
||||||
logWarning(`repaired content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
logWarning(`finalized answer length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
||||||
originalContent: mdContent,
|
originalContent: mdContent,
|
||||||
repairedContent: result.text
|
repairedContent: result.text
|
||||||
});
|
});
|
||||||
@@ -96,7 +96,7 @@ export async function reviseAnswer(
|
|||||||
return result.text;
|
return result.text;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logError(`Error in ${TOOL_NAME}`, { error });
|
logError(TOOL_NAME, { error });
|
||||||
return mdContent;
|
return mdContent;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
93
src/tools/reducer.ts
Normal file
93
src/tools/reducer.ts
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
import { PromptPair, TrackerContext } from '../types';
|
||||||
|
import { getModel } from "../config";
|
||||||
|
import { generateText } from "ai";
|
||||||
|
import { Schemas } from "../utils/schemas";
|
||||||
|
import { logInfo, logError, logDebug } from '../logging';
|
||||||
|
|
||||||
|
|
||||||
|
function getPrompt(mdContent: string): PromptPair {
|
||||||
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
system: `
|
||||||
|
You are an article aggregator that creates a coherent, high-quality article by smartly merging multiple source articles. Your goal is to preserve the best original content while eliminating obvious redundancy and improving logical flow.
|
||||||
|
|
||||||
|
<core-instructions>
|
||||||
|
1. Content Preservation
|
||||||
|
ALWAYS preserve original sentences verbatim - do not paraphrase or rewrite
|
||||||
|
Select the highest quality version when multiple articles cover the same point
|
||||||
|
Maintain the original author's voice and technical accuracy
|
||||||
|
Keep direct quotes, statistics, and factual claims exactly as written
|
||||||
|
2. Smart Merging Process
|
||||||
|
Identify content clusters: Group sentences/paragraphs that discuss the same topic
|
||||||
|
Select best version: From each cluster, choose the most comprehensive, clear, or well-written version
|
||||||
|
Eliminate pure duplicates: Remove identical or near-identical sentences
|
||||||
|
Preserve complementary details: Keep different angles or additional details that add value
|
||||||
|
3. Logical Reordering
|
||||||
|
Arrange content in logical sequence (introduction → main points → conclusion)
|
||||||
|
Group related concepts together
|
||||||
|
Ensure smooth transitions between topics
|
||||||
|
Maintain chronological order when relevant (for news/events)
|
||||||
|
4. Quality Criteria for Selection
|
||||||
|
When choosing between similar content, prioritize:
|
||||||
|
Clarity: More understandable explanations
|
||||||
|
Completeness: More comprehensive coverage
|
||||||
|
Accuracy: Better sourced or more precise information
|
||||||
|
Relevance: More directly related to the main topic
|
||||||
|
</core-instructions>
|
||||||
|
|
||||||
|
<output-format>
|
||||||
|
Structure the final article with:
|
||||||
|
Clear section headings (when appropriate)
|
||||||
|
Logical paragraph breaks
|
||||||
|
Smooth flow between topics
|
||||||
|
No attribution to individual sources (present as unified piece)
|
||||||
|
</output-format>
|
||||||
|
|
||||||
|
Do not add your own commentary or analysis
|
||||||
|
Do not change technical terms, names, or specific details
|
||||||
|
|
||||||
|
Your final output should read as a cohesive, high-quality article that appears to be written by a single author, while actually being a careful curation of the best sentences from all input sources.
|
||||||
|
`,
|
||||||
|
user: mdContent
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const TOOL_NAME = 'reducer';
|
||||||
|
|
||||||
|
export async function reduceAnswers(
|
||||||
|
mdContent: string,
|
||||||
|
trackers: TrackerContext,
|
||||||
|
schema: Schemas
|
||||||
|
): Promise<string> {
|
||||||
|
try {
|
||||||
|
const prompt = getPrompt(mdContent);
|
||||||
|
trackers?.actionTracker.trackThink('reduce_answer', schema.languageCode)
|
||||||
|
|
||||||
|
const result = await generateText({
|
||||||
|
model: getModel(TOOL_NAME),
|
||||||
|
system: prompt.system,
|
||||||
|
prompt: prompt.user,
|
||||||
|
});
|
||||||
|
|
||||||
|
trackers.tokenTracker.trackUsage(TOOL_NAME, result.usage)
|
||||||
|
|
||||||
|
|
||||||
|
logInfo(TOOL_NAME, { text: result.text });
|
||||||
|
logDebug(`reduce before/after: ${mdContent.length} -> ${result.text.length}`);
|
||||||
|
|
||||||
|
// if (result.text.length < mdContent.length * 0.85) {
|
||||||
|
// logWarning(`reduce content length ${result.text.length} is significantly shorter than original content ${mdContent.length}, return original content instead.`, {
|
||||||
|
// originalContent: mdContent,
|
||||||
|
// repairedContent: result.text
|
||||||
|
// });
|
||||||
|
// return mdContent;
|
||||||
|
// }
|
||||||
|
|
||||||
|
return result.text;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
logError(TOOL_NAME, { error });
|
||||||
|
return mdContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,7 +5,8 @@
|
|||||||
"read_for": "Let me read ${urls} to gather more information.",
|
"read_for": "Let me read ${urls} to gather more information.",
|
||||||
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
||||||
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
|
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
|
||||||
"final_answer": "Let me finalize the answer.",
|
"finalize_answer": "Let me finalize the answer.",
|
||||||
|
"reduce_answer": "Let me aggregate all research results.",
|
||||||
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
|
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
|
||||||
"hostnames_no_results": "Can't find any results from ${hostnames}.",
|
"hostnames_no_results": "Can't find any results from ${hostnames}.",
|
||||||
"cross_reference": "Let me cross-reference the information from the web to verify the answer."
|
"cross_reference": "Let me cross-reference the information from the web to verify the answer."
|
||||||
@@ -16,7 +17,8 @@
|
|||||||
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
|
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
|
||||||
"read_for_verify": "让我读取源网页内容来验证答案。",
|
"read_for_verify": "让我读取源网页内容来验证答案。",
|
||||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||||
"final_answer": "我来整理一下答案。",
|
"finalize_answer": "我来整理一下答案。",
|
||||||
|
"reduce_answer": "让我综合整理所有的调研结果。",
|
||||||
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
|
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
|
||||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
||||||
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
||||||
@@ -27,7 +29,8 @@
|
|||||||
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
|
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
|
||||||
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
||||||
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
|
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
|
||||||
"final_answer": "我來整理一下答案。",
|
"finalize_answer": "我來整理一下答案。",
|
||||||
|
"reduce_answer": "讓我整合所有調研結果。",
|
||||||
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
|
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
|
||||||
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。",
|
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。",
|
||||||
"cross_reference": "讓我交叉驗證一下網頁上的信息來驗證答案。"
|
"cross_reference": "讓我交叉驗證一下網頁上的信息來驗證答案。"
|
||||||
@@ -38,7 +41,8 @@
|
|||||||
"read_for": "${urls} を読んで、情報を集めます。",
|
"read_for": "${urls} を読んで、情報を集めます。",
|
||||||
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
||||||
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
|
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
|
||||||
"final_answer": "答えをまとめます。",
|
"finalize_answer": "答えをまとめます。",
|
||||||
|
"reduce_answer": "答えをまとめます。",
|
||||||
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
|
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
|
||||||
"hostnames_no_results": "${hostnames} から結果が見つかりません。",
|
"hostnames_no_results": "${hostnames} から結果が見つかりません。",
|
||||||
"cross_reference": "ウェブ上の情報をクロスリファレンスして、答えを確認します。"
|
"cross_reference": "ウェブ上の情報をクロスリファレンスして、答えを確認します。"
|
||||||
@@ -49,7 +53,8 @@
|
|||||||
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
|
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
|
||||||
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
||||||
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
|
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
|
||||||
"final_answer": "답변을 마무리하겠습니다.",
|
"finalize_answer": "답변을 마무리하겠습니다.",
|
||||||
|
"reduce_answer": "답변을 마무리하겠습니다.",
|
||||||
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
|
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
|
||||||
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다.",
|
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다.",
|
||||||
"cross_reference": "웹에서 정보를 교차 검증하여 답변을 확인하겠습니다."
|
"cross_reference": "웹에서 정보를 교차 검증하여 답변을 확인하겠습니다."
|
||||||
@@ -60,7 +65,8 @@
|
|||||||
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
||||||
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
||||||
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
|
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
|
||||||
"final_answer": "Je vais finaliser la réponse.",
|
"finalize_answer": "Je vais finaliser la réponse.",
|
||||||
|
"reduce_answer": "Je vais finaliser la réponse.",
|
||||||
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
|
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
|
||||||
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}.",
|
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}.",
|
||||||
"cross_reference": "Je vais croiser les informations sur le web pour vérifier la réponse."
|
"cross_reference": "Je vais croiser les informations sur le web pour vérifier la réponse."
|
||||||
@@ -71,7 +77,8 @@
|
|||||||
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
||||||
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
||||||
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
|
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
|
||||||
"final_answer": "Ich werde die Antwort abschließen.",
|
"finalize_answer": "Ich werde die Antwort abschließen.",
|
||||||
|
"reduce_answer": "Ich werde die Antwort abschließen.",
|
||||||
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
|
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
|
||||||
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden.",
|
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden.",
|
||||||
"cross_reference": "Ich werde die Informationen im Web abgleichen, um die Antwort zu überprüfen."
|
"cross_reference": "Ich werde die Informationen im Web abgleichen, um die Antwort zu überprüfen."
|
||||||
@@ -82,7 +89,8 @@
|
|||||||
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
||||||
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
||||||
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
|
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
|
||||||
"final_answer": "Voy a finalizar la respuesta.",
|
"finalize_answer": "Voy a finalizar la respuesta.",
|
||||||
|
"reduce_answer": "Voy a finalizar la respuesta.",
|
||||||
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
|
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
|
||||||
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
|
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
|
||||||
},
|
},
|
||||||
@@ -92,7 +100,8 @@
|
|||||||
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
||||||
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
|
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
|
||||||
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
|
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
|
||||||
"final_answer": "Finalizzerò la risposta.",
|
"finalize_answer": "Finalizzerò la risposta.",
|
||||||
|
"reduce_answer": "Finalizzerò la risposta.",
|
||||||
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
|
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
|
||||||
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}.",
|
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}.",
|
||||||
"cross_reference": "Incrocerò le informazioni sul web per verificare la risposta."
|
"cross_reference": "Incrocerò le informazioni sul web per verificare la risposta."
|
||||||
@@ -103,7 +112,8 @@
|
|||||||
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
||||||
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
||||||
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
|
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
|
||||||
"final_answer": "Vou finalizar a resposta.",
|
"finalize_answer": "Vou finalizar a resposta.",
|
||||||
|
"reduce_answer": "Vou finalizar a resposta.",
|
||||||
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
|
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
|
||||||
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}.",
|
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}.",
|
||||||
"cross_reference": "Vou cruzar as informações da web para verificar a resposta."
|
"cross_reference": "Vou cruzar as informações da web para verificar a resposta."
|
||||||
@@ -114,7 +124,8 @@
|
|||||||
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
||||||
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
||||||
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
|
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
|
||||||
"final_answer": "Дайте мне завершить ответ.",
|
"finalize_answer": "Дайте мне завершить ответ.",
|
||||||
|
"reduce_answer": "Дайте мне завершить ответ.",
|
||||||
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
|
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
|
||||||
"hostnames_no_results": "Ничего не найдено на ${hostnames}.",
|
"hostnames_no_results": "Ничего не найдено на ${hostnames}.",
|
||||||
"cross_reference": "Дайте мне сопоставить информацию из сети, чтобы проверить ответ."
|
"cross_reference": "Дайте мне сопоставить информацию из сети, чтобы проверить ответ."
|
||||||
@@ -125,6 +136,8 @@
|
|||||||
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
||||||
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
||||||
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
|
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
|
||||||
|
"finalize_answer": "دعني أنهي الإجابة.",
|
||||||
|
"reduce_answer": "دعني أنهي الإجابة.",
|
||||||
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
|
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
|
||||||
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}.",
|
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}.",
|
||||||
"cross_reference": "دعني أقوم بمقارنة المعلومات من الويب للتحقق من الإجابة."
|
"cross_reference": "دعني أقوم بمقارنة المعلومات من الويب للتحقق من الإجابة."
|
||||||
@@ -135,7 +148,8 @@
|
|||||||
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
||||||
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
||||||
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
|
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
|
||||||
"final_answer": "Ik zal het antwoord afronden.",
|
"finalize_answer": "Ik zal het antwoord afronden.",
|
||||||
|
"reduce_answer": "Ik zal het antwoord afronden.",
|
||||||
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
|
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
|
||||||
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}.",
|
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}.",
|
||||||
"cross_reference": "Ik zal de informatie op het web kruisverwijzen om het antwoord te verifiëren."
|
"cross_reference": "Ik zal de informatie op het web kruisverwijzen om het antwoord te verifiëren."
|
||||||
@@ -146,7 +160,8 @@
|
|||||||
"read_for": "让我阅读 ${urls} 来获取更多信息。",
|
"read_for": "让我阅读 ${urls} 来获取更多信息。",
|
||||||
"read_for_verify": "让我获取源内容来验证答案。",
|
"read_for_verify": "让我获取源内容来验证答案。",
|
||||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||||
"final_answer": "我来整理一下答案。",
|
"finalize_answer": "我来整理一下答案。",
|
||||||
|
"reduce_answer": "让我整合所有调研结果。",
|
||||||
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
|
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
|
||||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。",
|
||||||
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
"cross_reference": "让我交叉验证一下网页上的信息来验证答案。"
|
||||||
|
|||||||
@@ -824,3 +824,111 @@ export async function detectBrokenUnicodeViaFileIO(str: string) {
|
|||||||
// Now check for the visible replacement character
|
// Now check for the visible replacement character
|
||||||
return { broken: readStr.includes('<27>'), readStr };
|
return { broken: readStr.includes('<27>'), readStr };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface NgramResult {
|
||||||
|
ngram: string;
|
||||||
|
freq: number;
|
||||||
|
pmi?: number; // Added PMI score
|
||||||
|
}
|
||||||
|
|
||||||
|
function calculatePMI(
|
||||||
|
ngram: string,
|
||||||
|
ngramFreq: number,
|
||||||
|
wordFreqs: Map<string, number>,
|
||||||
|
totalNgrams: number
|
||||||
|
): number {
|
||||||
|
const words = ngram.split(' ');
|
||||||
|
if (words.length < 2) return 0;
|
||||||
|
|
||||||
|
// Calculate joint probability
|
||||||
|
const jointProb = ngramFreq / totalNgrams;
|
||||||
|
|
||||||
|
// Calculate individual probabilities
|
||||||
|
const wordProbs = words.map(word => (wordFreqs.get(word) || 0) / totalNgrams);
|
||||||
|
|
||||||
|
// Calculate PMI
|
||||||
|
const pmi = Math.log2(jointProb / wordProbs.reduce((a, b) => a * b, 1));
|
||||||
|
return pmi;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isCJK(char: string): boolean {
|
||||||
|
const code = char.charCodeAt(0);
|
||||||
|
return (
|
||||||
|
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified Ideographs
|
||||||
|
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||||
|
(code >= 0x30A0 && code <= 0x30FF) || // Katakana
|
||||||
|
(code >= 0xAC00 && code <= 0xD7AF) // Hangul
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isCJKText(text: string): boolean {
|
||||||
|
return Array.from(text).some(char => isCJK(char));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractNgrams(
|
||||||
|
text: string,
|
||||||
|
n: number,
|
||||||
|
minFreq: number = 2,
|
||||||
|
minPMI: number = 1.0 // Added minimum PMI threshold
|
||||||
|
): NgramResult[] {
|
||||||
|
// Split text into chunks by newlines
|
||||||
|
const chunks = text.split('\n').filter(chunk => chunk.trim().length > 0);
|
||||||
|
|
||||||
|
// Maps to store frequencies
|
||||||
|
const ngramFreq: Map<string, number> = new Map();
|
||||||
|
const wordFreq: Map<string, number> = new Map();
|
||||||
|
let totalNgrams = 0;
|
||||||
|
|
||||||
|
// First pass: collect frequencies
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
if (isCJKText(chunk)) {
|
||||||
|
// For CJK text, use character-level ngrams
|
||||||
|
for (let len = 2; len <= n; len++) {
|
||||||
|
for (let i = 0; i <= chunk.length - len; i++) {
|
||||||
|
const ngram = chunk.slice(i, i + len);
|
||||||
|
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
|
||||||
|
totalNgrams++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// For non-CJK text, use word-level ngrams
|
||||||
|
const words = chunk.split(/\s+/).filter(word => word.length > 0);
|
||||||
|
|
||||||
|
// Count individual word frequencies
|
||||||
|
words.forEach(word => {
|
||||||
|
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Count ngram frequencies
|
||||||
|
for (let len = 2; len <= n; len++) {
|
||||||
|
for (let i = 0; i <= words.length - len; i++) {
|
||||||
|
const ngram = words.slice(i, i + len).join(' ');
|
||||||
|
ngramFreq.set(ngram, (ngramFreq.get(ngram) || 0) + 1);
|
||||||
|
totalNgrams++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: calculate PMI and filter
|
||||||
|
const results: NgramResult[] = Array.from(ngramFreq.entries())
|
||||||
|
.filter(([ngram, freq]) => freq >= minFreq)
|
||||||
|
.map(([ngram, freq]) => {
|
||||||
|
const pmi = isCJKText(ngram) ? 0 : calculatePMI(ngram, freq, wordFreq, totalNgrams);
|
||||||
|
return { ngram, freq, pmi };
|
||||||
|
})
|
||||||
|
.filter(result => result.pmi === undefined || result.pmi >= minPMI)
|
||||||
|
.sort((a, b) => {
|
||||||
|
// If both have PMI scores, sort by PMI
|
||||||
|
if (a.pmi !== undefined && b.pmi !== undefined) {
|
||||||
|
return b.pmi - a.pmi;
|
||||||
|
}
|
||||||
|
// If only one has PMI, prioritize the one with PMI
|
||||||
|
if (a.pmi !== undefined) return -1;
|
||||||
|
if (b.pmi !== undefined) return 1;
|
||||||
|
// If neither has PMI (CJK text), sort by frequency
|
||||||
|
return b.freq - a.freq;
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user