mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2026-03-22 07:29:35 +08:00
feat: late chunking
This commit is contained in:
@@ -434,7 +434,8 @@ export async function getResponse(question?: string,
|
|||||||
allKnowledge,
|
allKnowledge,
|
||||||
allURLs,
|
allURLs,
|
||||||
visitedURLs,
|
visitedURLs,
|
||||||
SchemaGen.languageCode
|
SchemaGen,
|
||||||
|
currentQuestion
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -701,7 +702,8 @@ You decided to think out of the box or cut from a completely different angle.
|
|||||||
allKnowledge,
|
allKnowledge,
|
||||||
allURLs,
|
allURLs,
|
||||||
visitedURLs,
|
visitedURLs,
|
||||||
SchemaGen.languageCode
|
SchemaGen,
|
||||||
|
currentQuestion
|
||||||
);
|
);
|
||||||
|
|
||||||
diaryContext.push(success
|
diaryContext.push(success
|
||||||
|
|||||||
38
src/app.ts
38
src/app.ts
@@ -7,7 +7,7 @@ import {
|
|||||||
ChatCompletionResponse,
|
ChatCompletionResponse,
|
||||||
ChatCompletionChunk,
|
ChatCompletionChunk,
|
||||||
AnswerAction,
|
AnswerAction,
|
||||||
Model, StepAction
|
Model, StepAction, VisitAction
|
||||||
} from './types';
|
} from './types';
|
||||||
import {TokenTracker} from "./utils/token-tracker";
|
import {TokenTracker} from "./utils/token-tracker";
|
||||||
import {ActionTracker} from "./utils/action-tracker";
|
import {ActionTracker} from "./utils/action-tracker";
|
||||||
@@ -337,7 +337,7 @@ async function processQueue(streamingState: StreamingState, res: Response, reque
|
|||||||
system_fingerprint: 'fp_' + requestId,
|
system_fingerprint: 'fp_' + requestId,
|
||||||
choices: [{
|
choices: [{
|
||||||
index: 0,
|
index: 0,
|
||||||
delta: {content: word, type: "think"},
|
delta: {content: word, type: 'think'},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: null
|
finish_reason: null
|
||||||
}]
|
}]
|
||||||
@@ -475,7 +475,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
system_fingerprint: 'fp_' + requestId,
|
system_fingerprint: 'fp_' + requestId,
|
||||||
choices: [{
|
choices: [{
|
||||||
index: 0,
|
index: 0,
|
||||||
delta: {role: 'assistant', content: '<think>', type: "text"},
|
delta: {role: 'assistant', content: '<think>', type: 'think'},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: null
|
finish_reason: null
|
||||||
}]
|
}]
|
||||||
@@ -485,6 +485,24 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
// Set up progress listener with cleanup
|
// Set up progress listener with cleanup
|
||||||
const actionListener = async (step: StepAction) => {
|
const actionListener = async (step: StepAction) => {
|
||||||
// Add content to queue for both thinking steps and final answer
|
// Add content to queue for both thinking steps and final answer
|
||||||
|
if (step.action === 'visit') {
|
||||||
|
(step as VisitAction).URLTargets.forEach((url) => {
|
||||||
|
const chunk: ChatCompletionChunk = {
|
||||||
|
id: requestId,
|
||||||
|
object: 'chat.completion.chunk',
|
||||||
|
created,
|
||||||
|
model: body.model,
|
||||||
|
system_fingerprint: 'fp_' + requestId,
|
||||||
|
choices: [{
|
||||||
|
index: 0,
|
||||||
|
delta: {type: 'think', url},
|
||||||
|
logprobs: null,
|
||||||
|
finish_reason: null,
|
||||||
|
}]
|
||||||
|
};
|
||||||
|
res.write(`data: ${JSON.stringify(chunk)}\n\n`);
|
||||||
|
});
|
||||||
|
}
|
||||||
if (step.think) {
|
if (step.think) {
|
||||||
// if not ends with a space, add one
|
// if not ends with a space, add one
|
||||||
const content = step.think + ' ';
|
const content = step.think + ' ';
|
||||||
@@ -548,9 +566,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
system_fingerprint: 'fp_' + requestId,
|
system_fingerprint: 'fp_' + requestId,
|
||||||
choices: [{
|
choices: [{
|
||||||
index: 0,
|
index: 0,
|
||||||
delta: {content: `</think>\n\n`, type: "think"},
|
delta: {content: `</think>\n\n`, type: 'think'},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: null
|
finish_reason: 'thinking_end'
|
||||||
}]
|
}]
|
||||||
};
|
};
|
||||||
res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
|
res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
|
||||||
@@ -638,9 +656,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
system_fingerprint: 'fp_' + requestId,
|
system_fingerprint: 'fp_' + requestId,
|
||||||
choices: [{
|
choices: [{
|
||||||
index: 0,
|
index: 0,
|
||||||
delta: {content: '</think>', type: "think"},
|
delta: {content: '</think>', type: 'think'},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: null
|
finish_reason: 'error'
|
||||||
}],
|
}],
|
||||||
usage,
|
usage,
|
||||||
};
|
};
|
||||||
@@ -655,9 +673,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
system_fingerprint: 'fp_' + requestId,
|
system_fingerprint: 'fp_' + requestId,
|
||||||
choices: [{
|
choices: [{
|
||||||
index: 0,
|
index: 0,
|
||||||
delta: {content: errorMessage, type: "error"},
|
delta: {content: errorMessage, type: 'error'},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: 'stop'
|
finish_reason: 'error'
|
||||||
}],
|
}],
|
||||||
usage
|
usage
|
||||||
};
|
};
|
||||||
@@ -679,7 +697,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
|||||||
type: 'error'
|
type: 'error'
|
||||||
},
|
},
|
||||||
logprobs: null,
|
logprobs: null,
|
||||||
finish_reason: 'stop'
|
finish_reason: 'error'
|
||||||
}],
|
}],
|
||||||
usage,
|
usage,
|
||||||
};
|
};
|
||||||
|
|||||||
218
src/tools/jina-latechunk.ts
Normal file
218
src/tools/jina-latechunk.ts
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
import {TrackerContext} from "../types";
|
||||||
|
import axios from 'axios';
|
||||||
|
import {JINA_API_KEY} from "../config";
|
||||||
|
import {Schemas} from "../utils/schemas";
|
||||||
|
|
||||||
|
export async function cherryPick(question: string, longContext: string, options: any = {}, trackers: TrackerContext, schemaGen: Schemas) {
|
||||||
|
|
||||||
|
const {
|
||||||
|
snippetLength = 2000,
|
||||||
|
numSnippets = 2,
|
||||||
|
chunkSize = 200,
|
||||||
|
maxTokensPerRequest = 8192, // Maximum tokens per embedding request
|
||||||
|
// Rough estimate of tokens per character (can be adjusted based on your text)
|
||||||
|
tokensPerCharacter = 0.5
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
if (longContext.length < snippetLength * numSnippets) {
|
||||||
|
// If the context is shorter than the snippet length, return the whole context
|
||||||
|
return longContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split the longContext into chunks of chunkSize
|
||||||
|
const chunks: string[] = [];
|
||||||
|
for (let i = 0; i < longContext.length; i += chunkSize) {
|
||||||
|
chunks.push(longContext.substring(i, Math.min(i + chunkSize, longContext.length)));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('late chunking enabled! num chunks:', chunks.length);
|
||||||
|
|
||||||
|
trackers.actionTracker.trackThink('late_chunk', schemaGen.languageCode);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Estimate the number of tokens per chunk
|
||||||
|
const estimatedTokensPerChunk = Math.ceil(chunkSize * tokensPerCharacter);
|
||||||
|
|
||||||
|
// Calculate chunks per batch to stay under token limit
|
||||||
|
const chunksPerBatch = Math.floor(maxTokensPerRequest / estimatedTokensPerChunk);
|
||||||
|
|
||||||
|
// Create batches of chunks
|
||||||
|
const chunkBatches = [];
|
||||||
|
for (let i = 0; i < chunks.length; i += chunksPerBatch) {
|
||||||
|
chunkBatches.push(chunks.slice(i, i + chunksPerBatch));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Total length ${longContext.length} split ${chunks.length} chunks into ${chunkBatches.length} batches of ~${chunksPerBatch} chunks each`);
|
||||||
|
|
||||||
|
// Process each batch and collect the embeddings
|
||||||
|
const allChunkEmbeddings: number[][] = [];
|
||||||
|
let totalTokensUsed = 0;
|
||||||
|
|
||||||
|
for (let batchIndex = 0; batchIndex < chunkBatches.length; batchIndex++) {
|
||||||
|
const batch = chunkBatches[batchIndex];
|
||||||
|
console.log(`Processing batch ${batchIndex + 1}/${chunkBatches.length} with ${batch.length} chunks`);
|
||||||
|
|
||||||
|
// Get embeddings for the current batch
|
||||||
|
const batchEmbeddingResponse = await axios.post(
|
||||||
|
'https://api.jina.ai/v1/embeddings',
|
||||||
|
{
|
||||||
|
model: "jina-embeddings-v3",
|
||||||
|
task: "retrieval.passage",
|
||||||
|
late_chunking: true,
|
||||||
|
dimensions: 1024,
|
||||||
|
embedding_type: "float",
|
||||||
|
input: batch
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${JINA_API_KEY}`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (batchEmbeddingResponse.status !== 200) {
|
||||||
|
throw new Error(`Unexpected status code from API: ${batchEmbeddingResponse.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate response structure
|
||||||
|
if (!batchEmbeddingResponse.data?.data) {
|
||||||
|
throw new Error("Unexpected API response format");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract embeddings from this batch
|
||||||
|
const batchEmbeddings = batchEmbeddingResponse.data.data.map((item: any) => item.embedding);
|
||||||
|
allChunkEmbeddings.push(...batchEmbeddings);
|
||||||
|
|
||||||
|
// Track token usage
|
||||||
|
const batchTokens = batchEmbeddingResponse.data.usage?.total_tokens || 0;
|
||||||
|
totalTokensUsed += batchTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get embedding for the question
|
||||||
|
const questionEmbeddingResponse = await axios.post(
|
||||||
|
'https://api.jina.ai/v1/embeddings',
|
||||||
|
{
|
||||||
|
model: "jina-embeddings-v3",
|
||||||
|
task: "retrieval.query",
|
||||||
|
dimensions: 1024,
|
||||||
|
embedding_type: "float",
|
||||||
|
input: [question]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${JINA_API_KEY}`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (questionEmbeddingResponse.status !== 200) {
|
||||||
|
throw new Error("Unexpected status code from API");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate question embedding response
|
||||||
|
if (!questionEmbeddingResponse.data?.data || !questionEmbeddingResponse.data.data[0]?.embedding) {
|
||||||
|
throw new Error("Question embedding not found in API response");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track token usage for question embedding
|
||||||
|
const questionTokens = questionEmbeddingResponse.data.usage?.total_tokens || 0;
|
||||||
|
totalTokensUsed += questionTokens;
|
||||||
|
|
||||||
|
// Track total token usage
|
||||||
|
trackers.tokenTracker.trackUsage('latechunk', {
|
||||||
|
promptTokens: totalTokensUsed,
|
||||||
|
completionTokens: 0,
|
||||||
|
totalTokens: totalTokensUsed
|
||||||
|
});
|
||||||
|
|
||||||
|
const questionEmbedding = questionEmbeddingResponse.data.data[0].embedding;
|
||||||
|
|
||||||
|
// Verify that we got embeddings for all chunks
|
||||||
|
if (allChunkEmbeddings.length !== chunks.length) {
|
||||||
|
console.error(`Got ${allChunkEmbeddings.length} embeddings for ${chunks.length} chunks`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate cosine similarity between the question and each chunk
|
||||||
|
const similarities = allChunkEmbeddings.map((chunkEmbed: number[]) => {
|
||||||
|
return cosineSimilarity(questionEmbedding, chunkEmbed);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Calculate the number of chunks needed for a single snippet
|
||||||
|
const chunksPerSnippet = Math.ceil(snippetLength / chunkSize);
|
||||||
|
|
||||||
|
// Find the top `numSnippets` snippets with highest average similarity
|
||||||
|
const snippets: string[] = [];
|
||||||
|
|
||||||
|
// Create a copy of similarities to avoid modifying the original
|
||||||
|
const similaritiesCopy = [...similarities];
|
||||||
|
|
||||||
|
for (let i = 0; i < numSnippets; i++) {
|
||||||
|
// Find the best starting position for the snippet
|
||||||
|
let bestStartIndex = 0;
|
||||||
|
let bestScore = -Infinity;
|
||||||
|
|
||||||
|
// Check each possible starting position for a snippet
|
||||||
|
for (let j = 0; j <= similarities.length - chunksPerSnippet; j++) {
|
||||||
|
// Calculate the average similarity for the current window
|
||||||
|
const windowScores = similaritiesCopy.slice(j, j + chunksPerSnippet);
|
||||||
|
const windowScore = windowScores.reduce((sum, score) => sum + score, 0) / windowScores.length;
|
||||||
|
|
||||||
|
if (windowScore > bestScore) {
|
||||||
|
bestScore = windowScore;
|
||||||
|
bestStartIndex = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the snippet text
|
||||||
|
const startIndex = bestStartIndex * chunkSize;
|
||||||
|
const endIndex = Math.min(startIndex + snippetLength, longContext.length);
|
||||||
|
snippets.push(longContext.substring(startIndex, endIndex));
|
||||||
|
|
||||||
|
// Mark the used chunks with a very low score to avoid reusing them
|
||||||
|
for (let k = bestStartIndex; k < bestStartIndex + chunksPerSnippet && k < similaritiesCopy.length; k++) {
|
||||||
|
similaritiesCopy[k] = -Infinity;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// wrap with <snippet-index> tag
|
||||||
|
return snippets.map((snippet, index) => `
|
||||||
|
<snippet-${index+1}>
|
||||||
|
|
||||||
|
${snippet}
|
||||||
|
|
||||||
|
</snippet-${index+1}>`.trim()).join("\n\n");
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error in late chunking:', error);
|
||||||
|
// Fallback: just return the beginning of the context up to the desired length
|
||||||
|
return longContext.substring(0, snippetLength * numSnippets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to calculate cosine similarity between two vectors
|
||||||
|
function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
|
||||||
|
if (vectorA.length !== vectorB.length) {
|
||||||
|
throw new Error("Vectors must have the same length");
|
||||||
|
}
|
||||||
|
|
||||||
|
let dotProduct = 0;
|
||||||
|
let magnitudeA = 0;
|
||||||
|
let magnitudeB = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < vectorA.length; i++) {
|
||||||
|
dotProduct += vectorA[i] * vectorB[i];
|
||||||
|
magnitudeA += vectorA[i] * vectorA[i];
|
||||||
|
magnitudeB += vectorB[i] * vectorB[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
magnitudeA = Math.sqrt(magnitudeA);
|
||||||
|
magnitudeB = Math.sqrt(magnitudeB);
|
||||||
|
|
||||||
|
if (magnitudeA === 0 || magnitudeB === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return dotProduct / (magnitudeA * magnitudeB);
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import https from 'https';
|
import https from 'https';
|
||||||
import { TokenTracker } from "../utils/token-tracker";
|
import {TokenTracker} from "../utils/token-tracker";
|
||||||
import { ReadResponse } from '../types';
|
import {ReadResponse} from '../types';
|
||||||
import { JINA_API_KEY } from "../config";
|
import {JINA_API_KEY} from "../config";
|
||||||
|
|
||||||
export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
|
export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
@@ -10,13 +10,15 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = JSON.stringify({ url });
|
const data = JSON.stringify({url});
|
||||||
const headers: Record<string, any> = {
|
const headers: Record<string, any> = {
|
||||||
'Accept': 'application/json',
|
'Accept': 'application/json',
|
||||||
'Authorization': `Bearer ${JINA_API_KEY}`,
|
'Authorization': `Bearer ${JINA_API_KEY}`,
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'X-Retain-Images': 'none',
|
'X-Retain-Images': 'none',
|
||||||
};
|
'X-Md-Link-Style': 'discarded',
|
||||||
|
'X-Timeout': '20'
|
||||||
|
};
|
||||||
if (withAllLinks) {
|
if (withAllLinks) {
|
||||||
headers['X-With-Links-Summary'] = 'all'
|
headers['X-With-Links-Summary'] = 'all'
|
||||||
}
|
}
|
||||||
@@ -75,12 +77,12 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
|
|||||||
const tokens = response.data.usage?.tokens || 0;
|
const tokens = response.data.usage?.tokens || 0;
|
||||||
const tokenTracker = tracker || new TokenTracker();
|
const tokenTracker = tracker || new TokenTracker();
|
||||||
tokenTracker.trackUsage('read', {
|
tokenTracker.trackUsage('read', {
|
||||||
totalTokens: tokens,
|
totalTokens: tokens,
|
||||||
promptTokens: url.length,
|
promptTokens: url.length,
|
||||||
completionTokens: tokens
|
completionTokens: tokens
|
||||||
});
|
});
|
||||||
|
|
||||||
resolve({ response });
|
resolve({response});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ export interface ChatCompletionResponse {
|
|||||||
type: 'text' | 'think' | 'json' | 'error';
|
type: 'text' | 'think' | 'json' | 'error';
|
||||||
};
|
};
|
||||||
logprobs: null;
|
logprobs: null;
|
||||||
finish_reason: 'stop';
|
finish_reason: 'stop' | 'error';
|
||||||
}>;
|
}>;
|
||||||
usage: {
|
usage: {
|
||||||
prompt_tokens: number;
|
prompt_tokens: number;
|
||||||
@@ -256,9 +256,10 @@ export interface ChatCompletionChunk {
|
|||||||
role?: 'assistant';
|
role?: 'assistant';
|
||||||
content?: string;
|
content?: string;
|
||||||
type?: 'text' | 'think' | 'json' | 'error';
|
type?: 'text' | 'think' | 'json' | 'error';
|
||||||
|
url?: string;
|
||||||
};
|
};
|
||||||
logprobs: null;
|
logprobs: null;
|
||||||
finish_reason: null | 'stop';
|
finish_reason: null | 'stop' | 'thinking_end' | 'error';
|
||||||
}>;
|
}>;
|
||||||
usage?: any;
|
usage?: any;
|
||||||
visitedURLs?: string[];
|
visitedURLs?: string[];
|
||||||
|
|||||||
@@ -3,84 +3,98 @@
|
|||||||
"eval_first": "But wait, let me evaluate the answer first.",
|
"eval_first": "But wait, let me evaluate the answer first.",
|
||||||
"search_for": "Let me search for ${keywords} to gather more information.",
|
"search_for": "Let me search for ${keywords} to gather more information.",
|
||||||
"read_for": "Let me read ${urls} to gather more information.",
|
"read_for": "Let me read ${urls} to gather more information.",
|
||||||
"read_for_verify": "Let me fetch the source content to verify the answer."
|
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
||||||
|
"late_chunk": "Source is too long, I'm cherry-picking the relevant parts."
|
||||||
},
|
},
|
||||||
"zh-CN": {
|
"zh-CN": {
|
||||||
"eval_first": "等等,让我先自己评估一下答案。",
|
"eval_first": "等等,让我先自己评估一下答案。",
|
||||||
"search_for": "让我搜索${keywords}来获取更多信息。",
|
"search_for": "让我搜索${keywords}来获取更多信息。",
|
||||||
"read_for": "让我读取网页${urls}来获取更多信息。",
|
"read_for": "让我读取网页${urls}来获取更多信息。",
|
||||||
"read_for_verify": "让我读取源网页内容来验证答案。"
|
"read_for_verify": "让我读取源网页内容来验证答案。",
|
||||||
|
"late_chunk": "源内容太长,我正在挑选相关部分。"
|
||||||
},
|
},
|
||||||
"zh-TW": {
|
"zh-TW": {
|
||||||
"eval_first": "等等,讓我先評估一下答案。",
|
"eval_first": "等等,讓我先評估一下答案。",
|
||||||
"search_for": "讓我搜索${keywords}來獲取更多信息。",
|
"search_for": "讓我搜索${keywords}來獲取更多信息。",
|
||||||
"read_for": "讓我閱讀${urls}來獲取更多信息。",
|
"read_for": "讓我閱讀${urls}來獲取更多信息。",
|
||||||
"read_for_verify": "讓我獲取源內容來驗證答案。"
|
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
||||||
|
"late_chunk": "源內容太長,我正在挑選相關部分。"
|
||||||
},
|
},
|
||||||
"ja": {
|
"ja": {
|
||||||
"eval_first": "ちょっと待って、まず答えを評価します。",
|
"eval_first": "ちょっと待って、まず答えを評価します。",
|
||||||
"search_for": "キーワード${keywords}で検索して、情報を集めます。",
|
"search_for": "キーワード${keywords}で検索して、情報を集めます。",
|
||||||
"read_for": "URL${urls}を読んで、情報を集めます。",
|
"read_for": "URL${urls}を読んで、情報を集めます。",
|
||||||
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。"
|
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
||||||
|
"late_chunk": "ソースが長すぎるため、関連部分を抜粋しています。"
|
||||||
},
|
},
|
||||||
"ko": {
|
"ko": {
|
||||||
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
|
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
|
||||||
"search_for": "키워드 ${keywords}로 검색하여 더 많은 정보를 수집하겠습니다.",
|
"search_for": "키워드 ${keywords}로 검색하여 더 많은 정보를 수집하겠습니다.",
|
||||||
"read_for": "URL ${urls}을 읽어 더 많은 정보를 수집하겠습니다.",
|
"read_for": "URL ${urls}을 읽어 더 많은 정보를 수집하겠습니다.",
|
||||||
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다."
|
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
||||||
|
"late_chunk": "소스가 너무 길어서 관련 부분만 추출하고 있습니다."
|
||||||
},
|
},
|
||||||
"fr": {
|
"fr": {
|
||||||
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
|
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
|
||||||
"search_for": "Je vais rechercher ${keywords} pour obtenir plus d'informations.",
|
"search_for": "Je vais rechercher ${keywords} pour obtenir plus d'informations.",
|
||||||
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
||||||
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse."
|
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
||||||
|
"late_chunk": "La source est trop longue, je sélectionne les parties pertinentes."
|
||||||
},
|
},
|
||||||
"de": {
|
"de": {
|
||||||
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
|
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
|
||||||
"search_for": "Ich werde nach ${keywords} suchen, um weitere Informationen zu sammeln.",
|
"search_for": "Ich werde nach ${keywords} suchen, um weitere Informationen zu sammeln.",
|
||||||
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
||||||
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen."
|
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
||||||
|
"late_chunk": "Die Quelle ist zu lang, ich wähle die relevanten Teile aus."
|
||||||
},
|
},
|
||||||
"es": {
|
"es": {
|
||||||
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
|
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
|
||||||
"search_for": "Voy a buscar ${keywords} para recopilar más información.",
|
"search_for": "Voy a buscar ${keywords} para recopilar más información.",
|
||||||
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
||||||
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta."
|
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
||||||
|
"late_chunk": "La fuente es demasiado larga, estoy seleccionando las partes relevantes."
|
||||||
},
|
},
|
||||||
"it": {
|
"it": {
|
||||||
"eval_first": "Un attimo, valuterò prima la risposta.",
|
"eval_first": "Un attimo, valuterò prima la risposta.",
|
||||||
"search_for": "Cercherò ${keywords} per raccogliere ulteriori informazioni.",
|
"search_for": "Cercherò ${keywords} per raccogliere ulteriori informazioni.",
|
||||||
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
||||||
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta."
|
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
|
||||||
|
"late_chunk": "La fonte è troppo lunga, sto selezionando le parti rilevanti."
|
||||||
},
|
},
|
||||||
"pt": {
|
"pt": {
|
||||||
"eval_first": "Um momento, vou avaliar a resposta primeiro.",
|
"eval_first": "Um momento, vou avaliar a resposta primeiro.",
|
||||||
"search_for": "Vou pesquisar ${keywords} para reunir mais informações.",
|
"search_for": "Vou pesquisar ${keywords} para reunir mais informações.",
|
||||||
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
||||||
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta."
|
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
||||||
|
"late_chunk": "A fonte é muito longa, estou selecionando as partes relevantes."
|
||||||
},
|
},
|
||||||
"ru": {
|
"ru": {
|
||||||
"eval_first": "Подождите, я сначала оценю ответ.",
|
"eval_first": "Подождите, я сначала оценю ответ.",
|
||||||
"search_for": "Дайте мне поискать ${keywords} для сбора дополнительной информации.",
|
"search_for": "Дайте мне поискать ${keywords} для сбора дополнительной информации.",
|
||||||
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
||||||
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа."
|
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
||||||
|
"late_chunk": "Источник слишком длинный, я выбираю только значимые части."
|
||||||
},
|
},
|
||||||
"ar": {
|
"ar": {
|
||||||
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
|
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
|
||||||
"search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
|
"search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
|
||||||
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
||||||
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة."
|
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
||||||
|
"late_chunk": "المصدر طويل جدًا، أنا أختار الأجزاء ذات الصلة."
|
||||||
},
|
},
|
||||||
"nl": {
|
"nl": {
|
||||||
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
|
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
|
||||||
"search_for": "Ik zal zoeken naar ${keywords} om meer informatie te verzamelen.",
|
"search_for": "Ik zal zoeken naar ${keywords} om meer informatie te verzamelen.",
|
||||||
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
||||||
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren."
|
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
||||||
|
"late_chunk": "De bron is te lang, ik selecteer de relevante delen."
|
||||||
},
|
},
|
||||||
"zh": {
|
"zh": {
|
||||||
"eval_first": "等等,让我先评估一下答案。",
|
"eval_first": "等等,让我先评估一下答案。",
|
||||||
"search_for": "让我搜索${keywords}来获取更多信息。",
|
"search_for": "让我搜索${keywords}来获取更多信息。",
|
||||||
"read_for": "让我阅读${urls}来获取更多信息。",
|
"read_for": "让我阅读${urls}来获取更多信息。",
|
||||||
"read_for_verify": "让我获取源内容来验证答案。"
|
"read_for_verify": "让我获取源内容来验证答案。",
|
||||||
|
"late_chunk": "源内容太长,我正在挑选相关部分。"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types";
|
import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
|
||||||
import {removeAllLineBreaks, smartMergeStrings} from "./text-tools";
|
import {smartMergeStrings} from "./text-tools";
|
||||||
import {rerankDocuments} from "../tools/jina-rerank";
|
import {rerankDocuments} from "../tools/jina-rerank";
|
||||||
import {readUrl} from "../tools/read";
|
import {readUrl} from "../tools/read";
|
||||||
|
import {Schemas} from "./schemas";
|
||||||
|
import {cherryPick} from "../tools/jina-latechunk";
|
||||||
|
|
||||||
export function normalizeUrl(urlString: string, debug = false, options = {
|
export function normalizeUrl(urlString: string, debug = false, options = {
|
||||||
removeAnchors: true,
|
removeAnchors: true,
|
||||||
@@ -390,7 +392,8 @@ export async function processURLs(
|
|||||||
allKnowledge: KnowledgeItem[],
|
allKnowledge: KnowledgeItem[],
|
||||||
allURLs: Record<string, SearchSnippet>,
|
allURLs: Record<string, SearchSnippet>,
|
||||||
visitedURLs: string[],
|
visitedURLs: string[],
|
||||||
languageCode: string
|
schemaGen: Schemas,
|
||||||
|
question: string
|
||||||
): Promise<{urlResults: any[], success: boolean}> {
|
): Promise<{urlResults: any[], success: boolean}> {
|
||||||
// Skip if no URLs to process
|
// Skip if no URLs to process
|
||||||
if (urls.length === 0) {
|
if (urls.length === 0) {
|
||||||
@@ -398,7 +401,7 @@ export async function processURLs(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Track the reading action
|
// Track the reading action
|
||||||
context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')});
|
context.actionTracker.trackThink('read_for', schemaGen.languageCode, {urls: urls.join(', ')});
|
||||||
|
|
||||||
// Process each URL in parallel
|
// Process each URL in parallel
|
||||||
const urlResults = await Promise.all(
|
const urlResults = await Promise.all(
|
||||||
@@ -407,7 +410,10 @@ export async function processURLs(
|
|||||||
const {response} = await readUrl(url, true, context.tokenTracker);
|
const {response} = await readUrl(url, true, context.tokenTracker);
|
||||||
const {data} = response;
|
const {data} = response;
|
||||||
const guessedTime = await getLastModified(url);
|
const guessedTime = await getLastModified(url);
|
||||||
console.log('Guessed time for', url, guessedTime);
|
if (guessedTime) {
|
||||||
|
console.log('Guessed time for', url, guessedTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Early return if no valid data
|
// Early return if no valid data
|
||||||
if (!data?.url || !data?.content) {
|
if (!data?.url || !data?.content) {
|
||||||
@@ -417,7 +423,7 @@ export async function processURLs(
|
|||||||
// Add to knowledge base
|
// Add to knowledge base
|
||||||
allKnowledge.push({
|
allKnowledge.push({
|
||||||
question: `What do expert say about "${data.title}"?`,
|
question: `What do expert say about "${data.title}"?`,
|
||||||
answer: removeAllLineBreaks(data.content),
|
answer: await cherryPick(question, data.content, {}, context, schemaGen),
|
||||||
references: [data.url],
|
references: [data.url],
|
||||||
type: 'url',
|
type: 'url',
|
||||||
updated: guessedTime
|
updated: guessedTime
|
||||||
|
|||||||
Reference in New Issue
Block a user