feat: late chunking

2026-03-22 07:29:35 +08:00 · 2025-03-12 14:07:11 +08:00
parent c8fc259dff
commit 013056f218
7 changed files with 308 additions and 47 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -434,7 +434,8 @@ export async function getResponse(question?: string,
          allKnowledge,
          allURLs,
          visitedURLs,
-          SchemaGen.languageCode
+          SchemaGen,
          currentQuestion
        );
      }
@@ -701,7 +702,8 @@ You decided to think out of the box or cut from a completely different angle.
          allKnowledge,
          allURLs,
          visitedURLs,
-          SchemaGen.languageCode
+          SchemaGen,
          currentQuestion
        );
        diaryContext.push(success
--- a/src/app.ts
+++ b/src/app.ts
@@ -7,7 +7,7 @@ import {
  ChatCompletionResponse,
  ChatCompletionChunk,
  AnswerAction,
-  Model, StepAction
+  Model, StepAction, VisitAction
 } from './types';
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
@@ -337,7 +337,7 @@ async function processQueue(streamingState: StreamingState, res: Response, reque
          system_fingerprint: 'fp_' + requestId,
          choices: [{
            index: 0,
-            delta: {content: word, type: "think"},
+            delta: {content: word, type: 'think'},
            logprobs: null,
            finish_reason: null
          }]
@@ -475,7 +475,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
      system_fingerprint: 'fp_' + requestId,
      choices: [{
        index: 0,
-        delta: {role: 'assistant', content: '<think>', type: "text"},
+        delta: {role: 'assistant', content: '<think>', type: 'think'},
        logprobs: null,
        finish_reason: null
      }]
@@ -485,6 +485,24 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
    // Set up progress listener with cleanup
    const actionListener = async (step: StepAction) => {
      // Add content to queue for both thinking steps and final answer
      if (step.action === 'visit') {
        (step as VisitAction).URLTargets.forEach((url) => {
          const chunk: ChatCompletionChunk = {
          id: requestId,
          object: 'chat.completion.chunk',
          created,
          model: body.model,
          system_fingerprint: 'fp_' + requestId,
          choices: [{
            index: 0,
            delta: {type: 'think', url},
            logprobs: null,
            finish_reason: null,
          }]
        };
        res.write(`data: ${JSON.stringify(chunk)}\n\n`);
        });
      }
      if (step.think) {
        // if not ends with a space, add one
        const content = step.think + ' ';
@@ -548,9 +566,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: `</think>\n\n`, type: "think"},
+          delta: {content: `</think>\n\n`, type: 'think'},
          logprobs: null,
-          finish_reason: null
+          finish_reason: 'thinking_end'
        }]
      };
      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
@@ -638,9 +656,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: '</think>', type: "think"},
+          delta: {content: '</think>', type: 'think'},
          logprobs: null,
-          finish_reason: null
+          finish_reason: 'error'
        }],
        usage,
      };
@@ -655,9 +673,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: errorMessage, type: "error"},
+          delta: {content: errorMessage, type: 'error'},
          logprobs: null,
-          finish_reason: 'stop'
+          finish_reason: 'error'
        }],
        usage
      };
@@ -679,7 +697,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
            type: 'error'
          },
          logprobs: null,
-          finish_reason: 'stop'
+          finish_reason: 'error'
        }],
        usage,
      };
--- a/src/tools/jina-latechunk.ts
+++ b/src/tools/jina-latechunk.ts
@@ -0,0 +1,218 @@
 import {TrackerContext} from "../types";
 import axios from 'axios';
 import {JINA_API_KEY} from "../config";
 import {Schemas} from "../utils/schemas";
 export async function cherryPick(question: string, longContext: string, options: any = {}, trackers: TrackerContext, schemaGen: Schemas) {
  const {
    snippetLength = 2000,
    numSnippets = 2,
    chunkSize = 200,
    maxTokensPerRequest = 8192, // Maximum tokens per embedding request
    // Rough estimate of tokens per character (can be adjusted based on your text)
    tokensPerCharacter = 0.5
  } = options;
  if (longContext.length < snippetLength * numSnippets) {
    // If the context is shorter than the snippet length, return the whole context
    return longContext;
  }
  // Split the longContext into chunks of chunkSize
  const chunks: string[] = [];
  for (let i = 0; i < longContext.length; i += chunkSize) {
    chunks.push(longContext.substring(i, Math.min(i + chunkSize, longContext.length)));
  }
  console.log('late chunking enabled! num chunks:', chunks.length);
  trackers.actionTracker.trackThink('late_chunk', schemaGen.languageCode);
  try {
    // Estimate the number of tokens per chunk
    const estimatedTokensPerChunk = Math.ceil(chunkSize * tokensPerCharacter);
    // Calculate chunks per batch to stay under token limit
    const chunksPerBatch = Math.floor(maxTokensPerRequest / estimatedTokensPerChunk);
    // Create batches of chunks
    const chunkBatches = [];
    for (let i = 0; i < chunks.length; i += chunksPerBatch) {
      chunkBatches.push(chunks.slice(i, i + chunksPerBatch));
    }
    console.log(`Total length ${longContext.length} split ${chunks.length} chunks into ${chunkBatches.length} batches of ~${chunksPerBatch} chunks each`);
    // Process each batch and collect the embeddings
    const allChunkEmbeddings: number[][] = [];
    let totalTokensUsed = 0;
    for (let batchIndex = 0; batchIndex < chunkBatches.length; batchIndex++) {
      const batch = chunkBatches[batchIndex];
      console.log(`Processing batch ${batchIndex + 1}/${chunkBatches.length} with ${batch.length} chunks`);
      // Get embeddings for the current batch
      const batchEmbeddingResponse = await axios.post(
        'https://api.jina.ai/v1/embeddings',
        {
          model: "jina-embeddings-v3",
          task: "retrieval.passage",
          late_chunking: true,
          dimensions: 1024,
          embedding_type: "float",
          input: batch
        },
        {
          headers: {
            'Content-Type': 'application/json',
            'Authorization': `Bearer ${JINA_API_KEY}`
          }
        }
      );
      if (batchEmbeddingResponse.status !== 200) {
        throw new Error(`Unexpected status code from API: ${batchEmbeddingResponse.status}`);
      }
      // Validate response structure
      if (!batchEmbeddingResponse.data?.data) {
        throw new Error("Unexpected API response format");
      }
      // Extract embeddings from this batch
      const batchEmbeddings = batchEmbeddingResponse.data.data.map((item: any) => item.embedding);
      allChunkEmbeddings.push(...batchEmbeddings);
      // Track token usage
      const batchTokens = batchEmbeddingResponse.data.usage?.total_tokens || 0;
      totalTokensUsed += batchTokens;
    }
    // Get embedding for the question
    const questionEmbeddingResponse = await axios.post(
      'https://api.jina.ai/v1/embeddings',
      {
        model: "jina-embeddings-v3",
        task: "retrieval.query",
        dimensions: 1024,
        embedding_type: "float",
        input: [question]
      },
      {
        headers: {
          'Content-Type': 'application/json',
          'Authorization': `Bearer ${JINA_API_KEY}`
        }
      }
    );
    if (questionEmbeddingResponse.status !== 200) {
      throw new Error("Unexpected status code from API");
    }
    // Validate question embedding response
    if (!questionEmbeddingResponse.data?.data || !questionEmbeddingResponse.data.data[0]?.embedding) {
      throw new Error("Question embedding not found in API response");
    }
    // Track token usage for question embedding
    const questionTokens = questionEmbeddingResponse.data.usage?.total_tokens || 0;
    totalTokensUsed += questionTokens;
    // Track total token usage
    trackers.tokenTracker.trackUsage('latechunk', {
      promptTokens: totalTokensUsed,
      completionTokens: 0,
      totalTokens: totalTokensUsed
    });
    const questionEmbedding = questionEmbeddingResponse.data.data[0].embedding;
    // Verify that we got embeddings for all chunks
    if (allChunkEmbeddings.length !== chunks.length) {
      console.error(`Got ${allChunkEmbeddings.length} embeddings for ${chunks.length} chunks`);
    }
    // Calculate cosine similarity between the question and each chunk
    const similarities = allChunkEmbeddings.map((chunkEmbed: number[]) => {
      return cosineSimilarity(questionEmbedding, chunkEmbed);
    });
    // Calculate the number of chunks needed for a single snippet
    const chunksPerSnippet = Math.ceil(snippetLength / chunkSize);
    // Find the top `numSnippets` snippets with highest average similarity
    const snippets: string[] = [];
    // Create a copy of similarities to avoid modifying the original
    const similaritiesCopy = [...similarities];
    for (let i = 0; i < numSnippets; i++) {
      // Find the best starting position for the snippet
      let bestStartIndex = 0;
      let bestScore = -Infinity;
      // Check each possible starting position for a snippet
      for (let j = 0; j <= similarities.length - chunksPerSnippet; j++) {
        // Calculate the average similarity for the current window
        const windowScores = similaritiesCopy.slice(j, j + chunksPerSnippet);
        const windowScore = windowScores.reduce((sum, score) => sum + score, 0) / windowScores.length;
        if (windowScore > bestScore) {
          bestScore = windowScore;
          bestStartIndex = j;
        }
      }
      // Extract the snippet text
      const startIndex = bestStartIndex * chunkSize;
      const endIndex = Math.min(startIndex + snippetLength, longContext.length);
      snippets.push(longContext.substring(startIndex, endIndex));
      // Mark the used chunks with a very low score to avoid reusing them
      for (let k = bestStartIndex; k < bestStartIndex + chunksPerSnippet && k < similaritiesCopy.length; k++) {
        similaritiesCopy[k] = -Infinity;
      }
    }
    // wrap with <snippet-index> tag
    return snippets.map((snippet, index) => `
 <snippet-${index+1}>
 ${snippet}
 </snippet-${index+1}>`.trim()).join("\n\n");
  } catch (error) {
    console.error('Error in late chunking:', error);
    // Fallback: just return the beginning of the context up to the desired length
    return longContext.substring(0, snippetLength * numSnippets);
  }
 }
 // Function to calculate cosine similarity between two vectors
 function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
  if (vectorA.length !== vectorB.length) {
    throw new Error("Vectors must have the same length");
  }
  let dotProduct = 0;
  let magnitudeA = 0;
  let magnitudeB = 0;
  for (let i = 0; i < vectorA.length; i++) {
    dotProduct += vectorA[i] * vectorB[i];
    magnitudeA += vectorA[i] * vectorA[i];
    magnitudeB += vectorB[i] * vectorB[i];
  }
  magnitudeA = Math.sqrt(magnitudeA);
  magnitudeB = Math.sqrt(magnitudeB);
  if (magnitudeA === 0 || magnitudeB === 0) {
    return 0;
  }
  return dotProduct / (magnitudeA * magnitudeB);
 }
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@@ -1,7 +1,7 @@
 import https from 'https';
-import { TokenTracker } from "../utils/token-tracker";
+import {TokenTracker} from "../utils/token-tracker";
-import { ReadResponse } from '../types';
+import {ReadResponse} from '../types';
-import { JINA_API_KEY } from "../config";
+import {JINA_API_KEY} from "../config";
 export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
  return new Promise((resolve, reject) => {
@@ -10,13 +10,15 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
      return;
    }
-    const data = JSON.stringify({ url });
+    const data = JSON.stringify({url});
    const headers: Record<string, any> = {
-        'Accept': 'application/json',
+      'Accept': 'application/json',
-        'Authorization': `Bearer ${JINA_API_KEY}`,
+      'Authorization': `Bearer ${JINA_API_KEY}`,
-        'Content-Type': 'application/json',
+      'Content-Type': 'application/json',
-        'X-Retain-Images': 'none',
+      'X-Retain-Images': 'none',
-      };
+      'X-Md-Link-Style': 'discarded',
      'X-Timeout': '20'
    };
    if (withAllLinks) {
      headers['X-With-Links-Summary'] = 'all'
    }
@@ -75,12 +77,12 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
        const tokens = response.data.usage?.tokens || 0;
        const tokenTracker = tracker || new TokenTracker();
        tokenTracker.trackUsage('read', {
-            totalTokens: tokens,
+          totalTokens: tokens,
-            promptTokens: url.length,
+          promptTokens: url.length,
-            completionTokens: tokens
+          completionTokens: tokens
        });
-        resolve({ response });
+        resolve({response});
      });
    });
--- a/src/types.ts
+++ b/src/types.ts
@@ -233,7 +233,7 @@ export interface ChatCompletionResponse {
      type: 'text' | 'think' | 'json' | 'error';
    };
    logprobs: null;
-    finish_reason: 'stop';
+    finish_reason: 'stop' | 'error';
  }>;
  usage: {
    prompt_tokens: number;
@@ -256,9 +256,10 @@ export interface ChatCompletionChunk {
      role?: 'assistant';
      content?: string;
      type?: 'text' | 'think' | 'json' | 'error';
      url?: string;
    };
    logprobs: null;
-    finish_reason: null | 'stop';
+    finish_reason: null | 'stop' | 'thinking_end' | 'error';
  }>;
  usage?: any;
  visitedURLs?: string[];
--- a/src/utils/i18n.json
+++ b/src/utils/i18n.json
@@ -3,84 +3,98 @@
    "eval_first": "But wait, let me evaluate the answer first.",
    "search_for": "Let me search for ${keywords} to gather more information.",
    "read_for": "Let me read ${urls} to gather more information.",
-    "read_for_verify": "Let me fetch the source content to verify the answer."
+    "read_for_verify": "Let me fetch the source content to verify the answer.",
    "late_chunk": "Source is too long, I'm cherry-picking the relevant parts."
  },
  "zh-CN": {
    "eval_first": "等等，让我先自己评估一下答案。",
    "search_for": "让我搜索${keywords}来获取更多信息。",
    "read_for": "让我读取网页${urls}来获取更多信息。",
-    "read_for_verify": "让我读取源网页内容来验证答案。"
+    "read_for_verify": "让我读取源网页内容来验证答案。",
    "late_chunk": "源内容太长，我正在挑选相关部分。"
  },
  "zh-TW": {
    "eval_first": "等等，讓我先評估一下答案。",
    "search_for": "讓我搜索${keywords}來獲取更多信息。",
    "read_for": "讓我閱讀${urls}來獲取更多信息。",
-    "read_for_verify": "讓我獲取源內容來驗證答案。"
+    "read_for_verify": "讓我獲取源內容來驗證答案。",
    "late_chunk": "源內容太長，我正在挑選相關部分。"
  },
  "ja": {
    "eval_first": "ちょっと待って、まず答えを評価します。",
    "search_for": "キーワード${keywords}で検索して、情報を集めます。",
    "read_for": "URL${urls}を読んで、情報を集めます。",
-    "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。"
+    "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
    "late_chunk": "ソースが長すぎるため、関連部分を抜粋しています。"
  },
  "ko": {
    "eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
    "search_for": "키워드 ${keywords}로 검색하여 더 많은 정보를 수집하겠습니다.",
    "read_for": "URL ${urls}을 읽어 더 많은 정보를 수집하겠습니다.",
-    "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다."
+    "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
    "late_chunk": "소스가 너무 길어서 관련 부분만 추출하고 있습니다."
  },
  "fr": {
    "eval_first": "Un instant, je vais d'abord évaluer la réponse.",
    "search_for": "Je vais rechercher ${keywords} pour obtenir plus d'informations.",
    "read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
-    "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse."
+    "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
    "late_chunk": "La source est trop longue, je sélectionne les parties pertinentes."
  },
  "de": {
    "eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
    "search_for": "Ich werde nach ${keywords} suchen, um weitere Informationen zu sammeln.",
    "read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
-    "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen."
+    "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
    "late_chunk": "Die Quelle ist zu lang, ich wähle die relevanten Teile aus."
  },
  "es": {
    "eval_first": "Un momento, voy a evaluar la respuesta primero.",
    "search_for": "Voy a buscar ${keywords} para recopilar más información.",
    "read_for": "Voy a leer ${urls} para recopilar más información.",
-    "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta."
+    "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
    "late_chunk": "La fuente es demasiado larga, estoy seleccionando las partes relevantes."
  },
  "it": {
    "eval_first": "Un attimo, valuterò prima la risposta.",
    "search_for": "Cercherò ${keywords} per raccogliere ulteriori informazioni.",
    "read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
-    "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta."
+    "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
    "late_chunk": "La fonte è troppo lunga, sto selezionando le parti rilevanti."
  },
  "pt": {
    "eval_first": "Um momento, vou avaliar a resposta primeiro.",
    "search_for": "Vou pesquisar ${keywords} para reunir mais informações.",
    "read_for": "Vou ler ${urls} para reunir mais informações.",
-    "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta."
+    "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
    "late_chunk": "A fonte é muito longa, estou selecionando as partes relevantes."
  },
  "ru": {
    "eval_first": "Подождите, я сначала оценю ответ.",
    "search_for": "Дайте мне поискать ${keywords} для сбора дополнительной информации.",
    "read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
-    "read_for_verify": "Дайте мне получить исходный контент для проверки ответа."
+    "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
    "late_chunk": "Источник слишком длинный, я выбираю только значимые части."
  },
  "ar": {
    "eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
    "search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
    "read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
-    "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة."
+    "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
    "late_chunk": "المصدر طويل جدًا، أنا أختار الأجزاء ذات الصلة."
  },
  "nl": {
    "eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
    "search_for": "Ik zal zoeken naar ${keywords} om meer informatie te verzamelen.",
    "read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
-    "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren."
+    "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
    "late_chunk": "De bron is te lang, ik selecteer de relevante delen."
  },
  "zh": {
    "eval_first": "等等，让我先评估一下答案。",
    "search_for": "让我搜索${keywords}来获取更多信息。",
    "read_for": "让我阅读${urls}来获取更多信息。",
-    "read_for_verify": "让我获取源内容来验证答案。"
+    "read_for_verify": "让我获取源内容来验证答案。",
    "late_chunk": "源内容太长，我正在挑选相关部分。"
  }
 }
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@@ -1,7 +1,9 @@
-import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types";
+import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
-import {removeAllLineBreaks, smartMergeStrings} from "./text-tools";
+import {smartMergeStrings} from "./text-tools";
 import {rerankDocuments} from "../tools/jina-rerank";
 import {readUrl} from "../tools/read";
 import {Schemas} from "./schemas";
 import {cherryPick} from "../tools/jina-latechunk";
 export function normalizeUrl(urlString: string, debug = false, options = {
  removeAnchors: true,
@@ -390,7 +392,8 @@ export async function processURLs(
  allKnowledge: KnowledgeItem[],
  allURLs: Record<string, SearchSnippet>,
  visitedURLs: string[],
-  languageCode: string
+  schemaGen: Schemas,
  question: string
 ): Promise<{urlResults: any[], success: boolean}> {
  // Skip if no URLs to process
  if (urls.length === 0) {
@@ -398,7 +401,7 @@ export async function processURLs(
  }
  // Track the reading action
-  context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')});
+  context.actionTracker.trackThink('read_for', schemaGen.languageCode, {urls: urls.join(', ')});
  // Process each URL in parallel
  const urlResults = await Promise.all(
@@ -407,7 +410,10 @@ export async function processURLs(
        const {response} = await readUrl(url, true, context.tokenTracker);
        const {data} = response;
        const guessedTime = await getLastModified(url);
-        console.log('Guessed time for', url, guessedTime);
+        if (guessedTime) {
          console.log('Guessed time for', url, guessedTime);
        }
        // Early return if no valid data
        if (!data?.url || !data?.content) {
@@ -417,7 +423,7 @@ export async function processURLs(
        // Add to knowledge base
        allKnowledge.push({
          question: `What do expert say about "${data.title}"?`,
-          answer: removeAllLineBreaks(data.content),
+          answer: await cherryPick(question, data.content, {}, context, schemaGen),
          references: [data.url],
          type: 'url',
          updated: guessedTime