feat: late chunking

2025-12-26 06:28:56 +08:00 · 2025-03-12 14:07:11 +08:00 · 2025-03-12 14:07:11 +08:00 · 013056f218
commit 013056f218
parent c8fc259dff
7 changed files with 308 additions and 47 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -434,7 +434,8 @@ export async function getResponse(question?: string,
          allKnowledge,
          allURLs,
          visitedURLs,
-          SchemaGen.languageCode
+          SchemaGen,
+          currentQuestion
        );
      }

@ -701,7 +702,8 @@ You decided to think out of the box or cut from a completely different angle.
          allKnowledge,
          allURLs,
          visitedURLs,
-          SchemaGen.languageCode
+          SchemaGen,
+          currentQuestion
        );

        diaryContext.push(success
--- a/src/app.ts
+++ b/src/app.ts
@ -7,7 +7,7 @@ import {
  ChatCompletionResponse,
  ChatCompletionChunk,
  AnswerAction,
-  Model, StepAction
+  Model, StepAction, VisitAction
 } from './types';
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
@ -337,7 +337,7 @@ async function processQueue(streamingState: StreamingState, res: Response, reque
          system_fingerprint: 'fp_' + requestId,
          choices: [{
            index: 0,
-            delta: {content: word, type: "think"},
+            delta: {content: word, type: 'think'},
            logprobs: null,
            finish_reason: null
          }]
@ -475,7 +475,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
      system_fingerprint: 'fp_' + requestId,
      choices: [{
        index: 0,
-        delta: {role: 'assistant', content: '<think>', type: "text"},
+        delta: {role: 'assistant', content: '<think>', type: 'think'},
        logprobs: null,
        finish_reason: null
      }]
@ -485,6 +485,24 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
    // Set up progress listener with cleanup
    const actionListener = async (step: StepAction) => {
      // Add content to queue for both thinking steps and final answer
+      if (step.action === 'visit') {
+        (step as VisitAction).URLTargets.forEach((url) => {
+          const chunk: ChatCompletionChunk = {
+          id: requestId,
+          object: 'chat.completion.chunk',
+          created,
+          model: body.model,
+          system_fingerprint: 'fp_' + requestId,
+          choices: [{
+            index: 0,
+            delta: {type: 'think', url},
+            logprobs: null,
+            finish_reason: null,
+          }]
+        };
+        res.write(`data: ${JSON.stringify(chunk)}\n\n`);
+        });
+      }
      if (step.think) {
        // if not ends with a space, add one
        const content = step.think + ' ';
@ -548,9 +566,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: `</think>\n\n`, type: "think"},
+          delta: {content: `</think>\n\n`, type: 'think'},
          logprobs: null,
-          finish_reason: null
+          finish_reason: 'thinking_end'
        }]
      };
      res.write(`data: ${JSON.stringify(closeThinkChunk)}\n\n`);
@ -638,9 +656,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: '</think>', type: "think"},
+          delta: {content: '</think>', type: 'think'},
          logprobs: null,
-          finish_reason: null
+          finish_reason: 'error'
        }],
        usage,
      };
@ -655,9 +673,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        system_fingerprint: 'fp_' + requestId,
        choices: [{
          index: 0,
-          delta: {content: errorMessage, type: "error"},
+          delta: {content: errorMessage, type: 'error'},
          logprobs: null,
-          finish_reason: 'stop'
+          finish_reason: 'error'
        }],
        usage
      };
@ -679,7 +697,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
            type: 'error'
          },
          logprobs: null,
-          finish_reason: 'stop'
+          finish_reason: 'error'
        }],
        usage,
      };
--- a/src/tools/jina-latechunk.ts
+++ b/src/tools/jina-latechunk.ts
@ -0,0 +1,218 @@
+import {TrackerContext} from "../types";
+import axios from 'axios';
+import {JINA_API_KEY} from "../config";
+import {Schemas} from "../utils/schemas";
+
+export async function cherryPick(question: string, longContext: string, options: any = {}, trackers: TrackerContext, schemaGen: Schemas) {
+
+  const {
+    snippetLength = 2000,
+    numSnippets = 2,
+    chunkSize = 200,
+    maxTokensPerRequest = 8192, // Maximum tokens per embedding request
+    // Rough estimate of tokens per character (can be adjusted based on your text)
+    tokensPerCharacter = 0.5
+  } = options;
+
+  if (longContext.length < snippetLength * numSnippets) {
+    // If the context is shorter than the snippet length, return the whole context
+    return longContext;
+  }
+
+  // Split the longContext into chunks of chunkSize
+  const chunks: string[] = [];
+  for (let i = 0; i < longContext.length; i += chunkSize) {
+    chunks.push(longContext.substring(i, Math.min(i + chunkSize, longContext.length)));
+  }
+
+  console.log('late chunking enabled! num chunks:', chunks.length);
+
+  trackers.actionTracker.trackThink('late_chunk', schemaGen.languageCode);
+
+  try {
+    // Estimate the number of tokens per chunk
+    const estimatedTokensPerChunk = Math.ceil(chunkSize * tokensPerCharacter);
+
+    // Calculate chunks per batch to stay under token limit
+    const chunksPerBatch = Math.floor(maxTokensPerRequest / estimatedTokensPerChunk);
+
+    // Create batches of chunks
+    const chunkBatches = [];
+    for (let i = 0; i < chunks.length; i += chunksPerBatch) {
+      chunkBatches.push(chunks.slice(i, i + chunksPerBatch));
+    }
+
+    console.log(`Total length ${longContext.length} split ${chunks.length} chunks into ${chunkBatches.length} batches of ~${chunksPerBatch} chunks each`);
+
+    // Process each batch and collect the embeddings
+    const allChunkEmbeddings: number[][] = [];
+    let totalTokensUsed = 0;
+
+    for (let batchIndex = 0; batchIndex < chunkBatches.length; batchIndex++) {
+      const batch = chunkBatches[batchIndex];
+      console.log(`Processing batch ${batchIndex + 1}/${chunkBatches.length} with ${batch.length} chunks`);
+
+      // Get embeddings for the current batch
+      const batchEmbeddingResponse = await axios.post(
+        'https://api.jina.ai/v1/embeddings',
+        {
+          model: "jina-embeddings-v3",
+          task: "retrieval.passage",
+          late_chunking: true,
+          dimensions: 1024,
+          embedding_type: "float",
+          input: batch
+        },
+        {
+          headers: {
+            'Content-Type': 'application/json',
+            'Authorization': `Bearer ${JINA_API_KEY}`
+          }
+        }
+      );
+
+      if (batchEmbeddingResponse.status !== 200) {
+        throw new Error(`Unexpected status code from API: ${batchEmbeddingResponse.status}`);
+      }
+
+      // Validate response structure
+      if (!batchEmbeddingResponse.data?.data) {
+        throw new Error("Unexpected API response format");
+      }
+
+      // Extract embeddings from this batch
+      const batchEmbeddings = batchEmbeddingResponse.data.data.map((item: any) => item.embedding);
+      allChunkEmbeddings.push(...batchEmbeddings);
+
+      // Track token usage
+      const batchTokens = batchEmbeddingResponse.data.usage?.total_tokens || 0;
+      totalTokensUsed += batchTokens;
+    }
+
+    // Get embedding for the question
+    const questionEmbeddingResponse = await axios.post(
+      'https://api.jina.ai/v1/embeddings',
+      {
+        model: "jina-embeddings-v3",
+        task: "retrieval.query",
+        dimensions: 1024,
+        embedding_type: "float",
+        input: [question]
+      },
+      {
+        headers: {
+          'Content-Type': 'application/json',
+          'Authorization': `Bearer ${JINA_API_KEY}`
+        }
+      }
+    );
+
+    if (questionEmbeddingResponse.status !== 200) {
+      throw new Error("Unexpected status code from API");
+    }
+
+    // Validate question embedding response
+    if (!questionEmbeddingResponse.data?.data || !questionEmbeddingResponse.data.data[0]?.embedding) {
+      throw new Error("Question embedding not found in API response");
+    }
+
+    // Track token usage for question embedding
+    const questionTokens = questionEmbeddingResponse.data.usage?.total_tokens || 0;
+    totalTokensUsed += questionTokens;
+
+    // Track total token usage
+    trackers.tokenTracker.trackUsage('latechunk', {
+      promptTokens: totalTokensUsed,
+      completionTokens: 0,
+      totalTokens: totalTokensUsed
+    });
+
+    const questionEmbedding = questionEmbeddingResponse.data.data[0].embedding;
+
+    // Verify that we got embeddings for all chunks
+    if (allChunkEmbeddings.length !== chunks.length) {
+      console.error(`Got ${allChunkEmbeddings.length} embeddings for ${chunks.length} chunks`);
+    }
+
+    // Calculate cosine similarity between the question and each chunk
+    const similarities = allChunkEmbeddings.map((chunkEmbed: number[]) => {
+      return cosineSimilarity(questionEmbedding, chunkEmbed);
+    });
+
+    // Calculate the number of chunks needed for a single snippet
+    const chunksPerSnippet = Math.ceil(snippetLength / chunkSize);
+
+    // Find the top `numSnippets` snippets with highest average similarity
+    const snippets: string[] = [];
+
+    // Create a copy of similarities to avoid modifying the original
+    const similaritiesCopy = [...similarities];
+
+    for (let i = 0; i < numSnippets; i++) {
+      // Find the best starting position for the snippet
+      let bestStartIndex = 0;
+      let bestScore = -Infinity;
+
+      // Check each possible starting position for a snippet
+      for (let j = 0; j <= similarities.length - chunksPerSnippet; j++) {
+        // Calculate the average similarity for the current window
+        const windowScores = similaritiesCopy.slice(j, j + chunksPerSnippet);
+        const windowScore = windowScores.reduce((sum, score) => sum + score, 0) / windowScores.length;
+
+        if (windowScore > bestScore) {
+          bestScore = windowScore;
+          bestStartIndex = j;
+        }
+      }
+
+      // Extract the snippet text
+      const startIndex = bestStartIndex * chunkSize;
+      const endIndex = Math.min(startIndex + snippetLength, longContext.length);
+      snippets.push(longContext.substring(startIndex, endIndex));
+
+      // Mark the used chunks with a very low score to avoid reusing them
+      for (let k = bestStartIndex; k < bestStartIndex + chunksPerSnippet && k < similaritiesCopy.length; k++) {
+        similaritiesCopy[k] = -Infinity;
+      }
+    }
+
+    // wrap with <snippet-index> tag
+    return snippets.map((snippet, index) => `
+<snippet-${index+1}>
+
+${snippet}
+
+</snippet-${index+1}>`.trim()).join("\n\n");
+
+  } catch (error) {
+    console.error('Error in late chunking:', error);
+    // Fallback: just return the beginning of the context up to the desired length
+    return longContext.substring(0, snippetLength * numSnippets);
+  }
+}
+
+// Function to calculate cosine similarity between two vectors
+function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
+  if (vectorA.length !== vectorB.length) {
+    throw new Error("Vectors must have the same length");
+  }
+
+  let dotProduct = 0;
+  let magnitudeA = 0;
+  let magnitudeB = 0;
+
+  for (let i = 0; i < vectorA.length; i++) {
+    dotProduct += vectorA[i] * vectorB[i];
+    magnitudeA += vectorA[i] * vectorA[i];
+    magnitudeB += vectorB[i] * vectorB[i];
+  }
+
+  magnitudeA = Math.sqrt(magnitudeA);
+  magnitudeB = Math.sqrt(magnitudeB);
+
+  if (magnitudeA === 0 || magnitudeB === 0) {
+    return 0;
+  }
+
+  return dotProduct / (magnitudeA * magnitudeB);
+}
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@ -1,7 +1,7 @@
 import https from 'https';
-import { TokenTracker } from "../utils/token-tracker";
-import { ReadResponse } from '../types';
-import { JINA_API_KEY } from "../config";
+import {TokenTracker} from "../utils/token-tracker";
+import {ReadResponse} from '../types';
+import {JINA_API_KEY} from "../config";

 export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
  return new Promise((resolve, reject) => {
@ -10,13 +10,15 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
      return;
    }

-    const data = JSON.stringify({ url });
+    const data = JSON.stringify({url});
    const headers: Record<string, any> = {
-        'Accept': 'application/json',
-        'Authorization': `Bearer ${JINA_API_KEY}`,
-        'Content-Type': 'application/json',
-        'X-Retain-Images': 'none',
-      };
+      'Accept': 'application/json',
+      'Authorization': `Bearer ${JINA_API_KEY}`,
+      'Content-Type': 'application/json',
+      'X-Retain-Images': 'none',
+      'X-Md-Link-Style': 'discarded',
+      'X-Timeout': '20'
+    };
    if (withAllLinks) {
      headers['X-With-Links-Summary'] = 'all'
    }
@ -75,12 +77,12 @@ export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTrac
        const tokens = response.data.usage?.tokens || 0;
        const tokenTracker = tracker || new TokenTracker();
        tokenTracker.trackUsage('read', {
-            totalTokens: tokens,
-            promptTokens: url.length,
-            completionTokens: tokens
+          totalTokens: tokens,
+          promptTokens: url.length,
+          completionTokens: tokens
        });

-        resolve({ response });
+        resolve({response});
      });
    });

--- a/src/types.ts
+++ b/src/types.ts
@ -233,7 +233,7 @@ export interface ChatCompletionResponse {
      type: 'text' | 'think' | 'json' | 'error';
    };
    logprobs: null;
-    finish_reason: 'stop';
+    finish_reason: 'stop' | 'error';
  }>;
  usage: {
    prompt_tokens: number;
@ -256,9 +256,10 @@ export interface ChatCompletionChunk {
      role?: 'assistant';
      content?: string;
      type?: 'text' | 'think' | 'json' | 'error';
+      url?: string;
    };
    logprobs: null;
-    finish_reason: null | 'stop';
+    finish_reason: null | 'stop' | 'thinking_end' | 'error';
  }>;
  usage?: any;
  visitedURLs?: string[];
--- a/src/utils/i18n.json
+++ b/src/utils/i18n.json
@ -3,84 +3,98 @@
    "eval_first": "But wait, let me evaluate the answer first.",
    "search_for": "Let me search for ${keywords} to gather more information.",
    "read_for": "Let me read ${urls} to gather more information.",
-    "read_for_verify": "Let me fetch the source content to verify the answer."
+    "read_for_verify": "Let me fetch the source content to verify the answer.",
+    "late_chunk": "Source is too long, I'm cherry-picking the relevant parts."
  },
  "zh-CN": {
    "eval_first": "等等，让我先自己评估一下答案。",
    "search_for": "让我搜索${keywords}来获取更多信息。",
    "read_for": "让我读取网页${urls}来获取更多信息。",
-    "read_for_verify": "让我读取源网页内容来验证答案。"
+    "read_for_verify": "让我读取源网页内容来验证答案。",
+    "late_chunk": "源内容太长，我正在挑选相关部分。"
  },
  "zh-TW": {
    "eval_first": "等等，讓我先評估一下答案。",
    "search_for": "讓我搜索${keywords}來獲取更多信息。",
    "read_for": "讓我閱讀${urls}來獲取更多信息。",
-    "read_for_verify": "讓我獲取源內容來驗證答案。"
+    "read_for_verify": "讓我獲取源內容來驗證答案。",
+    "late_chunk": "源內容太長，我正在挑選相關部分。"
  },
  "ja": {
    "eval_first": "ちょっと待って、まず答えを評価します。",
    "search_for": "キーワード${keywords}で検索して、情報を集めます。",
    "read_for": "URL${urls}を読んで、情報を集めます。",
-    "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。"
+    "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
+    "late_chunk": "ソースが長すぎるため、関連部分を抜粋しています。"
  },
  "ko": {
    "eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
    "search_for": "키워드 ${keywords}로 검색하여 더 많은 정보를 수집하겠습니다.",
    "read_for": "URL ${urls}을 읽어 더 많은 정보를 수집하겠습니다.",
-    "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다."
+    "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
+    "late_chunk": "소스가 너무 길어서 관련 부분만 추출하고 있습니다."
  },
  "fr": {
    "eval_first": "Un instant, je vais d'abord évaluer la réponse.",
    "search_for": "Je vais rechercher ${keywords} pour obtenir plus d'informations.",
    "read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
-    "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse."
+    "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
+    "late_chunk": "La source est trop longue, je sélectionne les parties pertinentes."
  },
  "de": {
    "eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
    "search_for": "Ich werde nach ${keywords} suchen, um weitere Informationen zu sammeln.",
    "read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
-    "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen."
+    "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
+    "late_chunk": "Die Quelle ist zu lang, ich wähle die relevanten Teile aus."
  },
  "es": {
    "eval_first": "Un momento, voy a evaluar la respuesta primero.",
    "search_for": "Voy a buscar ${keywords} para recopilar más información.",
    "read_for": "Voy a leer ${urls} para recopilar más información.",
-    "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta."
+    "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
+    "late_chunk": "La fuente es demasiado larga, estoy seleccionando las partes relevantes."
  },
  "it": {
    "eval_first": "Un attimo, valuterò prima la risposta.",
    "search_for": "Cercherò ${keywords} per raccogliere ulteriori informazioni.",
    "read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
-    "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta."
+    "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
+    "late_chunk": "La fonte è troppo lunga, sto selezionando le parti rilevanti."
  },
  "pt": {
    "eval_first": "Um momento, vou avaliar a resposta primeiro.",
    "search_for": "Vou pesquisar ${keywords} para reunir mais informações.",
    "read_for": "Vou ler ${urls} para reunir mais informações.",
-    "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta."
+    "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
+    "late_chunk": "A fonte é muito longa, estou selecionando as partes relevantes."
  },
  "ru": {
    "eval_first": "Подождите, я сначала оценю ответ.",
    "search_for": "Дайте мне поискать ${keywords} для сбора дополнительной информации.",
    "read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
-    "read_for_verify": "Дайте мне получить исходный контент для проверки ответа."
+    "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
+    "late_chunk": "Источник слишком длинный, я выбираю только значимые части."
  },
  "ar": {
    "eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
    "search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
    "read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
-    "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة."
+    "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
+    "late_chunk": "المصدر طويل جدًا، أنا أختار الأجزاء ذات الصلة."
  },
  "nl": {
    "eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
    "search_for": "Ik zal zoeken naar ${keywords} om meer informatie te verzamelen.",
    "read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
-    "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren."
+    "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
+    "late_chunk": "De bron is te lang, ik selecteer de relevante delen."
  },
  "zh": {
    "eval_first": "等等，让我先评估一下答案。",
    "search_for": "让我搜索${keywords}来获取更多信息。",
    "read_for": "让我阅读${urls}来获取更多信息。",
-    "read_for_verify": "让我获取源内容来验证答案。"
+    "read_for_verify": "让我获取源内容来验证答案。",
+    "late_chunk": "源内容太长，我正在挑选相关部分。"
  }
 }
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -1,7 +1,9 @@
-import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext} from "../types";
-import {removeAllLineBreaks, smartMergeStrings} from "./text-tools";
+import {BoostedSearchSnippet, KnowledgeItem, SearchResult, SearchSnippet, TrackerContext, VisitAction} from "../types";
+import {smartMergeStrings} from "./text-tools";
 import {rerankDocuments} from "../tools/jina-rerank";
 import {readUrl} from "../tools/read";
+import {Schemas} from "./schemas";
+import {cherryPick} from "../tools/jina-latechunk";

 export function normalizeUrl(urlString: string, debug = false, options = {
  removeAnchors: true,
@ -390,7 +392,8 @@ export async function processURLs(
  allKnowledge: KnowledgeItem[],
  allURLs: Record<string, SearchSnippet>,
  visitedURLs: string[],
-  languageCode: string
+  schemaGen: Schemas,
+  question: string
 ): Promise<{urlResults: any[], success: boolean}> {
  // Skip if no URLs to process
  if (urls.length === 0) {
@ -398,7 +401,7 @@ export async function processURLs(
  }

  // Track the reading action
-  context.actionTracker.trackThink('read_for', languageCode, {urls: urls.join(', ')});
+  context.actionTracker.trackThink('read_for', schemaGen.languageCode, {urls: urls.join(', ')});

  // Process each URL in parallel
  const urlResults = await Promise.all(
@ -407,7 +410,10 @@ export async function processURLs(
        const {response} = await readUrl(url, true, context.tokenTracker);
        const {data} = response;
        const guessedTime = await getLastModified(url);
-        console.log('Guessed time for', url, guessedTime);
+        if (guessedTime) {
+          console.log('Guessed time for', url, guessedTime);
+        }
+

        // Early return if no valid data
        if (!data?.url || !data?.content) {
@ -417,7 +423,7 @@ export async function processURLs(
        // Add to knowledge base
        allKnowledge.push({
          question: `What do expert say about "${data.title}"?`,
-          answer: removeAllLineBreaks(data.content),
+          answer: await cherryPick(question, data.content, {}, context, schemaGen),
          references: [data.url],
          type: 'url',
          updated: guessedTime