From 5f780435ac9a3d3c3a2ff7ed57122846c60924a8 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Wed, 12 Mar 2025 14:21:52 +0800 Subject: [PATCH] feat: late chunking --- src/tools/jina-latechunk.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/jina-latechunk.ts b/src/tools/jina-latechunk.ts index 42e7585..14554bb 100644 --- a/src/tools/jina-latechunk.ts +++ b/src/tools/jina-latechunk.ts @@ -6,15 +6,15 @@ import {Schemas} from "../utils/schemas"; export async function cherryPick(question: string, longContext: string, options: any = {}, trackers: TrackerContext, schemaGen: Schemas) { const { - snippetLength = 2000, - numSnippets = 2, + snippetLength = 3000, + numSnippets = Math.max(2, Math.min(5, Math.floor(longContext.length / snippetLength))), chunkSize = 200, maxTokensPerRequest = 8192, // Maximum tokens per embedding request // Rough estimate of tokens per character (can be adjusted based on your text) tokensPerCharacter = 0.5 } = options; - if (longContext.length < snippetLength * numSnippets) { + if (longContext.length < snippetLength * 2) { // If the context is shorter than the snippet length, return the whole context return longContext; }