From f9cbc4008c1e9c1386e45f893a07b5f95c3b5c19 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Fri, 7 Feb 2025 16:39:01 +0800 Subject: [PATCH] feat: improve dedup with jina embeddings --- README.md | 22 +++++++++++----------- src/tools/jina-dedup.ts | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5329c52..f08fbb5 100644 --- a/README.md +++ b/README.md @@ -238,14 +238,14 @@ Plain `gemini-2.0-flash` can be run by setting `tokenBudget` to zero, skipping t It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as I intentionally filtered out the questions that LLMs can answer. -| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch | -|--------|------------------|--------------------------------------| -| Pass Rate | 0% | 60% | -| Average Steps | 1 | 6 | -| Maximum Steps | 1 | 21 | -| Minimum Steps | 1 | 2 | -| Median Steps | 1 | 3 | -| Average Tokens | 428 | 67,650 | -| Median Tokens | 434 | 19,800 | -| Maximum Tokens | 463 | 374,903 | -| Minimum Tokens | 374 | 7,347 | \ No newline at end of file +| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch (#5e80ed4) | +|--------|------------------|-------------------------------------------------| +| Pass Rate | 0% | 60% | +| Average Steps | 1 | 5 | +| Maximum Steps | 1 | 13 | +| Minimum Steps | 1 | 2 | +| Median Steps | 1 | 3 | +| Average Tokens | 428 | 59,408 | +| Median Tokens | 434 | 16,001 | +| Maximum Tokens | 463 | 347,222 | +| Minimum Tokens | 374 | 5,594 | \ No newline at end of file diff --git a/src/tools/jina-dedup.ts b/src/tools/jina-dedup.ts index 5e0b23a..b0ed7c2 100644 --- a/src/tools/jina-dedup.ts +++ b/src/tools/jina-dedup.ts @@ -3,7 +3,7 @@ import { TokenTracker } from "../utils/token-tracker"; import {JINA_API_KEY} from "../config"; const JINA_API_URL = 'https://api.jina.ai/v1/embeddings'; -const SIMILARITY_THRESHOLD = 0.90; // Adjustable threshold for cosine similarity +const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity // Types for Jina API interface JinaEmbeddingRequest {