feat: improve dedup with jina embeddings

This commit is contained in:
Han Xiao
2025-02-07 16:39:01 +08:00
parent 5e80ed40ea
commit f9cbc4008c
2 changed files with 12 additions and 12 deletions

View File

@@ -238,14 +238,14 @@ Plain `gemini-2.0-flash` can be run by setting `tokenBudget` to zero, skipping t
It should not be surprised that plain `gemini-2.0-flash` has a 0% pass rate, as I intentionally filtered out the questions that LLMs can answer.
| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch |
|--------|------------------|--------------------------------------|
| Pass Rate | 0% | 60% |
| Average Steps | 1 | 6 |
| Maximum Steps | 1 | 21 |
| Minimum Steps | 1 | 2 |
| Median Steps | 1 | 3 |
| Average Tokens | 428 | 67,650 |
| Median Tokens | 434 | 19,800 |
| Maximum Tokens | 463 | 374,903 |
| Minimum Tokens | 374 | 7,347 |
| Metric | gemini-2.0-flash | gemini-2.0-flash + node-deepresearch #5e80ed4 |
|--------|------------------|-------------------------------------------------|
| Pass Rate | 0% | 60% |
| Average Steps | 1 | 5 |
| Maximum Steps | 1 | 13 |
| Minimum Steps | 1 | 2 |
| Median Steps | 1 | 3 |
| Average Tokens | 428 | 59,408 |
| Median Tokens | 434 | 16,001 |
| Maximum Tokens | 463 | 347,222 |
| Minimum Tokens | 374 | 5,594 |

View File

@@ -3,7 +3,7 @@ import { TokenTracker } from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
const SIMILARITY_THRESHOLD = 0.90; // Adjustable threshold for cosine similarity
const SIMILARITY_THRESHOLD = 0.93; // Adjustable threshold for cosine similarity
// Types for Jina API
interface JinaEmbeddingRequest {