feat: improve dedup with jina embeddings

This commit is contained in:
Han Xiao 2025-02-07 15:46:32 +08:00
parent 3e60f712d9
commit 814d539dac
3 changed files with 152 additions and 4 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@ knowledge.json
prompt-*.txt
queries.json
questions.json
eval-*.json
# Logs
logs

View File

@ -7,7 +7,7 @@ import fs from 'fs/promises';
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
import {braveSearch} from "./tools/brave-search";
import {rewriteQuery} from "./tools/query-rewriter";
import {dedupQueries} from "./tools/dedup";
import {dedupQueries} from "./tools/jina-dedup";
import {evaluateAnswer} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
@ -198,11 +198,11 @@ ${urlList}
- Focus on solving one specific aspect of the question
- Only use keywords in th search query, not full sentences
${allKeywords?.length ? `
- The following searched queries do not give useful information, you need to think out of the box or cut from a completely different angle:
- Avoid the following searched queries as they do not give any useful information, you need to think out of the box and propose queries from a completely different angle:
<bad-queries>
${allKeywords.join('\n')}
</bad-queries>
` : ''}
`.trim() : ''}
</action-search>
`);
}
@ -257,7 +257,11 @@ Critical Requirements:
- Exclude all non-JSON text, markdown, or explanations
- Maintain strict JSON syntax`);
return sections.join('\n\n');
return removeExtraLineBreaks(sections.join('\n\n'));
}
const removeExtraLineBreaks = (text: string) => {
return text.replace(/\n{2,}/gm, '\n\n');
}
const allContext: StepAction[] = []; // all steps in the current session, including those leads to wrong results

143
src/tools/jina-dedup.ts Normal file
View File

@ -0,0 +1,143 @@
import axios from 'axios';
import { TokenTracker } from "../utils/token-tracker";
import {JINA_API_KEY} from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
const SIMILARITY_THRESHOLD = 0.8; // Adjustable threshold for cosine similarity
// Types for Jina API
interface JinaEmbeddingRequest {
model: string;
input: string[];
}
interface JinaEmbeddingResponse {
model: string;
object: string;
usage: {
total_tokens: number;
prompt_tokens: number;
};
data: Array<{
object: string;
index: number;
embedding: number[];
}>;
}
// Compute cosine similarity between two vectors
function cosineSimilarity(vecA: number[], vecB: number[]): number {
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (normA * normB);
}
// Get embeddings for all queries in one batch
async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> {
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
const request: JinaEmbeddingRequest = {
model: 'jina-embeddings-v3',
input: queries
};
try {
const response = await axios.post<JinaEmbeddingResponse>(
JINA_API_URL,
request,
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`
}
}
);
// Sort embeddings by index to maintain original order
const embeddings = response.data.data
.sort((a, b) => a.index - b.index)
.map(item => item.embedding);
return {
embeddings,
tokens: response.data.usage.total_tokens
};
} catch (error) {
console.error('Error getting embeddings from Jina:', error);
throw error;
}
}
export async function dedupQueries(
newQueries: string[],
existingQueries: string[],
tracker?: TokenTracker
): Promise<{ unique_queries: string[], tokens: number }> {
try {
// Quick return for single new query with no existing queries
if (newQueries.length === 1 && existingQueries.length === 0) {
console.log('Dedup (quick return):', newQueries);
return {
unique_queries: newQueries,
tokens: 0 // No tokens used since we didn't call the API
};
}
// Get embeddings for all queries in one batch
const allQueries = [...newQueries, ...existingQueries];
const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries);
// Split embeddings back into new and existing
const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
const existingEmbeddings = allEmbeddings.slice(newQueries.length);
const uniqueQueries: string[] = [];
const usedIndices = new Set<number>();
// Compare each new query against existing queries and already accepted queries
for (let i = 0; i < newQueries.length; i++) {
let isUnique = true;
// Check against existing queries
for (let j = 0; j < existingQueries.length; j++) {
const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
// Check against already accepted queries
if (isUnique) {
for (const usedIndex of usedIndices) {
const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]);
if (similarity >= SIMILARITY_THRESHOLD) {
isUnique = false;
break;
}
}
}
// Add to unique queries if passed all checks
if (isUnique) {
uniqueQueries.push(newQueries[i]);
usedIndices.add(i);
}
}
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('dedup', tokens);
console.log('Dedup:', uniqueQueries);
return {
unique_queries: uniqueQueries,
tokens
};
} catch (error) {
console.error('Error in deduplication analysis:', error);
throw error;
}
}