diff --git a/.gitignore b/.gitignore
index 598fa38..56ef11f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ knowledge.json
prompt-*.txt
queries.json
questions.json
+eval-*.json
# Logs
logs
diff --git a/src/agent.ts b/src/agent.ts
index 09e45b5..09d2e96 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -7,7 +7,7 @@ import fs from 'fs/promises';
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
import {braveSearch} from "./tools/brave-search";
import {rewriteQuery} from "./tools/query-rewriter";
-import {dedupQueries} from "./tools/dedup";
+import {dedupQueries} from "./tools/jina-dedup";
import {evaluateAnswer} from "./tools/evaluator";
import {analyzeSteps} from "./tools/error-analyzer";
import {TokenTracker} from "./utils/token-tracker";
@@ -198,11 +198,11 @@ ${urlList}
- Focus on solving one specific aspect of the question
- Only use keywords in th search query, not full sentences
${allKeywords?.length ? `
-- The following searched queries do not give useful information, you need to think out of the box or cut from a completely different angle:
+- Avoid the following searched queries as they do not give any useful information, you need to think out of the box and propose queries from a completely different angle:
${allKeywords.join('\n')}
-` : ''}
+`.trim() : ''}
`);
}
@@ -257,7 +257,11 @@ Critical Requirements:
- Exclude all non-JSON text, markdown, or explanations
- Maintain strict JSON syntax`);
- return sections.join('\n\n');
+ return removeExtraLineBreaks(sections.join('\n\n'));
+}
+
+const removeExtraLineBreaks = (text: string) => {
+ return text.replace(/\n{2,}/gm, '\n\n');
}
const allContext: StepAction[] = []; // all steps in the current session, including those leads to wrong results
diff --git a/src/tools/jina-dedup.ts b/src/tools/jina-dedup.ts
new file mode 100644
index 0000000..3f276a0
--- /dev/null
+++ b/src/tools/jina-dedup.ts
@@ -0,0 +1,143 @@
+import axios from 'axios';
+import { TokenTracker } from "../utils/token-tracker";
+import {JINA_API_KEY} from "../config";
+
+const JINA_API_URL = 'https://api.jina.ai/v1/embeddings';
+const SIMILARITY_THRESHOLD = 0.8; // Adjustable threshold for cosine similarity
+
+// Types for Jina API
+interface JinaEmbeddingRequest {
+ model: string;
+ input: string[];
+}
+
+interface JinaEmbeddingResponse {
+ model: string;
+ object: string;
+ usage: {
+ total_tokens: number;
+ prompt_tokens: number;
+ };
+ data: Array<{
+ object: string;
+ index: number;
+ embedding: number[];
+ }>;
+}
+
+
+// Compute cosine similarity between two vectors
+function cosineSimilarity(vecA: number[], vecB: number[]): number {
+ const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
+ const normA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
+ const normB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
+ return dotProduct / (normA * normB);
+}
+
+// Get embeddings for all queries in one batch
+async function getEmbeddings(queries: string[]): Promise<{ embeddings: number[][], tokens: number }> {
+ if (!JINA_API_KEY) {
+ throw new Error('JINA_API_KEY is not set');
+ }
+
+ const request: JinaEmbeddingRequest = {
+ model: 'jina-embeddings-v3',
+ input: queries
+ };
+
+ try {
+ const response = await axios.post(
+ JINA_API_URL,
+ request,
+ {
+ headers: {
+ 'Content-Type': 'application/json',
+ 'Authorization': `Bearer ${JINA_API_KEY}`
+ }
+ }
+ );
+
+ // Sort embeddings by index to maintain original order
+ const embeddings = response.data.data
+ .sort((a, b) => a.index - b.index)
+ .map(item => item.embedding);
+
+ return {
+ embeddings,
+ tokens: response.data.usage.total_tokens
+ };
+ } catch (error) {
+ console.error('Error getting embeddings from Jina:', error);
+ throw error;
+ }
+}
+
+export async function dedupQueries(
+ newQueries: string[],
+ existingQueries: string[],
+ tracker?: TokenTracker
+): Promise<{ unique_queries: string[], tokens: number }> {
+ try {
+ // Quick return for single new query with no existing queries
+ if (newQueries.length === 1 && existingQueries.length === 0) {
+ console.log('Dedup (quick return):', newQueries);
+ return {
+ unique_queries: newQueries,
+ tokens: 0 // No tokens used since we didn't call the API
+ };
+ }
+
+ // Get embeddings for all queries in one batch
+ const allQueries = [...newQueries, ...existingQueries];
+ const { embeddings: allEmbeddings, tokens } = await getEmbeddings(allQueries);
+
+ // Split embeddings back into new and existing
+ const newEmbeddings = allEmbeddings.slice(0, newQueries.length);
+ const existingEmbeddings = allEmbeddings.slice(newQueries.length);
+
+ const uniqueQueries: string[] = [];
+ const usedIndices = new Set();
+
+ // Compare each new query against existing queries and already accepted queries
+ for (let i = 0; i < newQueries.length; i++) {
+ let isUnique = true;
+
+ // Check against existing queries
+ for (let j = 0; j < existingQueries.length; j++) {
+ const similarity = cosineSimilarity(newEmbeddings[i], existingEmbeddings[j]);
+ if (similarity >= SIMILARITY_THRESHOLD) {
+ isUnique = false;
+ break;
+ }
+ }
+
+ // Check against already accepted queries
+ if (isUnique) {
+ for (const usedIndex of usedIndices) {
+ const similarity = cosineSimilarity(newEmbeddings[i], newEmbeddings[usedIndex]);
+ if (similarity >= SIMILARITY_THRESHOLD) {
+ isUnique = false;
+ break;
+ }
+ }
+ }
+
+ // Add to unique queries if passed all checks
+ if (isUnique) {
+ uniqueQueries.push(newQueries[i]);
+ usedIndices.add(i);
+ }
+ }
+
+ // Track token usage from the API
+ (tracker || new TokenTracker()).trackUsage('dedup', tokens);
+ console.log('Dedup:', uniqueQueries);
+ return {
+ unique_queries: uniqueQueries,
+ tokens
+ };
+ } catch (error) {
+ console.error('Error in deduplication analysis:', error);
+ throw error;
+ }
+}