chore: first commit

2025-12-26 06:28:56 +08:00 · 2025-01-27 14:26:07 +08:00 · 2025-01-27 14:26:07 +08:00 · 2415ec3ebd
commit 2415ec3ebd
parent cd35dc7966
2 changed files with 188 additions and 18 deletions
--- a/src/agent.ts
+++ b/src/agent.ts
@ -5,6 +5,7 @@ import {readUrl} from "./tools/read";
 import fs from 'fs/promises';
 import {SafeSearchType, search} from "duck-duck-scrape";
 import {rewriteQuery} from "./tools/query-rewriter";
+import {dedupQueries} from "./tools/dedup";

 // Proxy setup remains the same
 if (process.env.https_proxy) {
@ -18,6 +19,32 @@ if (process.env.https_proxy) {
 }
 dotenv.config();

+async function sleep(ms: number) {
+  const frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
+  const startTime = Date.now();
+  const endTime = startTime + ms;
+
+  // Clear current line and hide cursor
+  process.stdout.write('\x1B[?25l');
+
+  while (Date.now() < endTime) {
+    const remaining = Math.ceil((endTime - Date.now()) / 1000);
+    const frameIndex = Math.floor(Date.now() / 100) % frames.length;
+
+    // Clear line and write new frame
+    process.stdout.write(`\r${frames[frameIndex]} Cool down... ${remaining}s remaining`);
+
+    // Small delay for animation
+    await new Promise(resolve => setTimeout(resolve, 50));
+  }
+
+  // Clear line, show cursor and move to next line
+  process.stdout.write('\r\x1B[K\x1B[?25h\n');
+
+  // Original sleep
+  await new Promise(resolve => setTimeout(resolve, 0));
+}
+
 type ResponseSchema = {
  type: SchemaType.OBJECT;
  properties: {
@ -207,10 +234,11 @@ async function getResponse(question: string) {
  let step = 0;
  let gaps: string[] = [question];  // All questions to be answered including the orginal question
  let allQuestions = [question];
+  let allKeywords = [];

  while (totalTokens < tokenBudget) {
    // add 1s delay to avoid rate limiting
-    await new Promise(resolve => setTimeout(resolve, 1000));
+    await sleep(1000);
    step++;
    console.log('===STEPS===', step)
    console.log('Gaps:', gaps)
@ -251,28 +279,39 @@ async function getResponse(question: string) {
    }

    if (action.action === 'reflect' && action.questionsToAnswer) {
-      gaps.push(...action.questionsToAnswer);
-      allQuestions.push(...action.questionsToAnswer);
+      let newGapQuestions = action.questionsToAnswer
+      if (allQuestions.length) {
+        newGapQuestions = await dedupQueries(newGapQuestions, allQuestions)
+      }
+      gaps.push(...newGapQuestions);
+      allQuestions.push(...newGapQuestions);
      gaps.push(question);  // always keep the original question in the gaps
    }

    // Rest of the action handling remains the same
    try {
      if (action.action === 'search' && action.searchQuery) {
-        const keywordsQueries = await rewriteQuery(action.searchQuery);
-        const searchResults = await Promise.all(
-          keywordsQueries.map(async (query) => {
-            const results = await search(query, {
-              safeSearch: SafeSearchType.STRICT
-            });
-            const minResults = results.results.map(r => ({
-              title: r.title,
-              url: r.url,
-              description: r.description,
-            }));
-            return {query, minResults};
-          })
-        );
+        // rewrite queries
+        let keywordsQueries = await rewriteQuery(action.searchQuery);
+        // avoid exisitng searched queries
+        if (allKeywords.length) {
+          keywordsQueries = await dedupQueries(keywordsQueries, allKeywords)
+        }
+        const searchResults = [];
+        for (const query of keywordsQueries) {
+          const results = await search(query, {
+            safeSearch: SafeSearchType.STRICT
+          });
+          const minResults = results.results.map(r => ({
+            title: r.title,
+            url: r.url,
+            description: r.description,
+          }));
+          searchResults.push({query, minResults});
+          allKeywords.push(query);
+          await sleep(5000);
+        }
+
        context.push({
          step,
          question: currentQuestion,
@ -314,7 +353,7 @@ const jinaToken = process.env.JINA_API_KEY as string;
 if (!apiKey) throw new Error("GEMINI_API_KEY not found");
 if (!jinaToken) throw new Error("JINA_API_KEY not found");

-const modelName = 'gemini-2.0-flash-exp';
+const modelName = 'gemini-1.5-flash';
 const genAI = new GoogleGenerativeAI(apiKey);

 const question = process.argv[2] || "";
--- a/src/tools/dedup.ts
+++ b/src/tools/dedup.ts
@ -0,0 +1,131 @@
+import { GoogleGenerativeAI, SchemaType } from "@google/generative-ai";
+import dotenv from 'dotenv';
+import { ProxyAgent, setGlobalDispatcher } from "undici";
+
+// Proxy setup
+if (process.env.https_proxy) {
+  try {
+    const proxyUrl = new URL(process.env.https_proxy).toString();
+    const dispatcher = new ProxyAgent({ uri: proxyUrl });
+    setGlobalDispatcher(dispatcher);
+  } catch (error) {
+    console.error('Failed to set proxy:', error);
+  }
+}
+dotenv.config();
+
+const apiKey = process.env.GEMINI_API_KEY;
+if (!apiKey) {
+  throw new Error("GEMINI_API_KEY not found in environment variables");
+}
+
+type DedupResponse = {
+  thought: string;
+  unique_queries: string[];
+};
+
+const responseSchema = {
+  type: SchemaType.OBJECT,
+  properties: {
+    thought: {
+      type: SchemaType.STRING,
+      description: "Strategic reasoning about the overall deduplication approach"
+    },
+    unique_queries: {
+      type: SchemaType.ARRAY,
+      items: {
+        type: SchemaType.STRING
+      },
+      description: "Array of semantically unique queries from set A"
+    }
+  },
+  required: ["thought", "unique_queries"]
+};
+
+const modelName = 'gemini-1.5-flash';
+
+const genAI = new GoogleGenerativeAI(apiKey);
+const model = genAI.getGenerativeModel({
+  model: modelName,
+  generationConfig: {
+    temperature: 0.1,
+    responseMimeType: "application/json",
+    responseSchema: responseSchema
+  }
+});
+
+function getPrompt(newQueries: string[], existingQueries: string[]): string {
+  return `You are an expert in semantic similarity analysis. Given a set of new queries (A) and existing queries (B), identify which queries from set A are semantically unique when compared BOTH to other queries within A AND to queries in set B.
+
+Core Rules:
+1. Consider semantic meaning and query intent, not just lexical similarity
+2. Account for different phrasings of the same information need
+3. A query is considered duplicate if its core information need is already covered by:
+   - ANY earlier query in set A (earlier = appears before in the array)
+   - OR any query in set B
+4. Be conservative - only mark as duplicate if very similar
+5. Different aspects or perspectives of the same topic are not duplicates
+6. Consider query specificity - a more specific query might not be a duplicate of a general one
+7. For duplicates within set A, always keep the FIRST occurrence and mark later ones as duplicates
+
+Examples:
+
+Set A: [
+  "how to install python on windows",
+  "what's the best pizza in brooklyn heights",
+  "windows python installation guide",
+  "recommend good pizza places brooklyn heights"
+]
+Set B: [
+  "macbook setup guide",
+  "restaurant recommendations manhattan"
+]
+Thought: Let's analyze set A both internally and against B:
+1. The first python installation query is unique
+2. The first pizza query is unique
+3. The second python query is a duplicate of the first
+4. The second pizza query is a duplicate of the earlier one
+Neither query in set B is similar enough to affect our decisions.
+Unique Queries: [
+  "how to install python on windows",
+  "what's the best pizza in brooklyn heights"
+]
+
+Now, analyze these sets:
+Set A: ${JSON.stringify(newQueries)}
+Set B: ${JSON.stringify(existingQueries)}`;
+}
+
+export async function dedupQueries(newQueries: string[], existingQueries: string[]): Promise<string[]> {
+  try {
+    const prompt = getPrompt(newQueries, existingQueries);
+    const result = await model.generateContent(prompt);
+    const response = await result.response;
+    const json = JSON.parse(response.text()) as DedupResponse;
+    console.log('Analysis:', json);
+    return json.unique_queries;
+  } catch (error) {
+    console.error('Error in deduplication analysis:', error);
+    throw error;
+  }
+}
+
+// Example usage
+async function main() {
+  const newQueries = process.argv[2] ? JSON.parse(process.argv[2]) : [];
+  const existingQueries = process.argv[3] ? JSON.parse(process.argv[3]) : [];
+
+  console.log('\nNew Queries (Set A):', newQueries);
+  console.log('Existing Queries (Set B):', existingQueries);
+
+  try {
+    const uniqueQueries = await dedupQueries(newQueries, existingQueries);
+    console.log('Unique Queries:', uniqueQueries);
+  } catch (error) {
+    console.error('Failed to deduplicate queries:', error);
+  }
+}
+
+if (require.main === module) {
+  main().catch(console.error);
+}