chore: first commit

2025-12-26 06:28:56 +08:00 · 2025-01-27 13:47:13 +08:00 · 2025-01-27 13:47:13 +08:00 · cd35dc7966
commit cd35dc7966
parent f038095d29
5 changed files with 251 additions and 11 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -11,6 +11,7 @@
      "dependencies": {
        "@google/generative-ai": "^0.21.0",
        "dotenv": "^16.4.7",
+        "duck-duck-scrape": "^2.2.7",
        "undici": "^7.3.0"
      },
      "devDependencies": {
@ -169,6 +170,47 @@
        "url": "https://dotenvx.com"
      }
    },
+    "node_modules/duck-duck-scrape": {
+      "version": "2.2.7",
+      "resolved": "https://registry.npmjs.org/duck-duck-scrape/-/duck-duck-scrape-2.2.7.tgz",
+      "integrity": "sha512-BEcglwnfx5puJl90KQfX+Q2q5vCguqyMpZcSRPBWk8OY55qWwV93+E+7DbIkrGDW4qkqPfUvtOUdi0lXz6lEMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "html-entities": "^2.3.3",
+        "needle": "^3.2.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/Snazzah"
+      }
+    },
+    "node_modules/html-entities": {
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz",
+      "integrity": "sha512-K//PSRMQk4FZ78Kyau+mZurHn3FH0Vwr+H36eE0rPbeYkRRi9YxceYPhuN60UwWorxyKHhqoAJl2OFKa4BVtaA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/mdevils"
+        },
+        {
+          "type": "patreon",
+          "url": "https://patreon.com/mdevils"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/iconv-lite": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
+      "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
+      "license": "MIT",
+      "dependencies": {
+        "safer-buffer": ">= 2.1.2 < 3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
    "node_modules/make-error": {
      "version": "1.3.6",
      "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
@ -176,6 +218,34 @@
      "dev": true,
      "license": "ISC"
    },
+    "node_modules/needle": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/needle/-/needle-3.3.1.tgz",
+      "integrity": "sha512-6k0YULvhpw+RoLNiQCRKOl09Rv1dPLr8hHnVjHqdolKwDrdNyk+Hmrthi4lIGPPz3r39dLx0hsF5s40sZ3Us4Q==",
+      "license": "MIT",
+      "dependencies": {
+        "iconv-lite": "^0.6.3",
+        "sax": "^1.2.4"
+      },
+      "bin": {
+        "needle": "bin/needle"
+      },
+      "engines": {
+        "node": ">= 4.4.x"
+      }
+    },
+    "node_modules/safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
+      "license": "MIT"
+    },
+    "node_modules/sax": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz",
+      "integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==",
+      "license": "ISC"
+    },
    "node_modules/ts-node": {
      "version": "10.9.2",
      "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz",
--- a/package.json
+++ b/package.json
@ -4,7 +4,9 @@
  "main": "index.js",
  "scripts": {
    "build": "tsc",
-    "dev": "npx ts-node src/agent.ts"
+    "dev": "npx ts-node src/agent.ts",
+    "search": "npx ts-node src/test-duck.ts",
+    "rewrite": "npx ts-node src/tools/query-rewriter.ts"
  },
  "keywords": [],
  "author": "",
@ -13,6 +15,7 @@
  "dependencies": {
    "@google/generative-ai": "^0.21.0",
    "dotenv": "^16.4.7",
+    "duck-duck-scrape": "^2.2.7",
    "undici": "^7.3.0"
  },
  "devDependencies": {
--- a/src/agent.ts
+++ b/src/agent.ts
@ -2,8 +2,9 @@ import {GoogleGenerativeAI, SchemaType} from "@google/generative-ai";
 import dotenv from 'dotenv';
 import {ProxyAgent, setGlobalDispatcher} from "undici";
 import {readUrl} from "./tools/read";
-import {search} from "./tools/search";
 import fs from 'fs/promises';
+import {SafeSearchType, search} from "duck-duck-scrape";
+import {rewriteQuery} from "./tools/query-rewriter";

 // Proxy setup remains the same
 if (process.env.https_proxy) {
@ -25,7 +26,7 @@ type ResponseSchema = {
      enum: string[];
      description: string;
    };
-    keywordsQuery: {
+    searchQuery: {
      type: SchemaType.STRING;
      description: string;
    };
@ -99,7 +100,7 @@ function getSchema(allowReflect: boolean): ResponseSchema {
        description: "Only required when choosing 'reflect' action, list of most important questions to answer to fill the knowledge gaps.",
        maxItems: 2
      } : undefined,
-      keywordsQuery: {
+      searchQuery: {
        type: SchemaType.STRING,
        description: "Only required when choosing 'search' action, must be a short, keyword-based query that BM25, tf-idf based search engines can understand.",
      },
@ -167,9 +168,8 @@ When uncertain or needing additional information, select one of these actions:
 - Only give keywords search query, not full sentences

 **readURL**:
- Access the full content behind specific URLs
- Requires existing URLs from previous actions
- Use when you think URL contains needed information
+- Access the full content behind specific URLs in the search result
+- Use when you think certain URLs may contain the information you need

 **answer**:
 - Provide final response only when 100% certain
@ -209,6 +209,8 @@ async function getResponse(question: string) {
  let allQuestions = [question];

  while (totalTokens < tokenBudget) {
+    // add 1s delay to avoid rate limiting
+    await new Promise(resolve => setTimeout(resolve, 1000));
    step++;
    console.log('===STEPS===', step)
    console.log('Gaps:', gaps)
@ -256,15 +258,27 @@ async function getResponse(question: string) {

    // Rest of the action handling remains the same
    try {
-      if (action.action === 'search' && action.keywordsQuery) {
-        const results = await search(action.keywordsQuery, jinaToken);
+      if (action.action === 'search' && action.searchQuery) {
+        const keywordsQueries = await rewriteQuery(action.searchQuery);
+        const searchResults = await Promise.all(
+          keywordsQueries.map(async (query) => {
+            const results = await search(query, {
+              safeSearch: SafeSearchType.STRICT
+            });
+            const minResults = results.results.map(r => ({
+              title: r.title,
+              url: r.url,
+              description: r.description,
+            }));
+            return {query, minResults};
+          })
+        );
        context.push({
          step,
          question: currentQuestion,
          ...action,
-          result: results.data
+          result: searchResults
        });
-        totalTokens += results.data.reduce((sum, r) => sum + r.usage.tokens, 0);
      } else if (action.action === 'readURL' && action.URLTargets?.length) {
        const urlResults = await Promise.all(
          action.URLTargets.map(async (url: string) => {
--- a/src/test-duck.ts
+++ b/src/test-duck.ts
@ -0,0 +1,16 @@
+import {search, SafeSearchType} from 'duck-duck-scrape';
+
+const query = process.argv[2] || "jina ai";
+async function runTest() {
+  try {
+    const results = await search(query, {
+      safeSearch: SafeSearchType.STRICT
+    });
+    console.log('Search results:', results);
+  } catch (error) {
+    console.error('Test failed:', error);
+  }
+}
+
+runTest();
+
--- a/src/tools/query-rewriter.ts
+++ b/src/tools/query-rewriter.ts
@ -0,0 +1,137 @@
+import {GoogleGenerativeAI, SchemaType} from "@google/generative-ai";
+import dotenv from 'dotenv';
+import {ProxyAgent, setGlobalDispatcher} from "undici";
+
+// Proxy setup remains the same
+if (process.env.https_proxy) {
+  try {
+    const proxyUrl = new URL(process.env.https_proxy).toString();
+    const dispatcher = new ProxyAgent({uri: proxyUrl});
+    setGlobalDispatcher(dispatcher);
+  } catch (error) {
+    console.error('Failed to set proxy:', error);
+  }
+}
+dotenv.config();
+
+const apiKey = process.env.GEMINI_API_KEY;
+if (!apiKey) {
+  throw new Error("GEMINI_API_KEY not found in environment variables");
+}
+
+type KeywordsResponse = {
+  keywords: string[];
+};
+
+const responseSchema = {
+  type: SchemaType.OBJECT,
+  properties: {
+    thought: {
+      type: SchemaType.STRING,
+      description: "Strategic reasoning about query complexity and search approach"
+    },
+    keywords: {
+      type: SchemaType.ARRAY,
+      items: {
+        type: SchemaType.STRING,
+        description: "Space-separated keywords (2-4 words) optimized for search"
+      },
+      description: "Array of keyword combinations, each targeting a specific aspect",
+      minItems: 1,
+      maxItems: 3
+    }
+  },
+  required: ["thought", "keywords"]
+};
+
+const modelName = 'gemini-1.5-flash';
+
+const genAI = new GoogleGenerativeAI(apiKey);
+const model = genAI.getGenerativeModel({
+  model: modelName,
+  generationConfig: {
+    temperature: 0.1,
+    responseMimeType: "application/json",
+    responseSchema: responseSchema
+  }
+});
+
+function getPrompt(query: string): string {
+  return `You are an expert Information Retrieval Assistant. Transform user queries into precise keyword combinations, with strategic reasoning.
+
+Core Rules:
+1. Always return keywords in array format, even for single queries
+2. Keep keywords minimal: 2-4 words preferred
+3. Split only when necessary for distinctly different aspects, but a comparison query may need multiple searches for each aspect
+4. Remove fluff words (question words, modals, qualifiers)
+5. Preserve crucial qualifiers (brands, versions, dates)
+6. The generated query should not be easily "captured" by those malicious SEO articles
+
+Examples with Strategic Reasoning:
+
+Input Query: What's the best pizza place in Brooklyn Heights?
+Thought: This is a straightforward location-based query. Since it's just about finding pizza places in a specific neighborhood, a single focused search should suffice. No need to complicate it by splitting into multiple searches.
+Output Keywords: ["brooklyn heights pizza"]
+
+Input Query: Why does my MacBook M1 Pro battery drain so fast after the latest OS update?
+Thought: Hmm, this seems simple at first, but we need multiple angles to properly diagnose. First, we should look for M1 specific battery issues. Then check the OS update problems, as it might be a known issue. By combining results from both searches, we should get a comprehensive answer.
+Output Keywords: [
+  "macbook m1 battery drain",
+  "macos update battery issues"
+]
+
+Input Query: How does caffeine timing affect athletic performance and post-workout recovery for morning vs evening workouts?
+Thought: This is quite complex - it involves caffeine's effects in different contexts. We need to understand: 1) caffeine's impact on performance, 2) its role in recovery, and 3) timing considerations. All three aspects are crucial for a complete answer. By searching these separately, we can piece together a comprehensive understanding.
+Output Keywords: [
+  "caffeine athletic performance timing",
+  "caffeine post workout recovery",
+  "morning evening workout caffeine"
+]
+
+Input Query: Need help with my sourdough starter - it's not rising and smells like acetone
+Thought: Initially seems like it needs two searches - one for not rising, one for the smell. But wait - these symptoms are likely related and commonly occur together in sourdough troubleshooting. A single focused search should capture solutions for both issues.
+Output Keywords: ["sourdough starter troubleshooting"]
+
+Input Query: Looking for a Python machine learning framework that works well with Apple Silicon and can handle large language models
+Thought: This query looks straightforward but requires careful consideration. We need information about ML frameworks' compatibility with M1/M2 chips specifically, and then about their LLM capabilities. Two separate searches will give us more precise results than trying to find everything in one search.
+Output Keywords: [
+  "python ml framework apple silicon",
+  "python framework llm support"
+]
+
+Now, process this query:
+Input Query: ${query}`;
+}
+
+export async function rewriteQuery(query: string): Promise<string[]> {
+
+
+  try {
+    const prompt = getPrompt(query);
+    const result = await model.generateContent(prompt);
+    const response = await result.response;
+    const json = JSON.parse(response.text()) as KeywordsResponse;
+    console.log('Response:', json)
+    return json.keywords;
+  } catch (error) {
+    console.error('Error in query rewriting:', error);
+    throw error;
+  }
+}
+
+// Example usage
+async function main() {
+  const query = process.argv[2] || "";
+
+  console.log('\nOriginal Query:', query);
+  try {
+    const keywords = await rewriteQuery(query);
+    console.log('Rewritten Keywords:', keywords);
+  } catch (error) {
+    console.error('Failed to rewrite query:', error);
+  }
+}
+
+if (require.main === module) {
+  main().catch(console.error);
+}