mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
chore: first commit
This commit is contained in:
parent
cd35dc7966
commit
2415ec3ebd
75
src/agent.ts
75
src/agent.ts
@ -5,6 +5,7 @@ import {readUrl} from "./tools/read";
|
||||
import fs from 'fs/promises';
|
||||
import {SafeSearchType, search} from "duck-duck-scrape";
|
||||
import {rewriteQuery} from "./tools/query-rewriter";
|
||||
import {dedupQueries} from "./tools/dedup";
|
||||
|
||||
// Proxy setup remains the same
|
||||
if (process.env.https_proxy) {
|
||||
@ -18,6 +19,32 @@ if (process.env.https_proxy) {
|
||||
}
|
||||
dotenv.config();
|
||||
|
||||
async function sleep(ms: number) {
|
||||
const frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
|
||||
const startTime = Date.now();
|
||||
const endTime = startTime + ms;
|
||||
|
||||
// Clear current line and hide cursor
|
||||
process.stdout.write('\x1B[?25l');
|
||||
|
||||
while (Date.now() < endTime) {
|
||||
const remaining = Math.ceil((endTime - Date.now()) / 1000);
|
||||
const frameIndex = Math.floor(Date.now() / 100) % frames.length;
|
||||
|
||||
// Clear line and write new frame
|
||||
process.stdout.write(`\r${frames[frameIndex]} Cool down... ${remaining}s remaining`);
|
||||
|
||||
// Small delay for animation
|
||||
await new Promise(resolve => setTimeout(resolve, 50));
|
||||
}
|
||||
|
||||
// Clear line, show cursor and move to next line
|
||||
process.stdout.write('\r\x1B[K\x1B[?25h\n');
|
||||
|
||||
// Original sleep
|
||||
await new Promise(resolve => setTimeout(resolve, 0));
|
||||
}
|
||||
|
||||
type ResponseSchema = {
|
||||
type: SchemaType.OBJECT;
|
||||
properties: {
|
||||
@ -207,10 +234,11 @@ async function getResponse(question: string) {
|
||||
let step = 0;
|
||||
let gaps: string[] = [question]; // All questions to be answered including the orginal question
|
||||
let allQuestions = [question];
|
||||
let allKeywords = [];
|
||||
|
||||
while (totalTokens < tokenBudget) {
|
||||
// add 1s delay to avoid rate limiting
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
await sleep(1000);
|
||||
step++;
|
||||
console.log('===STEPS===', step)
|
||||
console.log('Gaps:', gaps)
|
||||
@ -251,28 +279,39 @@ async function getResponse(question: string) {
|
||||
}
|
||||
|
||||
if (action.action === 'reflect' && action.questionsToAnswer) {
|
||||
gaps.push(...action.questionsToAnswer);
|
||||
allQuestions.push(...action.questionsToAnswer);
|
||||
let newGapQuestions = action.questionsToAnswer
|
||||
if (allQuestions.length) {
|
||||
newGapQuestions = await dedupQueries(newGapQuestions, allQuestions)
|
||||
}
|
||||
gaps.push(...newGapQuestions);
|
||||
allQuestions.push(...newGapQuestions);
|
||||
gaps.push(question); // always keep the original question in the gaps
|
||||
}
|
||||
|
||||
// Rest of the action handling remains the same
|
||||
try {
|
||||
if (action.action === 'search' && action.searchQuery) {
|
||||
const keywordsQueries = await rewriteQuery(action.searchQuery);
|
||||
const searchResults = await Promise.all(
|
||||
keywordsQueries.map(async (query) => {
|
||||
const results = await search(query, {
|
||||
safeSearch: SafeSearchType.STRICT
|
||||
});
|
||||
const minResults = results.results.map(r => ({
|
||||
title: r.title,
|
||||
url: r.url,
|
||||
description: r.description,
|
||||
}));
|
||||
return {query, minResults};
|
||||
})
|
||||
);
|
||||
// rewrite queries
|
||||
let keywordsQueries = await rewriteQuery(action.searchQuery);
|
||||
// avoid exisitng searched queries
|
||||
if (allKeywords.length) {
|
||||
keywordsQueries = await dedupQueries(keywordsQueries, allKeywords)
|
||||
}
|
||||
const searchResults = [];
|
||||
for (const query of keywordsQueries) {
|
||||
const results = await search(query, {
|
||||
safeSearch: SafeSearchType.STRICT
|
||||
});
|
||||
const minResults = results.results.map(r => ({
|
||||
title: r.title,
|
||||
url: r.url,
|
||||
description: r.description,
|
||||
}));
|
||||
searchResults.push({query, minResults});
|
||||
allKeywords.push(query);
|
||||
await sleep(5000);
|
||||
}
|
||||
|
||||
context.push({
|
||||
step,
|
||||
question: currentQuestion,
|
||||
@ -314,7 +353,7 @@ const jinaToken = process.env.JINA_API_KEY as string;
|
||||
if (!apiKey) throw new Error("GEMINI_API_KEY not found");
|
||||
if (!jinaToken) throw new Error("JINA_API_KEY not found");
|
||||
|
||||
const modelName = 'gemini-2.0-flash-exp';
|
||||
const modelName = 'gemini-1.5-flash';
|
||||
const genAI = new GoogleGenerativeAI(apiKey);
|
||||
|
||||
const question = process.argv[2] || "";
|
||||
|
||||
131
src/tools/dedup.ts
Normal file
131
src/tools/dedup.ts
Normal file
@ -0,0 +1,131 @@
|
||||
import { GoogleGenerativeAI, SchemaType } from "@google/generative-ai";
|
||||
import dotenv from 'dotenv';
|
||||
import { ProxyAgent, setGlobalDispatcher } from "undici";
|
||||
|
||||
// Proxy setup
|
||||
if (process.env.https_proxy) {
|
||||
try {
|
||||
const proxyUrl = new URL(process.env.https_proxy).toString();
|
||||
const dispatcher = new ProxyAgent({ uri: proxyUrl });
|
||||
setGlobalDispatcher(dispatcher);
|
||||
} catch (error) {
|
||||
console.error('Failed to set proxy:', error);
|
||||
}
|
||||
}
|
||||
dotenv.config();
|
||||
|
||||
const apiKey = process.env.GEMINI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("GEMINI_API_KEY not found in environment variables");
|
||||
}
|
||||
|
||||
type DedupResponse = {
|
||||
thought: string;
|
||||
unique_queries: string[];
|
||||
};
|
||||
|
||||
const responseSchema = {
|
||||
type: SchemaType.OBJECT,
|
||||
properties: {
|
||||
thought: {
|
||||
type: SchemaType.STRING,
|
||||
description: "Strategic reasoning about the overall deduplication approach"
|
||||
},
|
||||
unique_queries: {
|
||||
type: SchemaType.ARRAY,
|
||||
items: {
|
||||
type: SchemaType.STRING
|
||||
},
|
||||
description: "Array of semantically unique queries from set A"
|
||||
}
|
||||
},
|
||||
required: ["thought", "unique_queries"]
|
||||
};
|
||||
|
||||
const modelName = 'gemini-1.5-flash';
|
||||
|
||||
const genAI = new GoogleGenerativeAI(apiKey);
|
||||
const model = genAI.getGenerativeModel({
|
||||
model: modelName,
|
||||
generationConfig: {
|
||||
temperature: 0.1,
|
||||
responseMimeType: "application/json",
|
||||
responseSchema: responseSchema
|
||||
}
|
||||
});
|
||||
|
||||
function getPrompt(newQueries: string[], existingQueries: string[]): string {
|
||||
return `You are an expert in semantic similarity analysis. Given a set of new queries (A) and existing queries (B), identify which queries from set A are semantically unique when compared BOTH to other queries within A AND to queries in set B.
|
||||
|
||||
Core Rules:
|
||||
1. Consider semantic meaning and query intent, not just lexical similarity
|
||||
2. Account for different phrasings of the same information need
|
||||
3. A query is considered duplicate if its core information need is already covered by:
|
||||
- ANY earlier query in set A (earlier = appears before in the array)
|
||||
- OR any query in set B
|
||||
4. Be conservative - only mark as duplicate if very similar
|
||||
5. Different aspects or perspectives of the same topic are not duplicates
|
||||
6. Consider query specificity - a more specific query might not be a duplicate of a general one
|
||||
7. For duplicates within set A, always keep the FIRST occurrence and mark later ones as duplicates
|
||||
|
||||
Examples:
|
||||
|
||||
Set A: [
|
||||
"how to install python on windows",
|
||||
"what's the best pizza in brooklyn heights",
|
||||
"windows python installation guide",
|
||||
"recommend good pizza places brooklyn heights"
|
||||
]
|
||||
Set B: [
|
||||
"macbook setup guide",
|
||||
"restaurant recommendations manhattan"
|
||||
]
|
||||
Thought: Let's analyze set A both internally and against B:
|
||||
1. The first python installation query is unique
|
||||
2. The first pizza query is unique
|
||||
3. The second python query is a duplicate of the first
|
||||
4. The second pizza query is a duplicate of the earlier one
|
||||
Neither query in set B is similar enough to affect our decisions.
|
||||
Unique Queries: [
|
||||
"how to install python on windows",
|
||||
"what's the best pizza in brooklyn heights"
|
||||
]
|
||||
|
||||
Now, analyze these sets:
|
||||
Set A: ${JSON.stringify(newQueries)}
|
||||
Set B: ${JSON.stringify(existingQueries)}`;
|
||||
}
|
||||
|
||||
export async function dedupQueries(newQueries: string[], existingQueries: string[]): Promise<string[]> {
|
||||
try {
|
||||
const prompt = getPrompt(newQueries, existingQueries);
|
||||
const result = await model.generateContent(prompt);
|
||||
const response = await result.response;
|
||||
const json = JSON.parse(response.text()) as DedupResponse;
|
||||
console.log('Analysis:', json);
|
||||
return json.unique_queries;
|
||||
} catch (error) {
|
||||
console.error('Error in deduplication analysis:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Example usage
|
||||
async function main() {
|
||||
const newQueries = process.argv[2] ? JSON.parse(process.argv[2]) : [];
|
||||
const existingQueries = process.argv[3] ? JSON.parse(process.argv[3]) : [];
|
||||
|
||||
console.log('\nNew Queries (Set A):', newQueries);
|
||||
console.log('Existing Queries (Set B):', existingQueries);
|
||||
|
||||
try {
|
||||
const uniqueQueries = await dedupQueries(newQueries, existingQueries);
|
||||
console.log('Unique Queries:', uniqueQueries);
|
||||
} catch (error) {
|
||||
console.error('Failed to deduplicate queries:', error);
|
||||
}
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main().catch(console.error);
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user