chore: first commit

This commit is contained in:
Han Xiao 2025-01-27 14:26:07 +08:00
parent cd35dc7966
commit 2415ec3ebd
2 changed files with 188 additions and 18 deletions

View File

@ -5,6 +5,7 @@ import {readUrl} from "./tools/read";
import fs from 'fs/promises';
import {SafeSearchType, search} from "duck-duck-scrape";
import {rewriteQuery} from "./tools/query-rewriter";
import {dedupQueries} from "./tools/dedup";
// Proxy setup remains the same
if (process.env.https_proxy) {
@ -18,6 +19,32 @@ if (process.env.https_proxy) {
}
dotenv.config();
async function sleep(ms: number) {
const frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
const startTime = Date.now();
const endTime = startTime + ms;
// Clear current line and hide cursor
process.stdout.write('\x1B[?25l');
while (Date.now() < endTime) {
const remaining = Math.ceil((endTime - Date.now()) / 1000);
const frameIndex = Math.floor(Date.now() / 100) % frames.length;
// Clear line and write new frame
process.stdout.write(`\r${frames[frameIndex]} Cool down... ${remaining}s remaining`);
// Small delay for animation
await new Promise(resolve => setTimeout(resolve, 50));
}
// Clear line, show cursor and move to next line
process.stdout.write('\r\x1B[K\x1B[?25h\n');
// Original sleep
await new Promise(resolve => setTimeout(resolve, 0));
}
type ResponseSchema = {
type: SchemaType.OBJECT;
properties: {
@ -207,10 +234,11 @@ async function getResponse(question: string) {
let step = 0;
let gaps: string[] = [question]; // All questions to be answered including the orginal question
let allQuestions = [question];
let allKeywords = [];
while (totalTokens < tokenBudget) {
// add 1s delay to avoid rate limiting
await new Promise(resolve => setTimeout(resolve, 1000));
await sleep(1000);
step++;
console.log('===STEPS===', step)
console.log('Gaps:', gaps)
@ -251,28 +279,39 @@ async function getResponse(question: string) {
}
if (action.action === 'reflect' && action.questionsToAnswer) {
gaps.push(...action.questionsToAnswer);
allQuestions.push(...action.questionsToAnswer);
let newGapQuestions = action.questionsToAnswer
if (allQuestions.length) {
newGapQuestions = await dedupQueries(newGapQuestions, allQuestions)
}
gaps.push(...newGapQuestions);
allQuestions.push(...newGapQuestions);
gaps.push(question); // always keep the original question in the gaps
}
// Rest of the action handling remains the same
try {
if (action.action === 'search' && action.searchQuery) {
const keywordsQueries = await rewriteQuery(action.searchQuery);
const searchResults = await Promise.all(
keywordsQueries.map(async (query) => {
const results = await search(query, {
safeSearch: SafeSearchType.STRICT
});
const minResults = results.results.map(r => ({
title: r.title,
url: r.url,
description: r.description,
}));
return {query, minResults};
})
);
// rewrite queries
let keywordsQueries = await rewriteQuery(action.searchQuery);
// avoid exisitng searched queries
if (allKeywords.length) {
keywordsQueries = await dedupQueries(keywordsQueries, allKeywords)
}
const searchResults = [];
for (const query of keywordsQueries) {
const results = await search(query, {
safeSearch: SafeSearchType.STRICT
});
const minResults = results.results.map(r => ({
title: r.title,
url: r.url,
description: r.description,
}));
searchResults.push({query, minResults});
allKeywords.push(query);
await sleep(5000);
}
context.push({
step,
question: currentQuestion,
@ -314,7 +353,7 @@ const jinaToken = process.env.JINA_API_KEY as string;
if (!apiKey) throw new Error("GEMINI_API_KEY not found");
if (!jinaToken) throw new Error("JINA_API_KEY not found");
const modelName = 'gemini-2.0-flash-exp';
const modelName = 'gemini-1.5-flash';
const genAI = new GoogleGenerativeAI(apiKey);
const question = process.argv[2] || "";

131
src/tools/dedup.ts Normal file
View File

@ -0,0 +1,131 @@
import { GoogleGenerativeAI, SchemaType } from "@google/generative-ai";
import dotenv from 'dotenv';
import { ProxyAgent, setGlobalDispatcher } from "undici";
// Proxy setup
if (process.env.https_proxy) {
try {
const proxyUrl = new URL(process.env.https_proxy).toString();
const dispatcher = new ProxyAgent({ uri: proxyUrl });
setGlobalDispatcher(dispatcher);
} catch (error) {
console.error('Failed to set proxy:', error);
}
}
dotenv.config();
const apiKey = process.env.GEMINI_API_KEY;
if (!apiKey) {
throw new Error("GEMINI_API_KEY not found in environment variables");
}
type DedupResponse = {
thought: string;
unique_queries: string[];
};
const responseSchema = {
type: SchemaType.OBJECT,
properties: {
thought: {
type: SchemaType.STRING,
description: "Strategic reasoning about the overall deduplication approach"
},
unique_queries: {
type: SchemaType.ARRAY,
items: {
type: SchemaType.STRING
},
description: "Array of semantically unique queries from set A"
}
},
required: ["thought", "unique_queries"]
};
const modelName = 'gemini-1.5-flash';
const genAI = new GoogleGenerativeAI(apiKey);
const model = genAI.getGenerativeModel({
model: modelName,
generationConfig: {
temperature: 0.1,
responseMimeType: "application/json",
responseSchema: responseSchema
}
});
function getPrompt(newQueries: string[], existingQueries: string[]): string {
return `You are an expert in semantic similarity analysis. Given a set of new queries (A) and existing queries (B), identify which queries from set A are semantically unique when compared BOTH to other queries within A AND to queries in set B.
Core Rules:
1. Consider semantic meaning and query intent, not just lexical similarity
2. Account for different phrasings of the same information need
3. A query is considered duplicate if its core information need is already covered by:
- ANY earlier query in set A (earlier = appears before in the array)
- OR any query in set B
4. Be conservative - only mark as duplicate if very similar
5. Different aspects or perspectives of the same topic are not duplicates
6. Consider query specificity - a more specific query might not be a duplicate of a general one
7. For duplicates within set A, always keep the FIRST occurrence and mark later ones as duplicates
Examples:
Set A: [
"how to install python on windows",
"what's the best pizza in brooklyn heights",
"windows python installation guide",
"recommend good pizza places brooklyn heights"
]
Set B: [
"macbook setup guide",
"restaurant recommendations manhattan"
]
Thought: Let's analyze set A both internally and against B:
1. The first python installation query is unique
2. The first pizza query is unique
3. The second python query is a duplicate of the first
4. The second pizza query is a duplicate of the earlier one
Neither query in set B is similar enough to affect our decisions.
Unique Queries: [
"how to install python on windows",
"what's the best pizza in brooklyn heights"
]
Now, analyze these sets:
Set A: ${JSON.stringify(newQueries)}
Set B: ${JSON.stringify(existingQueries)}`;
}
export async function dedupQueries(newQueries: string[], existingQueries: string[]): Promise<string[]> {
try {
const prompt = getPrompt(newQueries, existingQueries);
const result = await model.generateContent(prompt);
const response = await result.response;
const json = JSON.parse(response.text()) as DedupResponse;
console.log('Analysis:', json);
return json.unique_queries;
} catch (error) {
console.error('Error in deduplication analysis:', error);
throw error;
}
}
// Example usage
async function main() {
const newQueries = process.argv[2] ? JSON.parse(process.argv[2]) : [];
const existingQueries = process.argv[3] ? JSON.parse(process.argv[3]) : [];
console.log('\nNew Queries (Set A):', newQueries);
console.log('Existing Queries (Set B):', existingQueries);
try {
const uniqueQueries = await dedupQueries(newQueries, existingQueries);
console.log('Unique Queries:', uniqueQueries);
} catch (error) {
console.error('Failed to deduplicate queries:', error);
}
}
if (require.main === module) {
main().catch(console.error);
}