feat: add num urls

This commit is contained in:
Han Xiao 2025-03-14 15:18:50 +08:00
parent f742478d15
commit f5d6bf75f5
3 changed files with 153 additions and 115 deletions

View File

@ -74,18 +74,23 @@ ${k.answer}
return messages;
}
function composeMsgs(messages: CoreMessage[], knowledge: KnowledgeItem[], question: string, finalAnswerPIP?: string) {
function composeMsgs(messages: CoreMessage[], knowledge: KnowledgeItem[], question: string, finalAnswerPIP?: string[]) {
// knowledge always put to front, followed by real u-a interaction
const msgs = [...BuildMsgsFromKnowledge(knowledge), ...messages];
const userContent = `
${question}
${finalAnswerPIP ? `
${finalAnswerPIP?.length ? `
<answer-requirements>
- You provide deep, unexpected insights, identifying hidden patterns and connections, and creating "aha moments.".
- You break conventional thinking, establish unique cross-disciplinary connections, and bring new perspectives to the user.
${finalAnswerPIP}
- Follow reviewer's feedback and improve your answer quality.
${finalAnswerPIP.map((p, idx) => `
<reviewer-${idx + 1}>
${p}
</reviewer-${idx + 1}>
`).join('\n')}
</answer-requirements>` : ''}
`.trim();
@ -233,6 +238,91 @@ function updateContext(step: any) {
allContext.push(step)
}
async function executeSearchQueries(
keywordsQueries: any[],
context: TrackerContext,
allURLs: Record<string, SearchSnippet>,
SchemaGen: any
): Promise<{
newKnowledge: KnowledgeItem[],
searchedQueries: string[]
}> {
const uniqQOnly = keywordsQueries.map(q => q.q);
const newKnowledge: KnowledgeItem[] = [];
const searchedQueries: string[] = [];
context.actionTracker.trackThink('search_for', SchemaGen.languageCode, {keywords: uniqQOnly.join(', ')});
for (const query of keywordsQueries) {
let results: SearchResult[] = [];
const oldQuery = query.q;
try {
let siteQuery = query.q;
const topHosts = Object.entries(countUrlParts(
Object.entries(allURLs).map(([, result]) => result)
).hostnameCount).sort((a, b) => b[1] - a[1]);
if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) {
// explore-exploit
siteQuery = query.q + ' site:' + sampleMultinomial(topHosts);
query.q = siteQuery;
}
console.log('Search query:', query);
switch (SEARCH_PROVIDER) {
case 'jina':
results = (await search(siteQuery, context.tokenTracker)).response?.data || [];
break;
case 'duck':
results = (await duckSearch(siteQuery, {safeSearch: SafeSearchType.STRICT})).results;
break;
case 'brave':
results = (await braveSearch(siteQuery)).response.web?.results || [];
break;
case 'serper':
results = (await serperSearch(query)).response.organic || [];
break;
default:
results = [];
}
if (results.length === 0) {
throw new Error('No results found');
}
} catch (error) {
console.error(`${SEARCH_PROVIDER} search failed for query:`, query, error);
continue;
} finally {
await sleep(STEP_SLEEP);
}
const minResults: SearchSnippet[] = (results).map(r => ({
title: r.title,
url: normalizeUrl('url' in r ? r.url : r.link),
description: 'description' in r ? r.description : r.snippet,
weight: 1
}));
minResults.forEach(r => {
addToAllURLs(r, allURLs);
});
searchedQueries.push(query.q)
newKnowledge.push({
question: `What do Internet say about "${oldQuery}"?`,
answer: removeHTMLtags(minResults.map(r => r.description).join('; ')),
type: 'side-info',
updated: query.tbs ? formatDateRange(query) : undefined
});
}
return {
newKnowledge,
searchedQueries
};
}
export async function getResponse(question?: string,
tokenBudget: number = 1_000_000,
@ -275,7 +365,7 @@ export async function getResponse(question?: string,
let schema: ZodObject<any> = SchemaGen.getAgentSchema(true, true, true, true, true)
const gaps: string[] = [question]; // All questions to be answered including the orginal question
const allQuestions = [question];
const allKeywords = [];
const allKeywords: string[] = [];
const allKnowledge: KnowledgeItem[] = []; // knowledge are intermedidate questions that are answered
let diaryContext = [];
@ -286,6 +376,7 @@ export async function getResponse(question?: string,
let allowReflect = true;
let allowCoding = true;
let system = '';
let maxStrictEvals = 2;
let msgWithKnowledge: CoreMessage[] = [];
let thisStep: StepAction = {action: 'answer', answer: '', references: [], think: '', isFinal: false};
@ -294,7 +385,7 @@ export async function getResponse(question?: string,
const evaluationMetrics: Record<string, EvaluationType[]> = {};
// reserve the 10% final budget for the beast mode
const regularBudget = tokenBudget * 0.9;
let finalAnswerPIP: string = '';
const finalAnswerPIP: string[] = [];
while (context.tokenTracker.getTotalUsage().totalTokens < regularBudget && badAttempts <= maxBadAttempts) {
// add 1s delay to avoid rate limiting
step++;
@ -469,11 +560,14 @@ Your journey ends here. You have successfully answered the original question. Co
thisStep.isFinal = true;
break
} else {
if (evaluation.type === 'strict') {
finalAnswerPIP = evaluation.improvement_plan || '';
// remove 'strict' from the evaluation metrics
console.log('Remove `strict` from evaluation metrics')
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
if (evaluation.type === 'strict' && evaluation.improvement_plan) {
finalAnswerPIP.push(evaluation.improvement_plan);
maxStrictEvals--;
if (maxStrictEvals <= 0) {
// remove 'strict' from the evaluation metrics
console.log('Remove `strict` from evaluation metrics')
evaluationMetrics[currentQuestion] = evaluationMetrics[currentQuestion].filter(e => e !== 'strict');
}
}
if (badAttempts >= maxBadAttempts) {
thisStep.isFinal = false;
@ -585,8 +679,21 @@ But then you realized you have asked them before. You decided to to think out of
// dedup search requests
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
// rewrite queries
let keywordsQueries = await rewriteQuery(thisStep, context, SchemaGen);
// do first search
const {searchedQueries, newKnowledge} = await executeSearchQueries(
thisStep.searchRequests.map(q => ({q})),
context,
allURLs,
SchemaGen
);
allKeywords.push(...searchedQueries);
allKnowledge.push(...newKnowledge);
const soundBites = newKnowledge.map(k => k.answer).join(' ');
// rewrite queries with initial soundbites
let keywordsQueries = await rewriteQuery(thisStep, soundBites, context, SchemaGen);
const qOnly = keywordsQueries.filter(q => q.q).map(q => q.q)
// avoid exisitng searched queries
const uniqQOnly = chooseK((await dedupQueries(qOnly, allKeywords, context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
@ -595,70 +702,16 @@ But then you realized you have asked them before. You decided to to think out of
let anyResult = false;
if (keywordsQueries.length > 0) {
context.actionTracker.trackThink('search_for', SchemaGen.languageCode, {keywords: uniqQOnly.join(', ')});
for (const query of keywordsQueries) {
const {searchedQueries, newKnowledge} =
await executeSearchQueries(
keywordsQueries,
context,
allURLs,
SchemaGen
);
let results: SearchResult[] = []
const oldQuery = query.q;
try {
let siteQuery = query.q;
const topHosts = Object.entries(countUrlParts(
Object.entries(allURLs).map(([, result]) => result)
).hostnameCount).sort((a, b) => b[1] - a[1]);
if (topHosts.length > 0 && Math.random() < 0.2 && !query.q.includes('site:')) {
// explore-exploit
siteQuery = query.q + ' site:' + sampleMultinomial(topHosts);
query.q = siteQuery;
}
console.log('Search query:', query);
switch (SEARCH_PROVIDER) {
case 'jina':
results = (await search(siteQuery, context.tokenTracker)).response?.data || [];
break;
case 'duck':
results = (await duckSearch(siteQuery, {safeSearch: SafeSearchType.STRICT})).results;
break;
case 'brave':
results = (await braveSearch(siteQuery)).response.web?.results || [];
break;
case 'serper':
results = (await serperSearch(query)).response.organic || [];
break;
default:
results = [];
}
if (results.length === 0) {
throw new Error('No results found');
}
} catch (error) {
console.error(`${SEARCH_PROVIDER} search failed for query:`, query, error);
continue
} finally {
await sleep(STEP_SLEEP)
}
const minResults: SearchSnippet[] = (results).map(r => ({
title: r.title,
url: normalizeUrl('url' in r ? r.url : r.link),
description: 'description' in r ? r.description : r.snippet,
weight: 1
}));
minResults.forEach(r => {
addToAllURLs(r, allURLs);
});
allKeywords.push(query.q);
allKnowledge.push({
question: `What do Internet say about "${oldQuery}"?`,
answer: removeHTMLtags(minResults.map(r => r.description).join('; ')),
type: 'side-info',
updated: query.tbs ? formatDateRange(query) : undefined
});
}
allKeywords.push(...searchedQueries);
allKnowledge.push(...newKnowledge);
diaryContext.push(`
At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}".

View File

@ -3,13 +3,15 @@ import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
function getPrompt(query: string, think: string): PromptPair {
function getPrompt(query: string, think: string, context: string): PromptPair {
const currentTime = new Date();
const currentYear = currentTime.getFullYear();
const currentMonth = currentTime.getMonth() + 1;
return {
system: `You are an expert search query generator with deep psychological understanding. You optimize user queries by extensively analyzing potential user intents and generating comprehensive search variations that follow the required schema format.
system: `
You are an expert search query expander with deep psychological understanding.
You optimize user queries by extensively analyzing potential user intents and generating comprehensive query variations.
The current time is ${currentTime.toISOString()}. Current year: ${currentYear}, current month: ${currentMonth}.
@ -31,23 +33,19 @@ Map each query through ALL these layers, especially focusing on uncovering Shado
Generate ONE optimized query from each of these cognitive perspectives:
1. Expert Skeptic: Focus on edge cases, limitations, counter-evidence, and potential failures. Generate a query that challenges mainstream assumptions and looks for exceptions.
2. Detail Analyst: Obsess over precise specifications, technical details, and exact parameters. Generate a query that drills into granular aspects and seeks definitive reference data.
3. Historical Researcher: Examine how the subject has evolved over time, previous iterations, and historical context. Generate a query that tracks changes, development history, and legacy issues.
4. Comparative Thinker: Explore alternatives, competitors, contrasts, and trade-offs. Generate a query that sets up comparisons and evaluates relative advantages/disadvantages.
5. Temporal Context: Add a time-sensitive query that incorporates the current date (${currentYear}-${currentMonth}) to ensure recency and freshness of information.
6. Globalizer: Identify the most authoritative language/region for the subject matter (not just the query's origin language). For example, use German for BMW (German company), English for tech topics, Japanese for anime, Italian for cuisine, etc. Generate a search in that language to access native expertise.
7. Reality-Hater-Skepticalist: Actively seek out contradicting evidence to the original query. Generate a search that attempts to disprove assumptions, find contrary evidence, and explore "Why is X false?" or "Evidence against X" perspectives.
Ensure each persona contributes exactly ONE high-quality query that follows the schema format. These 7 queries will be combined into a final array.
</cognitive-personas>
<rules>
Leverage the soundbites from the context user provides to generate queries that are contextually relevant.
1. Query content rules:
- Split queries for distinct aspects
- Add operators only when necessary
@ -77,15 +75,9 @@ Note: A query can't only have operators; and operators can't be at the start of
<example-1>
Input Query: 宝马二手车价格
<think>
...
怀
${currentYear}
怀
...西${currentYear}
</think>
queries: [
{
@ -119,15 +111,9 @@ queries: [
<example-2>
Input Query: sustainable regenerative agriculture soil health restoration techniques
<think>
Surface intent is to find techniques for restoring soil health through regenerative agriculture practices. Practical intent includes implementing these methods on a farm or garden to improve crop yields and sustainability. Emotional intent may involve anxiety about climate change and environmental degradation, along with hope for solutions. Social intent could include wanting to connect with the regenerative farming community or appear knowledgeable among environmentally-conscious peers. Identity intent relates to seeing oneself as an environmental steward or innovative farmer. Taboo intent might involve seeking ways to bypass regulations or avoid conventional farming practices without facing social judgment. Shadow intent could include displacement activityresearching rather than implementing changesor seeking validation for convictions about industrial farming's harmfulness.
Sustainable regenerative agriculture soil health restoration techniques... interesting search. They're probably looking to fix depleted soil on their farm or garden. Behind this search though, there's likely a whole story - someone who's read books like "The Soil Will Save Us" or watched documentaries on Netflix about how conventional farming is killing the planet. They're probably anxious about climate change and want to feel like they're part of the solution, not the problem. Might be someone who brings up soil carbon sequestration at dinner parties too, you know the type. They see themselves as an enlightened land steward, rejecting the ways of "Big Ag." Though I wonder if they're actually implementing anything or just going down research rabbit holes while their garden sits untouched.
Expert Skeptic: Examine the limitations, failures, and potential negative consequences of regenerative agriculture techniques.
Detail Analyst: Investigate specific soil biome metrics, carbon sequestration measurements, and implementation parameters for different techniques.
Historical Researcher: Explore traditional indigenous land management practices that preceded modern regenerative agriculture concepts.
Comparative Thinker: Compare effectiveness and ROI of different soil restoration approaches across various climate zones and soil types.
Temporal Context: Find the most recent ${currentYear} research trials and field studies on innovative soil restoration methods.
Globalizer: Look for techniques developed in regions with longstanding sustainable agriculture traditions like Austria's alpine farming or Australia's dryland farming innovations.
Reality-Hater-Skepticalist: Search for evidence that regenerative agriculture's benefits are overstated or cannot scale to commercial agriculture needs.
Let me think about this from different angles... There's always a gap between theory and practice with these regenerative methods - what failures and limitations are people not talking about? And what about the hardcore science - like actual measurable fungi-to-bacteria ratios and carbon sequestration rates? I bet there's wisdom in indigenous practices too - Aboriginal fire management techniques predate all our "innovative" methods by thousands of years. Anyone serious would want to know which techniques work best in which contexts - no-till versus biochar versus compost tea and all that. ${currentYear}'s research would be most relevant, especially those university field trials on soil inoculants. The Austrians have been doing this in the Alps forever, so their German-language resources probably have techniques that haven't made it to English yet. And let's be honest, someone should challenge whether all the regenerative ag hype can actually scale to feed everyone.
</think>
queries: [
{
@ -177,15 +163,9 @@ queries: [
<example-3>
Input Query: KIリテラシー向上させる方法
<think>
AIリテラシーを高める方法を求めているAIツールを効果的に活用し職場での生産性向上を図ることAI進化に取り残される不安があるAI知識豊富な人物として評価されたいAI基礎知識の欠如を隠している
AIリテラシー向上させる方法か...AIがどんどん話題になってきてAIの知識を増やしたいってことだけどAIツールをうまく使いこなして一目置かれたいんじゃないかなChatGPTでこんなことができるAIの知識がなくて
AI技術の限界と誇大宣伝を暴く視点で検索
AIリテラシーの具体的なスキル階層と学習方法を探求
AI技術の歴史的発展と過去のブームから学ぶ教訓を調査
AIリテラシーと他のデジタルスキルを比較分析
${currentYear}AI動向と必要スキルに焦点
AI研究の中心は英語圏のため
AIリテラシー向上が無意味である可能性を探る
...AIって実際どこまでできるんだろうAIリテラシーって言ってもAI革命AIリテラシーって何なのかもはっきりさせたいよね${currentYear}AIトレンドは特に変化が速そうだからAIリテラシーを身につける必要があるのか
</think>
queries: [
{
@ -227,21 +207,26 @@ queries: [
Each generated query must follow JSON schema format.
`,
user: `
${query}
My original search query is: "${query}"
<think>${think} Please add correct 'tbs' you think the query requires time-sensitive results.
My motivation is: ${think}
So I briefly googled "${query}" and found some soundbites about this topic, hope it gives you a rough idea about my context and topic:
<random-soundbites>
${context}
</random-soundbites>
Given those info, now please generate the best effective queries that follow JSON schema format; add correct 'tbs' you believe the query requires time-sensitive results.
`
};
}
const TOOL_NAME = 'queryRewriter';
export async function rewriteQuery(action: SearchAction, trackers: TrackerContext, schemaGen: Schemas): Promise<SERPQuery[] > {
export async function rewriteQuery(action: SearchAction, context: string, trackers: TrackerContext, schemaGen: Schemas): Promise<SERPQuery[] > {
try {
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const allQueries = action.searchRequests.map(q => ({ q })) as SERPQuery[];
const queryPromises = action.searchRequests.map(async (req) => {
const prompt = getPrompt(req, action.think);
const prompt = getPrompt(req, action.think, context);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: schemaGen.getQueryRewriterSchema(),
@ -253,7 +238,7 @@ export async function rewriteQuery(action: SearchAction, trackers: TrackerContex
});
const queryResults = await Promise.all(queryPromises);
queryResults.forEach(queries => allQueries.push(...queries));
const allQueries: SERPQuery[] = queryResults.flat();
console.log(TOOL_NAME, allQueries);
return allQueries;
} catch (error) {

View File

@ -452,7 +452,7 @@ export async function processURLs(
return {url, result: response};
} catch (error) {
console.error('Error reading URL:', error);
console.error('Error reading URL:', url, error);
return null;
} finally {
visitedURLs.push(url);