diff --git a/src/agent.ts b/src/agent.ts
index 3c843f1..23e56b6 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -19,7 +19,7 @@ import {zodToJsonSchema} from "zod-to-json-schema";
import {ObjectGeneratorSafe} from "./utils/safe-generator";
import {CodeSandbox} from "./tools/code-sandbox";
import {serperSearch} from './tools/serper-search';
-import {calculateBoostedWeights, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
+import {calculateBoostedWeights, countUrlParts, getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags, smartMergeStrings} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
@@ -41,7 +41,7 @@ function getPrompt(
allowCoding: boolean = true,
badContext?: { question: string, answer: string, evaluation: string, recap: string; blame: string; improvement: string; }[],
knowledge?: KnowledgeItem[],
- allURLs?: SearchResult[],
+ allURLs?: BoostedSearchResult[],
beastMode?: boolean,
): string {
const sections: string[] = [];
@@ -65,10 +65,10 @@ ${k.question}
${k.answer}
-${k.references ? `
-
-${JSON.stringify(k.references)}
-
+${k.references && k.type === 'url' ? `
+
+${k.references[0]}
+
` : ''}
`)
@@ -132,9 +132,7 @@ ${learnedStrategy}
if (allowRead) {
let urlList = '';
if (allURLs && allURLs.length > 0) {
- const weightedURLs = calculateBoostedWeights(allURLs) as BoostedSearchResult[]
-
- urlList = (weightedURLs)
+ urlList = (allURLs)
.filter(r => 'url' in r)
.sort((a, b) => (b.boostedWeight || 0) - (a.boostedWeight || 0))
.map(r => ` + weight: ${r.boostedWeight.toFixed(2)} "${r.url}": "${r.title}"`)
@@ -278,6 +276,7 @@ export async function getResponse(question?: string,
const badContext = [];
let diaryContext = [];
+ let weightedURLs: BoostedSearchResult[] = [];
let allowAnswer = true;
let allowSearch = true;
let allowRead = true;
@@ -318,7 +317,12 @@ export async function getResponse(question?: string,
// update all urls with buildURLMap
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
- allowSearch = allowSearch && (getUnvisitedURLs(allURLs, visitedURLs).length < 70); // disable search when too many urls already
+ if (allURLs && Object.keys(allURLs).length > 0) {
+ // rerank urls
+ weightedURLs = calculateBoostedWeights(getUnvisitedURLs(allURLs, visitedURLs));
+ }
+
+ allowSearch = allowSearch && (weightedURLs.length < 70); // disable search when too many urls already
// generate prompt for this step
system = getPrompt(
@@ -332,7 +336,7 @@ export async function getResponse(question?: string,
allowCoding,
badContext,
allKnowledge,
- getUnvisitedURLs(allURLs, visitedURLs),
+ weightedURLs,
false,
);
schema = SchemaGen.getAgentSchema(allowReflect, allowRead, allowAnswer, allowSearch, allowCoding, finalAnswerPIP)
@@ -436,14 +440,6 @@ ${evaluation.think}
// store the bad context and reset the diary context
const errorAnalysis = await analyzeSteps(diaryContext, context, SchemaGen);
- allKnowledge.push({
- question: currentQuestion,
- answer: thisStep.answer,
- references: thisStep.references,
- type: 'qa',
- updated: new Date().toISOString()
- });
-
badContext.push({
question: currentQuestion,
answer: thisStep.answer,
@@ -526,6 +522,9 @@ But then you realized you have asked them before. You decided to to think out of
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
// rewrite queries
+ console.log(countUrlParts(weightedURLs).hostnameCount)
+ const topHosts = Object.entries(countUrlParts(weightedURLs).hostnameCount).sort((a, b) => b[1] - a[1]).map(([host]) => host).slice(0, 2);
+ console.log(topHosts)
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context, SchemaGen);
// avoid exisitng searched queries
keywordsQueries = chooseK((await dedupQueries(keywordsQueries, allKeywords, context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
@@ -533,8 +532,6 @@ But then you realized you have asked them before. You decided to to think out of
let anyResult = false;
if (keywordsQueries.length > 0) {
-
-
context.actionTracker.trackThink('search_for', SchemaGen.languageCode, {keywords: keywordsQueries.join(', ')});
for (const query of keywordsQueries) {
console.log(`Search query: ${query}`);
@@ -542,18 +539,22 @@ But then you realized you have asked them before. You decided to to think out of
let results: SearchResult[] = []
try {
+ let siteQuery = query
+ if (topHosts.length > 0) {
+ siteQuery = query + ' site:' + chooseK(topHosts, 1)[0];
+ }
switch (SEARCH_PROVIDER) {
case 'jina':
- results = (await search(query, context.tokenTracker)).response?.data || [];
+ results = (await search(siteQuery, context.tokenTracker)).response?.data || [];
break;
case 'duck':
- results = (await duckSearch(query, {safeSearch: SafeSearchType.STRICT})).results;
+ results = (await duckSearch(siteQuery, {safeSearch: SafeSearchType.STRICT})).results;
break;
case 'brave':
- results = (await braveSearch(query)).response.web?.results || [];
+ results = (await braveSearch(siteQuery)).response.web?.results || [];
break;
case 'serper':
- results = (await serperSearch(query)).response.organic || [];
+ results = (await serperSearch(siteQuery)).response.organic || [];
break;
default:
results = [];
@@ -627,10 +628,14 @@ You decided to think out of the box or cut from a completely different angle.
}
} else if (thisStep.action === 'visit' && thisStep.URLTargets?.length) {
// normalize URLs
- thisStep.URLTargets = thisStep.URLTargets.map(url => normalizeUrl(url));
- thisStep.URLTargets = chooseK(thisStep.URLTargets.filter(url => !visitedURLs.includes(url)), MAX_URLS_PER_STEP)
+ thisStep.URLTargets = thisStep.URLTargets
+ .filter(url => url.startsWith('http'))
+ .map(url => normalizeUrl(url))
+ .filter(url => !visitedURLs.includes(url));
+ thisStep.URLTargets = [...new Set([...thisStep.URLTargets, ...weightedURLs.map(r => r.url)])].slice(0, MAX_URLS_PER_STEP);
const uniqueURLs = thisStep.URLTargets;
+ console.log(uniqueURLs)
if (uniqueURLs.length > 0) {
context.actionTracker.trackThink('read_for', SchemaGen.languageCode, {urls: uniqueURLs.join(', ')});
@@ -647,7 +652,7 @@ You decided to think out of the box or cut from a completely different angle.
}
allKnowledge.push({
- question: `What is in ${data.url}?`,
+ question: `What do expert say about "${data.title}"?`,
answer: removeAllLineBreaks(data.content),
references: [data.url],
type: 'url',
@@ -756,7 +761,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
false,
badContext,
allKnowledge,
- getUnvisitedURLs(allURLs, visitedURLs),
+ weightedURLs,
true,
);
diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts
index fd7de95..8586435 100644
--- a/src/tools/evaluator.ts
+++ b/src/tools/evaluator.ts
@@ -113,13 +113,22 @@ function getDefinitivePrompt(question: string, answer: string): PromptPair {
system: `You are an evaluator of answer definitiveness. Analyze if the given answer provides a definitive response or not.
-First, if the answer is not a direct response to the question, it must return false.
-Definitiveness is the king! The following types of responses are NOT definitive and must return false:
- 1. Expressions of uncertainty: "I don't know", "not sure", "might be", "probably"
+First, if the answer is not a direct response to the question, it must return false.
+
+Definitiveness means providing a clear, confident response. The following approaches are considered definitive:
+ 1. Direct, clear statements that address the question
+ 2. Comprehensive answers that cover multiple perspectives or both sides of an issue
+ 3. Answers that acknowledge complexity while still providing substantive information
+ 4. Balanced explanations that present pros and cons or different viewpoints
+
+The following types of responses are NOT definitive and must return false:
+ 1. Expressions of personal uncertainty: "I don't know", "not sure", "might be", "probably"
2. Lack of information statements: "doesn't exist", "lack of information", "could not find"
3. Inability statements: "I cannot provide", "I am unable to", "we cannot"
4. Negative statements that redirect: "However, you can...", "Instead, try..."
- 5. Non-answers that suggest alternatives
+ 5. Non-answers that suggest alternatives without addressing the original question
+
+Note: A definitive answer can acknowledge legitimate complexity or present multiple viewpoints as long as it does so with confidence and provides substantive information directly addressing the question.
@@ -178,6 +187,27 @@ Evaluation: {
"think": "The answer provides concrete mathematical approaches to proving P ≠ NP without uncertainty markers, presenting definitive methods that could be used."
"pass": true,
}
+
+Question: "Is universal healthcare a good policy?"
+Answer: "Universal healthcare has both advantages and disadvantages. Proponents argue it provides coverage for all citizens, reduces administrative costs, and leads to better public health outcomes. Critics contend it may increase wait times, raise taxes, and potentially reduce innovation in medical treatments. Most developed nations have implemented some form of universal healthcare with varying structures and degrees of coverage."
+Evaluation: {
+ "think": "The answer confidently presents both sides of the debate with specific points for each perspective. It provides substantive information directly addressing the question without expressions of personal uncertainty."
+ "pass": true,
+}
+
+Question: "Should companies use AI for hiring decisions?"
+Answer: "There are compelling arguments on both sides of this issue. Companies using AI in hiring can benefit from reduced bias in initial screening, faster processing of large applicant pools, and potentially better matches based on skills assessment. However, these systems can also perpetuate historical biases in training data, may miss nuanced human qualities, and raise privacy concerns. The effectiveness depends on careful implementation, human oversight, and regular auditing of these systems."
+Evaluation: {
+ "think": "The answer provides a balanced, detailed examination of both perspectives on AI in hiring. It acknowledges complexity while delivering substantive information with confidence."
+ "pass": true,
+}
+
+Question: "Is nuclear energy safe?"
+Answer: "I'm not an expert on energy policy, so I can't really say if nuclear energy is safe or not. There have been some accidents but also many successful plants."
+Evaluation: {
+ "think": "The answer contains explicit expressions of personal uncertainty ('I'm not an expert', 'I can't really say') and provides only vague information without substantive content."
+ "pass": false,
+}
`,
user: `
Question: ${question}
@@ -384,29 +414,38 @@ Answer: ${answer}`
function getQuestionEvaluationPrompt(question: string): PromptPair {
return {
- system: `You are an evaluator that determines if a question requires freshness, plurality, and/or completeness checks.
+ system: `You are an evaluator that determines if a question requires definitive, freshness, plurality, and/or completeness checks.
-1. freshness - Checks if the question is time-sensitive or requires very recent information
-2. plurality - Checks if the question asks for multiple items, examples, or a specific count or enumeration
-3. completeness - Checks if the question explicitly mentions multiple named elements that all need to be addressed
+definitive - Checks if the question requires a definitive answer or if uncertainty is acceptable (open-ended, speculative, discussion-based)
+freshness - Checks if the question is time-sensitive or requires very recent information
+plurality - Checks if the question asks for multiple items, examples, or a specific count or enumeration
+completeness - Checks if the question explicitly mentions multiple named elements that all need to be addressed
-1. Freshness Evaluation:
+1. Definitive Evaluation:
+ - Required for ALMOST ALL questions - assume by default that definitive evaluation is needed
+ - Not required ONLY for questions that are genuinely impossible to evaluate definitively
+ - Examples of impossible questions: paradoxes, questions beyond all possible knowledge
+ - Even subjective-seeming questions can be evaluated definitively based on evidence
+ - Future scenarios can be evaluated definitively based on current trends and information
+ - Look for cases where the question is inherently unanswerable by any possible means
+
+2. Freshness Evaluation:
- Required for questions about current state, recent events, or time-sensitive information
- Required for: prices, versions, leadership positions, status updates
- Look for terms: "current", "latest", "recent", "now", "today", "new"
- Consider company positions, product versions, market data time-sensitive
-2. Plurality Evaluation:
+3. Plurality Evaluation:
- ONLY apply when completeness check is NOT triggered
- Required when question asks for multiple examples, items, or specific counts
- Check for: numbers ("5 examples"), list requests ("list the ways"), enumeration requests
- Look for: "examples", "list", "enumerate", "ways to", "methods for", "several"
- Focus on requests for QUANTITY of items or examples
-3. Completeness Evaluation:
+4. Completeness Evaluation:
- Takes precedence over plurality check - if completeness applies, set plurality to false
- Required when question EXPLICITLY mentions multiple named elements that all need to be addressed
- This includes:
@@ -424,9 +463,10 @@ function getQuestionEvaluationPrompt(question: string): PromptPair {
谁发明了微积分?牛顿和莱布尼兹各自的贡献是什么?
-这是关于微积分历史的问题,不需要最新信息。问题特别提到了牛顿和莱布尼兹两个人,要求分析他们各自的贡献,所以我需要全面回答这两部分内容。完整性比较重要,而不是提供多个不同答案。
+这是关于微积分历史的问题,不涉及需要最新信息的内容。问题明确提到了牛顿和莱布尼兹两位数学家,要求分析他们各自的贡献,所以需要全面评估这两个特定的方面。这个问题涉及历史事实,有明确的学术研究可以参考,因此需要确定性评估。