feat: only hostnames

This commit is contained in:
Han Xiao 2025-03-24 10:41:35 +08:00
parent 2e8ee47536
commit 7d07078ec5
6 changed files with 67 additions and 36 deletions

View File

@ -272,7 +272,8 @@ async function executeSearchQueries(
keywordsQueries: any[],
context: TrackerContext,
allURLs: Record<string, SearchSnippet>,
SchemaGen: any
SchemaGen: Schemas,
onlyHostnames?: string[]
): Promise<{
newKnowledge: KnowledgeItem[],
searchedQueries: string[]
@ -285,6 +286,9 @@ async function executeSearchQueries(
for (const query of keywordsQueries) {
let results: SearchResult[] = [];
const oldQuery = query.q;
if (onlyHostnames && onlyHostnames.length > 0) {
query.q = `${query.q} site:${onlyHostnames.join(' OR site:')}`;
}
try {
console.log('Search query:', query);
@ -342,10 +346,16 @@ async function executeSearchQueries(
updated: query.tbs ? formatDateRange(query) : undefined
});
}
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`);
if (searchedQueries.length > MAX_QUERIES_PER_STEP) {
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`)
if (searchedQueries.length === 0) {
if (onlyHostnames && onlyHostnames.length > 0) {
console.log(`No results found for queries: ${uniqQOnly.join(', ')} on hostnames: ${onlyHostnames.join(', ')}`);
context.actionTracker.trackThink('hostnames_no_results', SchemaGen.languageCode, {hostnames: onlyHostnames.join(', ')});
}
} else {
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`);
if (searchedQueries.length > MAX_QUERIES_PER_STEP) {
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`)
}
}
return {
newKnowledge,
@ -366,6 +376,7 @@ export async function getResponse(question?: string,
noDirectAnswer: boolean = false,
boostHostnames: string[] = [],
badHostnames: string[] = [],
onlyHostnames: string[] = []
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
let step = 0;
@ -457,12 +468,11 @@ export async function getResponse(question?: string,
allowReflect = false;
}
// update all urls with buildURLMap
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
if (allURLs && Object.keys(allURLs).length > 0) {
// rerank urls
weightedURLs = rankURLs(
filterURLs(allURLs, visitedURLs, badHostnames),
filterURLs(allURLs, visitedURLs, badHostnames, onlyHostnames),
{
question: currentQuestion,
boostHostnames
@ -471,6 +481,7 @@ export async function getResponse(question?: string,
weightedURLs = keepKPerHostname(weightedURLs, 2);
console.log('Weighted URLs:', weightedURLs.length);
}
allowRead = allowRead && (weightedURLs.length > 0);
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
@ -743,25 +754,28 @@ But then you realized you have asked them before. You decided to to think out of
keywordsQueries,
context,
allURLs,
SchemaGen
SchemaGen,
onlyHostnames
);
allKeywords.push(...searchedQueries);
allKnowledge.push(...newKnowledge);
if (searchedQueries.length > 0) {
anyResult = true;
allKeywords.push(...searchedQueries);
allKnowledge.push(...newKnowledge);
diaryContext.push(`
diaryContext.push(`
At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}".
In particular, you tried to search for the following keywords: "${keywordsQueries.map(q => q.q).join(', ')}".
You found quite some information and add them to your URL list and **visit** them later when needed.
`);
updateContext({
totalStep,
question: currentQuestion,
...thisStep,
result: result
});
anyResult = true;
updateContext({
totalStep,
question: currentQuestion,
...thisStep,
result: result
});
}
}
if (!anyResult || !keywordsQueries?.length) {
diaryContext.push(`

View File

@ -557,6 +557,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
body.no_direct_answer,
body.boost_hostnames,
body.bad_hostnames,
body.only_hostnames,
)
let finalAnswer = (finalStep as AnswerAction).mdAnswer;

View File

@ -91,7 +91,7 @@ queries: [
"q": "二手宝马价格趋势"
},
{
"q": "二手宝马vs奔驰vs丰田 性价比"
"q": "二手宝马vs奔驰vs奥迪 性价比"
},
{
"tbs": "qdr:m",

View File

@ -224,6 +224,7 @@ export interface ChatCompletionRequest {
boost_hostnames?: string[];
bad_hostnames?: string[];
only_hostnames?: string[];
}
export interface URLAnnotation {

View File

@ -6,7 +6,8 @@
"read_for_verify": "Let me fetch the source content to verify the answer.",
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
"final_answer": "Let me finalize the answer.",
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked."
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
"hostnames_no_results": "Can't find any results from ${hostnames}."
},
"zh-CN": {
"eval_first": "等等,让我先自己评估一下答案。",
@ -15,7 +16,8 @@
"read_for_verify": "让我读取源网页内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。",
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。"
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
},
"zh-TW": {
"eval_first": "等等,讓我先評估一下答案。",
@ -24,7 +26,8 @@
"read_for_verify": "讓我獲取源內容來驗證答案。",
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
"final_answer": "我來整理一下答案。",
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!"
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。"
},
"ja": {
"eval_first": "ちょっと待って、まず答えを評価します。",
@ -33,7 +36,8 @@
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
"final_answer": "答えをまとめます。",
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。"
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
"hostnames_no_results": "${hostnames} から結果が見つかりません。"
},
"ko": {
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
@ -42,7 +46,8 @@
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
"final_answer": "답변을 마무리하겠습니다.",
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!"
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다."
},
"fr": {
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
@ -51,7 +56,8 @@
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
"final_answer": "Je vais finaliser la réponse.",
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !"
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}."
},
"de": {
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
@ -60,7 +66,8 @@
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
"final_answer": "Ich werde die Antwort abschließen.",
"blocked_content": "Mist! ${url} lässt mich nicht rein."
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden."
},
"es": {
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
@ -69,7 +76,8 @@
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
"final_answer": "Voy a finalizar la respuesta.",
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!"
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
},
"it": {
"eval_first": "Un attimo, valuterò prima la risposta.",
@ -77,7 +85,9 @@
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
"final_answer": "Finalizzerò la risposta."
"final_answer": "Finalizzerò la risposta.",
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}."
},
"pt": {
"eval_first": "Um momento, vou avaliar a resposta primeiro.",
@ -86,7 +96,8 @@
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
"final_answer": "Vou finalizar a resposta.",
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!"
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}."
},
"ru": {
"eval_first": "Подождите, я сначала оценю ответ.",
@ -95,7 +106,8 @@
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
"final_answer": "Дайте мне завершить ответ.",
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!"
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
"hostnames_no_results": "Ничего не найдено на ${hostnames}."
},
"ar": {
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
@ -103,7 +115,8 @@
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!"
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}."
},
"nl": {
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
@ -112,7 +125,8 @@
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
"final_answer": "Ik zal het antwoord afronden.",
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}."
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}."
},
"zh": {
"eval_first": "等等,让我先评估一下答案。",
@ -121,6 +135,7 @@
"read_for_verify": "让我获取源内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。",
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。"
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
}
}

View File

@ -160,9 +160,9 @@ export function normalizeUrl(urlString: string, debug = false, options = {
}
}
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[], onlyHostnames: string[]): SearchSnippet[] {
return Object.entries(allURLs)
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname) && (onlyHostnames.length === 0 || onlyHostnames.includes(extractUrlParts(url).hostname)))
.map(([, result]) => result);
}