feat: only hostnames

This commit is contained in:
Han Xiao 2025-03-24 10:41:35 +08:00
parent 2e8ee47536
commit 7d07078ec5
6 changed files with 67 additions and 36 deletions

View File

@ -272,7 +272,8 @@ async function executeSearchQueries(
keywordsQueries: any[], keywordsQueries: any[],
context: TrackerContext, context: TrackerContext,
allURLs: Record<string, SearchSnippet>, allURLs: Record<string, SearchSnippet>,
SchemaGen: any SchemaGen: Schemas,
onlyHostnames?: string[]
): Promise<{ ): Promise<{
newKnowledge: KnowledgeItem[], newKnowledge: KnowledgeItem[],
searchedQueries: string[] searchedQueries: string[]
@ -285,6 +286,9 @@ async function executeSearchQueries(
for (const query of keywordsQueries) { for (const query of keywordsQueries) {
let results: SearchResult[] = []; let results: SearchResult[] = [];
const oldQuery = query.q; const oldQuery = query.q;
if (onlyHostnames && onlyHostnames.length > 0) {
query.q = `${query.q} site:${onlyHostnames.join(' OR site:')}`;
}
try { try {
console.log('Search query:', query); console.log('Search query:', query);
@ -342,10 +346,16 @@ async function executeSearchQueries(
updated: query.tbs ? formatDateRange(query) : undefined updated: query.tbs ? formatDateRange(query) : undefined
}); });
} }
if (searchedQueries.length === 0) {
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`); if (onlyHostnames && onlyHostnames.length > 0) {
if (searchedQueries.length > MAX_QUERIES_PER_STEP) { console.log(`No results found for queries: ${uniqQOnly.join(', ')} on hostnames: ${onlyHostnames.join(', ')}`);
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`) context.actionTracker.trackThink('hostnames_no_results', SchemaGen.languageCode, {hostnames: onlyHostnames.join(', ')});
}
} else {
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`);
if (searchedQueries.length > MAX_QUERIES_PER_STEP) {
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`)
}
} }
return { return {
newKnowledge, newKnowledge,
@ -366,6 +376,7 @@ export async function getResponse(question?: string,
noDirectAnswer: boolean = false, noDirectAnswer: boolean = false,
boostHostnames: string[] = [], boostHostnames: string[] = [],
badHostnames: string[] = [], badHostnames: string[] = [],
onlyHostnames: string[] = []
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> { ): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
let step = 0; let step = 0;
@ -457,12 +468,11 @@ export async function getResponse(question?: string,
allowReflect = false; allowReflect = false;
} }
// update all urls with buildURLMap
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
if (allURLs && Object.keys(allURLs).length > 0) { if (allURLs && Object.keys(allURLs).length > 0) {
// rerank urls // rerank urls
weightedURLs = rankURLs( weightedURLs = rankURLs(
filterURLs(allURLs, visitedURLs, badHostnames), filterURLs(allURLs, visitedURLs, badHostnames, onlyHostnames),
{ {
question: currentQuestion, question: currentQuestion,
boostHostnames boostHostnames
@ -471,6 +481,7 @@ export async function getResponse(question?: string,
weightedURLs = keepKPerHostname(weightedURLs, 2); weightedURLs = keepKPerHostname(weightedURLs, 2);
console.log('Weighted URLs:', weightedURLs.length); console.log('Weighted URLs:', weightedURLs.length);
} }
allowRead = allowRead && (weightedURLs.length > 0);
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
@ -743,25 +754,28 @@ But then you realized you have asked them before. You decided to to think out of
keywordsQueries, keywordsQueries,
context, context,
allURLs, allURLs,
SchemaGen SchemaGen,
onlyHostnames
); );
allKeywords.push(...searchedQueries); if (searchedQueries.length > 0) {
allKnowledge.push(...newKnowledge); anyResult = true;
allKeywords.push(...searchedQueries);
allKnowledge.push(...newKnowledge);
diaryContext.push(` diaryContext.push(`
At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}". At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}".
In particular, you tried to search for the following keywords: "${keywordsQueries.map(q => q.q).join(', ')}". In particular, you tried to search for the following keywords: "${keywordsQueries.map(q => q.q).join(', ')}".
You found quite some information and add them to your URL list and **visit** them later when needed. You found quite some information and add them to your URL list and **visit** them later when needed.
`); `);
updateContext({ updateContext({
totalStep, totalStep,
question: currentQuestion, question: currentQuestion,
...thisStep, ...thisStep,
result: result result: result
}); });
anyResult = true; }
} }
if (!anyResult || !keywordsQueries?.length) { if (!anyResult || !keywordsQueries?.length) {
diaryContext.push(` diaryContext.push(`

View File

@ -557,6 +557,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
body.no_direct_answer, body.no_direct_answer,
body.boost_hostnames, body.boost_hostnames,
body.bad_hostnames, body.bad_hostnames,
body.only_hostnames,
) )
let finalAnswer = (finalStep as AnswerAction).mdAnswer; let finalAnswer = (finalStep as AnswerAction).mdAnswer;

View File

@ -91,7 +91,7 @@ queries: [
"q": "二手宝马价格趋势" "q": "二手宝马价格趋势"
}, },
{ {
"q": "二手宝马vs奔驰vs丰田 性价比" "q": "二手宝马vs奔驰vs奥迪 性价比"
}, },
{ {
"tbs": "qdr:m", "tbs": "qdr:m",

View File

@ -224,6 +224,7 @@ export interface ChatCompletionRequest {
boost_hostnames?: string[]; boost_hostnames?: string[];
bad_hostnames?: string[]; bad_hostnames?: string[];
only_hostnames?: string[];
} }
export interface URLAnnotation { export interface URLAnnotation {

View File

@ -6,7 +6,8 @@
"read_for_verify": "Let me fetch the source content to verify the answer.", "read_for_verify": "Let me fetch the source content to verify the answer.",
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.", "late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
"final_answer": "Let me finalize the answer.", "final_answer": "Let me finalize the answer.",
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked." "blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
"hostnames_no_results": "Can't find any results from ${hostnames}."
}, },
"zh-CN": { "zh-CN": {
"eval_first": "等等,让我先自己评估一下答案。", "eval_first": "等等,让我先自己评估一下答案。",
@ -15,7 +16,8 @@
"read_for_verify": "让我读取源网页内容来验证答案。", "read_for_verify": "让我读取源网页内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。", "final_answer": "我来整理一下答案。",
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。" "blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
}, },
"zh-TW": { "zh-TW": {
"eval_first": "等等,讓我先評估一下答案。", "eval_first": "等等,讓我先評估一下答案。",
@ -24,7 +26,8 @@
"read_for_verify": "讓我獲取源內容來驗證答案。", "read_for_verify": "讓我獲取源內容來驗證答案。",
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。", "late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
"final_answer": "我來整理一下答案。", "final_answer": "我來整理一下答案。",
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!" "blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。"
}, },
"ja": { "ja": {
"eval_first": "ちょっと待って、まず答えを評価します。", "eval_first": "ちょっと待って、まず答えを評価します。",
@ -33,7 +36,8 @@
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。", "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。", "late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
"final_answer": "答えをまとめます。", "final_answer": "答えをまとめます。",
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。" "blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
"hostnames_no_results": "${hostnames} から結果が見つかりません。"
}, },
"ko": { "ko": {
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.", "eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
@ -42,7 +46,8 @@
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.", "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.", "late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
"final_answer": "답변을 마무리하겠습니다.", "final_answer": "답변을 마무리하겠습니다.",
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!" "blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다."
}, },
"fr": { "fr": {
"eval_first": "Un instant, je vais d'abord évaluer la réponse.", "eval_first": "Un instant, je vais d'abord évaluer la réponse.",
@ -51,7 +56,8 @@
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.", "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.", "late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
"final_answer": "Je vais finaliser la réponse.", "final_answer": "Je vais finaliser la réponse.",
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !" "blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}."
}, },
"de": { "de": {
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.", "eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
@ -60,7 +66,8 @@
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.", "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.", "late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
"final_answer": "Ich werde die Antwort abschließen.", "final_answer": "Ich werde die Antwort abschließen.",
"blocked_content": "Mist! ${url} lässt mich nicht rein." "blocked_content": "Mist! ${url} lässt mich nicht rein.",
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden."
}, },
"es": { "es": {
"eval_first": "Un momento, voy a evaluar la respuesta primero.", "eval_first": "Un momento, voy a evaluar la respuesta primero.",
@ -69,7 +76,8 @@
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.", "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.", "late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
"final_answer": "Voy a finalizar la respuesta.", "final_answer": "Voy a finalizar la respuesta.",
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!" "blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
}, },
"it": { "it": {
"eval_first": "Un attimo, valuterò prima la risposta.", "eval_first": "Un attimo, valuterò prima la risposta.",
@ -77,7 +85,9 @@
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.", "read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.", "read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.", "late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
"final_answer": "Finalizzerò la risposta." "final_answer": "Finalizzerò la risposta.",
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}."
}, },
"pt": { "pt": {
"eval_first": "Um momento, vou avaliar a resposta primeiro.", "eval_first": "Um momento, vou avaliar a resposta primeiro.",
@ -86,7 +96,8 @@
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.", "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.", "late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
"final_answer": "Vou finalizar a resposta.", "final_answer": "Vou finalizar a resposta.",
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!" "blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}."
}, },
"ru": { "ru": {
"eval_first": "Подождите, я сначала оценю ответ.", "eval_first": "Подождите, я сначала оценю ответ.",
@ -95,7 +106,8 @@
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.", "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.", "late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
"final_answer": "Дайте мне завершить ответ.", "final_answer": "Дайте мне завершить ответ.",
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!" "blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
"hostnames_no_results": "Ничего не найдено на ${hostnames}."
}, },
"ar": { "ar": {
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.", "eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
@ -103,7 +115,8 @@
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.", "read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.", "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.", "late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!" "blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}."
}, },
"nl": { "nl": {
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.", "eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
@ -112,7 +125,8 @@
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.", "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.", "late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
"final_answer": "Ik zal het antwoord afronden.", "final_answer": "Ik zal het antwoord afronden.",
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}." "blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}."
}, },
"zh": { "zh": {
"eval_first": "等等,让我先评估一下答案。", "eval_first": "等等,让我先评估一下答案。",
@ -121,6 +135,7 @@
"read_for_verify": "让我获取源内容来验证答案。", "read_for_verify": "让我获取源内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。", "final_answer": "我来整理一下答案。",
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。" "blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
} }
} }

View File

@ -160,9 +160,9 @@ export function normalizeUrl(urlString: string, debug = false, options = {
} }
} }
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] { export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[], onlyHostnames: string[]): SearchSnippet[] {
return Object.entries(allURLs) return Object.entries(allURLs)
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname)) .filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname) && (onlyHostnames.length === 0 || onlyHostnames.includes(extractUrlParts(url).hostname)))
.map(([, result]) => result); .map(([, result]) => result);
} }