mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
feat: only hostnames
This commit is contained in:
parent
2e8ee47536
commit
7d07078ec5
52
src/agent.ts
52
src/agent.ts
@ -272,7 +272,8 @@ async function executeSearchQueries(
|
||||
keywordsQueries: any[],
|
||||
context: TrackerContext,
|
||||
allURLs: Record<string, SearchSnippet>,
|
||||
SchemaGen: any
|
||||
SchemaGen: Schemas,
|
||||
onlyHostnames?: string[]
|
||||
): Promise<{
|
||||
newKnowledge: KnowledgeItem[],
|
||||
searchedQueries: string[]
|
||||
@ -285,6 +286,9 @@ async function executeSearchQueries(
|
||||
for (const query of keywordsQueries) {
|
||||
let results: SearchResult[] = [];
|
||||
const oldQuery = query.q;
|
||||
if (onlyHostnames && onlyHostnames.length > 0) {
|
||||
query.q = `${query.q} site:${onlyHostnames.join(' OR site:')}`;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('Search query:', query);
|
||||
@ -342,10 +346,16 @@ async function executeSearchQueries(
|
||||
updated: query.tbs ? formatDateRange(query) : undefined
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`);
|
||||
if (searchedQueries.length > MAX_QUERIES_PER_STEP) {
|
||||
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`)
|
||||
if (searchedQueries.length === 0) {
|
||||
if (onlyHostnames && onlyHostnames.length > 0) {
|
||||
console.log(`No results found for queries: ${uniqQOnly.join(', ')} on hostnames: ${onlyHostnames.join(', ')}`);
|
||||
context.actionTracker.trackThink('hostnames_no_results', SchemaGen.languageCode, {hostnames: onlyHostnames.join(', ')});
|
||||
}
|
||||
} else {
|
||||
console.log(`Utility/Queries: ${utilityScore}/${searchedQueries.length}`);
|
||||
if (searchedQueries.length > MAX_QUERIES_PER_STEP) {
|
||||
console.log(`So many queries??? ${searchedQueries.map(q => `"${q}"`).join(', ')}`)
|
||||
}
|
||||
}
|
||||
return {
|
||||
newKnowledge,
|
||||
@ -366,6 +376,7 @@ export async function getResponse(question?: string,
|
||||
noDirectAnswer: boolean = false,
|
||||
boostHostnames: string[] = [],
|
||||
badHostnames: string[] = [],
|
||||
onlyHostnames: string[] = []
|
||||
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
|
||||
|
||||
let step = 0;
|
||||
@ -457,12 +468,11 @@ export async function getResponse(question?: string,
|
||||
allowReflect = false;
|
||||
}
|
||||
|
||||
// update all urls with buildURLMap
|
||||
// allowRead = allowRead && (Object.keys(allURLs).length > 0);
|
||||
|
||||
if (allURLs && Object.keys(allURLs).length > 0) {
|
||||
// rerank urls
|
||||
weightedURLs = rankURLs(
|
||||
filterURLs(allURLs, visitedURLs, badHostnames),
|
||||
filterURLs(allURLs, visitedURLs, badHostnames, onlyHostnames),
|
||||
{
|
||||
question: currentQuestion,
|
||||
boostHostnames
|
||||
@ -471,6 +481,7 @@ export async function getResponse(question?: string,
|
||||
weightedURLs = keepKPerHostname(weightedURLs, 2);
|
||||
console.log('Weighted URLs:', weightedURLs.length);
|
||||
}
|
||||
allowRead = allowRead && (weightedURLs.length > 0);
|
||||
|
||||
allowSearch = allowSearch && (weightedURLs.length < 200); // disable search when too many urls already
|
||||
|
||||
@ -743,25 +754,28 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
keywordsQueries,
|
||||
context,
|
||||
allURLs,
|
||||
SchemaGen
|
||||
SchemaGen,
|
||||
onlyHostnames
|
||||
);
|
||||
|
||||
allKeywords.push(...searchedQueries);
|
||||
allKnowledge.push(...newKnowledge);
|
||||
if (searchedQueries.length > 0) {
|
||||
anyResult = true;
|
||||
allKeywords.push(...searchedQueries);
|
||||
allKnowledge.push(...newKnowledge);
|
||||
|
||||
diaryContext.push(`
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}".
|
||||
In particular, you tried to search for the following keywords: "${keywordsQueries.map(q => q.q).join(', ')}".
|
||||
You found quite some information and add them to your URL list and **visit** them later when needed.
|
||||
`);
|
||||
|
||||
updateContext({
|
||||
totalStep,
|
||||
question: currentQuestion,
|
||||
...thisStep,
|
||||
result: result
|
||||
});
|
||||
anyResult = true;
|
||||
updateContext({
|
||||
totalStep,
|
||||
question: currentQuestion,
|
||||
...thisStep,
|
||||
result: result
|
||||
});
|
||||
}
|
||||
}
|
||||
if (!anyResult || !keywordsQueries?.length) {
|
||||
diaryContext.push(`
|
||||
|
||||
@ -557,6 +557,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
body.no_direct_answer,
|
||||
body.boost_hostnames,
|
||||
body.bad_hostnames,
|
||||
body.only_hostnames,
|
||||
)
|
||||
let finalAnswer = (finalStep as AnswerAction).mdAnswer;
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ queries: [
|
||||
"q": "二手宝马价格趋势"
|
||||
},
|
||||
{
|
||||
"q": "二手宝马vs奔驰vs丰田 性价比"
|
||||
"q": "二手宝马vs奔驰vs奥迪 性价比"
|
||||
},
|
||||
{
|
||||
"tbs": "qdr:m",
|
||||
|
||||
@ -224,6 +224,7 @@ export interface ChatCompletionRequest {
|
||||
|
||||
boost_hostnames?: string[];
|
||||
bad_hostnames?: string[];
|
||||
only_hostnames?: string[];
|
||||
}
|
||||
|
||||
export interface URLAnnotation {
|
||||
|
||||
@ -6,7 +6,8 @@
|
||||
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
||||
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
|
||||
"final_answer": "Let me finalize the answer.",
|
||||
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked."
|
||||
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked.",
|
||||
"hostnames_no_results": "Can't find any results from ${hostnames}."
|
||||
},
|
||||
"zh-CN": {
|
||||
"eval_first": "等等,让我先自己评估一下答案。",
|
||||
@ -15,7 +16,8 @@
|
||||
"read_for_verify": "让我读取源网页内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。"
|
||||
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。",
|
||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
|
||||
},
|
||||
"zh-TW": {
|
||||
"eval_first": "等等,讓我先評估一下答案。",
|
||||
@ -24,7 +26,8 @@
|
||||
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
||||
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
|
||||
"final_answer": "我來整理一下答案。",
|
||||
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!"
|
||||
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!",
|
||||
"hostnames_no_results": "咦... ${hostnames} 找不到什么结果。"
|
||||
},
|
||||
"ja": {
|
||||
"eval_first": "ちょっと待って、まず答えを評価します。",
|
||||
@ -33,7 +36,8 @@
|
||||
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
||||
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
|
||||
"final_answer": "答えをまとめます。",
|
||||
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。"
|
||||
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。",
|
||||
"hostnames_no_results": "${hostnames} から結果が見つかりません。"
|
||||
},
|
||||
"ko": {
|
||||
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
|
||||
@ -42,7 +46,8 @@
|
||||
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
||||
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
|
||||
"final_answer": "답변을 마무리하겠습니다.",
|
||||
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!"
|
||||
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!",
|
||||
"hostnames_no_results": "${hostnames} 에서 결과를 찾을 수 없습니다."
|
||||
},
|
||||
"fr": {
|
||||
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
|
||||
@ -51,7 +56,8 @@
|
||||
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
||||
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
|
||||
"final_answer": "Je vais finaliser la réponse.",
|
||||
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !"
|
||||
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !",
|
||||
"hostnames_no_results": "Aucun résultat trouvé sur ${hostnames}."
|
||||
},
|
||||
"de": {
|
||||
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
|
||||
@ -60,7 +66,8 @@
|
||||
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
||||
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
|
||||
"final_answer": "Ich werde die Antwort abschließen.",
|
||||
"blocked_content": "Mist! ${url} lässt mich nicht rein."
|
||||
"blocked_content": "Mist! ${url} lässt mich nicht rein.",
|
||||
"hostnames_no_results": "Keine Ergebnisse von ${hostnames} gefunden."
|
||||
},
|
||||
"es": {
|
||||
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
|
||||
@ -69,7 +76,8 @@
|
||||
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
||||
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
|
||||
"final_answer": "Voy a finalizar la respuesta.",
|
||||
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!"
|
||||
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!",
|
||||
"hostnames_no_results": "No se encontraron resultados de ${hostnames}."
|
||||
},
|
||||
"it": {
|
||||
"eval_first": "Un attimo, valuterò prima la risposta.",
|
||||
@ -77,7 +85,9 @@
|
||||
"read_for": "Leggerò ${urls} per raccogliere ulteriori informazioni.",
|
||||
"read_for_verify": "Recupererò il contenuto sorgente per verificare la risposta.",
|
||||
"late_chunk": "Il contenuto di ${url} è troppo lungo, selezionerò le parti rilevanti.",
|
||||
"final_answer": "Finalizzerò la risposta."
|
||||
"final_answer": "Finalizzerò la risposta.",
|
||||
"blocked_content": "Mannaggia! Sono bloccato da ${url}, non è bello!",
|
||||
"hostnames_no_results": "Nessun risultato trovato da ${hostnames}."
|
||||
},
|
||||
"pt": {
|
||||
"eval_first": "Um momento, vou avaliar a resposta primeiro.",
|
||||
@ -86,7 +96,8 @@
|
||||
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
||||
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
|
||||
"final_answer": "Vou finalizar a resposta.",
|
||||
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!"
|
||||
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!",
|
||||
"hostnames_no_results": "Nenhum resultado encontrado em ${hostnames}."
|
||||
},
|
||||
"ru": {
|
||||
"eval_first": "Подождите, я сначала оценю ответ.",
|
||||
@ -95,7 +106,8 @@
|
||||
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
||||
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
|
||||
"final_answer": "Дайте мне завершить ответ.",
|
||||
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!"
|
||||
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!",
|
||||
"hostnames_no_results": "Ничего не найдено на ${hostnames}."
|
||||
},
|
||||
"ar": {
|
||||
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
|
||||
@ -103,7 +115,8 @@
|
||||
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
||||
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
||||
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
|
||||
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!"
|
||||
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!",
|
||||
"hostnames_no_results": "لا يمكن العثور على أي نتائج من ${hostnames}."
|
||||
},
|
||||
"nl": {
|
||||
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
|
||||
@ -112,7 +125,8 @@
|
||||
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
||||
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
|
||||
"final_answer": "Ik zal het antwoord afronden.",
|
||||
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}."
|
||||
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}.",
|
||||
"hostnames_no_results": "Geen resultaten gevonden van ${hostnames}."
|
||||
},
|
||||
"zh": {
|
||||
"eval_first": "等等,让我先评估一下答案。",
|
||||
@ -121,6 +135,7 @@
|
||||
"read_for_verify": "让我获取源内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。"
|
||||
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。",
|
||||
"hostnames_no_results": "额… ${hostnames} 找不到什么结果啊。"
|
||||
}
|
||||
}
|
||||
@ -160,9 +160,9 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
}
|
||||
}
|
||||
|
||||
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[]): SearchSnippet[] {
|
||||
export function filterURLs(allURLs: Record<string, SearchSnippet>, visitedURLs: string[], badHostnames: string[], onlyHostnames: string[]): SearchSnippet[] {
|
||||
return Object.entries(allURLs)
|
||||
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname))
|
||||
.filter(([url,]) => !visitedURLs.includes(url) && !badHostnames.includes(extractUrlParts(url).hostname) && (onlyHostnames.length === 0 || onlyHostnames.includes(extractUrlParts(url).hostname)))
|
||||
.map(([, result]) => result);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user