mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
feat: filter out blocked content
This commit is contained in:
parent
b53bfdbaa5
commit
96d856c848
89
src/tools/jina-classify-spam.ts
Normal file
89
src/tools/jina-classify-spam.ts
Normal file
@ -0,0 +1,89 @@
|
||||
import axios from 'axios';
|
||||
import { TokenTracker } from "../utils/token-tracker";
|
||||
import { JINA_API_KEY } from "../config";
|
||||
|
||||
const JINA_API_URL = 'https://api.jina.ai/v1/classify';
|
||||
|
||||
// Types for Jina Classification API
|
||||
interface JinaClassifyRequest {
|
||||
classifier_id: string;
|
||||
input: string[];
|
||||
}
|
||||
|
||||
interface JinaClassifyResponse {
|
||||
usage: {
|
||||
total_tokens: number;
|
||||
};
|
||||
data: Array<{
|
||||
object: string;
|
||||
index: number;
|
||||
prediction: string;
|
||||
score: number;
|
||||
predictions: Array<{
|
||||
label: string;
|
||||
score: number;
|
||||
}>;
|
||||
}>;
|
||||
}
|
||||
|
||||
|
||||
export async function classifyText(
|
||||
text: string,
|
||||
classifierId: string = "4a27dea0-381e-407c-bc67-250de45763dd", // Default spam classifier ID
|
||||
timeoutMs: number = 5000, // Default timeout of 5 seconds
|
||||
tracker?: TokenTracker
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
if (!JINA_API_KEY) {
|
||||
throw new Error('JINA_API_KEY is not set');
|
||||
}
|
||||
|
||||
const request: JinaClassifyRequest = {
|
||||
classifier_id: classifierId,
|
||||
input: [text]
|
||||
};
|
||||
|
||||
// Create a timeout promise
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`Classification request timed out after ${timeoutMs}ms`)), timeoutMs);
|
||||
});
|
||||
|
||||
// Make the API request with axios
|
||||
const apiRequestPromise = axios.post<JinaClassifyResponse>(
|
||||
JINA_API_URL,
|
||||
request,
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${JINA_API_KEY}`
|
||||
},
|
||||
timeout: timeoutMs // Also set axios timeout
|
||||
}
|
||||
);
|
||||
|
||||
// Race the API request against the timeout
|
||||
const response = await Promise.race([apiRequestPromise, timeoutPromise]) as any;
|
||||
|
||||
// Track token usage from the API
|
||||
(tracker || new TokenTracker()).trackUsage('classify', {
|
||||
promptTokens: response.data.usage.total_tokens,
|
||||
completionTokens: 0,
|
||||
totalTokens: response.data.usage.total_tokens
|
||||
});
|
||||
|
||||
// Extract the prediction field and convert to boolean
|
||||
if (response.data.data && response.data.data.length > 0) {
|
||||
// Convert string "true"/"false" to actual boolean
|
||||
return response.data.data[0].prediction === "true";
|
||||
}
|
||||
|
||||
return false; // Default to false if no prediction is available
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('timed out')) {
|
||||
console.error('Classification request timed out:', error.message);
|
||||
} else {
|
||||
console.error('Error in classifying text:', error);
|
||||
}
|
||||
return false; // Default to false in case of error or timeout
|
||||
}
|
||||
}
|
||||
@ -5,7 +5,8 @@
|
||||
"read_for": "Let me read ${urls} to gather more information.",
|
||||
"read_for_verify": "Let me fetch the source content to verify the answer.",
|
||||
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
|
||||
"final_answer": "Let me finalize the answer."
|
||||
"final_answer": "Let me finalize the answer.",
|
||||
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked."
|
||||
},
|
||||
"zh-CN": {
|
||||
"eval_first": "等等,让我先自己评估一下答案。",
|
||||
@ -13,7 +14,8 @@
|
||||
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
|
||||
"read_for_verify": "让我读取源网页内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。"
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。"
|
||||
},
|
||||
"zh-TW": {
|
||||
"eval_first": "等等,讓我先評估一下答案。",
|
||||
@ -21,7 +23,8 @@
|
||||
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
|
||||
"read_for_verify": "讓我獲取源內容來驗證答案。",
|
||||
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
|
||||
"final_answer": "我來整理一下答案。"
|
||||
"final_answer": "我來整理一下答案。",
|
||||
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!"
|
||||
},
|
||||
"ja": {
|
||||
"eval_first": "ちょっと待って、まず答えを評価します。",
|
||||
@ -29,7 +32,8 @@
|
||||
"read_for": "${urls} を読んで、情報を集めます。",
|
||||
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
|
||||
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
|
||||
"final_answer": "答えをまとめます。"
|
||||
"final_answer": "答えをまとめます。",
|
||||
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。"
|
||||
},
|
||||
"ko": {
|
||||
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
|
||||
@ -37,7 +41,8 @@
|
||||
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
|
||||
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
|
||||
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
|
||||
"final_answer": "답변을 마무리하겠습니다."
|
||||
"final_answer": "답변을 마무리하겠습니다.",
|
||||
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!"
|
||||
},
|
||||
"fr": {
|
||||
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
|
||||
@ -45,7 +50,8 @@
|
||||
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
|
||||
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
|
||||
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
|
||||
"final_answer": "Je vais finaliser la réponse."
|
||||
"final_answer": "Je vais finaliser la réponse.",
|
||||
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !"
|
||||
},
|
||||
"de": {
|
||||
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
|
||||
@ -53,7 +59,8 @@
|
||||
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
|
||||
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
|
||||
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
|
||||
"final_answer": "Ich werde die Antwort abschließen."
|
||||
"final_answer": "Ich werde die Antwort abschließen.",
|
||||
"blocked_content": "Mist! ${url} lässt mich nicht rein."
|
||||
},
|
||||
"es": {
|
||||
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
|
||||
@ -61,7 +68,8 @@
|
||||
"read_for": "Voy a leer ${urls} para recopilar más información.",
|
||||
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
|
||||
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
|
||||
"final_answer": "Voy a finalizar la respuesta."
|
||||
"final_answer": "Voy a finalizar la respuesta.",
|
||||
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!"
|
||||
},
|
||||
"it": {
|
||||
"eval_first": "Un attimo, valuterò prima la risposta.",
|
||||
@ -77,7 +85,8 @@
|
||||
"read_for": "Vou ler ${urls} para reunir mais informações.",
|
||||
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
|
||||
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
|
||||
"final_answer": "Vou finalizar a resposta."
|
||||
"final_answer": "Vou finalizar a resposta.",
|
||||
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!"
|
||||
},
|
||||
"ru": {
|
||||
"eval_first": "Подождите, я сначала оценю ответ.",
|
||||
@ -85,14 +94,16 @@
|
||||
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
|
||||
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
|
||||
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
|
||||
"final_answer": "Дайте мне завершить ответ."
|
||||
"final_answer": "Дайте мне завершить ответ.",
|
||||
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!"
|
||||
},
|
||||
"ar": {
|
||||
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
|
||||
"search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
|
||||
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
|
||||
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
|
||||
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة."
|
||||
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
|
||||
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!"
|
||||
},
|
||||
"nl": {
|
||||
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
|
||||
@ -100,7 +111,8 @@
|
||||
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
|
||||
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
|
||||
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
|
||||
"final_answer": "Ik zal het antwoord afronden."
|
||||
"final_answer": "Ik zal het antwoord afronden.",
|
||||
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}."
|
||||
},
|
||||
"zh": {
|
||||
"eval_first": "等等,让我先评估一下答案。",
|
||||
@ -108,6 +120,7 @@
|
||||
"read_for": "让我阅读 ${urls} 来获取更多信息。",
|
||||
"read_for_verify": "让我获取源内容来验证答案。",
|
||||
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
|
||||
"final_answer": "我来整理一下答案。"
|
||||
"final_answer": "我来整理一下答案。",
|
||||
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。"
|
||||
}
|
||||
}
|
||||
@ -5,6 +5,7 @@ import {readUrl} from "../tools/read";
|
||||
import {Schemas} from "./schemas";
|
||||
import {cherryPick} from "../tools/jina-latechunk";
|
||||
import {formatDateBasedOnType} from "./date-tools";
|
||||
import {classifyText} from "../tools/jina-classify-spam";
|
||||
|
||||
export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
removeAnchors: true,
|
||||
@ -430,10 +431,10 @@ export async function processURLs(
|
||||
badURLs: string[],
|
||||
schemaGen: Schemas,
|
||||
question: string
|
||||
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
|
||||
): Promise<{ urlResults: any[], success: boolean }> {
|
||||
// Skip if no URLs to process
|
||||
if (urls.length === 0) {
|
||||
return {urlResults: [], success: false, badURLs: []};
|
||||
return {urlResults: [], success: false};
|
||||
}
|
||||
|
||||
const badHostnames: string[] = [];
|
||||
@ -470,6 +471,16 @@ export async function processURLs(
|
||||
throw new Error('No content found');
|
||||
}
|
||||
|
||||
// check if content is likely a blocked msg from paywall, bot detection, etc.
|
||||
// only check for <5000 char length content as most blocking msg is short
|
||||
const spamDetectLength = 1000;
|
||||
const isGoodContent = data.content.length > spamDetectLength || await classifyText(data.content);
|
||||
if (!isGoodContent) {
|
||||
console.error(`Blocked content ${data.content.length}:`, url, data.content.slice(0, spamDetectLength));
|
||||
context.actionTracker.trackThink('blocked_content', schemaGen.languageCode, {url});
|
||||
throw new Error(`Blocked content ${url}`);
|
||||
}
|
||||
|
||||
// Add to knowledge base
|
||||
allKnowledge.push({
|
||||
question: `What do expert say about "${question}"?`,
|
||||
@ -543,7 +554,6 @@ export async function processURLs(
|
||||
return {
|
||||
urlResults: validResults,
|
||||
success: validResults.length > 0,
|
||||
badURLs
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user