diff --git a/src/tools/jina-classify-spam.ts b/src/tools/jina-classify-spam.ts new file mode 100644 index 0000000..7fe9170 --- /dev/null +++ b/src/tools/jina-classify-spam.ts @@ -0,0 +1,89 @@ +import axios from 'axios'; +import { TokenTracker } from "../utils/token-tracker"; +import { JINA_API_KEY } from "../config"; + +const JINA_API_URL = 'https://api.jina.ai/v1/classify'; + +// Types for Jina Classification API +interface JinaClassifyRequest { + classifier_id: string; + input: string[]; +} + +interface JinaClassifyResponse { + usage: { + total_tokens: number; + }; + data: Array<{ + object: string; + index: number; + prediction: string; + score: number; + predictions: Array<{ + label: string; + score: number; + }>; + }>; +} + + +export async function classifyText( + text: string, + classifierId: string = "4a27dea0-381e-407c-bc67-250de45763dd", // Default spam classifier ID + timeoutMs: number = 5000, // Default timeout of 5 seconds + tracker?: TokenTracker +): Promise { + try { + if (!JINA_API_KEY) { + throw new Error('JINA_API_KEY is not set'); + } + + const request: JinaClassifyRequest = { + classifier_id: classifierId, + input: [text] + }; + + // Create a timeout promise + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`Classification request timed out after ${timeoutMs}ms`)), timeoutMs); + }); + + // Make the API request with axios + const apiRequestPromise = axios.post( + JINA_API_URL, + request, + { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${JINA_API_KEY}` + }, + timeout: timeoutMs // Also set axios timeout + } + ); + + // Race the API request against the timeout + const response = await Promise.race([apiRequestPromise, timeoutPromise]) as any; + + // Track token usage from the API + (tracker || new TokenTracker()).trackUsage('classify', { + promptTokens: response.data.usage.total_tokens, + completionTokens: 0, + totalTokens: response.data.usage.total_tokens + }); + + // Extract the prediction field and convert to boolean + if (response.data.data && response.data.data.length > 0) { + // Convert string "true"/"false" to actual boolean + return response.data.data[0].prediction === "true"; + } + + return false; // Default to false if no prediction is available + } catch (error) { + if (error instanceof Error && error.message.includes('timed out')) { + console.error('Classification request timed out:', error.message); + } else { + console.error('Error in classifying text:', error); + } + return false; // Default to false in case of error or timeout + } +} \ No newline at end of file diff --git a/src/utils/i18n.json b/src/utils/i18n.json index b307fed..def2e21 100644 --- a/src/utils/i18n.json +++ b/src/utils/i18n.json @@ -5,7 +5,8 @@ "read_for": "Let me read ${urls} to gather more information.", "read_for_verify": "Let me fetch the source content to verify the answer.", "late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.", - "final_answer": "Let me finalize the answer." + "final_answer": "Let me finalize the answer.", + "blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked." }, "zh-CN": { "eval_first": "等等,让我先自己评估一下答案。", @@ -13,7 +14,8 @@ "read_for": "让我读取网页 ${urls} 来获取更多信息。", "read_for_verify": "让我读取源网页内容来验证答案。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", - "final_answer": "我来整理一下答案。" + "final_answer": "我来整理一下答案。", + "blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。" }, "zh-TW": { "eval_first": "等等,讓我先評估一下答案。", @@ -21,7 +23,8 @@ "read_for": "讓我閱讀 ${urls} 來獲取更多信息。", "read_for_verify": "讓我獲取源內容來驗證答案。", "late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。", - "final_answer": "我來整理一下答案。" + "final_answer": "我來整理一下答案。", + "blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!" }, "ja": { "eval_first": "ちょっと待って、まず答えを評価します。", @@ -29,7 +32,8 @@ "read_for": "${urls} を読んで、情報を集めます。", "read_for_verify": "答えを確認するために、ソースコンテンツを取得します。", "late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。", - "final_answer": "答えをまとめます。" + "final_answer": "答えをまとめます。", + "blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。" }, "ko": { "eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.", @@ -37,7 +41,8 @@ "read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.", "read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.", "late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.", - "final_answer": "답변을 마무리하겠습니다." + "final_answer": "답변을 마무리하겠습니다.", + "blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!" }, "fr": { "eval_first": "Un instant, je vais d'abord évaluer la réponse.", @@ -45,7 +50,8 @@ "read_for": "Je vais lire ${urls} pour obtenir plus d'informations.", "read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.", "late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.", - "final_answer": "Je vais finaliser la réponse." + "final_answer": "Je vais finaliser la réponse.", + "blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !" }, "de": { "eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.", @@ -53,7 +59,8 @@ "read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.", "read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.", "late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.", - "final_answer": "Ich werde die Antwort abschließen." + "final_answer": "Ich werde die Antwort abschließen.", + "blocked_content": "Mist! ${url} lässt mich nicht rein." }, "es": { "eval_first": "Un momento, voy a evaluar la respuesta primero.", @@ -61,7 +68,8 @@ "read_for": "Voy a leer ${urls} para recopilar más información.", "read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.", "late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.", - "final_answer": "Voy a finalizar la respuesta." + "final_answer": "Voy a finalizar la respuesta.", + "blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!" }, "it": { "eval_first": "Un attimo, valuterò prima la risposta.", @@ -77,7 +85,8 @@ "read_for": "Vou ler ${urls} para reunir mais informações.", "read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.", "late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.", - "final_answer": "Vou finalizar a resposta." + "final_answer": "Vou finalizar a resposta.", + "blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!" }, "ru": { "eval_first": "Подождите, я сначала оценю ответ.", @@ -85,14 +94,16 @@ "read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.", "read_for_verify": "Дайте мне получить исходный контент для проверки ответа.", "late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.", - "final_answer": "Дайте мне завершить ответ." + "final_answer": "Дайте мне завершить ответ.", + "blocked_content": "Ой! Меня заблокировал ${url}, не круто!" }, "ar": { "eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.", "search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.", "read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.", "read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.", - "late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة." + "late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.", + "blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!" }, "nl": { "eval_first": "Een moment, ik zal het antwoord eerst evalueren.", @@ -100,7 +111,8 @@ "read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.", "read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.", "late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.", - "final_answer": "Ik zal het antwoord afronden." + "final_answer": "Ik zal het antwoord afronden.", + "blocked_content": "Verdorie! Ik word geblokkeerd door ${url}." }, "zh": { "eval_first": "等等,让我先评估一下答案。", @@ -108,6 +120,7 @@ "read_for": "让我阅读 ${urls} 来获取更多信息。", "read_for_verify": "让我获取源内容来验证答案。", "late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。", - "final_answer": "我来整理一下答案。" + "final_answer": "我来整理一下答案。", + "blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。" } } \ No newline at end of file diff --git a/src/utils/url-tools.ts b/src/utils/url-tools.ts index 08558cc..e7fa87d 100644 --- a/src/utils/url-tools.ts +++ b/src/utils/url-tools.ts @@ -5,6 +5,7 @@ import {readUrl} from "../tools/read"; import {Schemas} from "./schemas"; import {cherryPick} from "../tools/jina-latechunk"; import {formatDateBasedOnType} from "./date-tools"; +import {classifyText} from "../tools/jina-classify-spam"; export function normalizeUrl(urlString: string, debug = false, options = { removeAnchors: true, @@ -430,10 +431,10 @@ export async function processURLs( badURLs: string[], schemaGen: Schemas, question: string -): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> { +): Promise<{ urlResults: any[], success: boolean }> { // Skip if no URLs to process if (urls.length === 0) { - return {urlResults: [], success: false, badURLs: []}; + return {urlResults: [], success: false}; } const badHostnames: string[] = []; @@ -470,6 +471,16 @@ export async function processURLs( throw new Error('No content found'); } + // check if content is likely a blocked msg from paywall, bot detection, etc. + // only check for <5000 char length content as most blocking msg is short + const spamDetectLength = 1000; + const isGoodContent = data.content.length > spamDetectLength || await classifyText(data.content); + if (!isGoodContent) { + console.error(`Blocked content ${data.content.length}:`, url, data.content.slice(0, spamDetectLength)); + context.actionTracker.trackThink('blocked_content', schemaGen.languageCode, {url}); + throw new Error(`Blocked content ${url}`); + } + // Add to knowledge base allKnowledge.push({ question: `What do expert say about "${question}"?`, @@ -543,7 +554,6 @@ export async function processURLs( return { urlResults: validResults, success: validResults.length > 0, - badURLs }; }