feat: filter out blocked content

This commit is contained in:
Han Xiao 2025-03-19 14:53:20 +08:00
parent b53bfdbaa5
commit 96d856c848
3 changed files with 128 additions and 16 deletions

View File

@ -0,0 +1,89 @@
import axios from 'axios';
import { TokenTracker } from "../utils/token-tracker";
import { JINA_API_KEY } from "../config";
const JINA_API_URL = 'https://api.jina.ai/v1/classify';
// Types for Jina Classification API
interface JinaClassifyRequest {
classifier_id: string;
input: string[];
}
interface JinaClassifyResponse {
usage: {
total_tokens: number;
};
data: Array<{
object: string;
index: number;
prediction: string;
score: number;
predictions: Array<{
label: string;
score: number;
}>;
}>;
}
export async function classifyText(
text: string,
classifierId: string = "4a27dea0-381e-407c-bc67-250de45763dd", // Default spam classifier ID
timeoutMs: number = 5000, // Default timeout of 5 seconds
tracker?: TokenTracker
): Promise<boolean> {
try {
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
const request: JinaClassifyRequest = {
classifier_id: classifierId,
input: [text]
};
// Create a timeout promise
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Classification request timed out after ${timeoutMs}ms`)), timeoutMs);
});
// Make the API request with axios
const apiRequestPromise = axios.post<JinaClassifyResponse>(
JINA_API_URL,
request,
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`
},
timeout: timeoutMs // Also set axios timeout
}
);
// Race the API request against the timeout
const response = await Promise.race([apiRequestPromise, timeoutPromise]) as any;
// Track token usage from the API
(tracker || new TokenTracker()).trackUsage('classify', {
promptTokens: response.data.usage.total_tokens,
completionTokens: 0,
totalTokens: response.data.usage.total_tokens
});
// Extract the prediction field and convert to boolean
if (response.data.data && response.data.data.length > 0) {
// Convert string "true"/"false" to actual boolean
return response.data.data[0].prediction === "true";
}
return false; // Default to false if no prediction is available
} catch (error) {
if (error instanceof Error && error.message.includes('timed out')) {
console.error('Classification request timed out:', error.message);
} else {
console.error('Error in classifying text:', error);
}
return false; // Default to false in case of error or timeout
}
}

View File

@ -5,7 +5,8 @@
"read_for": "Let me read ${urls} to gather more information.",
"read_for_verify": "Let me fetch the source content to verify the answer.",
"late_chunk": "Content of ${url} is too long, let me cherry-pick the relevant parts.",
"final_answer": "Let me finalize the answer."
"final_answer": "Let me finalize the answer.",
"blocked_content": "Hmm...the content of ${url} doesn't look right, I might be blocked."
},
"zh-CN": {
"eval_first": "等等,让我先自己评估一下答案。",
@ -13,7 +14,8 @@
"read_for": "让我读取网页 ${urls} 来获取更多信息。",
"read_for_verify": "让我读取源网页内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。"
"final_answer": "我来整理一下答案。",
"blocked_content": "额…这个 ${url} 的内容不太对啊,我是不是被屏蔽了啊。"
},
"zh-TW": {
"eval_first": "等等,讓我先評估一下答案。",
@ -21,7 +23,8 @@
"read_for": "讓我閱讀 ${urls} 來獲取更多信息。",
"read_for_verify": "讓我獲取源內容來驗證答案。",
"late_chunk": "網頁 ${url} 內容太長,我正在挑選相關部分。",
"final_answer": "我來整理一下答案。"
"final_answer": "我來整理一下答案。",
"blocked_content": "咦...奇怪了,${url} 好像把我擋在門外了。有够麻烦!"
},
"ja": {
"eval_first": "ちょっと待って、まず答えを評価します。",
@ -29,7 +32,8 @@
"read_for": "${urls} を読んで、情報を集めます。",
"read_for_verify": "答えを確認するために、ソースコンテンツを取得します。",
"late_chunk": "${url} のコンテンツが長すぎるため、関連部分を選択します。",
"final_answer": "答えをまとめます。"
"final_answer": "答えをまとめます。",
"blocked_content": "あれ?${url}にアクセスできないみたいです。壁にぶつかってしまいました。申し訳ありません。"
},
"ko": {
"eval_first": "잠시만요, 먼저 답변을 평가해 보겠습니다.",
@ -37,7 +41,8 @@
"read_for": "${urls} 을 읽어 더 많은 정보를 수집하겠습니다.",
"read_for_verify": "답변을 확인하기 위해 소스 콘텐츠를 가져오겠습니다.",
"late_chunk": "${url} 의 콘텐츠가 너무 길어, 관련 부분을 선택하겠습니다.",
"final_answer": "답변을 마무리하겠습니다."
"final_answer": "답변을 마무리하겠습니다.",
"blocked_content": "어라? ${url}에서 문전박대를 당했네요. 참 황당하네요!"
},
"fr": {
"eval_first": "Un instant, je vais d'abord évaluer la réponse.",
@ -45,7 +50,8 @@
"read_for": "Je vais lire ${urls} pour obtenir plus d'informations.",
"read_for_verify": "Je vais récupérer le contenu source pour vérifier la réponse.",
"late_chunk": "Le contenu de ${url} est trop long, je vais sélectionner les parties pertinentes.",
"final_answer": "Je vais finaliser la réponse."
"final_answer": "Je vais finaliser la réponse.",
"blocked_content": "Zut alors ! ${url} me met à la porte. C'est la galère !"
},
"de": {
"eval_first": "Einen Moment, ich werde die Antwort zuerst evaluieren.",
@ -53,7 +59,8 @@
"read_for": "Ich werde ${urls} lesen, um weitere Informationen zu sammeln.",
"read_for_verify": "Ich werde den Quellinhalt abrufen, um die Antwort zu überprüfen.",
"late_chunk": "Der Inhalt von ${url} ist zu lang, ich werde die relevanten Teile auswählen.",
"final_answer": "Ich werde die Antwort abschließen."
"final_answer": "Ich werde die Antwort abschließen.",
"blocked_content": "Mist! ${url} lässt mich nicht rein."
},
"es": {
"eval_first": "Un momento, voy a evaluar la respuesta primero.",
@ -61,7 +68,8 @@
"read_for": "Voy a leer ${urls} para recopilar más información.",
"read_for_verify": "Voy a obtener el contenido fuente para verificar la respuesta.",
"late_chunk": "El contenido de ${url} es demasiado largo, voy a seleccionar las partes relevantes.",
"final_answer": "Voy a finalizar la respuesta."
"final_answer": "Voy a finalizar la respuesta.",
"blocked_content": "¡Oh no! Estoy bloqueado por ${url}, ¡no es genial!"
},
"it": {
"eval_first": "Un attimo, valuterò prima la risposta.",
@ -77,7 +85,8 @@
"read_for": "Vou ler ${urls} para reunir mais informações.",
"read_for_verify": "Vou buscar o conteúdo da fonte para verificar a resposta.",
"late_chunk": "O conteúdo de ${url} é muito longo, vou selecionar as partes relevantes.",
"final_answer": "Vou finalizar a resposta."
"final_answer": "Vou finalizar a resposta.",
"blocked_content": "Ah não! Estou bloqueado por ${url}, não é legal!"
},
"ru": {
"eval_first": "Подождите, я сначала оценю ответ.",
@ -85,14 +94,16 @@
"read_for": "Дайте мне прочитать ${urls} для сбора дополнительной информации.",
"read_for_verify": "Дайте мне получить исходный контент для проверки ответа.",
"late_chunk": "Содержимое ${url} слишком длинное, я выберу только значимые части.",
"final_answer": "Дайте мне завершить ответ."
"final_answer": "Дайте мне завершить ответ.",
"blocked_content": "Ой! Меня заблокировал ${url}, не круто!"
},
"ar": {
"eval_first": "لكن انتظر، دعني أقوم بتقييم الإجابة أولاً.",
"search_for": "دعني أبحث عن ${keywords} لجمع المزيد من المعلومات.",
"read_for": "دعني أقرأ ${urls} لجمع المزيد من المعلومات.",
"read_for_verify": "دعني أحضر محتوى المصدر للتحقق من الإجابة.",
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة."
"late_chunk": "محتوى ${url} طويل جدًا، سأختار الأجزاء ذات الصلة.",
"blocked_content": "أوه لا! أنا محظور من ${url}، ليس جيدًا!"
},
"nl": {
"eval_first": "Een moment, ik zal het antwoord eerst evalueren.",
@ -100,7 +111,8 @@
"read_for": "Ik zal ${urls} lezen om meer informatie te verzamelen.",
"read_for_verify": "Ik zal de broninhoud ophalen om het antwoord te verifiëren.",
"late_chunk": "De inhoud van ${url} is te lang, ik zal de relevante delen selecteren.",
"final_answer": "Ik zal het antwoord afronden."
"final_answer": "Ik zal het antwoord afronden.",
"blocked_content": "Verdorie! Ik word geblokkeerd door ${url}."
},
"zh": {
"eval_first": "等等,让我先评估一下答案。",
@ -108,6 +120,7 @@
"read_for": "让我阅读 ${urls} 来获取更多信息。",
"read_for_verify": "让我获取源内容来验证答案。",
"late_chunk": "网页 ${url} 内容太长,我正在筛选精华部分。",
"final_answer": "我来整理一下答案。"
"final_answer": "我来整理一下答案。",
"blocked_content": "额…这个内容不太对啊,我感觉被 ${url} 屏蔽了。"
}
}

View File

@ -5,6 +5,7 @@ import {readUrl} from "../tools/read";
import {Schemas} from "./schemas";
import {cherryPick} from "../tools/jina-latechunk";
import {formatDateBasedOnType} from "./date-tools";
import {classifyText} from "../tools/jina-classify-spam";
export function normalizeUrl(urlString: string, debug = false, options = {
removeAnchors: true,
@ -430,10 +431,10 @@ export async function processURLs(
badURLs: string[],
schemaGen: Schemas,
question: string
): Promise<{ urlResults: any[], success: boolean, badURLs: string[] }> {
): Promise<{ urlResults: any[], success: boolean }> {
// Skip if no URLs to process
if (urls.length === 0) {
return {urlResults: [], success: false, badURLs: []};
return {urlResults: [], success: false};
}
const badHostnames: string[] = [];
@ -470,6 +471,16 @@ export async function processURLs(
throw new Error('No content found');
}
// check if content is likely a blocked msg from paywall, bot detection, etc.
// only check for <5000 char length content as most blocking msg is short
const spamDetectLength = 1000;
const isGoodContent = data.content.length > spamDetectLength || await classifyText(data.content);
if (!isGoodContent) {
console.error(`Blocked content ${data.content.length}:`, url, data.content.slice(0, spamDetectLength));
context.actionTracker.trackThink('blocked_content', schemaGen.languageCode, {url});
throw new Error(`Blocked content ${url}`);
}
// Add to knowledge base
allKnowledge.push({
question: `What do expert say about "${question}"?`,
@ -543,7 +554,6 @@ export async function processURLs(
return {
urlResults: validResults,
success: validResults.length > 0,
badURLs
};
}