mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
feat: filter out blocked content
This commit is contained in:
parent
ef5820729d
commit
92f1a15f8c
@ -11,7 +11,8 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
removeAnchors: true,
|
||||
removeSessionIDs: true,
|
||||
removeUTMParams: true,
|
||||
removeTrackingParams: true
|
||||
removeTrackingParams: true,
|
||||
removeXAnalytics: true // New option to control x.com /analytics removal
|
||||
}) {
|
||||
try {
|
||||
urlString = urlString.replace(/\s+/g, '').trim();
|
||||
@ -28,6 +29,20 @@ export function normalizeUrl(urlString: string, debug = false, options = {
|
||||
throw new Error('Example URL');
|
||||
}
|
||||
|
||||
// Handle x.com and twitter.com URLs with /analytics
|
||||
if (options.removeXAnalytics) {
|
||||
// Match with or without query parameters and fragments
|
||||
const xComPattern = /^(https?:\/\/(www\.)?(x\.com|twitter\.com)\/([^/]+)\/status\/(\d+))\/analytics(\/)?(\?.*)?(#.*)?$/i;
|
||||
const xMatch = urlString.match(xComPattern);
|
||||
if (xMatch) {
|
||||
// Preserve query parameters and fragments if present
|
||||
let cleanUrl = xMatch[1]; // Base URL without /analytics
|
||||
if (xMatch[7]) cleanUrl += xMatch[7]; // Add query parameters if present
|
||||
if (xMatch[8]) cleanUrl += xMatch[8]; // Add fragment if present
|
||||
urlString = cleanUrl;
|
||||
}
|
||||
}
|
||||
|
||||
const url = new URL(urlString);
|
||||
if (url.protocol !== 'http:' && url.protocol !== 'https:') {
|
||||
throw new Error('Unsupported protocol');
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user