diff --git a/README.md b/README.md index 4f56edc..3038328 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,8 @@ async def example(): # 实例对象 work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径 folder_name = "Download" # 作品文件储存文件夹名称(自动创建),默认值:Download + name_format = "作品标题 作品描述" + user_agent = "" # User-Agent cookie = "" # 小红书网页版 Cookie,无需登录,必需参数,登录状态对数据采集有影响 proxy = None # 网络代理 timeout = 5 # 请求数据超时限制,单位:秒,默认值:10 @@ -163,6 +165,8 @@ async def example(): pass # 使用默认参数 async with XHS(work_path=work_path, folder_name=folder_name, + name_format=name_format, + user_agent=user_agent, cookie=cookie, proxy=proxy, timeout=timeout, @@ -212,6 +216,12 @@ async def example(): 发布时间 作者昵称 作品标题 +user_agent +str +浏览器 User-Agent +内置 chrome user-agent + + cookie str 小红书网页版 Cookie,无需登录 @@ -219,7 +229,7 @@ async def example(): proxy -str +str|dict 设置程序代理 null @@ -349,7 +359,8 @@ async def example(): # 💡 代码参考 -* https://docs.aiohttp.org/en/stable/ +* https://github.com/encode/httpx/ +* https://github.com/tiangolo/fastapi * https://textual.textualize.io/ * https://aiosqlite.omnilib.dev/en/stable/ * https://click.palletsprojects.com/en/8.1.x/ diff --git a/locale/en_GB/LC_MESSAGES/xhs.mo b/locale/en_GB/LC_MESSAGES/xhs.mo index 860ea0d..398ddee 100644 Binary files a/locale/en_GB/LC_MESSAGES/xhs.mo and b/locale/en_GB/LC_MESSAGES/xhs.mo differ diff --git a/locale/en_GB/LC_MESSAGES/xhs.po b/locale/en_GB/LC_MESSAGES/xhs.po index 4ae6595..2b9a049 100644 --- a/locale/en_GB/LC_MESSAGES/xhs.po +++ b/locale/en_GB/LC_MESSAGES/xhs.po @@ -270,3 +270,18 @@ msgstr "Web API server has been shut down!" msgid "服务器主机及端口: {0}" msgstr "Server host and port: {0}" + +msgid "内置 Chrome User-Agent" +msgstr "Built in Chrome User Agent" + +msgid "proxy 参数 {0} 设置错误,程序将不会使用代理" +msgstr "The proxy parameter {0} is set incorrectly, and the program will not use the proxy" + +msgid "代理 {0} 测试成功" +msgstr "Agent {0} tested successfully" + +msgid "代理 {0} 测试超时" +msgstr "Agent {0} test timeout" + +msgid "代理 {0} 测试失败:{1}" +msgstr "Agent {0} test failed: {1}" diff --git a/locale/zh_CN/LC_MESSAGES/xhs.po b/locale/zh_CN/LC_MESSAGES/xhs.po index 770e6b3..c3f4b28 100644 --- a/locale/zh_CN/LC_MESSAGES/xhs.po +++ b/locale/zh_CN/LC_MESSAGES/xhs.po @@ -270,3 +270,18 @@ msgstr "" msgid "服务器主机及端口: {0}" msgstr "" + +msgid "内置 Chrome User-Agent" +msgstr "" + +msgid "proxy 参数 {0} 设置错误,程序将不会使用代理" +msgstr "" + +msgid "代理 {0} 测试成功" +msgstr "" + +msgid "代理 {0} 测试超时" +msgstr "" + +msgid "代理 {0} 测试失败:{1}" +msgstr "" diff --git a/main.py b/main.py index b814ebc..ae60a5f 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,8 @@ async def example(): # 实例对象 work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径 folder_name = "Download" # 作品文件储存文件夹名称(自动创建),默认值:Download + name_format = "作品标题 作品描述" + user_agent = "" # User-Agent cookie = "" # 小红书网页版 Cookie,无需登录,必需参数,登录状态对数据采集有影响 proxy = None # 网络代理 timeout = 5 # 请求数据超时限制,单位:秒,默认值:10 @@ -28,6 +30,8 @@ async def example(): pass # 使用默认参数 async with XHS(work_path=work_path, folder_name=folder_name, + name_format=name_format, + user_agent=user_agent, cookie=cookie, proxy=proxy, timeout=timeout, @@ -60,6 +64,7 @@ if __name__ == '__main__': if len(argv) == 1: run(app()) elif argv[1] == "server": - run(server()) + print("该模式重构中!") + # run(server()) else: cli() diff --git a/requirements.txt b/requirements.txt index 02f4adf..25b8099 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -aiohttp>=3.9.1 textual>=0.47.1 pyperclip>=1.8.2 lxml>=5.1.0 @@ -6,3 +5,6 @@ PyYAML>=6.0.1 aiosqlite>=0.20.0 click>=8.1.7 browser_cookie3>=0.19.1 +httpx>=0.27.0 +fastapi>=0.110.0 +uvicorn>=0.24.0 diff --git a/source/TUI/app.py b/source/TUI/app.py index 2cf7844..6de0422 100644 --- a/source/TUI/app.py +++ b/source/TUI/app.py @@ -43,7 +43,11 @@ class XHSDownloader(App): def __initialization(self) -> None: self.parameter = self.SETTINGS.run() self.message = Translate(self.parameter["language"]).message() - self.APP = XHS(**self.parameter, transition=self.message) + self.APP = XHS( + **self.parameter, + transition=self.message, + _print=False, + ) async def on_mount(self) -> None: self.install_screen( diff --git a/source/TUI/index.py b/source/TUI/index.py index 944e977..1168bcf 100644 --- a/source/TUI/index.py +++ b/source/TUI/index.py @@ -87,6 +87,7 @@ class Index(Screen): ">" * 50}", style=MASTER), scroll_end=False) + self.xhs.manager.print_proxy_tip(log=self.tip, ) @on(Button.Pressed, "#deal") async def deal_button(self): diff --git a/source/TUI/server.py b/source/TUI/server.py deleted file mode 100644 index e69de29..0000000 diff --git a/source/TUI/setting.py b/source/TUI/setting.py index 836f7dd..f95e451 100644 --- a/source/TUI/setting.py +++ b/source/TUI/setting.py @@ -39,6 +39,9 @@ class Setting(Screen): Label(self.message("作品文件名称格式"), classes="params", ), Input(self.data["name_format"], placeholder=self.message("发布时间 作者昵称 作品标题"), valid_empty=True, id="name_format", ), + Label(self.message("User-Agent"), classes="params", ), + Input(self.data["user_agent"], placeholder=self.message("内置 Chrome User-Agent"), valid_empty=True, + id="user_agent", ), Label(self.message("小红书网页版 Cookie"), classes="params", ), Input(placeholder=self.__check_cookie(), valid_empty=True, id="cookie", ), Label(self.message("网络代理"), classes="params", ), @@ -98,6 +101,7 @@ class Setting(Screen): "work_path": self.query_one("#work_path").value, "folder_name": self.query_one("#folder_name").value, "name_format": self.query_one("#name_format").value, + "user_agent": self.query_one("#user_agent").value, "cookie": self.query_one("#cookie").value or self.data["cookie"], "proxy": self.query_one("#proxy").value or None, "timeout": int(self.query_one("#timeout").value), diff --git a/source/TUI/update.py b/source/TUI/update.py index 89490bd..36cd5a8 100644 --- a/source/TUI/update.py +++ b/source/TUI/update.py @@ -1,6 +1,5 @@ from typing import Callable -from aiohttp import ClientTimeout from rich.text import Text from textual import work from textual.app import ComposeResult @@ -39,7 +38,7 @@ class Update(ModalScreen): @work() async def check_update(self) -> None: try: - url = await self.xhs.html.request_url(RELEASES, False, None, timeout=ClientTimeout(connect=5)) + url = await self.xhs.html.request_url(RELEASES, False, None, timeout=5, ) version = url.split("/")[-1] match self.compare_versions(f"{VERSION_MAJOR}.{VERSION_MINOR}", version, VERSION_BETA): case 4: diff --git a/source/application/app.py b/source/application/app.py index 763fd13..55256c7 100644 --- a/source/application/app.py +++ b/source/application/app.py @@ -1,4 +1,4 @@ -from asyncio import CancelledError +# from asyncio import CancelledError from asyncio import Event from asyncio import Queue from asyncio import QueueEmpty @@ -10,7 +10,7 @@ from re import compile from typing import Callable from urllib.parse import urlparse -from aiohttp import web +# from aiohttp import web from pyperclip import paste from source.expansion import BrowserCookie @@ -24,7 +24,7 @@ from source.module import ( ERROR, WARNING, MASTER, - REPOSITORY, + # REPOSITORY, ) from source.module import Translate from source.module import logging @@ -55,7 +55,7 @@ class XHS: name_format="发布时间 作者昵称 作品标题", user_agent: str = None, cookie: str = None, - proxy: str = None, + proxy: str | dict = None, timeout=10, chunk=1024 * 1024, max_retry=5, @@ -69,6 +69,7 @@ class XHS: # server=False, transition: Callable[[str], str] = None, read_cookie: int | str = None, + _print: bool = True, *args, **kwargs, ): @@ -78,8 +79,8 @@ class XHS: work_path, folder_name, name_format, - # user_agent, chunk, + user_agent, self.read_browser_cookie(read_cookie) or cookie, proxy, timeout, @@ -92,6 +93,7 @@ class XHS: folder_mode, # server, self.message, + _print, ) self.html = Html(self.manager) self.image = Image() @@ -104,8 +106,8 @@ class XHS: self.clipboard_cache: str = "" self.queue = Queue() self.event = Event() - self.runner = self.init_server() - self.site = None + # self.runner = self.init_server() + # self.site = None def __extract_image(self, container: dict, data: Namespace): container["下载地址"], container["动图地址"] = self.image.get_image_link( @@ -232,7 +234,8 @@ class XHS: values.append(data[key]) return self.manager.SEPARATE.join(values) - def __get_name_time(self, data: dict) -> str: + @staticmethod + def __get_name_time(data: dict) -> str: return data["发布时间"].replace(":", ".") def __get_name_author(self, data: dict) -> str: @@ -290,49 +293,49 @@ class XHS: return BrowserCookie.get( value, domain="xiaohongshu.com") if value else "" - @staticmethod - async def index(request): - return web.HTTPFound(REPOSITORY) + # @staticmethod + # async def index(request): + # return web.HTTPFound(REPOSITORY) - async def handle(self, request): - data = await request.post() - url = data.get("url") - download = data.get("download", False) - index = data.get("index") - skip = data.get("skip", False) - url = await self.__extract_links(url, None) - if not url: - msg = self.message("提取小红书作品链接失败") - data = None - else: - if data := await self.__deal_extract(url[0], download, index, None, None, not skip, ): - msg = self.message("获取小红书作品数据成功") - else: - msg = self.message("获取小红书作品数据失败") - data = None - return web.json_response(dict(message=msg, url=url[0], data=data)) + # async def handle(self, request): + # data = await request.post() + # url = data.get("url") + # download = data.get("download", False) + # index = data.get("index") + # skip = data.get("skip", False) + # url = await self.__extract_links(url, None) + # if not url: + # msg = self.message("提取小红书作品链接失败") + # data = None + # else: + # if data := await self.__deal_extract(url[0], download, index, None, None, not skip, ): + # msg = self.message("获取小红书作品数据成功") + # else: + # msg = self.message("获取小红书作品数据失败") + # data = None + # return web.json_response(dict(message=msg, url=url[0], data=data)) - def init_server(self, ): - app = web.Application(debug=True) - app.router.add_get('/', self.index) - app.router.add_post('/xhs/', self.handle) - return web.AppRunner(app) + # def init_server(self, ): + # app = web.Application(debug=True) + # app.router.add_get('/', self.index) + # app.router.add_post('/xhs/', self.handle) + # return web.AppRunner(app) - async def run_server(self, log=None, ): - try: - await self.start_server(log) - while True: - await sleep(3600) # 保持服务器运行 - except (CancelledError, KeyboardInterrupt): - await self.close_server(log) + # async def run_server(self, log=None, ): + # try: + # await self.start_server(log) + # while True: + # await sleep(3600) # 保持服务器运行 + # except (CancelledError, KeyboardInterrupt): + # await self.close_server(log) - async def start_server(self, log=None, ): - await self.runner.setup() - self.site = web.TCPSite(self.runner, "0.0.0.0") - await self.site.start() - logging(log, self.message("Web API 服务器已启动!")) - logging(log, self.message("服务器主机及端口: {0}".format(self.site.name, ))) + # async def start_server(self, log=None, ): + # await self.runner.setup() + # self.site = web.TCPSite(self.runner, "0.0.0.0") + # await self.site.start() + # logging(log, self.message("Web API 服务器已启动!")) + # logging(log, self.message("服务器主机及端口: {0}".format(self.site.name, ))) - async def close_server(self, log=None, ): - await self.runner.cleanup() - logging(log, self.message("Web API 服务器已关闭!")) + # async def close_server(self, log=None, ): + # await self.runner.cleanup() + # logging(log, self.message("Web API 服务器已关闭!")) diff --git a/source/application/download.py b/source/application/download.py index 9ec80d5..5af2b59 100644 --- a/source/application/download.py +++ b/source/application/download.py @@ -1,7 +1,7 @@ from asyncio import gather from pathlib import Path -from aiohttp import ClientError +from httpx import HTTPError from source.module import ERROR from source.module import Manager @@ -25,9 +25,8 @@ class Download: self.manager = manager self.folder = manager.folder self.temp = manager.temp - self.proxy = manager.proxy self.chunk = manager.chunk - self.session = manager.download_session + self.client = manager.download_client self.retry = manager.retry self.message = manager.message self.folder_mode = manager.folder_mode @@ -117,11 +116,11 @@ class Download: async def __download(self, url: str, path: Path, name: str, format_: str, log, bar): temp = self.temp.joinpath(f"{name}.{format_}") try: - async with self.session.get(url, proxy=self.proxy) as response: - if response.status != 200: + async with self.client.stream("GET", url, ) as response: + if response.status_code != 200: logging( log, self.message("链接 {0} 请求失败,响应码 {1}").format( - url, response.status), style=ERROR) + url, response.status_code), style=ERROR) return False suffix = self.__extract_type( response.headers.get("Content-Type")) or format_ @@ -131,14 +130,14 @@ class Download: # response.headers.get( # 'content-length', 0)) or None) with temp.open("wb") as f: - async for chunk in response.content.iter_chunked(self.chunk): + async for chunk in response.aiter_bytes(self.chunk): f.write(chunk) # self.__update_progress(bar, len(chunk)) self.manager.move(temp, real) # self.__create_progress(bar, None) logging(log, self.message("文件 {0} 下载成功").format(real.name)) return True - except ClientError as error: + except HTTPError as error: self.manager.delete(temp) # self.__create_progress(bar, None) logging(log, str(error), ERROR) diff --git a/source/application/request.py b/source/application/request.py index d1b83fc..42e1779 100644 --- a/source/application/request.py +++ b/source/application/request.py @@ -1,4 +1,4 @@ -from aiohttp import ClientError +from httpx import HTTPError from source.module import ERROR from source.module import Manager @@ -10,10 +10,9 @@ __all__ = ["Html"] class Html: def __init__(self, manager: Manager, ): - self.proxy = manager.proxy self.retry = manager.retry self.message = manager.message - self.session = manager.request_session + self.client = manager.request_client @retry async def request_url( @@ -24,15 +23,14 @@ class Html: **kwargs, ) -> str: try: - async with self.session.get( - url, - proxy=self.proxy, - **kwargs, - ) as response: - if response.status != 200: - return "" - return await response.text() if content else str(response.url) - except ClientError as error: + response = await self.client.get( + url, + **kwargs, + ) + if response.status_code != 200: + return "" + return response.text if content else str(response.url) + except HTTPError as error: logging(log, str(error), ERROR) logging( log, self.message("网络异常,请求 {0} 失败").format(url), ERROR) diff --git a/source/module/__init__.py b/source/module/__init__.py index e03f620..74793c6 100644 --- a/source/module/__init__.py +++ b/source/module/__init__.py @@ -21,6 +21,7 @@ from .static import ( USERSCRIPT, HEADERS, PROJECT, + USERAGENT, ) from .tools import ( retry, @@ -54,4 +55,5 @@ __all__ = [ "PROJECT", "Translate", "DataRecorder", + "USERAGENT", ] diff --git a/source/module/manager.py b/source/module/manager.py index 98851ca..80b4ecd 100644 --- a/source/module/manager.py +++ b/source/module/manager.py @@ -5,10 +5,15 @@ from shutil import move from shutil import rmtree from typing import Callable -from aiohttp import ClientSession -from aiohttp import ClientTimeout +from httpx import AsyncClient +from httpx import RequestError +from httpx import TimeoutException +from httpx import get from .static import HEADERS +from .static import USERAGENT +from .static import WARNING +from .tools import logging __all__ = ["Manager"] @@ -30,6 +35,10 @@ class Manager: '作者昵称', '作者ID', ) + NO_PROXY = { + "http://": None, + "https://": None, + } SEPARATE = "_" def __init__( @@ -39,8 +48,9 @@ class Manager: folder: str, name_format: str, chunk: int, + user_agent: str, cookie: str, - proxy: str, + proxy: str | dict, timeout: int, retry: int, record_data: bool, @@ -51,12 +61,14 @@ class Manager: folder_mode: bool, # server: bool, transition: Callable[[str], str], + _print: bool, ): self.root = root self.temp = root.joinpath("./temp") self.path = self.__check_path(path) self.folder = self.__check_folder(folder) - self.blank_headers = HEADERS + self.message = transition + self.blank_headers = HEADERS | {"User-Agent": user_agent or USERAGENT} self.headers = self.blank_headers | {"Cookie": cookie} self.retry = retry self.chunk = chunk @@ -64,16 +76,20 @@ class Manager: self.record_data = self.check_bool(record_data, False) self.image_format = self.__check_image_format(image_format) self.folder_mode = self.check_bool(folder_mode, False) - self.proxy = proxy - self.request_session = ClientSession( + self.proxy_tip = None + self.proxy = self.__check_proxy(proxy) + self.print_proxy_tip(_print, ) + self.request_client = AsyncClient( headers=self.headers | { "Referer": "https://www.xiaohongshu.com/explore", }, - timeout=ClientTimeout(connect=timeout), + timeout=timeout, + **self.proxy, ) - self.download_session = ClientSession( + self.download_client = AsyncClient( headers=self.blank_headers, - timeout=ClientTimeout(connect=timeout)) - self.message = transition + timeout=timeout, + **self.proxy, + ) self.image_download = self.check_bool(image_download, True) self.video_download = self.check_bool(video_download, True) self.live_download = self.check_bool(live_download, True) @@ -134,8 +150,8 @@ class Manager: return value if isinstance(value, bool) else default async def close(self): - await self.request_session.close() - await self.download_session.close() + await self.request_client.aclose() + await self.download_client.aclose() self.__clean() def __check_name_format(self, format_: str) -> str: @@ -148,3 +164,38 @@ class Manager: ), format_, ) + + def __check_proxy( + self, + proxy: str | dict, + url="https://www.baidu.com/", + ) -> dict: + if not proxy: + return {"proxies": self.NO_PROXY} + if isinstance(proxy, str): + kwarg = {"proxy": proxy} + elif isinstance(proxy, dict): + kwarg = {"proxies": proxy} + else: + self.proxy_tip = ( + self.message("proxy 参数 {0} 设置错误,程序将不会使用代理").format(proxy), WARNING,) + return {"proxies": self.NO_PROXY} + try: + response = get( + url, + **kwarg, ) + if response.status_code < 400: + self.proxy_tip = (self.message("代理 {0} 测试成功").format(proxy),) + return kwarg + except TimeoutException: + self.proxy_tip = ( + self.message("代理 {0} 测试超时").format(proxy), WARNING,) + except RequestError as e: + self.proxy_tip = ( + self.message("代理 {0} 测试失败:{1}").format( + proxy, e), WARNING,) + return {"proxies": self.NO_PROXY} + + def print_proxy_tip(self, _print: bool = True, log=None, ) -> None: + if _print and self.proxy_tip: + logging(log, *self.proxy_tip) diff --git a/source/module/settings.py b/source/module/settings.py index 6fe88e0..34cb5f0 100644 --- a/source/module/settings.py +++ b/source/module/settings.py @@ -4,6 +4,7 @@ from pathlib import Path from platform import system from .static import ROOT +from .static import USERAGENT __all__ = ['Settings'] @@ -13,6 +14,7 @@ class Settings: "work_path": "", "folder_name": "Download", "name_format": "发布时间 作者昵称 作品标题", + "user_agent": USERAGENT, "cookie": "", "proxy": None, "timeout": 10, diff --git a/source/module/static.py b/source/module/static.py index 0dd7932..8b473c0 100644 --- a/source/module/static.py +++ b/source/module/static.py @@ -18,11 +18,12 @@ __all__ = [ "USERSCRIPT", "HEADERS", "PROJECT", + "USERAGENT", ] VERSION_MAJOR = 2 -VERSION_MINOR = 0 -VERSION_BETA = False +VERSION_MINOR = 1 +VERSION_BETA = True ROOT = Path(__file__).resolve().parent.parent.parent PROJECT = f"XHS-Downloader V{VERSION_MAJOR}.{ VERSION_MINOR}{" Beta" if VERSION_BETA else ""}" @@ -33,22 +34,27 @@ RELEASES = "https://github.com/JoeanAmier/XHS-Downloader/releases/latest" USERSCRIPT = "https://raw.githubusercontent.com/JoeanAmier/XHS-Downloader/master/static/XHS-Downloader.js" +USERAGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 " + "Safari/537.36") + HEADERS = { - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,' - 'application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'zh-SG,zh-CN;q=0.9,zh;q=0.8', - 'dnt': '1', - 'priority': 'u=0, i', - 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'document', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-site': 'none', - 'sec-fetch-user': '?1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 ' - 'Safari/537.36', + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "zh-SG,zh-CN;q=0.9,zh;q=0.8", + "Cookie": "", + "Dnt": "1", + # "Priority": "u=0, i", + # "Sec-Ch-Ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"", + "Sec-Ch-Ua-Mobile": "?0", + # "Sec-Ch-Ua-Platform": "\"Windows\"", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": USERAGENT, } MASTER = "b #fff200"