使用 HTTPX 替代 AIOHTTP

This commit is contained in:
JoeanAmier 2024-06-27 18:45:14 +08:00
parent 01711be950
commit e7c4d6daee
18 changed files with 222 additions and 105 deletions

View File

@ -151,6 +151,8 @@ async def example():
# 实例对象
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称自动创建默认值Download
name_format = "作品标题 作品描述"
user_agent = "" # User-Agent
cookie = "" # 小红书网页版 Cookie无需登录必需参数登录状态对数据采集有影响
proxy = None # 网络代理
timeout = 5 # 请求数据超时限制单位默认值10
@ -163,6 +165,8 @@ async def example():
pass # 使用默认参数
async with XHS(work_path=work_path,
folder_name=folder_name,
name_format=name_format,
user_agent=user_agent,
cookie=cookie,
proxy=proxy,
timeout=timeout,
@ -212,6 +216,12 @@ async def example():
<td align="center"><code>发布时间 作者昵称 作品标题</code></td>
</tr>
<tr>
<td align="center">user_agent</td>
<td align="center">str</td>
<td align="center">浏览器 User-Agent</td>
<td align="center">内置 chrome user-agent</td>
</tr>
<tr>
<td align="center">cookie</td>
<td align="center">str</td>
<td align="center">小红书网页版 Cookie<b>无需登录</b></td>
@ -219,7 +229,7 @@ async def example():
</tr>
<tr>
<td align="center">proxy</td>
<td align="center">str</td>
<td align="center">str|dict</td>
<td align="center">设置程序代理</td>
<td align="center">null</td>
</tr>
@ -349,7 +359,8 @@ async def example():
# 💡 代码参考
* https://docs.aiohttp.org/en/stable/
* https://github.com/encode/httpx/
* https://github.com/tiangolo/fastapi
* https://textual.textualize.io/
* https://aiosqlite.omnilib.dev/en/stable/
* https://click.palletsprojects.com/en/8.1.x/

Binary file not shown.

View File

@ -270,3 +270,18 @@ msgstr "Web API server has been shut down!"
msgid "服务器主机及端口: {0}"
msgstr "Server host and port: {0}"
msgid "内置 Chrome User-Agent"
msgstr "Built in Chrome User Agent"
msgid "proxy 参数 {0} 设置错误,程序将不会使用代理"
msgstr "The proxy parameter {0} is set incorrectly, and the program will not use the proxy"
msgid "代理 {0} 测试成功"
msgstr "Agent {0} tested successfully"
msgid "代理 {0} 测试超时"
msgstr "Agent {0} test timeout"
msgid "代理 {0} 测试失败:{1}"
msgstr "Agent {0} test failed: {1}"

View File

@ -270,3 +270,18 @@ msgstr ""
msgid "服务器主机及端口: {0}"
msgstr ""
msgid "内置 Chrome User-Agent"
msgstr ""
msgid "proxy 参数 {0} 设置错误,程序将不会使用代理"
msgstr ""
msgid "代理 {0} 测试成功"
msgstr ""
msgid "代理 {0} 测试超时"
msgstr ""
msgid "代理 {0} 测试失败:{1}"
msgstr ""

View File

@ -16,6 +16,8 @@ async def example():
# 实例对象
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称自动创建默认值Download
name_format = "作品标题 作品描述"
user_agent = "" # User-Agent
cookie = "" # 小红书网页版 Cookie无需登录必需参数登录状态对数据采集有影响
proxy = None # 网络代理
timeout = 5 # 请求数据超时限制单位默认值10
@ -28,6 +30,8 @@ async def example():
pass # 使用默认参数
async with XHS(work_path=work_path,
folder_name=folder_name,
name_format=name_format,
user_agent=user_agent,
cookie=cookie,
proxy=proxy,
timeout=timeout,
@ -60,6 +64,7 @@ if __name__ == '__main__':
if len(argv) == 1:
run(app())
elif argv[1] == "server":
run(server())
print("该模式重构中!")
# run(server())
else:
cli()

View File

@ -1,4 +1,3 @@
aiohttp>=3.9.1
textual>=0.47.1
pyperclip>=1.8.2
lxml>=5.1.0
@ -6,3 +5,6 @@ PyYAML>=6.0.1
aiosqlite>=0.20.0
click>=8.1.7
browser_cookie3>=0.19.1
httpx>=0.27.0
fastapi>=0.110.0
uvicorn>=0.24.0

View File

@ -43,7 +43,11 @@ class XHSDownloader(App):
def __initialization(self) -> None:
self.parameter = self.SETTINGS.run()
self.message = Translate(self.parameter["language"]).message()
self.APP = XHS(**self.parameter, transition=self.message)
self.APP = XHS(
**self.parameter,
transition=self.message,
_print=False,
)
async def on_mount(self) -> None:
self.install_screen(

View File

@ -87,6 +87,7 @@ class Index(Screen):
">" *
50}",
style=MASTER), scroll_end=False)
self.xhs.manager.print_proxy_tip(log=self.tip, )
@on(Button.Pressed, "#deal")
async def deal_button(self):

View File

View File

@ -39,6 +39,9 @@ class Setting(Screen):
Label(self.message("作品文件名称格式"), classes="params", ),
Input(self.data["name_format"], placeholder=self.message("发布时间 作者昵称 作品标题"), valid_empty=True,
id="name_format", ),
Label(self.message("User-Agent"), classes="params", ),
Input(self.data["user_agent"], placeholder=self.message("内置 Chrome User-Agent"), valid_empty=True,
id="user_agent", ),
Label(self.message("小红书网页版 Cookie"), classes="params", ),
Input(placeholder=self.__check_cookie(), valid_empty=True, id="cookie", ),
Label(self.message("网络代理"), classes="params", ),
@ -98,6 +101,7 @@ class Setting(Screen):
"work_path": self.query_one("#work_path").value,
"folder_name": self.query_one("#folder_name").value,
"name_format": self.query_one("#name_format").value,
"user_agent": self.query_one("#user_agent").value,
"cookie": self.query_one("#cookie").value or self.data["cookie"],
"proxy": self.query_one("#proxy").value or None,
"timeout": int(self.query_one("#timeout").value),

View File

@ -1,6 +1,5 @@
from typing import Callable
from aiohttp import ClientTimeout
from rich.text import Text
from textual import work
from textual.app import ComposeResult
@ -39,7 +38,7 @@ class Update(ModalScreen):
@work()
async def check_update(self) -> None:
try:
url = await self.xhs.html.request_url(RELEASES, False, None, timeout=ClientTimeout(connect=5))
url = await self.xhs.html.request_url(RELEASES, False, None, timeout=5, )
version = url.split("/")[-1]
match self.compare_versions(f"{VERSION_MAJOR}.{VERSION_MINOR}", version, VERSION_BETA):
case 4:

View File

@ -1,4 +1,4 @@
from asyncio import CancelledError
# from asyncio import CancelledError
from asyncio import Event
from asyncio import Queue
from asyncio import QueueEmpty
@ -10,7 +10,7 @@ from re import compile
from typing import Callable
from urllib.parse import urlparse
from aiohttp import web
# from aiohttp import web
from pyperclip import paste
from source.expansion import BrowserCookie
@ -24,7 +24,7 @@ from source.module import (
ERROR,
WARNING,
MASTER,
REPOSITORY,
# REPOSITORY,
)
from source.module import Translate
from source.module import logging
@ -55,7 +55,7 @@ class XHS:
name_format="发布时间 作者昵称 作品标题",
user_agent: str = None,
cookie: str = None,
proxy: str = None,
proxy: str | dict = None,
timeout=10,
chunk=1024 * 1024,
max_retry=5,
@ -69,6 +69,7 @@ class XHS:
# server=False,
transition: Callable[[str], str] = None,
read_cookie: int | str = None,
_print: bool = True,
*args,
**kwargs,
):
@ -78,8 +79,8 @@ class XHS:
work_path,
folder_name,
name_format,
# user_agent,
chunk,
user_agent,
self.read_browser_cookie(read_cookie) or cookie,
proxy,
timeout,
@ -92,6 +93,7 @@ class XHS:
folder_mode,
# server,
self.message,
_print,
)
self.html = Html(self.manager)
self.image = Image()
@ -104,8 +106,8 @@ class XHS:
self.clipboard_cache: str = ""
self.queue = Queue()
self.event = Event()
self.runner = self.init_server()
self.site = None
# self.runner = self.init_server()
# self.site = None
def __extract_image(self, container: dict, data: Namespace):
container["下载地址"], container["动图地址"] = self.image.get_image_link(
@ -232,7 +234,8 @@ class XHS:
values.append(data[key])
return self.manager.SEPARATE.join(values)
def __get_name_time(self, data: dict) -> str:
@staticmethod
def __get_name_time(data: dict) -> str:
return data["发布时间"].replace(":", ".")
def __get_name_author(self, data: dict) -> str:
@ -290,49 +293,49 @@ class XHS:
return BrowserCookie.get(
value, domain="xiaohongshu.com") if value else ""
@staticmethod
async def index(request):
return web.HTTPFound(REPOSITORY)
# @staticmethod
# async def index(request):
# return web.HTTPFound(REPOSITORY)
async def handle(self, request):
data = await request.post()
url = data.get("url")
download = data.get("download", False)
index = data.get("index")
skip = data.get("skip", False)
url = await self.__extract_links(url, None)
if not url:
msg = self.message("提取小红书作品链接失败")
data = None
else:
if data := await self.__deal_extract(url[0], download, index, None, None, not skip, ):
msg = self.message("获取小红书作品数据成功")
else:
msg = self.message("获取小红书作品数据失败")
data = None
return web.json_response(dict(message=msg, url=url[0], data=data))
# async def handle(self, request):
# data = await request.post()
# url = data.get("url")
# download = data.get("download", False)
# index = data.get("index")
# skip = data.get("skip", False)
# url = await self.__extract_links(url, None)
# if not url:
# msg = self.message("提取小红书作品链接失败")
# data = None
# else:
# if data := await self.__deal_extract(url[0], download, index, None, None, not skip, ):
# msg = self.message("获取小红书作品数据成功")
# else:
# msg = self.message("获取小红书作品数据失败")
# data = None
# return web.json_response(dict(message=msg, url=url[0], data=data))
def init_server(self, ):
app = web.Application(debug=True)
app.router.add_get('/', self.index)
app.router.add_post('/xhs/', self.handle)
return web.AppRunner(app)
# def init_server(self, ):
# app = web.Application(debug=True)
# app.router.add_get('/', self.index)
# app.router.add_post('/xhs/', self.handle)
# return web.AppRunner(app)
async def run_server(self, log=None, ):
try:
await self.start_server(log)
while True:
await sleep(3600) # 保持服务器运行
except (CancelledError, KeyboardInterrupt):
await self.close_server(log)
# async def run_server(self, log=None, ):
# try:
# await self.start_server(log)
# while True:
# await sleep(3600) # 保持服务器运行
# except (CancelledError, KeyboardInterrupt):
# await self.close_server(log)
async def start_server(self, log=None, ):
await self.runner.setup()
self.site = web.TCPSite(self.runner, "0.0.0.0")
await self.site.start()
logging(log, self.message("Web API 服务器已启动!"))
logging(log, self.message("服务器主机及端口: {0}".format(self.site.name, )))
# async def start_server(self, log=None, ):
# await self.runner.setup()
# self.site = web.TCPSite(self.runner, "0.0.0.0")
# await self.site.start()
# logging(log, self.message("Web API 服务器已启动!"))
# logging(log, self.message("服务器主机及端口: {0}".format(self.site.name, )))
async def close_server(self, log=None, ):
await self.runner.cleanup()
logging(log, self.message("Web API 服务器已关闭!"))
# async def close_server(self, log=None, ):
# await self.runner.cleanup()
# logging(log, self.message("Web API 服务器已关闭!"))

View File

@ -1,7 +1,7 @@
from asyncio import gather
from pathlib import Path
from aiohttp import ClientError
from httpx import HTTPError
from source.module import ERROR
from source.module import Manager
@ -25,9 +25,8 @@ class Download:
self.manager = manager
self.folder = manager.folder
self.temp = manager.temp
self.proxy = manager.proxy
self.chunk = manager.chunk
self.session = manager.download_session
self.client = manager.download_client
self.retry = manager.retry
self.message = manager.message
self.folder_mode = manager.folder_mode
@ -117,11 +116,11 @@ class Download:
async def __download(self, url: str, path: Path, name: str, format_: str, log, bar):
temp = self.temp.joinpath(f"{name}.{format_}")
try:
async with self.session.get(url, proxy=self.proxy) as response:
if response.status != 200:
async with self.client.stream("GET", url, ) as response:
if response.status_code != 200:
logging(
log, self.message("链接 {0} 请求失败,响应码 {1}").format(
url, response.status), style=ERROR)
url, response.status_code), style=ERROR)
return False
suffix = self.__extract_type(
response.headers.get("Content-Type")) or format_
@ -131,14 +130,14 @@ class Download:
# response.headers.get(
# 'content-length', 0)) or None)
with temp.open("wb") as f:
async for chunk in response.content.iter_chunked(self.chunk):
async for chunk in response.aiter_bytes(self.chunk):
f.write(chunk)
# self.__update_progress(bar, len(chunk))
self.manager.move(temp, real)
# self.__create_progress(bar, None)
logging(log, self.message("文件 {0} 下载成功").format(real.name))
return True
except ClientError as error:
except HTTPError as error:
self.manager.delete(temp)
# self.__create_progress(bar, None)
logging(log, str(error), ERROR)

View File

@ -1,4 +1,4 @@
from aiohttp import ClientError
from httpx import HTTPError
from source.module import ERROR
from source.module import Manager
@ -10,10 +10,9 @@ __all__ = ["Html"]
class Html:
def __init__(self, manager: Manager, ):
self.proxy = manager.proxy
self.retry = manager.retry
self.message = manager.message
self.session = manager.request_session
self.client = manager.request_client
@retry
async def request_url(
@ -24,15 +23,14 @@ class Html:
**kwargs,
) -> str:
try:
async with self.session.get(
url,
proxy=self.proxy,
**kwargs,
) as response:
if response.status != 200:
return ""
return await response.text() if content else str(response.url)
except ClientError as error:
response = await self.client.get(
url,
**kwargs,
)
if response.status_code != 200:
return ""
return response.text if content else str(response.url)
except HTTPError as error:
logging(log, str(error), ERROR)
logging(
log, self.message("网络异常,请求 {0} 失败").format(url), ERROR)

View File

@ -21,6 +21,7 @@ from .static import (
USERSCRIPT,
HEADERS,
PROJECT,
USERAGENT,
)
from .tools import (
retry,
@ -54,4 +55,5 @@ __all__ = [
"PROJECT",
"Translate",
"DataRecorder",
"USERAGENT",
]

View File

@ -5,10 +5,15 @@ from shutil import move
from shutil import rmtree
from typing import Callable
from aiohttp import ClientSession
from aiohttp import ClientTimeout
from httpx import AsyncClient
from httpx import RequestError
from httpx import TimeoutException
from httpx import get
from .static import HEADERS
from .static import USERAGENT
from .static import WARNING
from .tools import logging
__all__ = ["Manager"]
@ -30,6 +35,10 @@ class Manager:
'作者昵称',
'作者ID',
)
NO_PROXY = {
"http://": None,
"https://": None,
}
SEPARATE = "_"
def __init__(
@ -39,8 +48,9 @@ class Manager:
folder: str,
name_format: str,
chunk: int,
user_agent: str,
cookie: str,
proxy: str,
proxy: str | dict,
timeout: int,
retry: int,
record_data: bool,
@ -51,12 +61,14 @@ class Manager:
folder_mode: bool,
# server: bool,
transition: Callable[[str], str],
_print: bool,
):
self.root = root
self.temp = root.joinpath("./temp")
self.path = self.__check_path(path)
self.folder = self.__check_folder(folder)
self.blank_headers = HEADERS
self.message = transition
self.blank_headers = HEADERS | {"User-Agent": user_agent or USERAGENT}
self.headers = self.blank_headers | {"Cookie": cookie}
self.retry = retry
self.chunk = chunk
@ -64,16 +76,20 @@ class Manager:
self.record_data = self.check_bool(record_data, False)
self.image_format = self.__check_image_format(image_format)
self.folder_mode = self.check_bool(folder_mode, False)
self.proxy = proxy
self.request_session = ClientSession(
self.proxy_tip = None
self.proxy = self.__check_proxy(proxy)
self.print_proxy_tip(_print, )
self.request_client = AsyncClient(
headers=self.headers | {
"Referer": "https://www.xiaohongshu.com/explore", },
timeout=ClientTimeout(connect=timeout),
timeout=timeout,
**self.proxy,
)
self.download_session = ClientSession(
self.download_client = AsyncClient(
headers=self.blank_headers,
timeout=ClientTimeout(connect=timeout))
self.message = transition
timeout=timeout,
**self.proxy,
)
self.image_download = self.check_bool(image_download, True)
self.video_download = self.check_bool(video_download, True)
self.live_download = self.check_bool(live_download, True)
@ -134,8 +150,8 @@ class Manager:
return value if isinstance(value, bool) else default
async def close(self):
await self.request_session.close()
await self.download_session.close()
await self.request_client.aclose()
await self.download_client.aclose()
self.__clean()
def __check_name_format(self, format_: str) -> str:
@ -148,3 +164,38 @@ class Manager:
),
format_,
)
def __check_proxy(
self,
proxy: str | dict,
url="https://www.baidu.com/",
) -> dict:
if not proxy:
return {"proxies": self.NO_PROXY}
if isinstance(proxy, str):
kwarg = {"proxy": proxy}
elif isinstance(proxy, dict):
kwarg = {"proxies": proxy}
else:
self.proxy_tip = (
self.message("proxy 参数 {0} 设置错误,程序将不会使用代理").format(proxy), WARNING,)
return {"proxies": self.NO_PROXY}
try:
response = get(
url,
**kwarg, )
if response.status_code < 400:
self.proxy_tip = (self.message("代理 {0} 测试成功").format(proxy),)
return kwarg
except TimeoutException:
self.proxy_tip = (
self.message("代理 {0} 测试超时").format(proxy), WARNING,)
except RequestError as e:
self.proxy_tip = (
self.message("代理 {0} 测试失败:{1}").format(
proxy, e), WARNING,)
return {"proxies": self.NO_PROXY}
def print_proxy_tip(self, _print: bool = True, log=None, ) -> None:
if _print and self.proxy_tip:
logging(log, *self.proxy_tip)

View File

@ -4,6 +4,7 @@ from pathlib import Path
from platform import system
from .static import ROOT
from .static import USERAGENT
__all__ = ['Settings']
@ -13,6 +14,7 @@ class Settings:
"work_path": "",
"folder_name": "Download",
"name_format": "发布时间 作者昵称 作品标题",
"user_agent": USERAGENT,
"cookie": "",
"proxy": None,
"timeout": 10,

View File

@ -18,11 +18,12 @@ __all__ = [
"USERSCRIPT",
"HEADERS",
"PROJECT",
"USERAGENT",
]
VERSION_MAJOR = 2
VERSION_MINOR = 0
VERSION_BETA = False
VERSION_MINOR = 1
VERSION_BETA = True
ROOT = Path(__file__).resolve().parent.parent.parent
PROJECT = f"XHS-Downloader V{VERSION_MAJOR}.{
VERSION_MINOR}{" Beta" if VERSION_BETA else ""}"
@ -33,22 +34,27 @@ RELEASES = "https://github.com/JoeanAmier/XHS-Downloader/releases/latest"
USERSCRIPT = "https://raw.githubusercontent.com/JoeanAmier/XHS-Downloader/master/static/XHS-Downloader.js"
USERAGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 "
"Safari/537.36")
HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-SG,zh-CN;q=0.9,zh;q=0.8',
'dnt': '1',
'priority': 'u=0, i',
'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 '
'Safari/537.36',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-SG,zh-CN;q=0.9,zh;q=0.8",
"Cookie": "",
"Dnt": "1",
# "Priority": "u=0, i",
# "Sec-Ch-Ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
"Sec-Ch-Ua-Mobile": "?0",
# "Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": USERAGENT,
}
MASTER = "b #fff200"