新增 livePhoto 文件下载功能

This commit is contained in:
JoeanAmier
2024-06-08 11:23:52 +08:00
parent bd0780c344
commit 54b7cf86a5
16 changed files with 123 additions and 85 deletions

View File

@@ -18,6 +18,7 @@
<li>✅ 采集小红书作品信息</li>
<li>✅ 提取小红书作品下载地址</li>
<li>✅ 下载小红书无水印作品文件</li>
<li>✅ 下载小红书 livePhoto 文件(非无水印)</li>
<li>✅ 自动跳过已下载的作品文件</li>
<li>✅ 作品文件完整性处理机制</li>
<li>✅ 自定义图文作品文件下载格式</li>
@@ -148,8 +149,7 @@ async def example():
# 实例对象
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称自动创建默认值Download
user_agent = "" # 请求头 User-Agent可选参数
cookie = "" # 小红书网页版 Cookie无需登录必需参数
cookie = "" # 小红书网页版 Cookie无需登录必需参数登录状态对数据采集有影响
proxy = None # 网络代理
timeout = 5 # 请求数据超时限制单位默认值10
chunk = 1024 * 1024 * 10 # 下载文件时,每次从服务器获取的数据块大小,单位:字节
@@ -161,7 +161,6 @@ async def example():
pass # 使用默认参数
async with XHS(work_path=work_path,
folder_name=folder_name,
user_agent=user_agent,
cookie=cookie,
proxy=proxy,
timeout=timeout,
@@ -211,12 +210,6 @@ async def example():
<td align="center"><code>发布时间 作者昵称 作品标题</code></td>
</tr>
<tr>
<td align="center">user_agent</td>
<td align="center">str</td>
<td align="center">请求头 User-Agent</td>
<td align="center">默认 UA</td>
</tr>
<tr>
<td align="center">cookie</td>
<td align="center">str</td>
<td align="center">小红书网页版 Cookie<b>无需登录</b></td>
@@ -271,6 +264,12 @@ async def example():
<td align="center">true</td>
</tr>
<tr>
<td align="center">live_download</td>
<td align="center">bool</td>
<td align="center">图文动图文件下载开关</td>
<td align="center">false</td>
</tr>
<tr>
<td align="center">folder_mode</td>
<td align="center">bool</td>
<td align="center">是否将每个作品的文件储存至单独的文件夹;文件夹名称与文件名称保持一致</td>

Binary file not shown.

View File

@@ -109,9 +109,6 @@ msgstr "Xiaohongshu web version cookie, no login required, parameters have been
msgid "小红书网页版 Cookie无需登录参数未设置"
msgstr "Xiaohongshu web version cookie, no login required, parameters not set"
msgid "默认 User-Agent"
msgstr "Default User-Agent"
msgid "不使用代理"
msgstr "No proxy"
@@ -247,6 +244,9 @@ msgstr "Video works download switch"
msgid "图文作品下载开关"
msgstr "Image works download switch"
msgid "动图文件下载开关"
msgstr "Live photo download switch"
msgid "配置文件 settings.json 缺少必要的参数,请删除该文件,然后重新运行程序,自动生成默认配置文件!"
msgstr "The configuration file settings.json is missing necessary parameters. Please delete the file and run the program again to automatically generate the default configuration file!"

View File

@@ -109,9 +109,6 @@ msgstr ""
msgid "小红书网页版 Cookie无需登录参数未设置"
msgstr ""
msgid "默认 User-Agent"
msgstr ""
msgid "不使用代理"
msgstr ""
@@ -247,6 +244,9 @@ msgstr ""
msgid "图文作品下载开关"
msgstr ""
msgid "动图文件下载开关"
msgstr ""
msgid "配置文件 settings.json 缺少必要的参数,请删除该文件,然后重新运行程序,自动生成默认配置文件!"
msgstr ""

View File

@@ -16,8 +16,7 @@ async def example():
# 实例对象
work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
folder_name = "Download" # 作品文件储存文件夹名称自动创建默认值Download
user_agent = "" # 请求头 User-Agent可选参数
cookie = "" # 小红书网页版 Cookie无需登录必需参数
cookie = "" # 小红书网页版 Cookie无需登录必需参数登录状态对数据采集有影响
proxy = None # 网络代理
timeout = 5 # 请求数据超时限制单位默认值10
chunk = 1024 * 1024 * 10 # 下载文件时,每次从服务器获取的数据块大小,单位:字节
@@ -29,7 +28,6 @@ async def example():
pass # 使用默认参数
async with XHS(work_path=work_path,
folder_name=folder_name,
user_agent=user_agent,
cookie=cookie,
proxy=proxy,
timeout=timeout,

View File

@@ -123,7 +123,6 @@ class CLI:
("--work_path", "-wp", "str", _("作品数据 / 文件保存根路径")),
("--folder_name", "-fn", "str", _("作品文件储存文件夹名称")),
("--name_format", "-nf", "str", _("作品文件名称格式")),
("--user_agent", "-ua", "str", _("User-Agent")),
("--cookie", "-ck", "str", _("小红书网页版 Cookie无需登录")),
("--proxy", "-p", "str", _("网络代理")),
("--timeout", "-t", "int", _("请求数据超时限制,单位:秒")),
@@ -131,6 +130,7 @@ class CLI:
("--max_retry", "-mr", "int", _("请求数据失败时,重试的最大次数")),
("--record_data", "-rd", "bool", _("是否记录作品数据至文件")),
("--image_format", "-if", "choice", _("图文作品文件下载格式支持PNG、WEBP")),
("--live_download", "-ld", "bool", _("图文动图文件下载开关")),
("--folder_mode", "-fm", "bool", _("是否将每个作品的文件储存至单独的文件夹")),
("--language", "-l", "choice", _("设置程序语言目前支持zh_CN、en_GB")),
("--settings", "-s", "str", _("读取指定配置文件")),
@@ -163,7 +163,6 @@ class CLI:
)
@option("--folder_name", "-fn", )
@option("--name_format", "-nf", )
@option("--user_agent", "-ua", )
@option("--cookie", "-ck", )
@option("--proxy", "-p", )
@option("--timeout", "-t", type=int, )
@@ -171,6 +170,7 @@ class CLI:
@option("--max_retry", "-mr", type=int, )
@option("--record_data", "-rd", type=bool, )
@option("--image_format", "-if", type=Choice(["png", "PNG", "webp", "WEBP"]), )
@option("--live_download", "-ld", type=bool, )
@option("--folder_mode", "-fm", type=bool, )
@option("--language", "-l",
type=Choice(["zh_CN", "en_GB"]), )

View File

@@ -39,9 +39,6 @@ class Setting(Screen):
Label(self.message("作品文件名称格式"), classes="params", ),
Input(self.data["name_format"], placeholder=self.message("发布时间 作者昵称 作品标题"), valid_empty=True,
id="name_format", ),
Label(self.message("User-Agent"), classes="params", ),
Input(self.data["user_agent"], placeholder=self.message("默认 User-Agent"), valid_empty=True,
id="user_agent", ),
Label(self.message("小红书网页版 Cookie"), classes="params", ),
Input(placeholder=self.__check_cookie(), valid_empty=True, id="cookie", ),
Label(self.message("网络代理"), classes="params", ),
@@ -58,11 +55,16 @@ class Setting(Screen):
Checkbox(self.message("视频作品下载开关"), id="video_download", value=self.data["video_download"], ),
Checkbox(self.message("图文作品下载开关"), id="image_download", value=self.data["image_download"], ),
classes="horizontal-layout"),
Label(),
Container(
Checkbox(self.message("动图文件下载开关"), id="live_download", value=self.data["live_download"], ),
classes="horizontal-layout"),
Container(
Label(self.message("图片下载格式"), classes="params", ),
Label(self.message("程序语言"), classes="params", ),
classes="horizontal-layout",
),
Label(),
Container(
Select.from_values(
("PNG", "WEBP"),
@@ -95,7 +97,6 @@ class Setting(Screen):
"work_path": self.query_one("#work_path").value,
"folder_name": self.query_one("#folder_name").value,
"name_format": self.query_one("#name_format").value,
"user_agent": self.query_one("#user_agent").value,
"cookie": self.query_one("#cookie").value or self.data["cookie"],
"proxy": self.query_one("#proxy").value or None,
"timeout": int(self.query_one("#timeout").value),
@@ -107,6 +108,7 @@ class Setting(Screen):
"language": self.query_one("#language").value,
"image_download": self.query_one("#image_download").value,
"video_download": self.query_one("#video_download").value,
"live_download": self.query_one("#live_download").value,
# "server": False,
})

View File

@@ -40,26 +40,45 @@ class Update(ModalScreen):
async def check_update(self) -> None:
try:
url = await self.xhs.html.request_url(RELEASES, False, None, timeout=ClientTimeout(connect=5))
latest_major, latest_minor = map(
int, url.split("/")[-1].split(".", 1))
if latest_major > VERSION_MAJOR or latest_minor > VERSION_MINOR:
tip = Text(f"{self.message("检测到新版本:{0}.{1}").format(
VERSION_MAJOR, VERSION_MINOR)}\n{RELEASES}", style=WARNING)
elif latest_minor == VERSION_MINOR and VERSION_BETA:
tip = Text(
f"{self.message("当前版本为开发版, 可更新至正式版")}\n{RELEASES}",
style=WARNING)
elif VERSION_BETA:
tip = Text(
self.message("当前已是最新开发版"),
style=WARNING)
else:
tip = Text(
self.message("当前已是最新正式版"),
style=INFO)
version = url.split("/")[-1]
match self.compare_versions(f"{VERSION_MAJOR}.{VERSION_MINOR}", version, VERSION_BETA):
case 4:
tip = Text(f"{self.message("检测到新版本:{0}.{1}").format(
VERSION_MAJOR, VERSION_MINOR)}\n{RELEASES}", style=WARNING)
case 3:
tip = Text(
f"{self.message("当前版本为开发版, 可更新至正式版")}\n{RELEASES}",
style=WARNING)
case 2:
tip = Text(
self.message("当前已是最新开发版"),
style=WARNING)
case 1:
tip = Text(
self.message("当前已是最新正式版"),
style=INFO)
case _:
raise ValueError
except ValueError:
tip = Text(self.message("检测新版本失败"), style=ERROR)
self.dismiss(tip)
def on_mount(self) -> None:
self.check_update()
@staticmethod
def compare_versions(
current_version: str,
target_version: str,
is_development: bool) -> int:
current_major, current_minor = map(int, current_version.split('.'))
target_major, target_minor = map(int, target_version.split('.'))
if target_major > current_major:
return 4
if target_major == current_major:
if target_minor > current_minor:
return 4
if target_minor == current_minor:
return 3 if is_development else 1
return 2

View File

@@ -63,6 +63,7 @@ class XHS:
image_format="PNG",
image_download=True,
video_download=True,
live_download=False,
folder_mode=False,
language="zh_CN",
# server=False,
@@ -77,7 +78,7 @@ class XHS:
work_path,
folder_name,
name_format,
user_agent,
# user_agent,
chunk,
self.read_browser_cookie(read_cookie) or cookie,
proxy,
@@ -87,6 +88,7 @@ class XHS:
image_format,
image_download,
video_download,
live_download,
folder_mode,
# server,
self.message,
@@ -106,7 +108,7 @@ class XHS:
self.site = None
def __extract_image(self, container: dict, data: Namespace):
container["下载地址"] = self.image.get_image_link(
container["下载地址"], container["动图地址"] = self.image.get_image_link(
data, self.manager.image_format)
def __extract_video(self, container: dict, data: Namespace):
@@ -119,7 +121,8 @@ class XHS:
logging(
log, self.message("作品 {0} 存在下载记录,跳过下载").format(i))
else:
path, result = await self.download.run(u, index, name, container["作品类型"], log, bar)
path, result = await self.download.run(u, container["动图地址"], index, name, container["作品类型"],
log, bar)
await self.__add_record(i, result)
elif not u:
logging(log, self.message("提取作品文件下载地址失败"), ERROR)
@@ -128,6 +131,7 @@ class XHS:
async def save_data(self, data: dict, ):
data["采集时间"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
data["下载地址"] = " ".join(data["下载地址"])
data["动图地址"] = " ".join(i or "NaN" for i in data["动图地址"])
await self.data_recorder.add(**data)
async def __add_record(self, id_: str, result: tuple) -> None:

View File

@@ -32,18 +32,21 @@ class Download:
self.message = manager.message
self.folder_mode = manager.folder_mode
self.video_format = "mp4"
self.live_format = "mp4"
self.image_format = manager.image_format
self.image_download = manager.image_download
self.video_download = manager.video_download
self.live_download = manager.live_download
async def run(self, urls: list, index: list | tuple | None, name: str, type_: str, log, bar) -> tuple[Path, tuple]:
async def run(self, urls: list, lives: list, index: list | tuple | None, name: str, type_: str, log, bar) -> tuple[
Path, tuple]:
path = self.__generate_path(name)
match type_:
case "视频":
tasks = self.__ready_download_video(urls, path, name, log)
case "图文":
tasks = self.__ready_download_image(
urls, index, path, name, log)
urls, lives, index, path, name, log)
case _:
raise ValueError
tasks = [
@@ -73,14 +76,14 @@ class Download:
if not self.video_download:
logging(log, self.message("视频作品下载功能已关闭,跳过下载"))
return []
if any(path.glob(f"{name}.*")):
logging(log, self.message("{0} 文件已存在,跳过下载").format(name))
if self.__check_exists(path, f"{name}.{self.video_format}", log):
return []
return [(urls[0], name, self.video_format)]
def __ready_download_image(
self,
urls: list[str],
lives: list[str],
index: list | tuple | None,
path: Path,
name: str,
@@ -89,21 +92,30 @@ class Download:
if not self.image_download:
logging(log, self.message("图文作品下载功能已关闭,跳过下载"))
return tasks
for i, j in enumerate(urls, start=1):
for i, j in enumerate(zip(urls, lives), start=1):
if index and i not in index:
continue
file = f"{name}_{i}"
if any(path.glob(f"{file}.*")):
logging(
log, self.message(
"{0} 文件已存在,跳过下载").format(name))
if not self.__check_exists(
path, f"{file}.{self.image_format}", log):
tasks.append([j[0], file, self.image_format])
if not self.live_download or not j[1] or self.__check_exists(
path, f"{file}.{self.live_format}", log):
continue
tasks.append([j, file, self.image_format])
tasks.append([j[1], file, self.live_format])
return tasks
def __check_exists(self, path: Path, name: str, log, ) -> bool:
if any(path.glob(name)):
logging(
log, self.message(
"{0} 文件已存在,跳过下载").format(name))
return True
return False
@re_download
async def __download(self, url: str, path: Path, name: str, format_: str, log, bar):
temp = self.temp.joinpath(name)
temp = self.temp.joinpath(f"{name}.{format_}")
try:
async with self.session.get(url, proxy=self.proxy) as response:
if response.status != 200:
@@ -124,7 +136,7 @@ class Download:
# self.__update_progress(bar, len(chunk))
self.manager.move(temp, real)
# self.__create_progress(bar, None)
logging(log, self.message("文件 {0} 下载成功").format(name))
logging(log, self.message("文件 {0} 下载成功").format(real.name))
return True
except ClientError as error:
self.manager.delete(temp)

View File

@@ -6,8 +6,9 @@ __all__ = ['Image']
class Image:
@classmethod
def get_image_link(cls, data: Namespace, format_: str) -> list:
def get_image_link(cls, data: Namespace, format_: str) -> [list, list]:
images = data.safe_extract("imageList", [])
live_link = cls.__get_live_link(images)
token_list = [
cls.__extract_image_token(
Namespace.object_extract(
@@ -15,10 +16,10 @@ class Image:
match format_:
case "png":
return [Html.format_url(cls.__generate_png_link(i))
for i in token_list]
for i in token_list], live_link
case "webp":
return [Html.format_url(cls.__generate_webp_link(i))
for i in token_list]
for i in token_list], live_link
case _:
raise ValueError
@@ -33,3 +34,12 @@ class Image:
@staticmethod
def __extract_image_token(url: str) -> str:
return "/".join(url.split("/")[5:]).split("!")[0]
@staticmethod
def __get_live_link(items: list) -> list:
links = []
for item in items:
links.append(
Html.format_url(Namespace.object_extract(
item, "stream.h264[0].masterUrl")))
return links

View File

@@ -19,7 +19,6 @@ from .static import (
WARNING,
INFO,
USERSCRIPT,
USERAGENT,
HEADERS,
PROJECT,
)
@@ -49,7 +48,6 @@ __all__ = [
"WARNING",
"INFO",
"USERSCRIPT",
"USERAGENT",
"HEADERS",
"retry",
"logging",

View File

@@ -9,7 +9,6 @@ from aiohttp import ClientSession
from aiohttp import ClientTimeout
from .static import HEADERS
from .static import USERAGENT
__all__ = ["Manager"]
@@ -39,7 +38,6 @@ class Manager:
path: str,
folder: str,
name_format: str,
user_agent: str,
chunk: int,
cookie: str,
proxy: str,
@@ -49,6 +47,7 @@ class Manager:
image_format: str,
image_download: bool,
video_download: bool,
live_download: bool,
folder_mode: bool,
# server: bool,
transition: Callable[[str], str],
@@ -57,8 +56,7 @@ class Manager:
self.temp = root.joinpath("./temp")
self.path = self.__check_path(path)
self.folder = self.__check_folder(folder)
self.blank_headers = HEADERS | {
"User-Agent": user_agent or USERAGENT, }
self.blank_headers = HEADERS
self.headers = self.blank_headers | {"Cookie": cookie}
self.retry = retry
self.chunk = chunk
@@ -78,6 +76,7 @@ class Manager:
self.message = transition
self.image_download = self.check_bool(image_download, True)
self.video_download = self.check_bool(video_download, True)
self.live_download = self.check_bool(live_download, True)
# self.server = self.check_bool(server, False)
def __check_path(self, path: str) -> Path:

View File

@@ -67,10 +67,10 @@ class DataRecorder(IDRecorder):
("点赞数量", "TEXT"),
("作者昵称", "TEXT"),
("作者ID", "TEXT"),
# ("IP归属地", "TEXT"),
("作者链接", "TEXT"),
("作品链接", "TEXT"),
("下载地址", "TEXT"),
("动图地址", "TEXT"),
)
def __init__(self, manager: Manager):

View File

@@ -13,7 +13,6 @@ class Settings:
"work_path": "",
"folder_name": "Download",
"name_format": "发布时间 作者昵称 作品标题",
"user_agent": "",
"cookie": "",
"proxy": None,
"timeout": 10,
@@ -23,6 +22,7 @@ class Settings:
"image_format": "PNG",
"image_download": True,
"video_download": True,
"live_download": False,
"folder_mode": False,
"language": "zh_CN",
# "server": False,

View File

@@ -16,7 +16,6 @@ __all__ = [
"WARNING",
"INFO",
"USERSCRIPT",
"USERAGENT",
"HEADERS",
"PROJECT",
]
@@ -35,24 +34,22 @@ RELEASES = "https://github.com/JoeanAmier/XHS-Downloader/releases/latest"
USERSCRIPT = "https://raw.githubusercontent.com/JoeanAmier/XHS-Downloader/master/static/XHS-Downloader.js"
HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Dnt": "1",
"Sec-Ch-Ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Microsoft Edge\";v=\"120\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-SG,zh-CN;q=0.9,zh;q=0.8',
'dnt': '1',
'priority': 'u=0, i',
'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 '
'Safari/537.36',
}
USERAGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 "
"Safari/537.36 Edg/121.0.0.0")
MASTER = "b #fff200"
PROMPT = "b turquoise2"