perf(download.py): 优化已下载文件判断逻辑

1. 优化图片文件后缀名
2. 修改文件下载并发数
3. 优化项目代码格式
4. 更新项目使用说明
This commit is contained in:
JoeanAmier 2024-10-13 11:00:18 +08:00
parent 04557b074f
commit 58b2067c68
6 changed files with 53 additions and 19 deletions

View File

@ -15,6 +15,7 @@
<br>
<p>🔥 <b>小红书链接提取/作品采集工具</b>:提取账号发布、收藏、点赞、专辑作品链接;提取搜索结果作品链接、用户链接;采集小红书作品信息;提取小红书作品下载地址;下载小红书无水印作品文件!</p>
<p><strong>⚠️ 2024/9/24: 由于小红书规则更新,使用版本号低于 <code>1.7.1</code> 的用户脚本有封号风险,请及时更新用户脚本后再使用!</strong></p>
<p><strong>⚠️ 2024/10/13: 由于作品链接携带日期信息,使用先前日期获取的作品链接可能会被风控,建议下载作品文件时使用最新获取的作品链接!</strong></p>
<p>⭐ 本项目完全免费开源,无任何收费功能,请勿上当受骗!</p>
<h1>📑 项目功能</h1>
<ul><b>程序功能</b>

View File

@ -15,6 +15,7 @@
<br>
<p>🔥 <b>Xiaohongshu Link Extraction/Content Collection Tool</b>Extract account-published, favorited, and liked content links; extract search result content links and user links; collect Xiaohongshu content information; extract Xiaohongshu content download addresses; download Xiaohongshu watermark-free content files!</p>
<p><strong>⚠️ 2024/9/24: Due to rule updates on Xiaohongshu, there is a risk of account suspension for user scripts with version numbers lower than <code>1.7.1</code> Please update the user scripts in a timely manner before using them!</strong></p>
<p><strong>⚠️ 2024/10/13: Due to the date information carried in the links of Xiaohongshu works, using links obtained from previous dates may be subject to risk control. It is recommended to use the latest Xiaohongshu works links when downloading Xiaohongshu work files!</strong></p>
<p>⭐ This project is completely free and open-source, with no paid features. Please do not be deceived!</p>
<p>⭐ Due to the author's limited energy, I was unable to update the English document in a timely manner, and the content may have become outdated, partial translation is machine translation, the translation result may be incorrect, Suggest referring to Chinese documentation. If you want to contribute to translation, we warmly welcome you.</p>
<h1>📑 Project Features</h1>

View File

@ -6,16 +6,17 @@ from typing import TYPE_CHECKING, Any
from aiofiles import open
from httpx import HTTPError
from source.module import ERROR
from source.module import (
from ..module import ERROR
from ..module import (
FILE_SIGNATURES_LENGTH,
FILE_SIGNATURES,
)
# from source.module import WARNING
from source.module import Manager
from source.module import logging
from source.module import retry as re_download
from source.module import sleep_time
from ..module import MAX_WORKERS
# from ..module import WARNING
from ..module import Manager
from ..module import logging
from ..module import retry as re_download
from ..module import sleep_time
if TYPE_CHECKING:
from httpx import AsyncClient
@ -24,10 +25,10 @@ __all__ = ['Download']
class Download:
SEMAPHORE = Semaphore(4)
SEMAPHORE = Semaphore(MAX_WORKERS)
CONTENT_TYPE_MAP = {
"image/png": "png",
"image/jpeg": "jpg",
"image/jpeg": "jpeg",
"image/webp": "webp",
"application/octet-stream": "",
"video/mp4": "mp4",
@ -47,6 +48,13 @@ class Download:
self.video_format = "mp4"
self.live_format = "mp4"
self.image_format = manager.image_format
self.image_format_list = (
"jpeg",
"png",
"webp",
"avif",
"heic",
)
self.image_download = manager.image_download
self.video_download = manager.video_download
self.live_download = manager.live_download
@ -97,7 +105,7 @@ class Download:
if not self.video_download:
logging(log, self.message("视频作品下载功能已关闭,跳过下载"))
return []
if self.__check_exists(path, f"{name}.{self.video_format}", log):
if self.__check_exists_path(path, f"{name}.{self.video_format}", log):
return []
return [(urls[0], name, self.video_format)]
@ -117,16 +125,25 @@ class Download:
if index and i not in index:
continue
file = f"{name}_{i}"
if not self.__check_exists(
path, f"{file}.{self.image_format}", log):
if not any(
self.__check_exists_path(
path,
f"{file}.{s}",
log,
)
for s in self.image_format_list
):
tasks.append([j[0], file, self.image_format])
if not self.live_download or not j[1] or self.__check_exists(
path, f"{file}.{self.live_format}", log):
if not self.live_download or not j[1] or self.__check_exists_path(
path,
f"{file}.{self.live_format}",
log,
):
continue
tasks.append([j[1], file, self.live_format])
return tasks
def __check_exists(self, path: Path, name: str, log, ) -> bool:
def __check_exists_glob(self, path: Path, name: str, log, ) -> bool:
if any(path.glob(name)):
logging(
log, self.message(
@ -134,6 +151,14 @@ class Download:
return True
return False
def __check_exists_path(self, path: Path, name: str, log, ) -> bool:
if path.joinpath(name).exists():
logging(
log, self.message(
"{0} 文件已存在,跳过下载").format(name))
return True
return False
@re_download
async def __download(
self,

View File

@ -30,6 +30,7 @@ from .static import (
SEC_CH_UA_PLATFORM,
FILE_SIGNATURES,
FILE_SIGNATURES_LENGTH,
MAX_WORKERS,
)
from .tools import (
retry,

View File

@ -48,7 +48,7 @@ FILE_SIGNATURES: tuple[tuple[int, bytes, str,], ...] = (
# 分别为偏移量(字节)、十六进制签名、后缀
# 参考https://en.wikipedia.org/wiki/List_of_file_signatures
# 参考https://www.garykessler.net/library/file_sigs.html
(0, b"\xFF\xD8\xFF", "jpg"),
(0, b"\xFF\xD8\xFF", "jpeg"),
(0, b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", "png"),
(4, b"\x66\x74\x79\x70\x61\x76\x69\x66", "avif"),
(4, b"\x66\x74\x79\x70\x68\x65\x69\x63", "heic"),
@ -64,3 +64,5 @@ FILE_SIGNATURES: tuple[tuple[int, bytes, str,], ...] = (
(8, b"\x41\x56\x49\x20", "avi"),
)
FILE_SIGNATURES_LENGTH = max(offset + len(signature) for offset, signature, _ in FILE_SIGNATURES)
MAX_WORKERS: int = 3

View File

@ -2,9 +2,11 @@
1. 优化从浏览器读取 Cookie 功能
2. 修复文件名称过长报错的问题
3. 新增文件名称长度限制
4. 优化文件后缀处理逻辑
5. 优化代理测试逻辑
3. 优化已下载文件判断逻辑
4. 更新文件下载并发数量
5. 新增文件名称长度限制
6. 优化文件后缀处理逻辑
7. 优化代理测试逻辑
<p><strong>旧版本升级后首次运行请删除配置文件 <code>settings.json</code>,删除后重新运行程序会自动生成新的默认配置文件!</strong></p>
@ -15,3 +17,5 @@
1. 重构作品链接提取功能
<p><strong>⚠️ 由于小红书规则更新,使用版本号低于 <code>1.7.1</code> 的用户脚本有封号风险,请及时更新用户脚本后再使用!</strong></p>
<p><strong>⚠️ 由于作品链接携带日期信息,使用先前日期获取的作品链接可能会被风控,建议下载作品文件时使用最新获取的作品链接!</strong></p>