更新项目代码

This commit is contained in:
JoeamAmier
2023-12-05 22:55:13 +08:00
parent 395d81c522
commit 413168f122
7 changed files with 92 additions and 84 deletions

View File

@@ -11,10 +11,11 @@
<h1>📑 功能清单</h1> <h1>📑 功能清单</h1>
<ul> <ul>
<li>✅ 采集小红书图文/视频作品信息</li> <li>✅ 采集小红书图文/视频作品信息</li>
<li>取小红书图文/视频作品文件下载地址</li> <li>取小红书图文/视频作品文件下载地址</li>
<li>✅ 下载小红书无水印图文/视频作品文件</li> <li>✅ 下载小红书无水印图文/视频作品文件</li>
<li>✅ 自动跳过已下载的作品文件</li> <li>✅ 自动跳过已下载的作品文件</li>
<li>✅ 作品文件完整性处理机制</li> <li>✅ 作品文件完整性处理机制</li>
<li>☑️ 采集作品信息储存至文件</li>
</ul> </ul>
<h1>📸 程序截图</h1> <h1>📸 程序截图</h1>
<br> <br>
@@ -25,7 +26,7 @@
<li><code>https://www.xiaohongshu.com/discovery/item/作品ID</code></li> <li><code>https://www.xiaohongshu.com/discovery/item/作品ID</code></li>
<li><code>https://xhslink.com/分享码</code></li> <li><code>https://xhslink.com/分享码</code></li>
<br/> <br/>
<p><b>可以单次输入多个作品链接,链接之间使用空格分隔。</b></p> <p><b>支持单次输入多个作品链接,链接之间使用空格分隔。</b></p>
</ul> </ul>
<h1>🪟 关于终端</h1> <h1>🪟 关于终端</h1>
<p>⭐ 推荐使用 <a href="https://learn.microsoft.com/zh-cn/windows/terminal/install">Windows 终端</a> Windows 11 自带默认终端)运行程序以便获得最佳显示效果!</p> <p>⭐ 推荐使用 <a href="https://learn.microsoft.com/zh-cn/windows/terminal/install">Windows 终端</a> Windows 11 自带默认终端)运行程序以便获得最佳显示效果!</p>
@@ -41,7 +42,7 @@
<li>运行 <code>main.py</code> 即可使用</li> <li>运行 <code>main.py</code> 即可使用</li>
</ol> </ol>
<h2>💻 二次开发</h2> <h2>💻 二次开发</h2>
<p>如果要获取小红书图文/视频作品信息,可以根据 <code>main.py</code> 的注释提示进行代码调用。</p> <p>如果要获取小红书图文/视频作品信息,可以根据 <code>main.py</code> 的注释提示进行代码调用。</p>
<pre> <pre>
# 测试链接 # 测试链接
error_demo = "https://github.com/JoeanAmier/XHS_Downloader" error_demo = "https://github.com/JoeanAmier/XHS_Downloader"
@@ -49,24 +50,26 @@ image_demo = "https://www.xiaohongshu.com/explore/63b275a30000000019020185"
video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc" video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
multiple_demo = f"{image_demo} {video_demo}" multiple_demo = f"{image_demo} {video_demo}"
# 实例对象 # 实例对象
path = "D:\\" # 作品下载储存根路径,默认值:当前路径 path = "" # 作品下载储存根路径,默认值:当前路径
folder = "Download" # 作品下载文件夹名称自动创建默认值Download folder = "Download" # 作品下载文件夹名称自动创建默认值Download
proxies = None # 网络代理 user_agent = "" # 请求头 User-Agent
proxy = None # 网络代理
timeout = 5 # 网络请求超时限制默认值10 timeout = 5 # 网络请求超时限制默认值10
chunk = 1024 * 1024 # 下载文件时,每次从服务器获取的数据块大小,单位字节 chunk = 1024 * 1024 # 下载文件时,每次从服务器获取的数据块大小,单位字节
# with XHS() as xhs: # async with XHS() as xhs:
# pass # 使用默认参数 # pass # 使用默认参数
with XHS(path=path, async with XHS(path=path,
folder=folder, folder=folder,
proxies=proxies, user_agent=user_agent,
timeout=timeout, proxy=proxy,
chunk=chunk) as xhs: # 使用自定义参数 timeout=timeout,
chunk=chunk) as xhs: # 使用自定义参数
download = True # 是否下载作品文件默认值False download = True # 是否下载作品文件默认值False
# 返回作品详细信息,包括下载地址 # 返回作品详细信息,包括下载地址
print(xhs.extract(error_demo)) # 获取数据失败时返回空字典 print(await xhs.extract(error_demo, download=download)) # 获取数据失败时返回空字典
print(xhs.extract(image_demo, download=download)) print(await xhs.extract(image_demo, download=download))
print(xhs.extract(video_demo, download=download)) print(await xhs.extract(video_demo, download=download))
print(xhs.extract(multiple_demo, download=download)) print(await xhs.extract(multiple_demo, download=download)) # 支持传入多个作品链接
</pre> </pre>
<h1>⚙️ 配置文件</h1> <h1>⚙️ 配置文件</h1>
<p>项目根目录下的 <code>settings.json</code> 文件,首次运行自动生成,可以自定义部分运行参数。</p> <p>项目根目录下的 <code>settings.json</code> 文件,首次运行自动生成,可以自定义部分运行参数。</p>
@@ -83,7 +86,7 @@ with XHS(path=path,
<tr> <tr>
<td align="center">path</td> <td align="center">path</td>
<td align="center">str</td> <td align="center">str</td>
<td align="center">作品文件存根路径</td> <td align="center">作品数据 / 文件存根路径</td>
<td align="center">项目根路径</td> <td align="center">项目根路径</td>
</tr> </tr>
<tr> <tr>
@@ -93,6 +96,12 @@ with XHS(path=path,
<td align="center">Download</td> <td align="center">Download</td>
</tr> </tr>
<tr> <tr>
<td align="center">user_agent</td>
<td align="center">str</td>
<td align="center">请求头 User-Agent</td>
<td align="center">内置 UA</td>
</tr>
<tr>
<td align="center">proxy</td> <td align="center">proxy</td>
<td align="center">str</td> <td align="center">str</td>
<td align="center">设置代理</td> <td align="center">设置代理</td>

18
main.py
View File

@@ -11,24 +11,26 @@ async def example():
video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc" video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
multiple_demo = f"{image_demo} {video_demo}" multiple_demo = f"{image_demo} {video_demo}"
# 实例对象 # 实例对象
path = "D:\\" # 作品下载储存根路径,默认值:当前路径 path = "" # 作品下载储存根路径,默认值:当前路径
folder = "Download" # 作品下载文件夹名称自动创建默认值Download folder = "Download" # 作品下载文件夹名称自动创建默认值Download
proxies = None # 网络代理 user_agent = "" # 请求头 User-Agent
proxy = None # 网络代理
timeout = 5 # 网络请求超时限制默认值10 timeout = 5 # 网络请求超时限制默认值10
chunk = 1024 * 1024 # 下载文件时,每次从服务器获取的数据块大小,单位字节 chunk = 1024 * 1024 # 下载文件时,每次从服务器获取的数据块大小,单位字节
async with XHS() as xhs: # async with XHS() as xhs:
pass # 使用默认参数 # pass # 使用默认参数
async with XHS(path=path, async with XHS(path=path,
folder=folder, folder=folder,
proxy=proxies, user_agent=user_agent,
proxy=proxy,
timeout=timeout, timeout=timeout,
chunk=chunk) as xhs: # 使用自定义参数 chunk=chunk) as xhs: # 使用自定义参数
download = False # 是否下载作品文件默认值False download = True # 是否下载作品文件默认值False
# 返回作品详细信息,包括下载地址 # 返回作品详细信息,包括下载地址
print(await xhs.extract(error_demo)) # 获取数据失败时返回空字典 print(await xhs.extract(error_demo, download=download)) # 获取数据失败时返回空字典
print(await xhs.extract(image_demo, download=download)) print(await xhs.extract(image_demo, download=download))
print(await xhs.extract(video_demo, download=download)) print(await xhs.extract(video_demo, download=download))
print(await xhs.extract(multiple_demo, download=download)) print(await xhs.extract(multiple_demo, download=download)) # 支持传入多个作品链接
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -1,11 +1,8 @@
from pathlib import Path from pathlib import Path
from aiohttp import ClientConnectionError
from aiohttp import ClientProxyConnectionError
from aiohttp import ClientSSLError
from aiohttp import ClientSession from aiohttp import ClientSession
from aiohttp import ClientTimeout
# from aiohttp import ClientTimeout from aiohttp import ServerTimeoutError
__all__ = ['Download'] __all__ = ['Download']
@@ -26,8 +23,9 @@ class Download:
self.root = self.__init_root(root, path, folder) self.root = self.__init_root(root, path, folder)
self.proxy = proxy self.proxy = proxy
self.chunk = chunk self.chunk = chunk
# self.timeout = ClientTimeout(total=timeout) self.session = ClientSession(
self.session = ClientSession(headers=manager.headers) headers=manager.headers,
timeout=ClientTimeout(connect=timeout))
def __init_root(self, root: Path, path: str, folder: str) -> Path: def __init_root(self, root: Path, path: str, folder: str) -> Path:
if path and (r := Path(path)).is_dir(): if path and (r := Path(path)).is_dir():
@@ -42,8 +40,10 @@ class Download:
if type_ == 0: if type_ == 0:
await self.__download(urls[0], f"{name}.mp4", log, bar) await self.__download(urls[0], f"{name}.mp4", log, bar)
elif type_ == 1: elif type_ == 1:
for index, url in enumerate(urls): for index, url in enumerate(urls, start=1):
await self.__download(url, f"{name}_{index + 1}.png", log, bar) await self.__download(url, f"{name}_{index}.png", log, bar)
else:
raise ValueError
async def __download(self, url: str, name: str, log, bar): async def __download(self, url: str, name: str, log, bar):
temp = self.temp.joinpath(name) temp = self.temp.joinpath(name)
@@ -52,32 +52,26 @@ class Download:
return return
try: try:
async with self.session.get(url, proxy=self.proxy) as response: async with self.session.get(url, proxy=self.proxy) as response:
# self.__create_progress(bar, int(response.headers.get('content-length', 0))) self.__create_progress(
bar, int(
response.headers.get(
'content-length', 0)) or None)
with temp.open("wb") as f: with temp.open("wb") as f:
async for chunk in response.content.iter_chunked(self.chunk): async for chunk in response.content.iter_chunked(self.chunk):
f.write(chunk) f.write(chunk)
# self.__update_progress(bar, len(chunk)) self.__update_progress(bar, len(chunk))
# self.__remove_progress(bar)
self.manager.move(temp, file) self.manager.move(temp, file)
except ( self.__create_progress(bar, None)
ClientProxyConnectionError, except ServerTimeoutError:
ClientSSLError,
ClientConnectionError,
TimeoutError,
):
self.manager.delete(temp) self.manager.delete(temp)
# self.__remove_progress(bar) self.__create_progress(bar, None)
# @staticmethod @staticmethod
# def __create_progress(bar, total: int | None): def __create_progress(bar, total: int | None):
# if bar: if bar:
# bar.update(total=total) bar.update(total=total)
#
# @staticmethod @staticmethod
# def __update_progress(bar, advance: int): def __update_progress(bar, advance: int):
# if bar: if bar:
# bar.advance(advance) bar.advance(advance)
#
# @staticmethod
# def __remove_progress(bar):
# pass

View File

@@ -1,9 +1,6 @@
from aiohttp import ClientConnectionError
from aiohttp import ClientProxyConnectionError
from aiohttp import ClientSSLError
from aiohttp import ClientSession from aiohttp import ClientSession
from aiohttp import ClientTimeout
# from aiohttp import ClientTimeout from aiohttp import ServerTimeoutError
__all__ = ['Html'] __all__ = ['Html']
@@ -18,7 +15,9 @@ class Html:
self.proxy = proxy self.proxy = proxy
self.session = ClientSession( self.session = ClientSession(
headers=headers | { headers=headers | {
"Referer": "https://www.xiaohongshu.com/", }) "Referer": "https://www.xiaohongshu.com/", },
timeout=ClientTimeout(connect=timeout),
)
async def request_url( async def request_url(
self, self,
@@ -30,11 +29,7 @@ class Html:
proxy=self.proxy, proxy=self.proxy,
) as response: ) as response:
return await response.text() if text else response.url return await response.text() if text else response.url
except ( except ServerTimeoutError:
ClientProxyConnectionError,
ClientSSLError,
ClientConnectionError,
):
return "" return ""
@staticmethod @staticmethod

View File

@@ -6,12 +6,11 @@ __all__ = ["Manager"]
class Manager: class Manager:
headers = { def __init__(self, root: Path, ua: str):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/119.0.0.0 Safari/537.36", }
def __init__(self, root: Path):
self.temp = root.joinpath("./temp") self.temp = root.joinpath("./temp")
self.headers = {
"User-Agent": ua or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", }
@staticmethod @staticmethod
def is_exists(path: Path) -> bool: def is_exists(path: Path) -> bool:

View File

@@ -9,7 +9,8 @@ class Settings:
default = { default = {
"path": "", "path": "",
"folder": "Download", "folder": "Download",
"proxies": None, "user_agent": "",
"proxy": "",
"timeout": 10, "timeout": 10,
"chunk": 1024 * 1024, "chunk": 1024 * 1024,
} }

View File

@@ -30,20 +30,27 @@ __all__ = ['XHS', 'XHSDownloader']
class XHS: class XHS:
ROOT = Path(__file__).resolve().parent.parent ROOT = Path(__file__).resolve().parent.parent
link = compile(r"https://www\.xiaohongshu\.com/explore/[a-z0-9]+") LINK = compile(r"https://www\.xiaohongshu\.com/explore/[a-z0-9]+")
share = compile(r"https://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+") SHARE = compile(r"https://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+")
short = compile(r"https://xhslink\.com/[A-Za-z0-9]+") SHORT = compile(r"https://xhslink\.com/[A-Za-z0-9]+")
__INSTANCE = None
def __new__(cls, *args, **kwargs):
if not cls.__INSTANCE:
cls.__INSTANCE = super().__new__(cls)
return cls.__INSTANCE
def __init__( def __init__(
self, self,
path="", path="",
folder="Download", folder="Download",
proxy=None, user_agent: str = None,
proxy: str = None,
timeout=10, timeout=10,
chunk=1024 * 1024, chunk=1024 * 1024,
**kwargs, **kwargs,
): ):
self.manager = Manager(self.ROOT) self.manager = Manager(self.ROOT, user_agent)
self.html = Html(self.manager.headers, proxy, timeout) self.html = Html(self.manager.headers, proxy, timeout)
self.image = Image() self.image = Image()
self.video = Video() self.video = Video()
@@ -81,12 +88,12 @@ class XHS:
async def __deal_links(self, url: str) -> list: async def __deal_links(self, url: str) -> list:
urls = [] urls = []
for i in url.split(): for i in url.split():
if u := self.short.search(i): if u := self.SHORT.search(i):
i = await self.html.request_url( i = await self.html.request_url(
u.group(), False) u.group(), False)
if u := self.share.search(i): if u := self.SHARE.search(i):
urls.append(u.group()) urls.append(u.group())
elif u := self.link.search(i): elif u := self.LINK.search(i):
urls.append(u.group()) urls.append(u.group())
return urls return urls
@@ -118,18 +125,19 @@ class XHS:
await self.html.session.close() await self.html.session.close()
await self.download.session.close() await self.download.session.close()
def rich_log(self, log, text, style="b bright_green"): @staticmethod
def rich_log(log, text, style="b bright_green"):
if log: if log:
log.write(Text(text, style=style)) log.write(Text(text, style=style))
else: else:
self.console.print(text, style=style) print(text)
class XHSDownloader(App): class XHSDownloader(App):
VERSION = 1.6 VERSION = 1.6
BETA = True BETA = True
ROOT = Path(__file__).resolve().parent.parent ROOT = Path(__file__).resolve().parent.parent
APP = XHS(**Settings(ROOT).run()) # APP = XHS(**Settings(ROOT).run())
CSS_PATH = ROOT.joinpath( CSS_PATH = ROOT.joinpath(
"static/XHS-Downloader.tcss") "static/XHS-Downloader.tcss")
BINDINGS = [ BINDINGS = [