新增保存作品数据功能

This commit is contained in:
JoeamAmier
2023-12-11 22:31:46 +08:00
parent 5f1e4a6657
commit af8e3cabb1
3 changed files with 65 additions and 34 deletions

View File

@@ -1,5 +1,3 @@
from pathlib import Path
from aiohttp import ClientSession from aiohttp import ClientSession
from aiohttp import ClientTimeout from aiohttp import ClientTimeout
from aiohttp import ServerDisconnectedError from aiohttp import ServerDisconnectedError
@@ -16,31 +14,18 @@ class Download:
def __init__( def __init__(
self, self,
manager, manager,
root: Path,
path: str,
folder: str,
proxy: str = "", proxy: str = "",
chunk=1024 * 1024, chunk=1024 * 1024,
timeout=10, timeout=10):
retry_=5, ):
self.manager = manager self.manager = manager
self.folder = manager.folder
self.temp = manager.temp self.temp = manager.temp
self.root = self.__init_root(root, path, folder)
self.proxy = proxy self.proxy = proxy
self.chunk = chunk self.chunk = chunk
self.session = ClientSession( self.session = ClientSession(
headers={"User-Agent": manager.headers["User-Agent"]}, headers={"User-Agent": manager.headers["User-Agent"]},
timeout=ClientTimeout(connect=timeout)) timeout=ClientTimeout(connect=timeout))
self.retry = retry_ self.retry = manager.retry
def __init_root(self, root: Path, path: str, folder: str) -> Path:
if path and (r := Path(path)).is_dir():
root = r.joinpath(folder or "Download")
else:
root = root.joinpath(folder or "Download")
root.mkdir(exist_ok=True)
self.temp.mkdir(exist_ok=True)
return root
async def run(self, urls: list, name: str, type_: int, log, bar): async def run(self, urls: list, name: str, type_: int, log, bar):
if type_ == 0: if type_ == 0:
@@ -54,7 +39,7 @@ class Download:
@retry @retry
async def __download(self, url: str, name: str, log, bar): async def __download(self, url: str, name: str, log, bar):
temp = self.temp.joinpath(name) temp = self.temp.joinpath(name)
file = self.root.joinpath(name) file = self.folder.joinpath(name)
if self.manager.is_exists(file): if self.manager.is_exists(file):
self.rich_log(log, f"{name} 已存在,跳过下载") self.rich_log(log, f"{name} 已存在,跳过下载")
return True return True

View File

@@ -1,4 +1,8 @@
from datetime import datetime
from json import dumps
from pathlib import Path from pathlib import Path
from re import compile
from re import sub
from shutil import move from shutil import move
from shutil import rmtree from shutil import rmtree
@@ -6,11 +10,22 @@ __all__ = ["Manager"]
class Manager: class Manager:
def __init__(self, root: Path, ua: str, cookie: str, retry: int): NAME = compile(r"[^\u4e00-\u9fa5a-zA-Z0-9_]")
def __init__(
self,
root: Path,
path: str,
folder: str,
user_agent: str,
cookie: str,
retry: int):
self.root = root
self.temp = root.joinpath("./temp") self.temp = root.joinpath("./temp")
self.folder = self.__init_root(root, path, folder)
self.headers = { self.headers = {
"User-Agent": ua or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "User-Agent": user_agent or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gec"
"Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", "ko) Chrome/120.0.0.0 Safari/537.36",
"Cookie": cookie or "abRequestId=54c534bb-a2c6-558f-8e03-5b4c5c45635c; xsecappid=xhs-pc-web; a1=18c286a400" "Cookie": cookie or "abRequestId=54c534bb-a2c6-558f-8e03-5b4c5c45635c; xsecappid=xhs-pc-web; a1=18c286a400"
"4jy56qvzejvp631col0hd3032h4zjez50000106381; webId=779c977da3a15b5623015be94bdcc9e9; g" "4jy56qvzejvp631col0hd3032h4zjez50000106381; webId=779c977da3a15b5623015be94bdcc9e9; g"
"id=yYSJYK0qDW8KyYSJYK048quV84Vv2KAhudVhJduUKqySlx2818xfq4888y8KqYy8y2y2f8Jy; web_sess" "id=yYSJYK0qDW8KyYSJYK048quV84Vv2KAhudVhJduUKqySlx2818xfq4888y8KqYy8y2y2f8Jy; web_sess"
@@ -20,6 +35,15 @@ class Manager:
"e9ef2000000003801ff3d%22%2C%22uc%22:29}; cache_feeds=[]"} "e9ef2000000003801ff3d%22%2C%22uc%22:29}; cache_feeds=[]"}
self.retry = retry self.retry = retry
def __init_root(self, root: Path, path: str, folder: str) -> Path:
if path and (r := Path(path)).is_dir():
root = r.joinpath(folder or "Download")
else:
root = root.joinpath(folder or "Download")
root.mkdir(exist_ok=True)
self.temp.mkdir(exist_ok=True)
return root
@staticmethod @staticmethod
def is_exists(path: Path) -> bool: def is_exists(path: Path) -> bool:
return path.exists() return path.exists()
@@ -34,3 +58,20 @@ class Manager:
def clean(self): def clean(self):
rmtree(self.temp.resolve()) rmtree(self.temp.resolve())
def filter_name(self, name: str) -> str:
name = self.NAME.sub("_", name)
return sub(r"_+", "_", name)
def save_data(self, name: str, data: dict):
with self.folder.joinpath(f"{name}.txt").open("a", encoding="utf-8") as f:
time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
content = f"{
time.center(
50,
"=")}\n{
dumps(
data,
indent=4,
ensure_ascii=False)}\n"
f.write(content)

View File

@@ -56,7 +56,13 @@ class XHS:
max_retry=5, max_retry=5,
**kwargs, **kwargs,
): ):
self.manager = Manager(ROOT, user_agent, cookie, max_retry) self.manager = Manager(
ROOT,
path,
folder_name,
user_agent,
cookie,
max_retry)
self.html = Html( self.html = Html(
self.manager.headers, self.manager.headers,
proxy, proxy,
@@ -67,28 +73,28 @@ class XHS:
self.explore = Explore() self.explore = Explore()
self.download = Download( self.download = Download(
self.manager, self.manager,
ROOT,
path,
folder_name,
proxy, proxy,
chunk, chunk,
timeout, timeout, )
self.manager.retry, )
self.rich_log = self.download.rich_log self.rich_log = self.download.rich_log
async def __get_image(self, container: dict, html: str, download, log, bar): async def __get_image(self, container: dict, html: str, download, log, bar):
urls = self.image.get_image_link(html) urls = self.image.get_image_link(html)
# self.rich_log(log, urls) # 调试代码 # self.rich_log(log, urls) # 调试代码
name = self.__naming_rules(container)
if download: if download:
await self.download.run(urls, self.__naming_rules(container), 1, log, bar) await self.download.run(urls, name, 1, log, bar)
container["下载地址"] = urls container["下载地址"] = urls
self.manager.save_data(name, container)
async def __get_video(self, container: dict, html: str, download, log, bar): async def __get_video(self, container: dict, html: str, download, log, bar):
url = self.video.get_video_link(html) url = self.video.get_video_link(html)
# self.rich_log(log, url) # 调试代码 # self.rich_log(log, url) # 调试代码
name = self.__naming_rules(container)
if download: if download:
await self.download.run(url, self.__naming_rules(container), 0, log, bar) await self.download.run(url, name, 0, log, bar)
container["下载地址"] = url container["下载地址"] = url
self.manager.save_data(name, container)
async def extract(self, url: str, download=False, log=None, bar=None) -> list[dict]: async def extract(self, url: str, download=False, log=None, bar=None) -> list[dict]:
# return # 调试代码 # return # 调试代码
@@ -131,10 +137,9 @@ class XHS:
self.rich_log(log, f"完成处理:{url}") self.rich_log(log, f"完成处理:{url}")
return data return data
@staticmethod def __naming_rules(self, data: dict) -> str:
def __naming_rules(data: dict) -> str: """下载文件默认使用 作品标题 或 作品 ID 作为文件名称,可修改此方法自定义文件名称格式"""
"""下载文件默认使用作品 ID 作为文件名,可修改此方法自定义文件名格式""" return self.manager.filter_name(data["作品标题"]) or data["作品ID"]
return data["作品ID"]
async def __aenter__(self): async def __aenter__(self):
return self return self