mirror of
https://github.com/JoeanAmier/XHS-Downloader.git
synced 2025-12-26 04:48:05 +08:00
196 lines
6.5 KiB
Python
196 lines
6.5 KiB
Python
from asyncio import Event
|
|
from asyncio import Queue
|
|
from asyncio import QueueEmpty
|
|
from asyncio import gather
|
|
from asyncio import sleep
|
|
from contextlib import suppress
|
|
from re import compile
|
|
|
|
from pyperclip import paste
|
|
|
|
from source.expansion import Converter
|
|
from source.expansion import Namespace
|
|
from source.module import Manager
|
|
from source.module import (
|
|
ROOT,
|
|
ERROR,
|
|
WARNING,
|
|
)
|
|
from source.module import logging
|
|
from source.module import wait
|
|
from source.translator import (
|
|
LANGUAGE,
|
|
Chinese,
|
|
English,
|
|
)
|
|
from .download import Download
|
|
from .explore import Explore
|
|
from .image import Image
|
|
from .request import Html
|
|
from .video import Video
|
|
|
|
__all__ = ["XHS"]
|
|
|
|
|
|
class XHS:
|
|
LINK = compile(r"https?://www\.xiaohongshu\.com/explore/[a-z0-9]+")
|
|
SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+")
|
|
SHORT = compile(r"https?://xhslink\.com/[A-Za-z0-9]+")
|
|
__INSTANCE = None
|
|
|
|
def __new__(cls, *args, **kwargs):
|
|
if not cls.__INSTANCE:
|
|
cls.__INSTANCE = super().__new__(cls)
|
|
return cls.__INSTANCE
|
|
|
|
def __init__(
|
|
self,
|
|
work_path="",
|
|
folder_name="Download",
|
|
user_agent: str = None,
|
|
cookie: str = None,
|
|
proxy: str = None,
|
|
timeout=10,
|
|
chunk=1024 * 1024,
|
|
max_retry=5,
|
|
record_data=False,
|
|
image_format="PNG",
|
|
folder_mode=False,
|
|
language="zh-CN",
|
|
language_object: Chinese | English = None,
|
|
):
|
|
self.prompt = language_object or LANGUAGE.get(language, Chinese)
|
|
self.manager = Manager(
|
|
ROOT,
|
|
work_path,
|
|
folder_name,
|
|
user_agent,
|
|
chunk,
|
|
cookie,
|
|
proxy,
|
|
timeout,
|
|
max_retry,
|
|
record_data,
|
|
image_format,
|
|
folder_mode,
|
|
self.prompt,
|
|
)
|
|
self.html = Html(self.manager)
|
|
self.image = Image()
|
|
self.video = Video()
|
|
self.explore = Explore()
|
|
self.convert = Converter()
|
|
self.download = Download(self.manager)
|
|
self.clipboard_cache: str = ""
|
|
self.queue = Queue()
|
|
self.event = Event()
|
|
|
|
def __extract_image(self, container: dict, data: Namespace):
|
|
container["下载地址"] = self.image.get_image_link(
|
|
data, self.manager.image_format)
|
|
|
|
def __extract_video(self, container: dict, data: Namespace):
|
|
container["下载地址"] = self.video.get_video_link(data)
|
|
|
|
async def __download_files(self, container: dict, download: bool, log, bar):
|
|
name = self.__naming_rules(container)
|
|
path = self.manager.folder
|
|
if (u := container["下载地址"]) and download:
|
|
path = await self.download.run(u, name, container["作品类型"], log, bar)
|
|
elif not u:
|
|
logging(log, self.prompt.download_link_error, ERROR)
|
|
self.manager.save_data(path, name, container)
|
|
|
|
async def extract(self, url: str, download=False, efficient=False, log=None, bar=None) -> list[dict]:
|
|
# return # 调试代码
|
|
urls = await self.__extract_links(url, log)
|
|
if not urls:
|
|
logging(log, self.prompt.extract_link_failure, WARNING)
|
|
else:
|
|
logging(log, self.prompt.pending_processing(len(urls)))
|
|
# return urls # 调试代码
|
|
return [await self.__deal_extract(i, download, efficient, log, bar) for i in urls]
|
|
|
|
async def __extract_links(self, url: str, log) -> list:
|
|
urls = []
|
|
for i in url.split():
|
|
if u := self.SHORT.search(i):
|
|
i = await self.html.request_url(
|
|
u.group(), False, log)
|
|
if u := self.SHARE.search(i):
|
|
urls.append(u.group())
|
|
elif u := self.LINK.search(i):
|
|
urls.append(u.group())
|
|
return urls
|
|
|
|
async def __deal_extract(self, url: str, download: bool, efficient: bool, log, bar):
|
|
logging(log, self.prompt.start_processing(url))
|
|
html = await self.html.request_url(url, log=log)
|
|
namespace = self.__generate_data_object(html)
|
|
if not namespace:
|
|
logging(log, self.prompt.get_data_failure(url), ERROR)
|
|
return {}
|
|
await self.__suspend(efficient)
|
|
data = self.explore.run(namespace)
|
|
# logging(log, data) # 调试代码
|
|
if not data:
|
|
logging(log, self.prompt.extract_data_failure(url), ERROR)
|
|
return {}
|
|
match data["作品类型"]:
|
|
case "视频":
|
|
self.__extract_video(data, namespace)
|
|
case "图文":
|
|
self.__extract_image(data, namespace)
|
|
case _:
|
|
data["下载地址"] = []
|
|
await self.__download_files(data, download, log, bar)
|
|
logging(log, self.prompt.processing_completed(url))
|
|
return data
|
|
|
|
def __generate_data_object(self, html: str) -> Namespace:
|
|
data = self.convert.run(html)
|
|
return Namespace(data)
|
|
|
|
def __naming_rules(self, data: dict) -> str:
|
|
time_ = data["发布时间"].replace(":", ".")
|
|
author = self.manager.filter_name(data["作者昵称"]) or data["作者ID"]
|
|
title = self.manager.filter_name(data["作品标题"]) or data["作品ID"]
|
|
return f"{time_}_{author}_{title[:64]}"
|
|
|
|
async def monitor(self, delay=1, download=False, efficient=False, log=None, bar=None) -> None:
|
|
self.event.clear()
|
|
await gather(self.__push_link(delay), self.__receive_link(delay, download, efficient, log, bar))
|
|
|
|
async def __push_link(self, delay: int):
|
|
while not self.event.is_set():
|
|
if (t := paste()).lower() == "close":
|
|
self.stop_monitor()
|
|
elif t != self.clipboard_cache:
|
|
self.clipboard_cache = t
|
|
[await self.queue.put(i) for i in await self.__extract_links(t, None)]
|
|
await sleep(delay)
|
|
|
|
async def __receive_link(self, delay: int, *args, **kwargs):
|
|
while not self.event.is_set() or self.queue.qsize() > 0:
|
|
with suppress(QueueEmpty):
|
|
await self.__deal_extract(self.queue.get_nowait(), *args, **kwargs)
|
|
await sleep(delay)
|
|
|
|
def stop_monitor(self):
|
|
self.event.set()
|
|
|
|
@staticmethod
|
|
async def __suspend(efficient: bool) -> None:
|
|
if efficient:
|
|
return
|
|
await wait()
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_value, traceback):
|
|
await self.close()
|
|
|
|
async def close(self):
|
|
await self.manager.close()
|