XHS_Downloader/source/module/manager.py

from pathlib import Path
from re import compile, sub
from shutil import move, rmtree
from os import utime
from httpx import (
    AsyncClient,
    AsyncHTTPTransport,
    HTTPStatusError,
    RequestError,
    TimeoutException,
    get,
)

from source.expansion import remove_empty_directories

from ..translation import _
from .static import HEADERS, USERAGENT, WARNING
from .tools import logging
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ..expansion import Cleaner

__all__ = ["Manager"]


class Manager:
    NAME = compile(r"[^\u4e00-\u9fffa-zA-Z0-9-_！？，。；：“”（）《》]")
    NAME_KEYS = (
        "收藏数量",
        "评论数量",
        "分享数量",
        "点赞数量",
        "作品标签",
        "作品ID",
        "作品标题",
        "作品描述",
        "作品类型",
        "发布时间",
        "最后更新时间",
        "作者昵称",
        "作者ID",
    )
    NO_PROXY = {
        "http://": None,
        "https://": None,
    }
    SEPARATE = "_"
    WEB_ID = r"(?:^|; )webId=[^;]+"
    WEB_SESSION = r"(?:^|; )web_session=[^;]+"

    def __init__(
        self,
        root: Path,
        path: str,
        folder: str,
        name_format: str,
        chunk: int,
        user_agent: str,
        cookie: str,
        proxy: str | dict,
        timeout: int,
        retry: int,
        record_data: bool,
        image_format: str,
        image_download: bool,
        video_download: bool,
        live_download: bool,
        download_record: bool,
        folder_mode: bool,
        author_archive: bool,
        write_mtime: bool,
        script_server: bool,
        cleaner: "Cleaner",
        print_object,
    ):
        self.print = print_object
        self.root = root
        self.cleaner = cleaner
        self.temp = root.joinpath("Temp")
        self.path = self.__check_path(path)
        self.folder = self.__check_folder(folder)
        self.compatible()
        self.blank_headers = HEADERS | {
            "user-agent": user_agent or USERAGENT,
        }
        self.headers = self.blank_headers | {
            "cookie": cookie,
        }
        self.retry = retry
        self.chunk = chunk
        self.name_format = self.__check_name_format(name_format)
        self.record_data = self.check_bool(record_data, False)
        self.image_format = self.__check_image_format(image_format)
        self.folder_mode = self.check_bool(folder_mode, False)
        self.download_record = self.check_bool(download_record, True)
        self.proxy_tip = None
        self.proxy = self.__check_proxy(proxy)
        self.print_proxy_tip()
        self.timeout = timeout
        self.request_client = AsyncClient(
            headers=self.headers
            | {
                "referer": "https://www.xiaohongshu.com/",
            },
            timeout=timeout,
            verify=False,
            follow_redirects=True,
            mounts={
                "http://": AsyncHTTPTransport(proxy=self.proxy),
                "https://": AsyncHTTPTransport(proxy=self.proxy),
            },
        )
        self.download_client = AsyncClient(
            headers=self.blank_headers,
            timeout=timeout,
            verify=False,
            follow_redirects=True,
            mounts={
                "http://": AsyncHTTPTransport(proxy=self.proxy),
                "https://": AsyncHTTPTransport(proxy=self.proxy),
            },
        )
        self.image_download = self.check_bool(image_download, True)
        self.video_download = self.check_bool(video_download, True)
        self.live_download = self.check_bool(live_download, True)
        self.author_archive = self.check_bool(author_archive, False)
        self.write_mtime = self.check_bool(write_mtime, False)
        self.script_server = self.check_bool(script_server, False)
        self.create_folder()

    def __check_path(self, path: str) -> Path:
        if not path:
            return self.root
        if (r := Path(path)).is_dir():
            return r
        return r if (r := self.__check_root_again(r)) else self.root

    def __check_folder(self, folder: str) -> Path:
        folder = self.cleaner.filter_name(folder, default="Download")
        return self.path.joinpath(folder)

    @staticmethod
    def __check_root_again(root: Path) -> bool | Path:
        if root.parent.is_dir():
            root.mkdir(exist_ok=True)
            return root
        return False

    @staticmethod
    def __check_image_format(image_format) -> str:
        if (i := image_format.lower()) in {
            "auto",
            "png",
            "webp",
            "jpeg",
            "heic",
            "avif",
        }:
            return i
        return "png"

    @staticmethod
    def is_exists(path: Path) -> bool:
        return path.exists()

    @staticmethod
    def delete(path: Path):
        if path.exists():
            path.unlink()

    @staticmethod
    def archive(root: Path, name: str, folder_mode: bool) -> Path:
        return root.joinpath(name) if folder_mode else root

    @classmethod
    def move(
        cls,
        temp: Path,
        path: Path,
        mtime: int = None,
        rewrite: bool = False,
    ):
        move(temp.resolve(), path.resolve())
        if rewrite and mtime:
            cls.update_mtime(path.resolve(), mtime)

    @staticmethod
    def update_mtime(file: Path, mtime: int):
        utime(file, (mtime, mtime))

    def __clean(self):
        rmtree(self.temp.resolve())

    def filter_name(self, name: str) -> str:
        name = self.NAME.sub("_", name)
        return sub(r"_+", "_", name).strip("_")

    @staticmethod
    def check_bool(value: bool, default: bool) -> bool:
        return value if isinstance(value, bool) else default

    async def close(self):
        await self.request_client.aclose()
        await self.download_client.aclose()
        # self.__clean()
        remove_empty_directories(self.root)
        remove_empty_directories(self.folder)

    def __check_name_format(self, format_: str) -> str:
        keys = format_.split()
        return next(
            ("发布时间 作者昵称 作品标题" for key in keys if key not in self.NAME_KEYS),
            format_,
        )

    def __check_proxy(
        self,
        proxy: str,
        url="https://www.xiaohongshu.com/explore",
    ) -> str | None:
        if proxy:
            try:
                response = get(
                    url,
                    proxy=proxy,
                    timeout=10,
                    headers={
                        "User-Agent": USERAGENT,
                    },
                )
                response.raise_for_status()
                self.proxy_tip = (_("代理 {0} 测试成功").format(proxy),)
                return proxy
            except TimeoutException:
                self.proxy_tip = (
                    _("代理 {0} 测试超时").format(proxy),
                    WARNING,
                )
            except (
                RequestError,
                HTTPStatusError,
            ) as e:
                self.proxy_tip = (
                    _("代理 {0} 测试失败：{1}").format(
                        proxy,
                        e,
                    ),
                    WARNING,
                )
        return None

    def print_proxy_tip(
        self,
    ) -> None:
        if self.proxy_tip:
            logging(self.print, *self.proxy_tip)

    @classmethod
    def clean_cookie(cls, cookie_string: str) -> str:
        return cls.delete_cookie(
            cookie_string,
            (
                cls.WEB_ID,
                cls.WEB_SESSION,
            ),
        )

    @classmethod
    def delete_cookie(cls, cookie_string: str, patterns: list | tuple) -> str:
        for pattern in patterns:
            # 使用空字符串替换匹配到的部分
            cookie_string = sub(pattern, "", cookie_string)
        # 去除多余的分号和空格
        cookie_string = sub(r";\s*$", "", cookie_string)  # 删除末尾的分号和空格
        cookie_string = sub(r";\s*;", ";", cookie_string)  # 删除中间多余分号后的空格
        return cookie_string.strip("; ")

    def create_folder(
        self,
    ):
        self.folder.mkdir(exist_ok=True)
        self.temp.mkdir(exist_ok=True)

    def compatible(
        self,
    ):
        if (
            self.path == self.root
            and (old := self.path.parent.joinpath(self.folder.name)).exists()
            and not self.folder.exists()
        ):
            move(old, self.folder)