diff --git a/locale/en_GB/LC_MESSAGES/xhs.mo b/locale/en_GB/LC_MESSAGES/xhs.mo index c8c46d6..830c8eb 100644 Binary files a/locale/en_GB/LC_MESSAGES/xhs.mo and b/locale/en_GB/LC_MESSAGES/xhs.mo differ diff --git a/locale/en_GB/LC_MESSAGES/xhs.po b/locale/en_GB/LC_MESSAGES/xhs.po index d2ff7ec..2849281 100644 --- a/locale/en_GB/LC_MESSAGES/xhs.po +++ b/locale/en_GB/LC_MESSAGES/xhs.po @@ -85,7 +85,7 @@ msgstr "Xiaohongshu Web Cookie" msgid "网络代理" msgstr "Network proxy" -msgid "记录作品数据" +msgid "记录作品详细数据" msgstr "Record works data" msgid "图片下载格式" @@ -291,3 +291,6 @@ msgstr "Agent {0} test failed: {1}" msgid "浏览器名称或序号输入错误!" msgstr "Browser name or serial number input error!" + +msgid "作品下载记录开关" +msgstr "Works download record switch" diff --git a/locale/zh_CN/LC_MESSAGES/xhs.po b/locale/zh_CN/LC_MESSAGES/xhs.po index b1fc6da..a5bf704 100644 --- a/locale/zh_CN/LC_MESSAGES/xhs.po +++ b/locale/zh_CN/LC_MESSAGES/xhs.po @@ -85,7 +85,7 @@ msgstr "" msgid "网络代理" msgstr "" -msgid "记录作品数据" +msgid "记录作品详细数据" msgstr "" msgid "图片下载格式" @@ -291,3 +291,6 @@ msgstr "" msgid "浏览器名称或序号输入错误!" msgstr "" + +msgid "作品下载记录开关" +msgstr "" diff --git a/source/TUI/setting.py b/source/TUI/setting.py index 1e1da44..c76a7c8 100644 --- a/source/TUI/setting.py +++ b/source/TUI/setting.py @@ -61,7 +61,7 @@ class Setting(Screen): Input(str(self.data["max_retry"]), placeholder="5", type="integer", id="max_retry", ), Label(), Container( - Checkbox(self.message("记录作品数据"), id="record_data", value=self.data["record_data"], ), + Checkbox(self.message("记录作品详细数据"), id="record_data", value=self.data["record_data"], ), Checkbox(self.message("作品文件夹归档模式"), id="folder_mode", value=self.data["folder_mode"], ), Checkbox(self.message("视频作品下载开关"), id="video_download", value=self.data["video_download"], ), Checkbox(self.message("图文作品下载开关"), id="image_download", value=self.data["image_download"], ), @@ -69,6 +69,7 @@ class Setting(Screen): Label(), Container( Checkbox(self.message("动图文件下载开关"), id="live_download", value=self.data["live_download"], ), + Checkbox(self.message("作品下载记录开关"), id="download_record", value=self.data["download_record"], ), classes="horizontal-layout"), Container( Label(self.message("图片下载格式"), classes="params", ), @@ -123,6 +124,7 @@ class Setting(Screen): "image_download": self.query_one("#image_download").value, "video_download": self.query_one("#video_download").value, "live_download": self.query_one("#live_download").value, + "download_record": self.query_one("#download_record").value, # "server": False, }) diff --git a/source/TUI/update.py b/source/TUI/update.py index 36cd5a8..51f86b8 100644 --- a/source/TUI/update.py +++ b/source/TUI/update.py @@ -10,9 +10,6 @@ from textual.widgets import LoadingIndicator from source.application import XHS from source.module import ( - VERSION_MAJOR, - VERSION_MINOR, - VERSION_BETA, ERROR, WARNING, INFO, @@ -40,10 +37,10 @@ class Update(ModalScreen): try: url = await self.xhs.html.request_url(RELEASES, False, None, timeout=5, ) version = url.split("/")[-1] - match self.compare_versions(f"{VERSION_MAJOR}.{VERSION_MINOR}", version, VERSION_BETA): + match self.compare_versions(f"{XHS.VERSION_MAJOR}.{XHS.VERSION_MINOR}", version, XHS.VERSION_BETA): case 4: tip = Text(f"{self.message("检测到新版本:{0}.{1}").format( - VERSION_MAJOR, VERSION_MINOR)}\n{RELEASES}", style=WARNING) + XHS.VERSION_MAJOR, XHS.VERSION_MINOR)}\n{RELEASES}", style=WARNING) case 3: tip = Text( f"{self.message("当前版本为开发版, 可更新至正式版")}\n{RELEASES}", diff --git a/source/application/app.py b/source/application/app.py index 37f15a5..6d263c0 100644 --- a/source/application/app.py +++ b/source/application/app.py @@ -32,6 +32,7 @@ from source.module import ( REPOSITORY, VERSION_MAJOR, VERSION_MINOR, + VERSION_BETA, ) from source.module import Translate from source.module import logging @@ -44,10 +45,25 @@ from .video import Video __all__ = ["XHS"] +def _data_cache(function): + async def inner(self, data: dict, ): + if self.manager.record_data: + download = data["下载地址"] + lives = data["动图地址"] + await function(self, data, ) + data["下载地址"] = download + data["动图地址"] = lives + + return inner + + class XHS: - LINK = compile(r"https?://www\.xiaohongshu\.com/explore/[a-z0-9]+") - SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+") - SHORT = compile(r"https?://xhslink\.com/[A-Za-z0-9]+") + VERSION_MAJOR = VERSION_MAJOR + VERSION_MINOR = VERSION_MINOR + VERSION_BETA = VERSION_BETA + LINK = compile(r"https?://www\.xiaohongshu\.com/explore/\S+") + SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/\S+") + SHORT = compile(r"https?://xhslink\.com/\S+") __INSTANCE = None def __new__(cls, *args, **kwargs): @@ -74,6 +90,7 @@ class XHS: video_download=True, live_download=False, folder_mode=False, + download_record=True, language="zh_CN", # server=False, transition: Callable[[str], str] = None, @@ -101,6 +118,7 @@ class XHS: image_download, video_download, live_download, + download_record, folder_mode, # server, self.message, @@ -127,7 +145,7 @@ class XHS: def __extract_video(self, container: dict, data: Namespace): container["下载地址"] = self.video.get_video_link(data) - container["动图地址"] = "" + container["动图地址"] = [None, ] async def __download_files(self, container: dict, download: bool, index, log, bar): name = self.__naming_rules(container) @@ -136,13 +154,21 @@ class XHS: logging( log, self.message("作品 {0} 存在下载记录,跳过下载").format(i)) else: - path, result = await self.download.run(u, container["动图地址"], index, name, container["作品类型"], - log, bar) + path, result = await self.download.run( + u, + container["动图地址"], + index, + name, + container["作品类型"], + log, + bar, + ) await self.__add_record(i, result) elif not u: logging(log, self.message("提取作品文件下载地址失败"), ERROR) await self.save_data(container) + @_data_cache async def save_data(self, data: dict, ): data["采集时间"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") data["下载地址"] = " ".join(data["下载地址"]) @@ -196,20 +222,20 @@ class XHS: return urls async def __deal_extract(self, url: str, download: bool, index: list | tuple | None, log, bar, data: bool, ): - if not data and await self.skip_download(i := self.__extract_link_id(url)): + if await self.skip_download(i := self.__extract_link_id(url)) and not data: msg = self.message("作品 {0} 存在下载记录,跳过处理").format(i) logging(log, msg) return {"message": msg} - logging(log, self.message("开始处理作品:{0}").format(url)) + logging(log, self.message("开始处理作品:{0}").format(i)) html = await self.html.request_url(url, log=log) namespace = self.__generate_data_object(html) if not namespace: - logging(log, self.message("{0} 获取数据失败").format(url), ERROR) + logging(log, self.message("{0} 获取数据失败").format(i), ERROR) return {} data = self.explore.run(namespace) # logging(log, data) # 调试代码 if not data: - logging(log, self.message("{0} 提取数据失败").format(url), ERROR) + logging(log, self.message("{0} 提取数据失败").format(i), ERROR) return {} match data["作品类型"]: case "视频": @@ -219,7 +245,7 @@ class XHS: case _: data["下载地址"] = [] await self.__download_files(data, download, index, log, bar) - logging(log, self.message("作品处理完成:{0}").format(url)) + logging(log, self.message("作品处理完成:{0}").format(i)) return data @staticmethod @@ -352,7 +378,7 @@ class XHS: # await self.runner.cleanup() # logging(log, self.message("Web API 服务器已关闭!")) - async def run_server(self, host="127.0.0.1", port=8000, log_level="info", ): + async def run_server(self, host="0.0.0.0", port=8000, log_level="info", ): self.server = FastAPI( title="XHS-Downloader", version=f"{VERSION_MAJOR}.{VERSION_MINOR}") diff --git a/source/application/download.py b/source/application/download.py index 62ad406..af5e418 100644 --- a/source/application/download.py +++ b/source/application/download.py @@ -37,8 +37,16 @@ class Download: self.video_download = manager.video_download self.live_download = manager.live_download - async def run(self, urls: list, lives: list, index: list | tuple | None, name: str, type_: str, log, bar) -> tuple[ - Path, tuple]: + async def run( + self, + urls: list, + lives: list, + index: list | tuple | None, + name: str, + type_: str, + log, + bar, + ) -> tuple[Path, tuple]: path = self.__generate_path(name) match type_: case "视频": diff --git a/source/application/explore.py b/source/application/explore.py index 53bff27..6f3044c 100644 --- a/source/application/explore.py +++ b/source/application/explore.py @@ -40,8 +40,7 @@ class Explore: def __extract_info(self, container: dict, data: Namespace): container["作品ID"] = data.safe_extract("noteId") - container["作品链接"] = f"https://www.xiaohongshu.com/explore/{ - container["作品ID"]}" + container["作品链接"] = f"https://www.xiaohongshu.com/explore/{container["作品ID"]}" container["作品标题"] = data.safe_extract("title") container["作品描述"] = data.safe_extract("desc") container["作品类型"] = self.explore_type.get( diff --git a/source/application/image.py b/source/application/image.py index 2a493e3..a2f0737 100644 --- a/source/application/image.py +++ b/source/application/image.py @@ -37,9 +37,12 @@ class Image: @staticmethod def __get_live_link(items: list) -> list: - links = [] - for item in items: - links.append( - Html.format_url(Namespace.object_extract( - item, "stream.h264[0].masterUrl"))) - return links + return [ + ( + Html.format_url( + Namespace.object_extract(item, "stream.h264[0].masterUrl") + ) + or None + ) + for item in items + ] diff --git a/source/application/request.py b/source/application/request.py index 1f8001f..4807e7c 100644 --- a/source/application/request.py +++ b/source/application/request.py @@ -13,6 +13,8 @@ class Html: self.retry = manager.retry self.message = manager.message self.client = manager.request_client + self.headers = manager.headers + self.blank_headers = manager.blank_headers @retry async def request_url( @@ -25,6 +27,7 @@ class Html: try: response = await self.client.get( url, + headers=self.select_headers(url, ), **kwargs, ) response.raise_for_status() @@ -38,3 +41,6 @@ class Html: @staticmethod def format_url(url: str) -> str: return bytes(url, "utf-8").decode("unicode_escape") + + def select_headers(self, url: str) -> dict: + return self.blank_headers if "discovery/item" in url else self.headers diff --git a/source/expansion/__init__.py b/source/expansion/__init__.py index 2f5741e..fe019e8 100644 --- a/source/expansion/__init__.py +++ b/source/expansion/__init__.py @@ -1,5 +1,6 @@ from .browser import BrowserCookie from .converter import Converter from .namespace import Namespace - -__all__ = ["Converter", "Namespace", "BrowserCookie", ] +from .truncate import beautify_string +from .truncate import trim_string +from .truncate import truncate_string diff --git a/source/expansion/truncate.py b/source/expansion/truncate.py new file mode 100644 index 0000000..96efced --- /dev/null +++ b/source/expansion/truncate.py @@ -0,0 +1,35 @@ +from unicodedata import name + + +def is_chinese_char(char: str) -> bool: + return 'CJK' in name(char, "") + + +def truncate_string(s: str, length: int = 64) -> str: + count = 0 + result = "" + for char in s: + count += 2 if is_chinese_char(char) else 1 + if count > length: + break + result += char + return result + + +def trim_string(s: str, length: int = 64) -> str: + length = length // 2 - 2 + return f"{s[:length]}...{s[-length:]}" if len(s) > length else s + + +def beautify_string(s: str, length: int = 64) -> str: + count = 0 + for char in s: + count += 2 if is_chinese_char(char) else 1 + if count > length: + break + else: + return s + length //= 2 + start = truncate_string(s, length) + end = truncate_string(s[::-1], length)[::-1] + return f"{start}...{end}" diff --git a/source/module/manager.py b/source/module/manager.py index f7745a7..e72a35b 100644 --- a/source/module/manager.py +++ b/source/module/manager.py @@ -6,6 +6,7 @@ from shutil import rmtree from typing import Callable from httpx import AsyncClient +from httpx import HTTPStatusError from httpx import RequestError from httpx import TimeoutException from httpx import get @@ -64,6 +65,7 @@ class Manager: image_download: bool, video_download: bool, live_download: bool, + download_record: bool, folder_mode: bool, # server: bool, transition: Callable[[str], str], @@ -80,7 +82,7 @@ class Manager: "Sec-Ch-Ua-Platform": sec_ch_ua_platform or SEC_CH_UA_PLATFORM, } self.headers = self.blank_headers | { - "Cookie": self.clean_cookie(cookie), + "Cookie": cookie, } self.retry = retry self.chunk = chunk @@ -88,6 +90,7 @@ class Manager: self.record_data = self.check_bool(record_data, False) self.image_format = self.__check_image_format(image_format) self.folder_mode = self.check_bool(folder_mode, False) + self.download_record = self.check_bool(download_record, True) self.proxy_tip = None self.proxy = self.__check_proxy(proxy) self.print_proxy_tip(_print, ) @@ -200,13 +203,16 @@ class Manager: response = get( url, **kwarg, ) - if response.status_code < 400: - self.proxy_tip = (self.message("代理 {0} 测试成功").format(proxy),) - return kwarg + response.raise_for_status() + self.proxy_tip = (self.message("代理 {0} 测试成功").format(proxy),) + return kwarg except TimeoutException: self.proxy_tip = ( self.message("代理 {0} 测试超时").format(proxy), WARNING,) - except RequestError as e: + except ( + RequestError, + HTTPStatusError, + ) as e: self.proxy_tip = ( self.message("代理 {0} 测试失败:{1}").format( proxy, e), WARNING,) @@ -218,17 +224,19 @@ class Manager: @classmethod def clean_cookie(cls, cookie_string: str) -> str: - for i in ( + return cls.delete_cookie( + cookie_string, + ( cls.WEB_ID, cls.WEB_SESSION, - ): - cookie_string = cls.delete_cookie(cookie_string, i) - return cookie_string + ), + ) @classmethod - def delete_cookie(cls, cookie_string: str, pattern) -> str: - # 使用空字符串替换匹配到的部分 - cookie_string = sub(pattern, "", cookie_string) + def delete_cookie(cls, cookie_string: str, patterns: list | tuple) -> str: + for pattern in patterns: + # 使用空字符串替换匹配到的部分 + cookie_string = sub(pattern, "", cookie_string) # 去除多余的分号和空格 cookie_string = sub(r';\s*$', "", cookie_string) # 删除末尾的分号和空格 cookie_string = sub(r';\s*;', ";", cookie_string) # 删除中间多余分号后的空格 diff --git a/source/module/recorder.py b/source/module/recorder.py index f65c209..e1234df 100644 --- a/source/module/recorder.py +++ b/source/module/recorder.py @@ -1,3 +1,5 @@ +from asyncio import CancelledError +from contextlib import suppress from re import compile from aiosqlite import connect @@ -12,6 +14,7 @@ class IDRecorder: def __init__(self, manager: Manager): self.file = manager.root.joinpath("ExploreID.db") + self.switch = manager.download_record self.database = None self.cursor = None @@ -22,12 +25,14 @@ class IDRecorder: await self.database.commit() async def select(self, id_: str): - await self.cursor.execute("SELECT ID FROM explore_id WHERE ID=?", (id_,)) - return await self.cursor.fetchone() + if self.switch: + await self.cursor.execute("SELECT ID FROM explore_id WHERE ID=?", (id_,)) + return await self.cursor.fetchone() async def add(self, id_: str) -> None: - await self.database.execute("REPLACE INTO explore_id VALUES (?);", (id_,)) - await self.database.commit() + if self.switch: + await self.database.execute("REPLACE INTO explore_id VALUES (?);", (id_,)) + await self.database.commit() async def __delete(self, id_: str) -> None: if id_: @@ -35,19 +40,22 @@ class IDRecorder: await self.database.commit() async def delete(self, ids: str): - ids = [i.group(1) for i in self.URL.finditer(ids)] - [await self.__delete(i) for i in ids] + if self.switch: + ids = [i.group(1) for i in self.URL.finditer(ids)] + [await self.__delete(i) for i in ids] async def all(self): - await self.cursor.execute("SELECT ID FROM explore_id") - return [i[0] for i in await self.cursor.fetchmany()] + if self.switch: + await self.cursor.execute("SELECT ID FROM explore_id") + return [i[0] for i in await self.cursor.fetchmany()] async def __aenter__(self): await self._connect_database() return self async def __aexit__(self, exc_type, exc_value, traceback): - await self.cursor.close() + with suppress(CancelledError): + await self.cursor.close() await self.database.close() @@ -76,6 +84,7 @@ class DataRecorder(IDRecorder): def __init__(self, manager: Manager): super().__init__(manager) self.file = manager.folder.joinpath("ExploreData.db") + self.switch = manager.record_data async def _connect_database(self): self.database = await connect(self.file) @@ -89,12 +98,13 @@ class DataRecorder(IDRecorder): pass async def add(self, **kwargs) -> None: - await self.database.execute(f"""REPLACE INTO explore_data ( + if self.switch: + await self.database.execute(f"""REPLACE INTO explore_data ( {", ".join(i[0] for i in self.DATA_TABLE)} ) VALUES ( {", ".join("?" for _ in kwargs)} );""", self.__generate_values(kwargs)) - await self.database.commit() + await self.database.commit() async def __delete(self, id_: str) -> None: pass diff --git a/source/module/settings.py b/source/module/settings.py index 0f20d3e..681115a 100644 --- a/source/module/settings.py +++ b/source/module/settings.py @@ -30,6 +30,7 @@ class Settings: "video_download": True, "live_download": False, "folder_mode": False, + "download_record": True, "language": "zh_CN", # "server": False, } diff --git a/static/XHS-Downloader.js b/static/XHS-Downloader.js index 7a73e84..e109db1 100644 --- a/static/XHS-Downloader.js +++ b/static/XHS-Downloader.js @@ -1,7 +1,7 @@ // ==UserScript== // @name XHS-Downloader // @namespace https://github.com/JoeanAmier/XHS-Downloader -// @version 1.5.2 +// @version 1.6.0 // @description 提取小红书作品/用户链接,下载小红书无水印图文/视频作品文件 // @author JoeanAmier // @match http*://xhslink.com/* @@ -304,43 +304,43 @@ const extractNotesInfo = order => { const notesRawValue = unsafeWindow.__INITIAL_STATE__.user.notes._rawValue[order]; - return new Set(notesRawValue.map(({id}) => id)); + return notesRawValue.map(item => [item.id, item.xsecToken]); }; const extractFeedInfo = () => { const notesRawValue = unsafeWindow.__INITIAL_STATE__.feed.feeds._rawValue; - return new Set(notesRawValue.map(({id}) => id)); + return notesRawValue.map(item => [item.id, item.xsecToken]); }; const extractSearchNotes = () => { const notesRawValue = unsafeWindow.__INITIAL_STATE__.search.feeds._rawValue; - return new Set(notesRawValue.map(({id}) => id)); + return notesRawValue.map(item => [item.id, item.xsecToken]); } const extractSearchUsers = () => { const notesRawValue = unsafeWindow.__INITIAL_STATE__.search.userLists._rawValue; - return new Set(notesRawValue.map(({id}) => id)); + return notesRawValue.map(item => item.id); } - const generateNoteUrls = ids => [...ids].map(id => `https://www.xiaohongshu.com/explore/${id}`).join(" "); + const generateNoteUrls = data => data.map(([id, token]) => `https://www.xiaohongshu.com/explore/${id}?xsec_token=${token}&xsec_source=pc_feed`).join(" "); - const generateUserUrls = ids => [...ids].map(id => `https://www.xiaohongshu.com/user/profile/${id}`).join(" "); + const generateUserUrls = data => data.map(id => `https://www.xiaohongshu.com/user/profile/${id}`).join(" "); const extractAllLinks = (callback, order) => { scrollScreen(() => { - let ids; + let data; if (order >= 0 && order <= 2) { - ids = extractNotesInfo(order); + data = extractNotesInfo(order); } else if (order === 3) { - ids = extractSearchNotes(); + data = extractSearchNotes(); } else if (order === 4) { - ids = extractSearchUsers(); + data = extractSearchUsers(); } else if (order === -1) { - ids = extractFeedInfo() + data = extractFeedInfo() } else { - ids = []; + data = []; } - let urlsString = order !== 4 ? generateNoteUrls(ids) : generateUserUrls(ids); + let urlsString = order !== 4 ? generateNoteUrls(data) : generateUserUrls(data); callback(urlsString); }, order === -1, [3, 4].includes(order)) };