From e2eee03feb2900b3ebc838f1b88b07aa27a9eda6 Mon Sep 17 00:00:00 2001 From: JoeanAmier Date: Fri, 9 Aug 2024 18:35:04 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=9D=9E=E6=B3=95?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=90=8D=E7=A7=B0=E6=8A=A5=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 替换文件名称包含的非法字符 2. 替换文件名称包含的 Emoji 表情 --- README.md | 1 + README_EN.md | 1 + requirements.txt | 1 + source/application/app.py | 16 ++++++- source/expansion/__init__.py | 1 + source/expansion/cleaner.py | 89 ++++++++++++++++++++++++++++++++++++ 6 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 source/expansion/cleaner.py diff --git a/README.md b/README.md index 9154d46..7624f25 100644 --- a/README.md +++ b/README.md @@ -426,3 +426,4 @@ async def example(): * https://aiosqlite.omnilib.dev/en/stable/ * https://click.palletsprojects.com/en/8.1.x/ * https://github.com/thewh1teagle/rookie +* https://github.com/carpedm20/emoji/ diff --git a/README_EN.md b/README_EN.md index 472648b..665c955 100644 --- a/README_EN.md +++ b/README_EN.md @@ -427,3 +427,4 @@ async def example(): * https://aiosqlite.omnilib.dev/en/stable/ * https://click.palletsprojects.com/en/8.1.x/ * https://github.com/thewh1teagle/rookie +* https://github.com/carpedm20/emoji/ diff --git a/requirements.txt b/requirements.txt index a8f12c0..7ece4b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ httpx>=0.27.0 fastapi>=0.111.0 uvicorn>=0.30.1 aiofiles>=24.1.0 +emoji>=2.12.1 diff --git a/source/application/app.py b/source/application/app.py index 43ee81d..47a425d 100644 --- a/source/application/app.py +++ b/source/application/app.py @@ -17,8 +17,10 @@ from uvicorn import Config from uvicorn import Server from source.expansion import BrowserCookie +from source.expansion import Cleaner from source.expansion import Converter from source.expansion import Namespace +from source.expansion import beautify_string from source.module import DataRecorder from source.module import ExtractData from source.module import ExtractParams @@ -65,6 +67,7 @@ class XHS: SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/\S+") SHORT = compile(r"https?://xhslink\.com/\S+") __INSTANCE = None + CLEANER = Cleaner() def __new__(cls, *args, **kwargs): if not cls.__INSTANCE: @@ -270,7 +273,13 @@ class XHS: values.append(self.__get_name_title(data)) case _: values.append(data[key]) - return self.manager.SEPARATE.join(values) + return self.CLEANER.filter_name( + self.manager.SEPARATE.join(values), + default=self.manager.SEPARATE.join(( + data["作者ID"], + data["作品ID"], + )), + ) @staticmethod def __get_name_time(data: dict) -> str: @@ -280,7 +289,10 @@ class XHS: return self.manager.filter_name(data["作者昵称"]) or data["作者ID"] def __get_name_title(self, data: dict) -> str: - return self.manager.filter_name(data["作品标题"])[:64] or data["作品ID"] + return beautify_string( + self.manager.filter_name(data["作品标题"]), + 64, + ) or data["作品ID"] async def monitor(self, delay=1, download=False, log=None, bar=None, data=True, ) -> None: logging( diff --git a/source/expansion/__init__.py b/source/expansion/__init__.py index 271fca5..5852359 100644 --- a/source/expansion/__init__.py +++ b/source/expansion/__init__.py @@ -6,3 +6,4 @@ from .truncate import trim_string from .truncate import truncate_string from .file_folder import file_switch from .file_folder import remove_empty_directories +from .cleaner import Cleaner diff --git a/source/expansion/cleaner.py b/source/expansion/cleaner.py new file mode 100644 index 0000000..f75da6f --- /dev/null +++ b/source/expansion/cleaner.py @@ -0,0 +1,89 @@ +from platform import system +from string import whitespace +from emoji import replace_emoji +from warnings import warn + + +class Cleaner: + def __init__(self): + """ + 替换字符串中包含的非法字符,默认根据系统类型生成对应的非法字符字典,也可以自行设置非法字符字典 + """ + self.rule = self.default_rule() # 默认非法字符字典 + + @staticmethod + def default_rule(): + """根据系统类型生成默认非法字符字典""" + if (s := system()) in ("Windows", "Darwin"): + rule = { + "/": "", + "\\": "", + "|": "", + "<": "", + ">": "", + "\"": "", + "?": "", + ":": "", + "*": "", + "\x00": "", + } # Windows 系统和 Mac 系统 + elif s == "Linux": + rule = { + "/": "", + "\x00": "", + } # Linux 系统 + else: + warn("不受支持的操作系统类型,可能无法正常去除非法字符!") + rule = {} + cache = {i: "" for i in whitespace[1:]} # 补充换行符等非法字符 + return rule | cache + + def set_rule(self, rule: dict[str, str], update=True): + """ + 设置非法字符字典 + + :param rule: 替换规则,字典格式,键为非法字符,值为替换后的内容 + :param update: 如果是 True,则与原有规则字典合并,否则替换原有规则字典 + """ + self.rule = {**self.rule, **rule} if update else rule + + def filter(self, text: str) -> str: + """ + 去除非法字符 + + :param text: 待处理的字符串 + :return: 替换后的字符串,如果替换后字符串为空,则返回 None + """ + for i in self.rule: + text = text.replace(i, self.rule[i]) + return text + + def filter_name( + self, + text: str, + replace: str = "", + default: str = "", + ) -> str: + """过滤文件夹名称中的非法字符""" + text = text.replace(":", ".") + + text = self.filter(text) + + text = replace_emoji(text, replace, ) + + text = self.clear_spaces(text) + + text = text.strip().strip(".").strip("_") + + return text or default + + @staticmethod + def clear_spaces(string: str): + """将连续的空格转换为单个空格""" + return " ".join(string.split()) + + +if __name__ == "__main__": + demo = Cleaner() + print(demo.rule) + print(demo.filter_name(""))