From e2eee03feb2900b3ebc838f1b88b07aa27a9eda6 Mon Sep 17 00:00:00 2001
From: JoeanAmier <yonglelolu@gmail.com>
Date: Fri, 9 Aug 2024 18:35:04 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=9D=9E=E6=B3=95?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=90=8D=E7=A7=B0=E6=8A=A5=E9=94=99=E7=9A=84?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 替换文件名称包含的非法字符
2. 替换文件名称包含的 Emoji 表情
---
 README.md                    |  1 +
 README_EN.md                 |  1 +
 requirements.txt             |  1 +
 source/application/app.py    | 16 ++++++-
 source/expansion/__init__.py |  1 +
 source/expansion/cleaner.py  | 89 ++++++++++++++++++++++++++++++++++++
 6 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 source/expansion/cleaner.py

diff --git a/README.md b/README.md
index 9154d46..7624f25 100644
--- a/README.md
+++ b/README.md
@@ -426,3 +426,4 @@ async def example():
 * https://aiosqlite.omnilib.dev/en/stable/
 * https://click.palletsprojects.com/en/8.1.x/
 * https://github.com/thewh1teagle/rookie
+* https://github.com/carpedm20/emoji/
diff --git a/README_EN.md b/README_EN.md
index 472648b..665c955 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -427,3 +427,4 @@ async def example():
 * https://aiosqlite.omnilib.dev/en/stable/
 * https://click.palletsprojects.com/en/8.1.x/
 * https://github.com/thewh1teagle/rookie
+* https://github.com/carpedm20/emoji/
diff --git a/requirements.txt b/requirements.txt
index a8f12c0..7ece4b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ httpx>=0.27.0
 fastapi>=0.111.0
 uvicorn>=0.30.1
 aiofiles>=24.1.0
+emoji>=2.12.1
diff --git a/source/application/app.py b/source/application/app.py
index 43ee81d..47a425d 100644
--- a/source/application/app.py
+++ b/source/application/app.py
@@ -17,8 +17,10 @@ from uvicorn import Config
 from uvicorn import Server
 
 from source.expansion import BrowserCookie
+from source.expansion import Cleaner
 from source.expansion import Converter
 from source.expansion import Namespace
+from source.expansion import beautify_string
 from source.module import DataRecorder
 from source.module import ExtractData
 from source.module import ExtractParams
@@ -65,6 +67,7 @@ class XHS:
     SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/\S+")
     SHORT = compile(r"https?://xhslink\.com/\S+")
     __INSTANCE = None
+    CLEANER = Cleaner()
 
     def __new__(cls, *args, **kwargs):
         if not cls.__INSTANCE:
@@ -270,7 +273,13 @@ class XHS:
                     values.append(self.__get_name_title(data))
                 case _:
                     values.append(data[key])
-        return self.manager.SEPARATE.join(values)
+        return self.CLEANER.filter_name(
+            self.manager.SEPARATE.join(values),
+            default=self.manager.SEPARATE.join((
+                data["作者ID"],
+                data["作品ID"],
+            )),
+        )
 
     @staticmethod
     def __get_name_time(data: dict) -> str:
@@ -280,7 +289,10 @@ class XHS:
         return self.manager.filter_name(data["作者昵称"]) or data["作者ID"]
 
     def __get_name_title(self, data: dict) -> str:
-        return self.manager.filter_name(data["作品标题"])[:64] or data["作品ID"]
+        return beautify_string(
+            self.manager.filter_name(data["作品标题"]),
+            64,
+        ) or data["作品ID"]
 
     async def monitor(self, delay=1, download=False, log=None, bar=None, data=True, ) -> None:
         logging(
diff --git a/source/expansion/__init__.py b/source/expansion/__init__.py
index 271fca5..5852359 100644
--- a/source/expansion/__init__.py
+++ b/source/expansion/__init__.py
@@ -6,3 +6,4 @@ from .truncate import trim_string
 from .truncate import truncate_string
 from .file_folder import file_switch
 from .file_folder import remove_empty_directories
+from .cleaner import Cleaner
diff --git a/source/expansion/cleaner.py b/source/expansion/cleaner.py
new file mode 100644
index 0000000..f75da6f
--- /dev/null
+++ b/source/expansion/cleaner.py
@@ -0,0 +1,89 @@
+from platform import system
+from string import whitespace
+from emoji import replace_emoji
+from warnings import warn
+
+
+class Cleaner:
+    def __init__(self):
+        """
+        替换字符串中包含的非法字符，默认根据系统类型生成对应的非法字符字典，也可以自行设置非法字符字典
+        """
+        self.rule = self.default_rule()  # 默认非法字符字典
+
+    @staticmethod
+    def default_rule():
+        """根据系统类型生成默认非法字符字典"""
+        if (s := system()) in ("Windows", "Darwin"):
+            rule = {
+                "/": "",
+                "\\": "",
+                "|": "",
+                "<": "",
+                ">": "",
+                "\"": "",
+                "?": "",
+                ":": "",
+                "*": "",
+                "\x00": "",
+            }  # Windows 系统和 Mac 系统
+        elif s == "Linux":
+            rule = {
+                "/": "",
+                "\x00": "",
+            }  # Linux 系统
+        else:
+            warn("不受支持的操作系统类型，可能无法正常去除非法字符！")
+            rule = {}
+        cache = {i: "" for i in whitespace[1:]}  # 补充换行符等非法字符
+        return rule | cache
+
+    def set_rule(self, rule: dict[str, str], update=True):
+        """
+        设置非法字符字典
+
+        :param rule: 替换规则，字典格式，键为非法字符，值为替换后的内容
+        :param update: 如果是 True，则与原有规则字典合并，否则替换原有规则字典
+        """
+        self.rule = {**self.rule, **rule} if update else rule
+
+    def filter(self, text: str) -> str:
+        """
+        去除非法字符
+
+        :param text: 待处理的字符串
+        :return: 替换后的字符串，如果替换后字符串为空，则返回 None
+        """
+        for i in self.rule:
+            text = text.replace(i, self.rule[i])
+        return text
+
+    def filter_name(
+            self,
+            text: str,
+            replace: str = "",
+            default: str = "",
+    ) -> str:
+        """过滤文件夹名称中的非法字符"""
+        text = text.replace(":", ".")
+
+        text = self.filter(text)
+
+        text = replace_emoji(text, replace, )
+
+        text = self.clear_spaces(text)
+
+        text = text.strip().strip(".").strip("_")
+
+        return text or default
+
+    @staticmethod
+    def clear_spaces(string: str):
+        """将连续的空格转换为单个空格"""
+        return " ".join(string.split())
+
+
+if __name__ == "__main__":
+    demo = Cleaner()
+    print(demo.rule)
+    print(demo.filter_name(""))