fix: 修复非法文件名称报错的问题

1. 替换文件名称包含的非法字符
2. 替换文件名称包含的 Emoji 表情
This commit is contained in:
JoeanAmier
2024-08-09 18:35:04 +08:00
parent ce01a7b2d7
commit e2eee03feb
6 changed files with 107 additions and 2 deletions

View File

@@ -426,3 +426,4 @@ async def example():
* https://aiosqlite.omnilib.dev/en/stable/ * https://aiosqlite.omnilib.dev/en/stable/
* https://click.palletsprojects.com/en/8.1.x/ * https://click.palletsprojects.com/en/8.1.x/
* https://github.com/thewh1teagle/rookie * https://github.com/thewh1teagle/rookie
* https://github.com/carpedm20/emoji/

View File

@@ -427,3 +427,4 @@ async def example():
* https://aiosqlite.omnilib.dev/en/stable/ * https://aiosqlite.omnilib.dev/en/stable/
* https://click.palletsprojects.com/en/8.1.x/ * https://click.palletsprojects.com/en/8.1.x/
* https://github.com/thewh1teagle/rookie * https://github.com/thewh1teagle/rookie
* https://github.com/carpedm20/emoji/

View File

@@ -9,3 +9,4 @@ httpx>=0.27.0
fastapi>=0.111.0 fastapi>=0.111.0
uvicorn>=0.30.1 uvicorn>=0.30.1
aiofiles>=24.1.0 aiofiles>=24.1.0
emoji>=2.12.1

View File

@@ -17,8 +17,10 @@ from uvicorn import Config
from uvicorn import Server from uvicorn import Server
from source.expansion import BrowserCookie from source.expansion import BrowserCookie
from source.expansion import Cleaner
from source.expansion import Converter from source.expansion import Converter
from source.expansion import Namespace from source.expansion import Namespace
from source.expansion import beautify_string
from source.module import DataRecorder from source.module import DataRecorder
from source.module import ExtractData from source.module import ExtractData
from source.module import ExtractParams from source.module import ExtractParams
@@ -65,6 +67,7 @@ class XHS:
SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/\S+") SHARE = compile(r"https?://www\.xiaohongshu\.com/discovery/item/\S+")
SHORT = compile(r"https?://xhslink\.com/\S+") SHORT = compile(r"https?://xhslink\.com/\S+")
__INSTANCE = None __INSTANCE = None
CLEANER = Cleaner()
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
if not cls.__INSTANCE: if not cls.__INSTANCE:
@@ -270,7 +273,13 @@ class XHS:
values.append(self.__get_name_title(data)) values.append(self.__get_name_title(data))
case _: case _:
values.append(data[key]) values.append(data[key])
return self.manager.SEPARATE.join(values) return self.CLEANER.filter_name(
self.manager.SEPARATE.join(values),
default=self.manager.SEPARATE.join((
data["作者ID"],
data["作品ID"],
)),
)
@staticmethod @staticmethod
def __get_name_time(data: dict) -> str: def __get_name_time(data: dict) -> str:
@@ -280,7 +289,10 @@ class XHS:
return self.manager.filter_name(data["作者昵称"]) or data["作者ID"] return self.manager.filter_name(data["作者昵称"]) or data["作者ID"]
def __get_name_title(self, data: dict) -> str: def __get_name_title(self, data: dict) -> str:
return self.manager.filter_name(data["作品标题"])[:64] or data["作品ID"] return beautify_string(
self.manager.filter_name(data["作品标题"]),
64,
) or data["作品ID"]
async def monitor(self, delay=1, download=False, log=None, bar=None, data=True, ) -> None: async def monitor(self, delay=1, download=False, log=None, bar=None, data=True, ) -> None:
logging( logging(

View File

@@ -6,3 +6,4 @@ from .truncate import trim_string
from .truncate import truncate_string from .truncate import truncate_string
from .file_folder import file_switch from .file_folder import file_switch
from .file_folder import remove_empty_directories from .file_folder import remove_empty_directories
from .cleaner import Cleaner

View File

@@ -0,0 +1,89 @@
from platform import system
from string import whitespace
from emoji import replace_emoji
from warnings import warn
class Cleaner:
def __init__(self):
"""
替换字符串中包含的非法字符,默认根据系统类型生成对应的非法字符字典,也可以自行设置非法字符字典
"""
self.rule = self.default_rule() # 默认非法字符字典
@staticmethod
def default_rule():
"""根据系统类型生成默认非法字符字典"""
if (s := system()) in ("Windows", "Darwin"):
rule = {
"/": "",
"\\": "",
"|": "",
"<": "",
">": "",
"\"": "",
"?": "",
":": "",
"*": "",
"\x00": "",
} # Windows 系统和 Mac 系统
elif s == "Linux":
rule = {
"/": "",
"\x00": "",
} # Linux 系统
else:
warn("不受支持的操作系统类型,可能无法正常去除非法字符!")
rule = {}
cache = {i: "" for i in whitespace[1:]} # 补充换行符等非法字符
return rule | cache
def set_rule(self, rule: dict[str, str], update=True):
"""
设置非法字符字典
:param rule: 替换规则,字典格式,键为非法字符,值为替换后的内容
:param update: 如果是 True则与原有规则字典合并否则替换原有规则字典
"""
self.rule = {**self.rule, **rule} if update else rule
def filter(self, text: str) -> str:
"""
去除非法字符
:param text: 待处理的字符串
:return: 替换后的字符串,如果替换后字符串为空,则返回 None
"""
for i in self.rule:
text = text.replace(i, self.rule[i])
return text
def filter_name(
self,
text: str,
replace: str = "",
default: str = "",
) -> str:
"""过滤文件夹名称中的非法字符"""
text = text.replace(":", ".")
text = self.filter(text)
text = replace_emoji(text, replace, )
text = self.clear_spaces(text)
text = text.strip().strip(".").strip("_")
return text or default
@staticmethod
def clear_spaces(string: str):
"""将连续的空格转换为单个空格"""
return " ".join(string.split())
if __name__ == "__main__":
demo = Cleaner()
print(demo.rule)
print(demo.filter_name(""))