优化无水印图片链接提取逻辑

2026-03-22 06:57:16 +08:00 · 2023-09-17 15:55:25 +08:00
parent ca0f71b493
commit 38c93a551b
1 changed files with 22 additions and 2 deletions
--- a/source/Image.py
+++ b/source/Image.py
@@ -1,3 +1,4 @@
+from json import loads
 from re import compile

 from .Html import Html
@@ -6,7 +7,26 @@ __all__ = ['Image']


 class Image:
-    IMAGE_URL = compile(r'"CRD_WM_[A-Z]{3,4}","url":"(.*?_wm_1)"')
+    IMAGE_INFO = compile(r'("infoList":\[\{.*?}])')

    def get_image_link(self, html: str) -> list:
-        return [Html.format_url(i) for i in self.IMAGE_URL.findall(html)]
+        data = self.__extract_image_data(html)
+        data = self.__format_image_data(data)
+        return self.__extract_image_urls(data)
+
+    def __extract_image_data(self, html: str) -> list[str]:
+        return self.IMAGE_INFO.findall(html)
+
+    @staticmethod
+    def __format_image_data(data: list[str]) -> list[dict]:
+        return [loads(f"{{{i}}}") for i in data]
+
+    @staticmethod
+    def __extract_image_urls(data: list[dict]) -> list[str]:
+        urls = []
+        for i in data:
+            for j in i.get("infoList", []):
+                if j.get("imageScene", "").startswith("CRD_WM_"):
+                    urls.append(j.get("url", ""))
+                    break
+        return [Html.format_url(i) for i in urls if i]