From bba6940cc80a25fd045d4da2a82a4bbc2e4998e6 Mon Sep 17 00:00:00 2001 From: JoeamAmier Date: Sat, 26 Aug 2023 23:33:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 5 +++-- source/Download.py | 3 ++- source/Explore.py | 13 +++++++++---- source/__init__.py | 15 ++++++++++++--- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 87179b6..d5c5d42 100644 --- a/main.py +++ b/main.py @@ -6,8 +6,9 @@ def example(): image_demo = "https://www.xiaohongshu.com/explore/64d1b406000000000103ee8d" video_demo = "https://www.xiaohongshu.com/explore/64c05652000000000c0378e7" xhs = XHS() - print(xhs.get_image(image_demo)) - print(xhs.get_video(video_demo)) + # print(xhs.get_image(image_demo)) + # print(xhs.get_video(video_demo)) + print(xhs.extract(video_demo)) if __name__ == '__main__': diff --git a/source/Download.py b/source/Download.py index ffeba86..58179b0 100644 --- a/source/Download.py +++ b/source/Download.py @@ -5,9 +5,10 @@ class Download: def __init__( self, path, + folder, headers: dict, proxies=None, ): - self.root = Path(path) + self.root = Path(path).joinpath(folder) self.headers = headers self.proxies = { "http": proxies, diff --git a/source/Explore.py b/source/Explore.py index 3bee102..b133dd9 100644 --- a/source/Explore.py +++ b/source/Explore.py @@ -1,9 +1,14 @@ +from json import loads from re import compile class Explore: - explore_data = compile(r'"noteDetailMap": (\{.*?})') + explore_data = compile( + r'"currentTime":\d{13},"note":(.*?)}},"serverRequestInfo"') - def __init__(self, html, url: str): - self.html = html - self.url = url + def run(self, html: str): + data = self.get_json_data(html) + + def get_json_data(self, html: str) -> dict: + data = self.explore_data.findall(html) + return {} if len(data) != 1 else loads(data[0]) diff --git a/source/__init__.py b/source/__init__.py index a97be84..79740bd 100644 --- a/source/__init__.py +++ b/source/__init__.py @@ -1,4 +1,5 @@ from .Download import Download +from .Explore import Explore from .Html import Html from .Image import Image from .Video import Video @@ -13,15 +14,17 @@ class XHS: def __init__( self, path="./", + folder="Download", headers=None, proxies=None, timeout=10, cookie=None): self.set_cookie(cookie) self.html = Html(headers or self.headers, proxies, timeout) - self.image = Image(self.html) - self.video = Video(self.html) - self.download = Download(path, self.html.headers, proxies) + self.image = Image() + self.video = Video() + self.explore = Explore() + self.download = Download(path, folder, self.html.headers, proxies) def set_cookie(self, cookie: str): if cookie: @@ -38,3 +41,9 @@ class XHS: if download: self.download.run([url]) return url + + def extract(self, url: str): + html = self.html.get_html(url) + if not html: + return None + self.explore.run(html)