Fix bug of document empty because of html.parsestring function

2025-04-12 11:37:11 +08:00 · 2024-12-11 23:17:21 +08:00 · 2024-12-11 23:17:21 +08:00 · b4d7ddf5cb
commit b4d7ddf5cb
parent 2031b09297
2 changed files with 9 additions and 1 deletions
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@ -12,7 +12,7 @@
            "justMyCode": false,
            //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
            // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--ids", "[5]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
+            "args": ["--ids", "[35]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
        "--read_type", "remote", 
    ]
            // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@ -1148,6 +1148,14 @@ class BrowserThread(Thread):
        self.history["handle"] = thisHandle
        thisHistoryURL = self.browser.current_url
        # 快速提取处理
+        # start = time.time()
+        try:
+            tree = html.fromstring(self.browser.page_source)
+        except Exception as e:
+            self.print_and_log("解析页面时出错，将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
+            node["parameters"]["quickExtractable"] = False
+        # end = time.time()
+        # print("解析页面秒数：", end - start)
        if node["parameters"]["quickExtractable"]:
            self.browser.switch_to.default_content() # 切换到主页面
            tree = html.fromstring(self.browser.page_source)