From b4d7ddf5cbd283bd5c72896334be952270a78995 Mon Sep 17 00:00:00 2001 From: Naibo_Mac_M2 Date: Wed, 11 Dec 2024 23:17:21 +0800 Subject: [PATCH] Fix bug of document empty because of html.parsestring function --- ExecuteStage/.vscode/launch.json | 2 +- ExecuteStage/easyspider_executestage.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ExecuteStage/.vscode/launch.json b/ExecuteStage/.vscode/launch.json index 4d1baa3..02d5a3a 100644 --- a/ExecuteStage/.vscode/launch.json +++ b/ExecuteStage/.vscode/launch.json @@ -12,7 +12,7 @@ "justMyCode": false, // "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] - "args": ["--ids", "[5]", "--headless", "0", "--user_data", "0", "--keyboard", "0", + "args": ["--ids", "[35]", "--headless", "0", "--user_data", "0", "--keyboard", "0", "--read_type", "remote", ] // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name" diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index ba0b06c..c3a4a28 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -1148,6 +1148,14 @@ class BrowserThread(Thread): self.history["handle"] = thisHandle thisHistoryURL = self.browser.current_url # 快速提取处理 + # start = time.time() + try: + tree = html.fromstring(self.browser.page_source) + except Exception as e: + self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode") + node["parameters"]["quickExtractable"] = False + # end = time.time() + # print("解析页面秒数:", end - start) if node["parameters"]["quickExtractable"]: self.browser.switch_to.default_content() # 切换到主页面 tree = html.fromstring(self.browser.page_source)