From 76fd4bad55ef2a12a67605eab64f70b295e7278a Mon Sep 17 00:00:00 2001 From: touero Date: Thu, 25 Apr 2024 23:44:44 +0800 Subject: [PATCH] fix: format string and using enum class defined constants --- ExecuteStage/constants.py | 9 ++++ ExecuteStage/easyspider_executestage.py | 66 ++++++++++++------------- 2 files changed, 42 insertions(+), 33 deletions(-) create mode 100644 ExecuteStage/constants.py diff --git a/ExecuteStage/constants.py b/ExecuteStage/constants.py new file mode 100644 index 0000000..f6fea8c --- /dev/null +++ b/ExecuteStage/constants.py @@ -0,0 +1,9 @@ +from enum import unique, IntEnum + + +@unique +class WriteMode(IntEnum): + Create_Mode = 0 # 新建模式|Create Mode + Append_Mode = 1 # 追加模式|Append Mode + Mysql_Mode = 2 # Mysql模式|Mysql Mode + Json_Mode = 3 # Json模式|Json Mode diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index 2c56410..45d4199 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -9,6 +9,7 @@ import threading # import undetected_chromedriver as uc from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \ on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json +from constants import WriteMode from myChrome import MyChrome from threading import Thread, Event from PIL import Image @@ -132,13 +133,12 @@ class BrowserThread(Thread): with open(stealth_path, 'r') as f: js = f.read() self.print_and_log("Loading stealth.min.js") - self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { - 'source': js}) # TMALL 反扒 + self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒 self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { - "source": """ - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }) + "source": """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }) """ }) WebDriverWait(self.browser, 10) @@ -155,27 +155,26 @@ class BrowserThread(Thread): self.outputFormat = service.get("outputFormat", "csv") # 输出格式 self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值 self.dataWriteMode = service.get("dataWriteMode", 1) # 数据写入模式,1为追加,2为覆盖,3为重命名文件 + self.task_version = service.get("version", "") # 任务版本 - try: - self.task_version = service["version"] # 任务版本 - if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本 - pass - else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider - if service["version"] != version: - self.print_and_log("版本不一致,请使用" + - service["version"] + "版本的EasySpider运行该任务!") - self.print_and_log("Version not match, please use EasySpider " + - service["version"] + " to run this task!") - self.browser.quit() - sys.exit() - except: # 0.2.0版本没有version字段,所以直接退出 + if not self.task_version: self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!") self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!") self.browser.quit() sys.exit() - try: - self.links = list(filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表 - except: + + if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本 + pass + elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider + self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务!") + self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!") + self.browser.quit() + sys.exit() + + service_links = service.get("links") + if service_links: + self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表 + else: self.links = list(filter(isnotnull, service["url"])) # 要执行的link self.OUTPUT = [] # 采集的数据 if self.outputFormat in ["csv", "txt", "xlsx", "json"]: @@ -188,24 +187,25 @@ class BrowserThread(Thread): i = i + 1 self.saveName = self.saveName + '_' + str(i) self.print_and_log("文件已存在,已重命名为", self.saveName) - self.writeMode = 1 # 写入模式,0为新建,1为追加 - if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx": - if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat): + self.writeMode = WriteMode.Create_Mode.value # 写入模式,0为新建,1为追加 + if self.outputFormat in ['csv', 'txt', 'xlsx']: + if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"): self.OUTPUT.append([]) # 添加表头 - self.writeMode = 0 + self.writeMode = WriteMode.Create_Mode.value elif self.outputFormat == "json": - self.writeMode = 3 # JSON模式无需判断是否存在文件 + self.writeMode = WriteMode.Json_Mode.value # JSON模式无需判断是否存在文件 elif self.outputFormat == "mysql": self.mysql = myMySQL(config["mysql_config_path"]) self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2) - self.writeMode = 2 - if self.writeMode == 0: + self.writeMode = WriteMode.MySQL_Mode.value # MySQL模式 + + if self.writeMode == WriteMode.Create_Mode.value: self.print_and_log("新建模式|Create Mode") - elif self.writeMode == 1: + elif self.writeMode == WriteMode.Append_Mode.value: self.print_and_log("追加模式|Append Mode") - elif self.writeMode == 2: + elif self.writeMode == WriteMode.MySQL_Mode.value: self.print_and_log("MySQL模式|MySQL Mode") - elif self.writeMode == 3: + elif self.writeMode == WriteMode.Json_Mode.value: self.print_and_log("JSON模式|JSON Mode") self.containJudge = service["containJudge"] # 是否含有判断语句 self.outputParameters = {} @@ -222,7 +222,7 @@ class BrowserThread(Thread): self.outputParametersTypes.append(param.get("type", "text")) self.outputParametersRecord.append(bool(param.get("recordASField", True))) # 文件叠加的时候不添加表头 - if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == 0: + if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create_Mode.value: self.OUTPUT[0].append(param["name"]) self.urlId = 0 # 全局记录变量 self.preprocess() # 预处理,优化提取数据流程