MacOS Test

This commit is contained in:
Naibo_Mac_M2 2023-12-07 05:06:09 +08:00
parent e4037e221d
commit 5376aa37b0
12 changed files with 508 additions and 248 deletions

View File

@ -1,6 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# import atexit # import atexit
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel, write_to_json import atexit
import copy
import shutil
import string
import undetected_chromedriver as uc
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, write_to_csv, write_to_excel, write_to_json
from myChrome import MyChrome from myChrome import MyChrome
from threading import Thread, Event from threading import Thread, Event
from PIL import Image from PIL import Image
@ -22,6 +28,7 @@ from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from pynput.keyboard import Key, Listener from pynput.keyboard import Key, Listener
from datetime import datetime from datetime import datetime
import io # 遇到错误退出时应执行的代码 import io # 遇到错误退出时应执行的代码
@ -40,8 +47,8 @@ from ddddocr import DdddOcr
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree from lxml import etree
import onnxruntime import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志 onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
import undetected_chromedriver as uc
# import pandas as pd # import pandas as pd
# import numpy # import numpy
# import pytesseract # import pytesseract
@ -53,7 +60,7 @@ desired_capabilities["pageLoadStrategy"] = "none"
class BrowserThread(Thread): class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config): def __init__(self, browser_t, id, service, version, event, saveName, config, option):
Thread.__init__(self) Thread.__init__(self)
self.logs = io.StringIO() self.logs = io.StringIO()
try: try:
@ -61,6 +68,7 @@ class BrowserThread(Thread):
except: except:
self.log = True self.log = True
self.browser = browser_t self.browser = browser_t
self.option = option
self.config = config self.config = config
self.version = version self.version = version
self.totalSteps = 0 self.totalSteps = 0
@ -76,25 +84,32 @@ class BrowserThread(Thread):
self.SAVED = False self.SAVED = False
self.BREAK = False self.BREAK = False
self.CONTINUE = False self.CONTINUE = False
try:
maximizeWindow = service["maximizeWindow"]
except:
maximizeWindow = 0
if maximizeWindow == 1:
self.browser.maximize_window()
# 名称设定 # 名称设定
if saveName != "": # 命令行覆盖保存名称 if saveName != "": # 命令行覆盖保存名称
self.saveName = saveName # 保存文件的名字 self.saveName = saveName # 保存文件的名字
now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
self.saveName = self.saveName.replace("current_time", now) self.saveName = self.saveName.replace("current_time", now)
self.print_and_log("任务ID", i, "的保存文件名为:", self.saveName) self.print_and_log("任务ID", id, "的保存文件名为:", self.saveName)
self.print_and_log("Save Name for task ID", i, "is:", self.saveName) self.print_and_log("Save Name for task ID", id, "is:", self.saveName)
if not os.path.exists("Data/Task_" + str(i)): if not os.path.exists("Data/Task_" + str(id)):
os.mkdir("Data/Task_" + str(i)) os.mkdir("Data/Task_" + str(id))
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName): if not os.path.exists("Data/Task_" + str(id) + "/" + self.saveName):
os.mkdir("Data/Task_" + str(i) + "/" + os.mkdir("Data/Task_" + str(id) + "/" +
self.saveName) # 创建保存文件夹用来保存截图 self.saveName) # 创建保存文件夹用来保存截图
self.getDataStep = 0 self.getDataStep = 0
self.startSteps = 0 self.startSteps = 0
try: try:
startFromExit = service["startFromExit"] # 从上次退出的步骤开始 startFromExit = service["startFromExit"] # 从上次退出的步骤开始
if startFromExit == 1: if startFromExit == 1:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r', encoding='utf-8-sig') as file_obj: with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
encoding='utf-8-sig') as file_obj:
self.startSteps = int(file_obj.read()) # 读取已执行步数 self.startSteps = int(file_obj.read()) # 读取已执行步数
except: except:
pass pass
@ -116,6 +131,11 @@ class BrowserThread(Thread):
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': js}) # TMALL 反扒 'source': js}) # TMALL 反扒
WebDriverWait(self.browser, 10) WebDriverWait(self.browser, 10)
self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id))
self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
self.browser.execute("send_command", self.paramss) # 下载地址改变
# self.browser.get('about:blank') # self.browser.get('about:blank')
self.procedure = service["graph"] # 程序执行流程 self.procedure = service["graph"] # 程序执行流程
try: try:
@ -148,8 +168,11 @@ class BrowserThread(Thread):
self.save_threshold = service["saveThreshold"] # 保存最低阈值 self.save_threshold = service["saveThreshold"] # 保存最低阈值
except: except:
self.save_threshold = 10 self.save_threshold = 10
try:
self.links = list( self.links = list(
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表 filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
except:
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
self.OUTPUT = [] # 采集的数据 self.OUTPUT = [] # 采集的数据
self.writeMode = 1 # 写入模式0为新建1为追加 self.writeMode = 1 # 写入模式0为新建1为追加
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx": if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
@ -228,7 +251,11 @@ class BrowserThread(Thread):
cookies = node["parameters"]["cookies"] cookies = node["parameters"]["cookies"]
except: except:
node["parameters"]["cookies"] = "" node["parameters"]["cookies"] = ""
if node["option"] == 2: # 点击操作 elif node["option"] == 2: # 点击操作
try:
alertHandleType = node["parameters"]["alertHandleType"]
except:
node["parameters"]["alertHandleType"] = 0
if node["parameters"]["useLoop"]: if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5": if self.task_version <= "0.3.5":
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
@ -263,8 +290,10 @@ class BrowserThread(Thread):
if para["contentType"] == 8: if para["contentType"] == 8:
self.print_and_log( self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。") "默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
self.print_and_log("If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.") self.print_and_log(
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2: "If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para[
"nodeType"] <= 2:
para["optimizable"] = True para["optimizable"] = True
else: else:
para["optimizable"] = False para["optimizable"] = False
@ -289,6 +318,13 @@ class BrowserThread(Thread):
node["parameters"]["xpath"] = "" node["parameters"]["xpath"] = ""
self.print_and_log("您的任务版本号为" + self.task_version + self.print_and_log("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath") "循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 8: # 循环操作
try:
exitElement = node["parameters"]["exitElement"]
if exitElement == "":
node["parameters"]["exitElement"] = "//body"
except:
node["parameters"]["exitElement"] = "//body"
self.print_and_log("预处理完成|Preprocess completed") self.print_and_log("预处理完成|Preprocess completed")
def readFromExcel(self): def readFromExcel(self):
@ -299,7 +335,8 @@ class BrowserThread(Thread):
except: except:
self.print_and_log("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确", self.print_and_log("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确",
os.path.abspath(self.inputExcel)) os.path.abspath(self.inputExcel))
self.print_and_log("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ", self.print_and_log(
"Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ",
os.path.abspath(self.inputExcel)) os.path.abspath(self.inputExcel))
time.sleep(5) time.sleep(5)
return 0 return 0
@ -329,7 +366,7 @@ class BrowserThread(Thread):
if "urlList_0" in data.keys(): if "urlList_0" in data.keys():
self.links = data["urlList_0"] self.links = data["urlList_0"]
except: except:
pass self.links = "about:blank"
task = self.service task = self.service
for key, value in data.items(): for key, value in data.items():
for i in range(len(task["inputParameters"])): for i in range(len(task["inputParameters"])):
@ -367,6 +404,20 @@ class BrowserThread(Thread):
self.saveData(exit=True) self.saveData(exit=True)
if self.outputFormat == "mysql": if self.outputFormat == "mysql":
self.mysql.close() self.mysql.close()
try:
quitWaitTime = self.service["quitWaitTime"]
except:
quitWaitTime = 60
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
time.sleep(quitWaitTime)
self.browser.quit()
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
try:
shutil.rmtree(self.option["tmp_user_data_folder"])
except:
pass
self.print_and_log("清理完成!|Clean up completed!")
def recordLog(self, *args, **kwargs): def recordLog(self, *args, **kwargs):
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") now = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
@ -387,11 +438,13 @@ class BrowserThread(Thread):
# 写入日志 # 写入日志
# self.recordLog("持久化存储数据/Persistently store data") # self.recordLog("持久化存储数据/Persistently store data")
if self.log: if self.log:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '.log', 'a', encoding='utf-8-sig') as file_obj: with open("Data/Task_" + str(self.id) + "/" + self.saveName + '.log', 'a',
encoding='utf-8-sig') as file_obj:
file_obj.write(self.logs.getvalue()) file_obj.write(self.logs.getvalue())
file_obj.close() file_obj.close()
# 写入已执行步数 # 写入已执行步数
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'w', encoding='utf-8-sig') as file_obj: with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'w',
encoding='utf-8-sig') as file_obj:
file_obj.write(str(self.totalSteps + 1)) file_obj.write(str(self.totalSteps + 1))
file_obj.close() file_obj.close()
# 写入数据 # 写入数据
@ -409,7 +462,8 @@ class BrowserThread(Thread):
elif self.outputFormat == "json": elif self.outputFormat == "json":
file_name = "Data/Task_" + \ file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + '.json' str(self.id) + "/" + self.saveName + '.json'
write_to_json(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord, self.outputParameters.keys()) write_to_json(file_name, self.OUTPUT, self.outputParametersTypes,
self.outputParametersRecord, self.outputParameters.keys())
elif self.outputFormat == "mysql": elif self.outputFormat == "mysql":
self.mysql.write_to_mysql( self.mysql.write_to_mysql(
self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes) self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
@ -549,6 +603,7 @@ class BrowserThread(Thread):
self.recordLog("JavaScript execution failed") self.recordLog("JavaScript execution failed")
elif int(codeMode) == 5: elif int(codeMode) == 5:
try: try:
code = readCode(code)
output = exec(code) output = exec(code)
self.recordLog("执行下面的代码:" + code) self.recordLog("执行下面的代码:" + code)
self.recordLog("Execute the following code:" + code) self.recordLog("Execute the following code:" + code)
@ -558,6 +613,7 @@ class BrowserThread(Thread):
code, ", error is:", e) code, ", error is:", e)
elif int(codeMode) == 6: elif int(codeMode) == 6:
try: try:
code = readCode(code)
output = eval(code) output = eval(code)
self.recordLog("获得下面的代码返回值:" + code) self.recordLog("获得下面的代码返回值:" + code)
self.recordLog( self.recordLog(
@ -618,6 +674,13 @@ class BrowserThread(Thread):
elif codeMode == 4: elif codeMode == 4:
self.CONTINUE = True self.CONTINUE = True
self.recordLog("跳过本次循环|Skip this loop") self.recordLog("跳过本次循环|Skip this loop")
elif codeMode == 7: # 暂停程序执行
self.event.clear()
self.print_and_log(
f"根据设置的自定义操作,任务已暂停,长按{self.service['pauseKey']}键继续执行...|Task paused according to custom operation, long press '{self.service['pauseKey']}' to continue...")
elif codeMode == 8: # 刷新页面
self.browser.refresh()
self.print_and_log("根据设置的自定义操作,任务已刷新页面|Task refreshed page according to custom operation")
else: # 0 1 5 6 else: # 0 1 5 6
output = self.execute_code( output = self.execute_code(
codeMode, code, max_wait_time, iframe=paras["iframe"]) codeMode, code, max_wait_time, iframe=paras["iframe"])
@ -647,7 +710,8 @@ class BrowserThread(Thread):
optionValue = loopValue optionValue = loopValue
optionMode = 1 optionMode = 1
try: try:
xpath = replace_field_values(para["xpath"], self.outputParameters, self) xpath = replace_field_values(
para["xpath"], self.outputParameters, self)
dropdown = Select(self.browser.find_element( dropdown = Select(self.browser.find_element(
By.XPATH, xpath, iframe=para["iframe"])) By.XPATH, xpath, iframe=para["iframe"]))
try: try:
@ -678,7 +742,8 @@ class BrowserThread(Thread):
def moveToElement(self, para, loopElement=None, loopPath="", index=0): def moveToElement(self, para, loopElement=None, loopPath="", index=0):
time.sleep(0.1) # 移动之前等待0.1秒 time.sleep(0.1) # 移动之前等待0.1秒
loopPath = replace_field_values(loopPath, self.outputParameters, self) loopPath = replace_field_values(loopPath, self.outputParameters, self)
xpath = replace_field_values(para["xpath"], self.outputParameters, self) xpath = replace_field_values(
para["xpath"], self.outputParameters, self)
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
if xpath == "": if xpath == "":
path = loopPath path = loopPath
@ -746,7 +811,7 @@ class BrowserThread(Thread):
for i in node["sequence"]: # 从根节点开始向下读取 for i in node["sequence"]: # 从根节点开始向下读取
self.executeNode(i, loopValue, loopPath, index) self.executeNode(i, loopValue, loopPath, index)
elif node["option"] == 1: # 打开网页操作 elif node["option"] == 1: # 打开网页操作
if not (nodeId == 1 and self.service["cloudflare"] == 1): # if not (nodeId == 1 and self.service["cloudflare"] == 1):
self.openPage(node["parameters"], loopValue) self.openPage(node["parameters"], loopValue)
elif node["option"] == 2: # 点击元素 elif node["option"] == 2: # 点击元素
self.clickElement(node["parameters"], loopValue, loopPath, index) self.clickElement(node["parameters"], loopValue, loopPath, index)
@ -842,16 +907,20 @@ class BrowserThread(Thread):
elif tType <= 8: # JS命令返回值 elif tType <= 8: # JS命令返回值
if tType == 5: # JS命令返回值等于 if tType == 5: # JS命令返回值等于
output = self.execute_code( output = self.execute_code(
0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"]) 0, cnode["parameters"]["code"], cnode["parameters"]["waitTime"],
iframe=cnode["parameters"]["iframe"])
elif tType == 6: # System elif tType == 6: # System
output = self.execute_code( output = self.execute_code(
1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], iframe=cnode["parameters"]["iframe"]) 1, cnode["parameters"]["code"], cnode["parameters"]["waitTime"],
iframe=cnode["parameters"]["iframe"])
elif tType == 7: # 针对当前循环项的JS命令返回值 elif tType == 7: # 针对当前循环项的JS命令返回值
output = self.execute_code( output = self.execute_code(
2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"]) 2, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement,
iframe=cnode["parameters"]["iframe"])
elif tType == 8: # 针对当前循环项的System命令返回值 elif tType == 8: # 针对当前循环项的System命令返回值
output = self.execute_code( output = self.execute_code(
6, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement, iframe=cnode["parameters"]["iframe"]) 6, cnode["parameters"]["code"], cnode["parameters"]["waitTime"], loopElement,
iframe=cnode["parameters"]["iframe"])
try: try:
if output.find("rue") != -1: # 如果返回值中包含true if output.find("rue") != -1: # 如果返回值中包含true
code = 1 code = 1
@ -869,14 +938,52 @@ class BrowserThread(Thread):
self.recordLog( self.recordLog(
"判断条件内所有条件分支的条件都不满足|None of the conditions in the judgment condition are met") "判断条件内所有条件分支的条件都不满足|None of the conditions in the judgment condition are met")
def handleHistory(self, node, xpath, thisHitoryURL, thisHistoryLength, index, element=None, elements=None):
if self.history["index"] != thisHistoryLength and self.history["handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - self.history["index"] # 计算历史记录变化差值
self.browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
# else:
# time.sleep(2)
try:
self.browser.execute_script('window.stop()')
except:
pass
ti = 0
if self.browser.current_url.startswith("data:"):
while self.browser.current_url != thisHitoryURL: # 如果执行完一次循环之后网址发生了变化
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
ti += 1
if self.browser.current_url == thisHitoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
break
time.sleep(2)
if element == None: # 不固定元素列表
element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
else: # 固定元素列表
element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
else:
if element == None:
element = elements
return index, element
# 对循环的处理 # 对循环的处理
def loopExecute(self, node, loopValue, clickPath="", index=0): def loopExecute(self, node, loopValue, clickPath="", index=0):
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒 time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
try:
thisHistoryLength = self.browser.execute_script( thisHistoryLength = self.browser.execute_script(
'return history.length') # 记录本次循环内的history的length 'return history.length') # 记录本次循环内的history的length
except:
thisHistoryLength = 0
self.history["index"] = thisHistoryLength self.history["index"] = thisHistoryLength
self.history["handle"] = thisHandle self.history["handle"] = thisHandle
thisHitoryURL = self.browser.current_url
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环 if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
# 无跳转标签页操作 # 无跳转标签页操作
count = 0 # 执行次数 count = 0 # 执行次数
@ -887,8 +994,7 @@ class BrowserThread(Thread):
# newBodyText = self.browser.page_source # newBodyText = self.browser.page_source
# newBodyText = self.browser.find_element(By.XPATH, "//body").text # newBodyText = self.browser.find_element(By.XPATH, "//body").text
if node["parameters"]["exitCount"] == 0: if node["parameters"]["exitCount"] == 0:
newBodyText = self.browser.find_element( newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
By.CSS_SELECTOR, "body", iframe=False).text
if node["parameters"]["iframe"]: # 如果标记了iframe if node["parameters"]["iframe"]: # 如果标记了iframe
iframes = self.browser.find_elements( iframes = self.browser.find_elements(
By.CSS_SELECTOR, "iframe", iframe=False) By.CSS_SELECTOR, "iframe", iframe=False)
@ -955,7 +1061,9 @@ class BrowserThread(Thread):
break break
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
node["parameters"]["breakCodeWaitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
@ -970,7 +1078,8 @@ class BrowserThread(Thread):
self.print_and_log("Loop element not found: ", self.print_and_log("Loop element not found: ",
xpath) xpath)
self.print_and_log("找不到循环元素: ", xpath) self.print_and_log("找不到循环元素: ", xpath)
for index in range(len(elements)): index = 0
while index < len(elements):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作 for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
self.executeNode(i, elements[index], self.executeNode(i, elements[index],
xpath, index) xpath, index)
@ -998,29 +1107,16 @@ class BrowserThread(Thread):
self.print_and_log("关闭标签页发生错误:", e) self.print_and_log("关闭标签页发生错误:", e)
self.print_and_log( self.print_and_log(
"Error occurred while closing tab: ", e) "Error occurred while closing tab: ", e)
if self.history["index"] != thisHistoryLength and self.history[ index, elements = self.handleHistory(node, xpath, thisHitoryURL, thisHistoryLength, index, elements=elements)
"handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
self.history["index"] # 计算历史记录变化差值
self.browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
# else:
# time.sleep(2)
# 切换历史记录等待:
self.recordLog("Change history back time or: ",
node["parameters"]["historyWait"])
try:
self.browser.execute_script('window.stop()')
except:
pass
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
node["parameters"]["breakCodeWaitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
index = index + 1
except NoSuchElementException: except NoSuchElementException:
self.print_and_log("Loop element not found: ", xpath) self.print_and_log("Loop element not found: ", xpath)
self.print_and_log("找不到循环元素: ", xpath) self.print_and_log("找不到循环元素: ", xpath)
@ -1028,9 +1124,14 @@ class BrowserThread(Thread):
raise raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
# 千万不要忘了分割!! # 千万不要忘了分割!!
for path in node["parameters"]["pathList"].split("\n"): paths = node["parameters"]["pathList"].split("\n")
# for path in node["parameters"]["pathList"].split("\n"):
index = 0
while index < len(paths):
path = paths[index]
try: try:
path = replace_field_values(path, self.outputParameters, self) path = replace_field_values(
path, self.outputParameters, self)
element = self.browser.find_element( element = self.browser.find_element(
By.XPATH, path, iframe=node["parameters"]["iframe"]) By.XPATH, path, iframe=node["parameters"]["iframe"])
# self.recordLog("循环元素|Loop element:", path) # self.recordLog("循环元素|Loop element:", path)
@ -1060,34 +1161,23 @@ class BrowserThread(Thread):
self.print_and_log("关闭标签页发生错误:", e) self.print_and_log("关闭标签页发生错误:", e)
self.print_and_log( self.print_and_log(
"Error occurred while closing tab: ", e) "Error occurred while closing tab: ", e)
if self.history["index"] != thisHistoryLength and self.history[ index, element = self.handleHistory(node, path, thisHitoryURL, thisHistoryLength, index, element=element)
"handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
self.history["index"] # 计算历史记录变化差值
self.browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
# else:
# time.sleep(2)
self.recordLog("Change history back time or: ",
node["parameters"]["historyWait"])
try:
self.browser.execute_script('window.stop()')
except:
pass
except NoSuchElementException: except NoSuchElementException:
self.print_and_log("Loop element not found: ", path) self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素: ", path) self.print_and_log("找不到循环元素: ", path)
index += 1
continue # 循环中找不到元素就略过操作 continue # 循环中找不到元素就略过操作
except Exception as e: except Exception as e:
raise raise
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
node["parameters"]["breakCodeWaitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
index = index + 1
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表 elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n") textList = node["parameters"]["textList"].split("\n")
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量 if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量
@ -1106,7 +1196,9 @@ class BrowserThread(Thread):
break break
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
node["parameters"]["breakCodeWaitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
@ -1134,7 +1226,9 @@ class BrowserThread(Thread):
break break
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
node["parameters"]["breakCodeWaitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
@ -1142,13 +1236,16 @@ class BrowserThread(Thread):
while True: # do while循环 while True: # do while循环
if int(node["parameters"]["loopType"]) == 5: # JS if int(node["parameters"]["loopType"]) == 5: # JS
output = self.execute_code( output = self.execute_code(
0, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"]) 0, node["parameters"]["code"], node["parameters"]["waitTime"],
iframe=node["parameters"]["iframe"])
elif int(node["parameters"]["loopType"]) == 6: # System elif int(node["parameters"]["loopType"]) == 6: # System
output = self.execute_code( output = self.execute_code(
1, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"]) 1, node["parameters"]["code"], node["parameters"]["waitTime"],
iframe=node["parameters"]["iframe"])
elif int(node["parameters"]["loopType"]) == 7: # Python elif int(node["parameters"]["loopType"]) == 7: # Python
output = self.execute_code( output = self.execute_code(
6, node["parameters"]["code"], node["parameters"]["waitTime"], iframe=node["parameters"]["iframe"]) 6, node["parameters"]["code"], node["parameters"]["waitTime"],
iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
@ -1224,13 +1321,17 @@ class BrowserThread(Thread):
"return history.length") "return history.length")
except: except:
self.history["index"] = 0 self.history["index"] = 0
except Exception as e:
self.print_and_log("History Length Error")
self.history["index"] = 0
self.scrollDown(para) # 控制屏幕向下滚动 self.scrollDown(para) # 控制屏幕向下滚动
# 键盘输入事件 # 键盘输入事件
def inputInfo(self, para, loopValue): def inputInfo(self, para, loopValue):
time.sleep(0.1) # 输入之前等待0.1秒 time.sleep(0.1) # 输入之前等待0.1秒
try: try:
xpath = replace_field_values(para["xpath"], self.outputParameters, self) xpath = replace_field_values(
para["xpath"], self.outputParameters, self)
textbox = self.browser.find_element( textbox = self.browser.find_element(
By.XPATH, xpath, iframe=para["iframe"]) By.XPATH, xpath, iframe=para["iframe"])
# textbox.send_keys(Keys.CONTROL, 'a') # textbox.send_keys(Keys.CONTROL, 'a')
@ -1289,8 +1390,10 @@ class BrowserThread(Thread):
try: try:
# element = self.browser.find_element( # element = self.browser.find_element(
# By.XPATH, path, iframe=para["iframe"]) # By.XPATH, path, iframe=para["iframe"])
clickPath = replace_field_values(clickPath, self.outputParameters, self) clickPath = replace_field_values(
xpath = replace_field_values(para["xpath"], self.outputParameters, self) clickPath, self.outputParameters, self)
xpath = replace_field_values(
para["xpath"], self.outputParameters, self)
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
if xpath == "": if xpath == "":
path = clickPath path = clickPath
@ -1342,6 +1445,22 @@ class BrowserThread(Thread):
self.print_and_log("Failed to click element:" + path, self.print_and_log("Failed to click element:" + path,
", please try to change the click type to JavaScript Click.") ", please try to change the click type to JavaScript Click.")
self.print_and_log(e) self.print_and_log(e)
# 弹窗处理
if para["alertHandleType"] > 0:
try:
time.sleep(1.5)
alert = self.browser.switch_to.alert
alertHandleType = int(para["alertHandleType"])
if alertHandleType == 1:
alert.accept()
self.print_and_log("已点击确认|Clicked OK")
elif alertHandleType == 2:
alert.dismiss()
self.print_and_log("已点击取消|Clicked Cancel")
except Exception as e:
self.print_and_log("找不到弹窗|Cannot find alert")
# 点击后对该元素执行一段JavaScript代码 # 点击后对该元素执行一段JavaScript代码
try: try:
if para["afterJS"] != "": if para["afterJS"] != "":
@ -1375,6 +1494,9 @@ class BrowserThread(Thread):
pass pass
self.history["index"] = self.browser.execute_script( self.history["index"] = self.browser.execute_script(
"return history.length") "return history.length")
except Exception as e:
self.print_and_log("History Length Error")
self.history["index"] = 0
else: else:
try: try:
self.history["index"] = self.browser.execute_script( self.history["index"] = self.browser.execute_script(
@ -1387,6 +1509,9 @@ class BrowserThread(Thread):
self.history["index"] = self.browser.execute_script( self.history["index"] = self.browser.execute_script(
"return history.length") "return history.length")
# 如果打开了新窗口,切换到新窗口 # 如果打开了新窗口,切换到新窗口
except Exception as e:
self.print_and_log("History Length Error")
self.history["index"] = 0
self.scrollDown(para) # 根据参数配置向下滚动 self.scrollDown(para) # 根据参数配置向下滚动
# rt.end() # rt.end()
@ -1556,7 +1681,8 @@ class BrowserThread(Thread):
# 提取数据事件 # 提取数据事件
def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0): def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
parentPath = replace_field_values(parentPath, self.outputParameters, self) parentPath = replace_field_values(
parentPath, self.outputParameters, self)
if para["clear"] == 1: if para["clear"] == 1:
self.clearOutputParameters() self.clearOutputParameters()
try: try:
@ -1591,7 +1717,8 @@ class BrowserThread(Thread):
# relativeXPath = lowercase_tags_in_xpath(relativeXPath) # relativeXPath = lowercase_tags_in_xpath(relativeXPath)
# 已经有text()或@href了不需要再加 # 已经有text()或@href了不需要再加
content_type = "" content_type = ""
if relativeXPath.find("/@href") >= 0 or relativeXPath.find("/text()") >= 0 or relativeXPath.find("::text()") >= 0: if relativeXPath.find("/@href") >= 0 or relativeXPath.find("/text()") >= 0 or relativeXPath.find(
"::text()") >= 0:
content_type = "" content_type = ""
elif p["nodeType"] == 2: elif p["nodeType"] == 2:
content_type = "//@href" content_type = "//@href"
@ -1642,20 +1769,26 @@ class BrowserThread(Thread):
else: else:
content = p["default"] content = p["default"]
if not self.dataNotFoundKeys[p["name"]]: if not self.dataNotFoundKeys[p["name"]]:
self.print_and_log('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % ( self.print_and_log(
'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
relativeXPath, p["name"])) relativeXPath, p["name"]))
self.print_and_log("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % ( self.print_and_log(
"提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
p["name"], relativeXPath)) p["name"], relativeXPath))
self.dataNotFoundKeys[p["name"]] = True self.dataNotFoundKeys[p["name"]] = True
except Exception as e: except Exception as e:
if not self.dataNotFoundKeys[p["name"]]: if not self.dataNotFoundKeys[p["name"]]:
self.print_and_log('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % ( self.print_and_log(
'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
relativeXPath, p["name"])) relativeXPath, p["name"]))
self.print_and_log("提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % ( self.print_and_log(
"提取数据操作时,字段名 %s 对应XPath %s 未找到(请查看原因,如是否翻页太快页面元素未加载出来),使用默认值,本字段将不再重复报错" % (
p["name"], relativeXPath)) p["name"], relativeXPath))
self.dataNotFoundKeys[p["name"]] = True self.dataNotFoundKeys[p["name"]] = True
try:
self.outputParameters[p["name"]] = content self.outputParameters[p["name"]] = content
except:
self.outputParameters[p["name"]] = p["default"]
# 对于不能优化的操作使用selenium执行 # 对于不能优化的操作使用selenium执行
for p in para["paras"]: for p in para["paras"]:
if not p["optimizable"]: if not p["optimizable"]:
@ -1686,7 +1819,8 @@ class BrowserThread(Thread):
else: else:
element = self.browser.find_element( element = self.browser.find_element(
By.XPATH, relativeXPath, iframe=p["iframe"]) By.XPATH, relativeXPath, iframe=p["iframe"])
except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值 except (
NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
# self.print_and_log(p) # self.print_and_log(p)
try: try:
content = p["default"] content = p["default"]
@ -1695,9 +1829,11 @@ class BrowserThread(Thread):
self.outputParameters[p["name"]] = content self.outputParameters[p["name"]] = content
try: try:
if not self.dataNotFoundKeys[p["name"]]: if not self.dataNotFoundKeys[p["name"]]:
self.print_and_log('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % ( self.print_and_log(
'Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (
relativeXPath, p["name"])) relativeXPath, p["name"]))
self.print_and_log("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % ( self.print_and_log(
"提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
p["name"], relativeXPath)) p["name"], relativeXPath))
except: except:
pass pass
@ -1759,15 +1895,13 @@ class BrowserThread(Thread):
self.maxViewLength, self.outputParametersRecord) self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line) self.OUTPUT.append(line)
if __name__ == '__main__': if __name__ == '__main__':
from multiprocessing import freeze_support from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开 freeze_support() # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度 # 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed # If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = { config = {
"id": [0], "ids": [0],
"saved_file_name": "", "saved_file_name": "",
"user_data": False, "user_data": False,
"config_folder": "", "config_folder": "",
@ -1776,23 +1910,24 @@ if __name__ == '__main__':
"headless": False, "headless": False,
"server_address": "http://localhost:8074", "server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入 "keyboard": True, # 是否监听键盘输入
"version": "0.5.0", "version": "0.6.0",
} }
c = Config(config) c = Config(config)
print(c) print(c)
options = Options() options = webdriver.ChromeOptions()
driver_path = "chromedriver.exe" driver_path = "chromedriver.exe"
import platform import platform
print(sys.platform, platform.architecture()) print(sys.platform, platform.architecture())
option = webdriver.ChromeOptions() # option = webdriver.ChromeOptions()
if not os.path.exists(os.getcwd()+"/Data"): if not os.path.exists(os.getcwd() + "/Data"):
os.mkdir(os.getcwd()+"/Data") os.mkdir(os.getcwd() + "/Data")
if sys.platform == "darwin" and platform.architecture()[0] == "64bit": if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome" options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# MacOS需要用option而不是options # MacOS需要用option而不是options
option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome" # option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
option.add_extension( # option.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx") # "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension( options.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx") "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64" driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
@ -1805,32 +1940,32 @@ if __name__ == '__main__':
c.config_folder = os.path.expanduser( c.config_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/") "~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder) # print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径 elif os.path.exists(os.getcwd() + "/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider", print("Finding chromedriver in EasySpider",
os.getcwd()+"/EasySpider") os.getcwd() + "/EasySpider")
if sys.platform == "win32" and platform.architecture()[0] == "32bit": if sys.platform == "win32" and platform.architecture()[0] == "32bit":
options.binary_location = os.path.join( options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置 os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
option.binary_location = os.path.join( # option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置 # os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os.path.join( driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe") os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx") # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx") options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "win32" and platform.architecture()[0] == "64bit": elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
options.binary_location = os.path.join( options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe") os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
option.binary_location = os.path.join( # option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe") # os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os.path.join( driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe") os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx") # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx") options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit": elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome" options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome" # option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64" driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
option.add_extension("EasySpider/resources/app/XPathHelper.crx") # option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx") options.add_extension("EasySpider/resources/app/XPathHelper.crx")
else: else:
print("Unsupported platform") print("Unsupported platform")
@ -1843,20 +1978,25 @@ if __name__ == '__main__':
# options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置 # options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
# # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe" # # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
# driver_path = "./Chrome/chromedriver.exe" # driver_path = "./Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd()+"/../ElectronJS"): elif os.path.exists(os.getcwd() + "/../ElectronJS"):
# 软件dev用 # 软件dev用
print("Finding chromedriver in EasySpider", print("Finding chromedriver in EasySpider",
os.getcwd()+"/ElectronJS") os.getcwd() + "/ElectronJS")
option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置 # option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置 options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe" driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
option.add_extension("../ElectronJS/XPathHelper.crx") # option.add_extension("../ElectronJS/XPathHelper.crx")
options.add_extension("../ElectronJS/XPathHelper.crx")
else: else:
options.binary_location = "./chrome.exe" # 指定chrome位置 options.binary_location = "./chrome.exe" # 指定chrome位置
# option.binary_location = "./chrome.exe" # 指定chrome位置
driver_path = "./chromedriver.exe" driver_path = "./chromedriver.exe"
option.add_extension("XPathHelper.crx") # option.add_extension("XPathHelper.crx")
options.add_extension("XPathHelper.crx")
option.add_experimental_option( # option.add_experimental_option(
# 'excludeSwitches', ['enable-automation']) # 以开发者模式
options.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式 'excludeSwitches', ['enable-automation']) # 以开发者模式
# user_data_dir = r'' # 注意没有Default # user_data_dir = r'' # 注意没有Default
@ -1875,97 +2015,147 @@ if __name__ == '__main__':
print("Config file path: " + print("Config file path: " +
c.config_folder + c.config_file_name) c.config_folder + c.config_file_name)
absolute_user_data_folder = config["absolute_user_data_folder"] absolute_user_data_folder = config["absolute_user_data_folder"]
print("\nAbsolute_user_data_folder:",
absolute_user_data_folder, "\n")
except: except:
pass pass
if c.user_data:
option.add_argument(
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
option.add_argument("--profile-directory=Default")
options.add_argument(
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
if c.headless:
print("Headless mode")
print("无头模式")
option.add_argument("--headless")
options.add_argument("--headless")
# options.add_argument( # options.add_argument(
# '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒 # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
option.add_argument( # option.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒 # "--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument( options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒 "--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument('-ignore-certificate-errors') options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors') options.add_argument('-ignore -ssl-errors')
option.add_argument('-ignore-certificate-errors') # option.add_argument('-ignore-certificate-errors')
option.add_argument('-ignore -ssl-errors') # option.add_argument('-ignore -ssl-errors')
if c.headless:
print("Headless mode")
print("无头模式")
# option.add_argument("--headless")
options.add_argument("--headless")
tmp_options = []
for id in c.ids:
tmp_options.append({"options": copy.deepcopy(options), "tmp_user_data_folder": ""})
if c.user_data:
tmp_user_folder_parent = os.path.join(os.getcwd(), "TempUserDataFolder")
if not os.path.exists(tmp_user_folder_parent):
os.mkdir(tmp_user_folder_parent)
characters = string.ascii_letters + string.digits
for i in range(len(c.ids)):
id = c.ids[i]
# 从字符集中随机选择字符构成字符串
random_string = ''.join(random.choice(characters) for i in range(10))
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
if os.path.exists(tmp_user_data_folder):
shutil.rmtree(tmp_user_data_folder)
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
# option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
# option.add_argument(
# f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
# option.add_argument("--profile-directory=Default")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
print(
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally说明有之前运行的Chrome实例没有正常关闭请关闭之前打开的所有Chrome实例后再运行程序即可。")
print(
"If you get an error Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally, it means that there is a Chrome instance that was not closed properly before, please close all Chrome instances that were opened before running the program.")
threads = [] threads = []
for i in c.id: for i in range(len(c.ids)):
# print(options) id = c.ids[i]
print("id: ", i) # option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
print("id: ", id)
if c.read_type == "remote": if c.read_type == "remote":
print("remote") print("remote")
content = requests.get( content = requests.get(
c.server_address + "/queryExecutionInstance?id=" + str(i)) c.server_address + "/queryExecutionInstance?id=" + str(id))
service = json.loads(content.text) # 加载服务信息 service = json.loads(content.text) # 加载服务信息
else: else:
print("local") print("local")
with open("execution_instances/" + str(i) + ".json", 'r', encoding='utf-8') as f: with open("execution_instances/" + str(id) + ".json", 'r', encoding='utf-8') as f:
content = f.read() content = f.read()
service = json.loads(content) # 加载服务信息 service = json.loads(content) # 加载服务信息
try:
print("Task Name:", service["name"]) print("Task Name:", service["name"])
print("任务名称:", service["name"]) print("任务名称:", service["name"])
except:
print(f"Cannot find task with id: {str(id)}, please check whether {str(id)}.json exists in the 'execution_instances' folder.")
print(f"未找到id为{str(id)}的任务,请检查'execution_instances'文件夹中是否存在{str(id)}.json文件。")
continue
try: try:
cloudflare = service["cloudflare"] cloudflare = service["cloudflare"]
except: except:
cloudflare = 0 cloudflare = 0
if cloudflare == 0: if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志 options.add_argument('log-level=3') # 隐藏日志
option.add_argument('log-level=3') # 隐藏日志 # option.add_argument('log-level=3') # 隐藏日志
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
print("Data path:", path)
options.add_experimental_option("prefs", { options.add_experimental_option("prefs", {
# 设置文件下载路径 # 设置文件下载路径
"download.default_directory": "Data/Task_" + str(i), "download.default_directory": path,
"download.prompt_for_download": False, # 禁止下载提示框 "download.prompt_for_download": False, # 禁止下载提示框
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.directory_upgrade": True, "download.directory_upgrade": True,
"download.extensions_to_open": "applications/pdf", "download.extensions_to_open": "applications/pdf",
"plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
}) "safebrowsing_for_trusted_sources_enabled": False,
option.add_experimental_option("prefs", { "safebrowsing.enabled": False,
# 设置文件下载路径 'safebrowsing.disable_download_protection': True,
"download.default_directory": "Data/Task_" + str(i), 'profile.default_content_settings.popups': 0,
"download.prompt_for_download": False, # 禁止下载提示框
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.directory_upgrade": True,
"download.extensions_to_open": "applications/pdf",
"plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF
}) })
# option.add_experimental_option("prefs", {
# # 设置文件下载路径
# "download.default_directory": path,
# "download.prompt_for_download": False, # 禁止下载提示框
# "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
# "download.directory_upgrade": True,
# "download.extensions_to_open": "applications/pdf",
# "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
# "safebrowsing_for_trusted_sources_enabled": False,
# "safebrowsing.enabled": False,
# 'safebrowsing.enabled': False,
# 'safebrowsing.disable_download_protection': True,
# 'profile.default_content_settings.popups': 0,
# })
try: try:
if service["environment"] == 1: if service["environment"] == 1:
option.add_experimental_option( # option.add_experimental_option(
'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览 # 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
options.add_experimental_option( options.add_experimental_option(
'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
except: except:
pass pass
browser_t = MyChrome( # browser_t = MyChrome(
options=options, chrome_options=option, executable_path=driver_path) # options=options, chrome_options=option, executable_path=driver_path)
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
elif cloudflare == 1: elif cloudflare == 1:
if sys.platform == "win32": if sys.platform == "win32":
options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器 options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
# options.add_argument("--auto-open-devtools-for-tabs") # options.add_argument("--auto-open-devtools-for-tabs")
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器 # options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
browser_t = MyUCChrome(options=options, driver_executable_path=driver_path) browser_t = MyUCChrome(
options=options, driver_executable_path=driver_path)
links = list(filter(isnotnull, service["links"].split("\n"))) links = list(filter(isnotnull, service["links"].split("\n")))
browser_t.execute_script('window.open("'+ links[0] +'","_blank");') # open page in new tab # open page in new tab
browser_t.execute_script(
'window.open("' + links[0] + '","_blank");')
time.sleep(5) # wait until page has loaded time.sleep(5) # wait until page has loaded
browser_t.switch_to.window(browser_t.window_handles[1]) # switch to new tab browser_t.switch_to.window(
browser_t.window_handles[1]) # switch to new tab
# browser_t = uc.Chrome() # browser_t = uc.Chrome()
else: else:
print("Cloudflare模式只支持Windows x64平台。") print("Cloudflare模式只支持Windows x64平台。")
@ -1974,27 +2164,27 @@ if __name__ == '__main__':
sys.exit() sys.exit()
event = Event() event = Event()
event.set() event.set()
thread = BrowserThread(browser_t, i, service, thread = BrowserThread(browser_t, id, service,
c.version, event, c.saved_file_name, config=config) c.version, event, c.saved_file_name, config=config, option=tmp_options[i])
print("Thread with task id: ", i, " is created") print("Thread with task id: ", id, " is created")
threads.append(thread) threads.append(thread)
thread.start() thread.start()
# Set the pause operation # Set the pause operation
# if sys.platform != "linux": # if sys.platform != "linux":
# time.sleep(3) # time.sleep(3)
# print("\n\n----------------------------------")
# print("正在运行任务长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码如果想恢复任务的执行请再次长按p键。")
# print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
# print("----------------------------------\n\n")
# Thread(target=check_pause, args=("p", event)).start() # Thread(target=check_pause, args=("p", event)).start()
# else: # else:
time.sleep(3) time.sleep(3)
press_time = {"duration": 0, "is_pressed": False} try:
pause_key = service["pauseKey"]
except:
pause_key = "p"
press_time = {"duration": 0, "is_pressed": False, "pause_key": pause_key}
print("\n\n----------------------------------") print("\n\n----------------------------------")
print( print(
"正在运行任务长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码如果想恢复任务的执行请再次长按p键。") "正在运行任务,长按键盘" + pause_key + "键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按" + pause_key + "键。")
print( print(
"Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.") "Running task, long press '" + pause_key + "' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press '" + pause_key + "' again.")
print("----------------------------------\n\n") print("----------------------------------\n\n")
# if cloudflare: # if cloudflare:
# print("过Cloudflare验证模式有时候会不稳定如果无法通过验证则需要隔几分钟重试一次或者可以更换新的用户信息文件夹再执行任务。") # print("过Cloudflare验证模式有时候会不稳定如果无法通过验证则需要隔几分钟重试一次或者可以更换新的用户信息文件夹再执行任务。")
@ -2002,7 +2192,8 @@ if __name__ == '__main__':
# 使用监听器监听键盘输入 # 使用监听器监听键盘输入
try: try:
if c.keyboard: if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener: with Listener(on_press=on_press_creator(press_time, event),
on_release=on_release_creator(event, press_time)) as listener:
listener.join() listener.join()
except: except:
pass pass
@ -2014,10 +2205,3 @@ if __name__ == '__main__':
for thread in threads: for thread in threads:
print() print()
thread.join() thread.join()
for thread in threads:
thread.browser.quit()
# print("Thread with task id: ", thread.id, " is closed")
print("程序已运行完成,请手动关闭此窗口。")
print(
"The program has finished running, please manually close this window.")

View File

@ -37,19 +37,21 @@ class MyChrome(webdriver.Chrome):
except Exception as e: except Exception as e:
print(e) print(e)
find_element = False find_element = False
# 遍历所有的 iframe 并点击里面的元素 # 遍历所有的 iframe 并查找里面的元素
for iframe in iframes: for iframe in iframes:
# 切换到 iframe # 切换到 iframe
super().switch_to.default_content() super().switch_to.default_content()
super().switch_to.frame(iframe) super().switch_to.frame(iframe)
self.iframe_env = True self.iframe_env = True
try: try:
# 在 iframe 中查找并点击元素 # 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value) element = super().find_element(by=by, value=value)
find_element = True find_element = True
except: except NoSuchElementException as e:
print("No such element found in the iframe") print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
# 完成操作后切回主文档 # 完成操作后切回主文档
# super().switch_to.default_content() # super().switch_to.default_content()
if find_element: if find_element:
@ -68,14 +70,14 @@ class MyChrome(webdriver.Chrome):
# 获取所有的 iframe # 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe") iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False find_element = False
# 遍历所有的 iframe 并点击里面的元素 # 遍历所有的 iframe 并找到里面的元素
for iframe in iframes: for iframe in iframes:
# 切换到 iframe # 切换到 iframe
try: try:
super().switch_to.default_content() super().switch_to.default_content()
super().switch_to.frame(iframe) super().switch_to.frame(iframe)
self.iframe_env = True self.iframe_env = True
# 在 iframe 中查找并点击元素 # 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value) elements = super().find_elements(by=by, value=value)
if len(elements) > 0: if len(elements) > 0:
@ -84,8 +86,10 @@ class MyChrome(webdriver.Chrome):
# super().switch_to.default_content() # super().switch_to.default_content()
if find_element: if find_element:
return elements return elements
except: except NoSuchElementException as e:
print("No such element found in the iframe") print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
if not find_element: if not find_element:
raise NoSuchElementException raise NoSuchElementException
else: else:
@ -117,19 +121,21 @@ if sys.platform != "darwin":
except Exception as e: except Exception as e:
print(e) print(e)
find_element = False find_element = False
# 遍历所有的 iframe 并点击里面的元素 # 遍历所有的 iframe 并找到里面的元素
for iframe in iframes: for iframe in iframes:
# 切换到 iframe # 切换到 iframe
super().switch_to.default_content() super().switch_to.default_content()
super().switch_to.frame(iframe) super().switch_to.frame(iframe)
self.iframe_env = True self.iframe_env = True
try: try:
# 在 iframe 中查找并点击元素 # 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value) element = super().find_element(by=by, value=value)
find_element = True find_element = True
except: except NoSuchElementException as e:
print("No such element found in the iframe") print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
# 完成操作后切回主文档 # 完成操作后切回主文档
# super().switch_to.default_content() # super().switch_to.default_content()
if find_element: if find_element:
@ -148,14 +154,14 @@ if sys.platform != "darwin":
# 获取所有的 iframe # 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe") iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False find_element = False
# 遍历所有的 iframe 并点击里面的元素 # 遍历所有的 iframe 并查找里面的元素
for iframe in iframes: for iframe in iframes:
# 切换到 iframe # 切换到 iframe
try: try:
super().switch_to.default_content() super().switch_to.default_content()
super().switch_to.frame(iframe) super().switch_to.frame(iframe)
self.iframe_env = True self.iframe_env = True
# 在 iframe 中查找并点击元素 # 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素 # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value) elements = super().find_elements(by=by, value=value)
if len(elements) > 0: if len(elements) > 0:
@ -164,8 +170,10 @@ if sys.platform != "darwin":
# super().switch_to.default_content() # super().switch_to.default_content()
if find_element: if find_element:
return elements return elements
except: except NoSuchElementException as e:
print("No such element found in the iframe") print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
if not find_element: if not find_element:
raise NoSuchElementException raise NoSuchElementException
else: else:

View File

@ -31,7 +31,7 @@ def lowercase_tags_in_xpath(xpath):
def on_press_creator(press_time, event): def on_press_creator(press_time, event):
def on_press(key): def on_press(key):
try: try:
if key.char == 'p': if key.char == press_time["pause_key"]:
if press_time["is_pressed"] == False: # 没按下p键时记录按下p键的时间 if press_time["is_pressed"] == False: # 没按下p键时记录按下p键的时间
press_time["duration"] = time.time() press_time["duration"] = time.time()
press_time["is_pressed"] = True press_time["is_pressed"] = True
@ -39,14 +39,14 @@ def on_press_creator(press_time, event):
duration = time.time() - press_time["duration"] duration = time.time() - press_time["duration"]
if duration > 2: if duration > 2:
if event._flag == False: if event._flag == False:
print("任务执行中,长按p键暂停执行。") print("任务执行中,长按" + press_time["pause_key"] + "键暂停执行。")
print("Task is running, long press 'p' to pause.") print("Task is running, long press '" + press_time["pause_key"] + "' to pause.")
# 设置Event的值为True使得线程b可以继续执行 # 设置Event的值为True使得线程b可以继续执行
event.set() event.set()
else: else:
# 设置Event的值为False使得线程b暂停执行 # 设置Event的值为False使得线程b暂停执行
print("任务已暂停,长按p键继续执行...") print("任务已暂停,长按" + press_time["pause_key"] + "键继续执行...")
print("Task paused, long press 'p' to continue...") print("Task paused, long press '" + press_time["pause_key"] + "' to continue...")
event.clear() event.clear()
press_time["duration"] = time.time() press_time["duration"] = time.time()
press_time["is_pressed"] = False press_time["is_pressed"] = False
@ -176,26 +176,36 @@ def write_to_csv(file_name, data, record):
f_csv.writerow(to_write) f_csv.writerow(to_write)
f.close() f.close()
def eval_repl(matchobj):
print(matchobj.group(1))
return str(eval(matchobj.group(1), globals(), locals()))
def replace_field_values(orginal_text, outputParameters, browser=None): def replace_field_values(orginal_text, outputParameters, browser=None):
pattern = r'Field\["([^"]+)"\]' pattern = r'Field\["([^"]+)"\]'
try: try:
replaced_text = re.sub( replaced_text = re.sub(
pattern, lambda match: outputParameters.get(match.group(1), ''), orginal_text) pattern, lambda match: outputParameters.get(match.group(1), ''), orginal_text)
if replaced_text.find("EVAL") != -1: # 如果返回值中包含EVAL if re.search(r'eval\(', replaced_text, re.IGNORECASE): # 如果返回值中包含EVAL
replaced_text = replaced_text.replace("self.", "browser.") replaced_text = replaced_text.replace("self.", "browser.")
replaced_text = re.sub(r'EVAL\("(.*?)"\)', lambda match: str(eval(match.group(1))), replaced_text) pattern = re.compile(r'(?i)eval\("(.+?)"\)')
except: # 循环替换所有匹配到的eval语句
while True:
match = pattern.search(replaced_text)
if not match:
break
# 执行eval并将其结果转换为字符串形式
eval_replaced_text = str(eval(match.group(1)))
# 替换eval语句
replaced_text = replaced_text.replace(match.group(0), eval_replaced_text)
except Exception as e:
print("eval替换失败请检查eval语句是否正确。| Failed to replace eval, please check if the eval statement is correct.")
replaced_text = orginal_text replaced_text = orginal_text
return replaced_text return replaced_text
def readCode(code):
if code.startswith("outside:"):
file_name = os.path.join(os.path.abspath("./"), code[8:])
with open(file_name, 'r', encoding='utf-8-sig') as file_obj:
code = file_obj.read()
return code
def write_to_json(file_name, data, types, record, keys): def write_to_json(file_name, data, types, record, keys):
keys = list(keys) keys = list(keys)
# Prepare empty list for data # Prepare empty list for data

View File

@ -0,0 +1,57 @@
"""
这是一个示例代码文件可以直接在这里写Python代码然后在程序中的exec操作中调用如果此文件名称为myCode.py请将此文件放置在EasySpider程序目录下和Data/文件夹同级那么在程序中的exec操作中可以直接写outside:myCode.py来调用此文件中的代码示例
1. 用self.browser表示当前操作的浏览器可直接用selenium的API进行操作如self.browser.find_element(By.CSS_SELECTOR, "body").send_keys(Keys.END)即可滚动到页面最下方
2. 自定义一个全局变量self.myVar = 1
3. 操纵上面定义的全局变量self.myVar = self.myVar + 1
4. 打印上面定义的全局变量print(self.myVar)
5. 将自定义变量的值赋值为某个字段提取的值self.myVar = self.outputParameters["字段名"]
6. 修改某个字段提取的值self.outputParameters["字段名"] = "新值"
对于更加复杂的操作请直接下载源代码并编译执行
This is a sample code snippet file. You can directly write Python code here, and then call it in the program using an `exec` operation. If this file is named myCode.py, please place this file under the EasySpider program directory (at the same level as the Data/ folder). Then, in the program's `exec` operation, you can directly write outside:myCode.py to invoke the code from this file. Examples:
1. Use self.browser to refer to the current browser being operated on. You can directly utilize the selenium API to perform actions. For instance, self.browser.find_element(By.CSS_SELECTOR, "body").send_keys(Keys.END) will scroll to the bottom of the page.
2. Define a global variable: self.myVar = 1
3. Manipulate the above-defined global variable: self.myVar = self.myVar + 1
4. Print the above-defined global variable: print(self.myVar)
5. Assign a value to the custom variable from a value extracted for some field: self.myVar = self.outputParameters["field name"]
6. Modify the value extracted for some field: self.outputParameters["field name"] = "new value"
For more complex operations, please download the source code and compile it for execution.
"""
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
# 导包 | Import packages
from selenium.common.exceptions import ElementClickInterceptedException
# 定义一个函数 | Define a function
def test(n = 0):
for i in range(0, n):
if i % 2 == 0:
print(i)
return "test"
# 异常捕获 | Exception capture
try:
# 使用XPath定位元素并点击浏览器中元素 | Use XPath to locate the element and click the element in the browser
element = self.browser.find_element(By.XPATH, "//*[contains(@class, 'LeftSide_menu_list__qXCeM')]/div[1]/a[1]") # 这里请忽略IDE的报错因为代码是嵌入到程序中的IDE无法识别self变量和By变量是正常的 | Please ignore the error reported by the IDE, because the code is embedded in the program, and the IDE cannot recognize that the self variable and By variable are normal
element.click()
print("点击成功|Click success")
except ElementClickInterceptedException:
# 如果元素被遮挡,点击失败
print("元素被遮挡,无法点击|The element is blocked and cannot be clicked")
except Exception as e:
# 打印其他异常
print("发生了一个异常|An exception occurred", e)
finally:
# 测试函数 | Test function
self.a = 1
print("a = ", self.a)
self.a = self.a + 1
print("a = ", self.a)
print("All parameters:", self.outputParameters)
print(test(3))
print("执行完毕|Execution completed")

Binary file not shown.

Binary file not shown.

View File

@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"} {"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}

View File

@ -15,6 +15,7 @@
"formidable": "^3.5.0", "formidable": "^3.5.0",
"http": "^0.0.1-security", "http": "^0.0.1-security",
"multer": "^1.4.5-lts.1", "multer": "^1.4.5-lts.1",
"node-abi": "^3.52.0",
"node-window-manager": "^2.2.4", "node-window-manager": "^2.2.4",
"selenium-webdriver": "^4.16.0", "selenium-webdriver": "^4.16.0",
"ws": "^8.12.0", "ws": "^8.12.0",
@ -3914,9 +3915,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/node-abi": { "node_modules/node-abi": {
"version": "3.45.0", "version": "3.52.0",
"dev": true, "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.52.0.tgz",
"license": "MIT", "integrity": "sha512-JJ98b02z16ILv7859irtXn4oUaFWADtvkzy2c0IAatNVX2Mc9Yoh8z6hZInn3QwvMEYhHuQloYi+TTQy67SIdQ==",
"dependencies": { "dependencies": {
"semver": "^7.3.5" "semver": "^7.3.5"
}, },
@ -4814,7 +4815,6 @@
}, },
"node_modules/semver": { "node_modules/semver": {
"version": "7.5.3", "version": "7.5.3",
"dev": true,
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"lru-cache": "^6.0.0" "lru-cache": "^6.0.0"
@ -4834,7 +4834,6 @@
}, },
"node_modules/semver/node_modules/lru-cache": { "node_modules/semver/node_modules/lru-cache": {
"version": "6.0.0", "version": "6.0.0",
"dev": true,
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"yallist": "^4.0.0" "yallist": "^4.0.0"
@ -5665,7 +5664,6 @@
}, },
"node_modules/yallist": { "node_modules/yallist": {
"version": "4.0.0", "version": "4.0.0",
"dev": true,
"license": "ISC" "license": "ISC"
}, },
"node_modules/yargs": { "node_modules/yargs": {

View File

@ -37,6 +37,7 @@
"formidable": "^3.5.0", "formidable": "^3.5.0",
"http": "^0.0.1-security", "http": "^0.0.1-security",
"multer": "^1.4.5-lts.1", "multer": "^1.4.5-lts.1",
"node-abi": "^3.52.0",
"node-window-manager": "^2.2.4", "node-window-manager": "^2.2.4",
"selenium-webdriver": "^4.16.0", "selenium-webdriver": "^4.16.0",
"ws": "^8.12.0", "ws": "^8.12.0",

View File

@ -0,0 +1 @@
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

View File

@ -48,7 +48,7 @@ def copy_folder(source_folder, destination_folder):
def get_chrome_version(): def get_chrome_version():
version = "115" version = "120"
if sys.platform == "win32": if sys.platform == "win32":
version_re = re.compile(r"^[1-9]\d*\.\d*.\d*") version_re = re.compile(r"^[1-9]\d*\.\d*.\d*")
try: try:

View File

@ -12,7 +12,8 @@
"justMyCode": false, "justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[52]", "--headless", "0", "--user_data", "1", "--keyboard", "1"] // "args": ["--ids", "[1]", "--headless", "0", "--user_data", "1", "--keyboard", "1"]
"args": "--ids '[3]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
} }
] ]
} }