执行操作前可设置等待指定元素出现

This commit is contained in:
NaiboWang-Alienware 2023-07-16 02:45:26 +08:00
parent 92d6315a22
commit 528500f795
12 changed files with 311 additions and 223 deletions

View File

@ -85,6 +85,10 @@ let flowchart_window = null;
let current_handle = null;
let old_handles = [];
let handle_pairs = {};
let socket_window = null;
let socket_start = null;
let socket_flowchart = null;
let invoke_window = null;
// var ffi = require('ffi-napi');
// var libm = ffi.Library('libm', {
@ -233,27 +237,7 @@ async function beginInvoke(msg, ws) {
break;
}
}
// .then(function (element) {
// console.log("element", element, handles);
// element.sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
// exit = true;
// }, function (error) {
// console.log("error", error);
// len = len - 1;
// if (len == 0) {
// exit = true;
// }
// }
// );
}
// let handles = driver.getAllWindowHandles();
// driver.switchTo().window(handles[handles.length - 1]);
// driver.findElement(By.xpath(msg.message.xpath)).sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
// robot.keyTap("a", "control");
// robot.keyTap("backspace");
// robot.typeString(keyInfo);
// robot.keyTap("shift");
// robot.keyTap("shift");
} else if (msg.type == 3) {
try {
if (msg.from == 0) {
@ -368,6 +352,11 @@ async function beginInvoke(msg, ws) {
} catch {
console.log("open devtools error");
}
try{
invoke_window.openDevTools();
} catch {
console.log("open devtools error");
}
} else if (msg.type == 7) {
// 获得当前页面Cookies
try{
@ -383,9 +372,6 @@ async function beginInvoke(msg, ws) {
const WebSocket = require('ws');
const {all} = require("express/lib/application");
let socket_window = null;
let socket_start = null;
let socket_flowchart = null;
let wss = new WebSocket.Server({port: websocket_port});
wss.on('connection', function (ws) {
ws.on('message', async function (message, isBinary) {
@ -521,7 +507,7 @@ function handleOpenBrowser(event, lang = "en", user_data_folder = "", mobile = f
}
function handleOpenInvoke(event, lang = "en") {
const window = new BrowserWindow({icon: iconPath});
invoke_window = new BrowserWindow({icon: iconPath});
let url = "";
language = lang;
if (lang == "en") {
@ -530,10 +516,10 @@ function handleOpenInvoke(event, lang = "en") {
url = server_address + `/taskGrid/taskList.html?type=1&wsport=${websocket_port}&backEndAddressServiceWrapper=` + server_address + "&lang=zh";
}
// and load the index.html of the app.
window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' });
window.maximize();
invoke_window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' });
invoke_window.maximize();
mainWindow.hide();
window.on('close', function (event) {
invoke_window.on('close', function (event) {
mainWindow.show();
});
}

View File

@ -8,6 +8,10 @@
margin-top: 10px;
}
label{
margin-left: 2px;
}
div.node {
height: 45px;
width: 150px;

View File

@ -580,7 +580,14 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
</div>
</div>
<div style="margin-top:5px">
<label>Seconds <b>after executed</b> (Can be set to a decimal, such as 0.5):</label>
<label>Wait for the following elements to appear <b>before</b> executing:</label>
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
placeholder="Enter the XPath of the element to wait for, leave blank to skip waiting"></textarea>
<label style="margin-top:5px">In which iframe is the element located? Set to 0 if the element is not inside an iframe:</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
<label style="margin-top:5px">Maximum waiting time for element appearance (in seconds):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
<label style="margin-top:5px">Wait seconds <b>after</b> execution (can set decimal values, e.g., 0.5):</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
<label>Wait Type</label>
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">

View File

@ -625,31 +625,7 @@ document.oncontextmenu = function() {
return false;
} //屏蔽右键菜单
//删除元素
document.onkeydown = function(e) {
if (nowNode != null && e.keyCode == 46) {
// if (confirm("确定要删除元素吗?")) {
deleteElement();
// }
} else { //ctrl+s保存服务
let currKey = 0;
currKey = e.keyCode || e.which || e.charCode;
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
$('#save').click();
return true;
} else if (currKey == 116) {
location.reload();
} else if (currKey == 123) {
console.log("打开devtools")
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
command.onopen = function() {
let message = {
type: 6, //消息类型0代表连接操作
};
this.send(JSON.stringify(message));
};
}
}
}
function inputDelete(e) {
if (e.keyCode == 46) {

View File

@ -580,9 +580,16 @@
</div>
</div>
<div style="margin-top:5px">
<label><b>执行后</b>等待秒数所有等待时间均可设置为小数如0.5</label>
<label>操作<b>执行前</b>等待以下元素出现:</label>
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
placeholder="填写要等待出现元素的XPath不填写则不等待"></textarea>
<label style="margin-top:5px">要等待的元素在页面第几个iframe中0表示元素不在iframe中</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
<label style="margin-top:5px">元素出现的最长等待时间(秒):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
<label style="margin-top:5px">操作<b>执行后</b>等待秒数所有等待时间均可设置为小数如0.5</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
<label>等待类型</label>
<label style="margin-top:5px">等待类型</label>
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">
<option value = 0>固定等待设置等10秒就等10秒</option>
<option value = 1>随机等待设置等10秒会随机等10×0.5 - 10 × 1.5 秒)</option>

View File

@ -75,3 +75,34 @@ function isValidMySQLTableName(tableName) {
return pattern.test(tableName);
}
document.onkeydown = function(e) {
let t = false;
try{
t = nowNode;
} catch (e) {
console.log(e);
}
if (t && nowNode != null && e.keyCode == 46) {
// if (confirm("确定要删除元素吗?")) {
deleteElement();
// }
} else { //ctrl+s保存服务
let currKey = 0;
currKey = e.keyCode || e.which || e.charCode;
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
$('#save').click();
return true;
} else if (currKey == 116) {
location.reload();
} else if (currKey == 123) {
console.log("打开devtools")
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
command.onopen = function() {
let message = {
type: 6, //消息类型0代表连接操作
};
this.send(JSON.stringify(message));
};
}
}
}

View File

@ -146,6 +146,9 @@ function addParameters(t) {
beforeJSWaitTime: 0, //执行前js等待时间
afterJS: "", //执行后执行的js
afterJSWaitTime: 0, //执行后js等待时间
waitElement: "", //等待元素
waitElementTime: 10, //等待元素时间
waitElementIframeIndex: 0, //等待元素在第几个iframe中
}; //公共参数处理
if (t.option == 1) {
t["parameters"]["url"] = "about:blank";

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":183,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/16/2023, 1:55:02 AM","update_time":"7/16/2023, 2:02:09 AM","version":"0.3.6","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n  ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"//*[@id=\"service-2017\"]/div[1]/ol/li[1]","waitElementTime":10,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[4]/div[1]","allXPaths":["/html/body/div[4]/div[1]","//div[contains(., '')]","//DIV[@class='w']","/html/body/div[last()-6]/div"],"exampleValues":[{"num":0,"value":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n  ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"unique_index":"7c04qey9fkllk4b56jd","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//div[123]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[]}}]}

View File

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[84]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
"args": ["--id", "[54]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
}
]
}

View File

@ -1,5 +1,28 @@
# -*- coding: utf-8 -*-
# import atexit
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
from commandline_config import Config
import os
import csv
from openpyxl import load_workbook, Workbook
import random
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from pynput.keyboard import Key, Listener
from datetime import datetime
import io # 遇到错误退出时应执行的代码
import json
@ -19,39 +42,16 @@ from lxml import etree
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
# import undetected_chromedriver as uc
from pynput.keyboard import Key, Listener
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import random
# import pandas as pd
from openpyxl import load_workbook, Workbook
# import numpy
import csv
import os
from commandline_config import Config
# import pytesseract
from PIL import Image
# import uuid
from threading import Thread, Event
from myChrome import MyChrome
if sys.platform != "darwin":
from myChrome import MyUCChrome
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config):
Thread.__init__(self)
@ -83,7 +83,8 @@ class BrowserThread(Thread):
if not os.path.exists("Data/Task_" + str(i)):
os.mkdir("Data/Task_" + str(i))
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
os.mkdir("Data/Task_" + str(i) + "/" +
self.saveName) # 创建保存文件夹用来保存截图
self.getDataStep = 0
self.startSteps = 0
try:
@ -94,11 +95,15 @@ class BrowserThread(Thread):
except:
pass
if self.startSteps != 0:
print("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为", self.startSteps, "条。")
print("In this mode, task ID", self.id, "will start from the last step, before we already collected", self.startSteps, " items.")
print("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
self.startSteps, "条。")
print("In this mode, task ID", self.id,
"will start from the last step, before we already collected", self.startSteps, " items.")
else:
print("此模式下任务ID", self.id, "将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
print("In this mode, task ID", self.id, "will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
print("此模式下任务ID", self.id,
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
print("In this mode, task ID", self.id,
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
stealth_path = driver_path[:driver_path.find(
"chromedriver")] + "stealth.min.js"
with open(stealth_path, 'r') as f:
@ -181,7 +186,8 @@ class BrowserThread(Thread):
except:
self.outputParametersTypes.append("text")
try:
self.outputParametersRecord.append(bool(para["recordASField"]))
self.outputParametersRecord.append(
bool(para["recordASField"]))
except:
self.outputParametersRecord.append(True)
# 文件叠加的时候不添加表头
@ -203,11 +209,19 @@ class BrowserThread(Thread):
iframe = node["parameters"]["iframe"]
except:
node["parameters"]["iframe"] = False
try:
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
node["parameters"]["xpath"])
except:
pass
try:
node["parameters"]["waitElementIframeIndex"] = int(
node["parameters"]["waitElementIframeIndex"])
except:
node["parameters"]["waitElement"] = ""
node["parameters"]["waitElementTime"] = 10
node["parameters"]["waitElementIframeIndex"] = 0
if node["option"] == 1: # 打开网页操作
try:
cookies = node["parameters"]["cookies"]
@ -216,8 +230,10 @@ class BrowserThread(Thread):
if node["option"] == 2: # 点击操作
if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5":
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
print("您的任务版本号为" + self.task_version + "循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
print("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 3: # 提取数据操作
node["parameters"]["recordASField"] = 0
paras = node["parameters"]["paras"]
@ -231,7 +247,8 @@ class BrowserThread(Thread):
except:
para["iframe"] = False
try:
para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
para["relativeXPath"] = lowercase_tags_in_xpath(
para["relativeXPath"])
except:
pass
try:
@ -258,8 +275,10 @@ class BrowserThread(Thread):
elif node["option"] == 7: # 移动到元素
if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5":
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
print("您的任务版本号为" + self.task_version + "循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
node["parameters"]["xpath"] = ""
print("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
def readFromExcel(self):
if self.inputExcel == "":
@ -267,8 +286,10 @@ class BrowserThread(Thread):
try:
workbook = load_workbook(self.inputExcel)
except:
print("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确", os.path.abspath(self.inputExcel))
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ", os.path.abspath(self.inputExcel))
print("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确",
os.path.abspath(self.inputExcel))
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ",
os.path.abspath(self.inputExcel))
time.sleep(5)
return 0
@ -365,14 +386,18 @@ class BrowserThread(Thread):
# 写入数据
if self.outputFormat == "csv" or self.outputFormat == "txt":
file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + '.' + self.outputFormat
write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
str(self.id) + "/" + self.saveName + \
'.' + self.outputFormat
write_to_csv(file_name, self.OUTPUT,
self.outputParametersRecord)
elif self.outputFormat == "xlsx":
file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + '.xlsx'
write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
write_to_excel(
file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
elif self.outputFormat == "mysql":
self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
self.mysql.write_to_mysql(
self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
self.OUTPUT = []
self.log = ""
@ -403,10 +428,12 @@ class BrowserThread(Thread):
i = 0
while True:
# newBodyText = self.browser.page_source
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
newBodyText = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
if newBodyText == bodyText:
print("页面已检测不到新内容,停止滚动。")
print("No new content detected on the page, stop scrolling.")
print(
"No new content detected on the page, stop scrolling.")
break
else:
bodyText = newBodyText
@ -493,13 +520,15 @@ class BrowserThread(Thread):
output = exec(code)
except Exception as e:
print("执行下面的代码时出错:" + code, ",错误为:", e)
print("Error executing the following code:" + code, ", error is:", e)
print("Error executing the following code:" +
code, ", error is:", e)
elif int(codeMode) == 6:
try:
output = eval(code)
except Exception as e:
print("获得下面的代码返回值时出错:" + code, ",错误为:", e)
print("Error executing and getting return value the following code:" + code, ", error is:", e)
print(
"Error executing and getting return value the following code:" + code, ", error is:", e)
elif int(codeMode) == 1:
self.recordLog("Execute System Call:" + code)
self.recordLog("执行系统命令:" + code)
@ -531,7 +560,8 @@ class BrowserThread(Thread):
max_wait_time = int(paras["waitTime"])
if codeMode == 2: # 使用循环的情况下传入的clickPath就是实际的xpath
try:
loopPath = replace_field_values(loopPath, self.outputParameters)
loopPath = replace_field_values(
loopPath, self.outputParameters)
elements = self.browser.find_elements(
By.XPATH, loopPath, iframe=paras["iframe"])
element = elements[index]
@ -553,7 +583,8 @@ class BrowserThread(Thread):
# print("The return value of operation <" + node["title"] + "> is: " + output)
self.outputParameters[node["title"]] = output
if recordASField:
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
line = new_line(self.outputParameters,
self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line)
def switchSelect(self, para, loopValue):
@ -566,7 +597,8 @@ class BrowserThread(Thread):
optionValue = loopValue.split("~")[index - 1]
except:
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
print("Failed to get value, maybe because the index is out of range, will use the entire text value")
print(
"Failed to get value, maybe because the index is out of range, will use the entire text value")
else:
optionValue = loopValue
optionMode = 1
@ -632,10 +664,35 @@ class BrowserThread(Thread):
def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
node = self.procedure[nodeId]
WebDriverWait(self.browser, 10).until
# 等待元素出现才进行操作10秒内未出现则报错
(EC.visibility_of_element_located(
(By.XPATH, node["parameters"]["xpath"])))
# WebDriverWait(self.browser, 10).until
# # 等待元素出现才进行操作10秒内未出现则报错
# (EC.visibility_of_element_located(
# (By.XPATH, node["parameters"]["xpath"])))
try:
if node["parameters"]["waitElement"] != "":
waitElement = replace_field_values(
node["parameters"]["waitElement"], self.outputParameters)
waitElementTime = float(node["parameters"]["waitElementTime"])
waitElementIframeIndex = node["parameters"]["waitElementInIframe"]
print("等待元素出现:", waitElement)
print("Waiting for element to appear:", waitElement)
if waitElementIframeIndex > 0:
iframes = self.browser.find_elements(
By.CSS_SELECTOR, "iframe", iframe=False)
iframe = iframes[waitElementIframeIndex - 1]
self.browser.switch_to.frame(iframe)
WebDriverWait(self.browser, waitElementTime).until(
EC.presence_of_element_located((By.XPATH, waitElement))
)
if waitElementIframeIndex > 0:
self.browser.switch_to.default_content()
except Exception as e:
if waitElement != "":
print("等待元素出现超时:", waitElement, ",将继续执行。")
print("Timeout waiting for element to appear:",
waitElement, ", will continue to execute.")
print(e)
self.recordLog("Wait element not found")
# 根据不同选项执行不同操作
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
@ -714,7 +771,8 @@ class BrowserThread(Thread):
continue
elif tType == 2: # 当前页面包含元素
try:
xpath = replace_field_values(cnode["parameters"]["value"], self.outputParameters)
xpath = replace_field_values(
cnode["parameters"]["value"], self.outputParameters)
if self.browser.find_element(By.XPATH, xpath, iframe=cnode["parameters"]["iframe"]):
executeBranchId = i
break
@ -722,7 +780,8 @@ class BrowserThread(Thread):
continue
elif tType == 3: # 当前循环元素包括文本
try:
value = replace_field_values(cnode["parameters"]["value"], self.outputParameters)
value = replace_field_values(
cnode["parameters"]["value"], self.outputParameters)
if loopElement.text.find(value) >= 0:
executeBranchId = i
break
@ -730,7 +789,8 @@ class BrowserThread(Thread):
continue
elif tType == 4: # 当前循环元素包括元素
try:
xpath = replace_field_values(cnode["parameters"]["value"][1:], self.outputParameters)
xpath = replace_field_values(
cnode["parameters"]["value"][1:], self.outputParameters)
if loopElement.find_element(By.XPATH, xpath):
executeBranchId = i
break
@ -782,7 +842,8 @@ class BrowserThread(Thread):
finished = False
# newBodyText = self.browser.page_source
# newBodyText = self.browser.find_element(By.XPATH, "//body").text
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
newBodyText = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
if node["parameters"]["exitCount"] == 0:
if newBodyText == bodyText: # 如果页面内容无变化
print("页面已检测不到新内容,停止循环。")
@ -790,7 +851,6 @@ class BrowserThread(Thread):
finished = True
break
else:
if node["parameters"]["exitCount"] == 0:
print("检测到页面变化,继续循环。")
print("Page changed detected, continue loop.")
bodyText = newBodyText
@ -1150,7 +1210,8 @@ class BrowserThread(Thread):
replaced_text = replaced_text.split("~")[index - 1]
except:
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
print("Failed to get value, maybe because the index is out of range, will use the entire text value")
print(
"Failed to get value, maybe because the index is out of range, will use the entire text value")
textbox.send_keys(replaced_text)
if value.lower().find("<enter>") >= 0:
textbox.send_keys(Keys.ENTER)
@ -1226,7 +1287,8 @@ class BrowserThread(Thread):
pass
except Exception as e:
print("点击元素失败:" + path, "请尝试将点击类型改为JavaScript点击后重试。")
print("Failed to click element:" + path, ", please try to change the click type to JavaScript Click.")
print("Failed to click element:" + path,
", please try to change the click type to JavaScript Click.")
print(e)
self.Log(e)
self.recordLog(str(e))
@ -1374,7 +1436,8 @@ class BrowserThread(Thread):
# 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
temp_name = "OCR_" + str(time.time()) + ".png"
location = "Data/Task_" + str(self.id) + "/" + self.saveName + "/" + temp_name
location = "Data/Task_" + \
str(self.id) + "/" + self.saveName + "/" + temp_name
image.save(location)
ocr = DdddOcr()
with open(location, 'rb') as f:
@ -1498,7 +1561,8 @@ class BrowserThread(Thread):
content = pageHTML.xpath(full_path)
except:
content = []
elif not relativeXPath.startswith("/"): # 如果是id()这种形式,不需要包/html/body
# 如果是id()这种形式,不需要包/html/body
elif not relativeXPath.startswith("/"):
try:
content = loopElementHTML.xpath(xpath)
except:
@ -1507,7 +1571,8 @@ class BrowserThread(Thread):
content = loopElementHTML.xpath(
"/html/body/" + loopElementHTML[0][0].tag + xpath)
else:
if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
# 如果是id()或(//div)[1]这种形式,不需要包/html/body
if xpath.find("/body") < 0 and xpath.startswith("/"):
xpath = "/html/body" + xpath
content = pageHTML.xpath(xpath)
if len(content) > 0:
@ -1517,7 +1582,8 @@ class BrowserThread(Thread):
for result in content if result.strip())
if p["nodeType"] == 2:
base_url = self.browser.current_url
content = urljoin(base_url, content) # 合并链接相对路径为绝对路径
# 合并链接相对路径为绝对路径
content = urljoin(base_url, content)
else:
content = p["default"]
if not self.dataNotFoundKeys[p["name"]]:
@ -1642,7 +1708,8 @@ class BrowserThread(Thread):
self.execute_code(
2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
if para["recordASField"] > 0:
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
line = new_line(self.outputParameters,
self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line)
# rt.end()
@ -1675,8 +1742,10 @@ if __name__ == '__main__':
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# MacOS需要用option而不是options
option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
option.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# # MacOS需要用option而不是options
@ -1684,7 +1753,8 @@ if __name__ == '__main__':
# driver_path = os.getcwd()+ "/chromedriver_mac64"
print(driver_path)
if c.config_folder == "":
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
c.config_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider",
@ -1780,6 +1850,11 @@ if __name__ == '__main__':
options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
option.add_argument('-ignore-certificate-errors')
option.add_argument('-ignore -ssl-errors')
threads = []
for i in c.id:
# print(options)
@ -1876,7 +1951,6 @@ if __name__ == '__main__':
# print("您的操作系统不支持暂停功能。")
# print("Your operating system does not support the pause function.")
# print("线程长度:", len(threads) )
for thread in threads:

View File

@ -3,7 +3,6 @@ requests==2.31.0
selenium==4.5.0
pyinstaller==5.9.0
Pillow==9.4.0
pytesseract==0.3.10
openpyxl==3.1.2
pymysql==1.1.0
lxml==4.9.2