执行操作前可设置等待指定元素出现

This commit is contained in:
NaiboWang-Alienware 2023-07-16 02:45:26 +08:00
parent 92d6315a22
commit 528500f795
12 changed files with 311 additions and 223 deletions

View File

@ -85,6 +85,10 @@ let flowchart_window = null;
let current_handle = null; let current_handle = null;
let old_handles = []; let old_handles = [];
let handle_pairs = {}; let handle_pairs = {};
let socket_window = null;
let socket_start = null;
let socket_flowchart = null;
let invoke_window = null;
// var ffi = require('ffi-napi'); // var ffi = require('ffi-napi');
// var libm = ffi.Library('libm', { // var libm = ffi.Library('libm', {
@ -233,27 +237,7 @@ async function beginInvoke(msg, ws) {
break; break;
} }
} }
// .then(function (element) {
// console.log("element", element, handles);
// element.sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
// exit = true;
// }, function (error) {
// console.log("error", error);
// len = len - 1;
// if (len == 0) {
// exit = true;
// }
// }
// );
} }
// let handles = driver.getAllWindowHandles();
// driver.switchTo().window(handles[handles.length - 1]);
// driver.findElement(By.xpath(msg.message.xpath)).sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
// robot.keyTap("a", "control");
// robot.keyTap("backspace");
// robot.typeString(keyInfo);
// robot.keyTap("shift");
// robot.keyTap("shift");
} else if (msg.type == 3) { } else if (msg.type == 3) {
try { try {
if (msg.from == 0) { if (msg.from == 0) {
@ -368,6 +352,11 @@ async function beginInvoke(msg, ws) {
} catch { } catch {
console.log("open devtools error"); console.log("open devtools error");
} }
try{
invoke_window.openDevTools();
} catch {
console.log("open devtools error");
}
} else if (msg.type == 7) { } else if (msg.type == 7) {
// 获得当前页面Cookies // 获得当前页面Cookies
try{ try{
@ -383,9 +372,6 @@ async function beginInvoke(msg, ws) {
const WebSocket = require('ws'); const WebSocket = require('ws');
const {all} = require("express/lib/application"); const {all} = require("express/lib/application");
let socket_window = null;
let socket_start = null;
let socket_flowchart = null;
let wss = new WebSocket.Server({port: websocket_port}); let wss = new WebSocket.Server({port: websocket_port});
wss.on('connection', function (ws) { wss.on('connection', function (ws) {
ws.on('message', async function (message, isBinary) { ws.on('message', async function (message, isBinary) {
@ -521,7 +507,7 @@ function handleOpenBrowser(event, lang = "en", user_data_folder = "", mobile = f
} }
function handleOpenInvoke(event, lang = "en") { function handleOpenInvoke(event, lang = "en") {
const window = new BrowserWindow({icon: iconPath}); invoke_window = new BrowserWindow({icon: iconPath});
let url = ""; let url = "";
language = lang; language = lang;
if (lang == "en") { if (lang == "en") {
@ -530,10 +516,10 @@ function handleOpenInvoke(event, lang = "en") {
url = server_address + `/taskGrid/taskList.html?type=1&wsport=${websocket_port}&backEndAddressServiceWrapper=` + server_address + "&lang=zh"; url = server_address + `/taskGrid/taskList.html?type=1&wsport=${websocket_port}&backEndAddressServiceWrapper=` + server_address + "&lang=zh";
} }
// and load the index.html of the app. // and load the index.html of the app.
window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' }); invoke_window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' });
window.maximize(); invoke_window.maximize();
mainWindow.hide(); mainWindow.hide();
window.on('close', function (event) { invoke_window.on('close', function (event) {
mainWindow.show(); mainWindow.show();
}); });
} }

View File

@ -8,6 +8,10 @@
margin-top: 10px; margin-top: 10px;
} }
label{
margin-left: 2px;
}
div.node { div.node {
height: 45px; height: 45px;
width: 150px; width: 150px;
@ -19,38 +23,38 @@
padding: 5px; padding: 5px;
margin: 10px auto; margin: 10px auto;
} }
.arrow { .arrow {
margin: 10px auto; margin: 10px auto;
text-align: center; text-align: center;
font-size: 23px!important; font-size: 23px!important;
color: black; color: black;
} }
.arrow:hover { .arrow:hover {
background-color: deepskyblue!important; background-color: deepskyblue!important;
cursor: pointer; cursor: pointer;
color: white; color: white;
} }
.branchAdd { .branchAdd {
margin: 10px auto; margin: 10px auto;
text-align: center; text-align: center;
font-size: 18px!important; font-size: 18px!important;
color: black; color: black;
} }
.branchAdd:hover { .branchAdd:hover {
background-color: deepskyblue; background-color: deepskyblue;
cursor: pointer; cursor: pointer;
color: white; color: white;
} }
div.node:hover { div.node:hover {
cursor: pointer; cursor: pointer;
background: navy; background: navy;
} }
.loop { .loop {
border: skyblue solid; border: skyblue solid;
text-align: center; text-align: center;
@ -61,11 +65,11 @@
margin: 10px auto; margin: 10px auto;
border-radius: 7px; border-radius: 7px;
} }
.options { .options {
height: 35px; height: 35px;
} }
.judge { .judge {
/* display: flex; */ /* display: flex; */
text-align: center; text-align: center;
@ -76,7 +80,7 @@
justify-content: center; justify-content: center;
border-radius: 7px; border-radius: 7px;
} }
.branch { .branch {
display: inline-block; display: inline-block;
margin: 5px; margin: 5px;
@ -88,11 +92,11 @@
margin: 10px; margin: 10px;
border-radius: 7px; border-radius: 7px;
} }
.sequence { .sequence {
display: block; display: block;
} }
.toolbox button { .toolbox button {
margin-top: 5px; margin-top: 5px;
margin-left: 3px; margin-left: 3px;
@ -100,35 +104,35 @@
width: 80%; width: 80%;
font-size: 15px!important; font-size: 15px!important;
} }
.Modify { .Modify {
margin: 20px; margin: 20px;
} }
.Modify input { .Modify input {
font-size: 17px!important; font-size: 17px!important;
} }
.elements { .elements {
font-size: 17px!important; font-size: 17px!important;
margin: 10px 0; margin: 10px 0;
} }
.elements p { .elements p {
margin: 5px 0; margin: 5px 0;
} }
.elements input[type=checkbox] { .elements input[type=checkbox] {
width: 20px; width: 20px;
height: 20px; height: 20px;
vertical-align: sub; vertical-align: sub;
margin-right: 5px; margin-right: 5px;
} }
.elements textarea { .elements textarea {
min-height: 100px; min-height: 100px;
} }
.elements label { .elements label {
font-size: 17px!important; font-size: 17px!important;
margin: 10px 0; margin: 10px 0;
@ -143,7 +147,7 @@
border: 1px solid rgb(78, 78, 78); border: 1px solid rgb(78, 78, 78);
padding-left: 2px; padding-left: 2px;
} }
table { table {
table-layout: fixed; table-layout: fixed;
word-break: break-all; word-break: break-all;
@ -153,7 +157,7 @@
white-space: nowrap; white-space: nowrap;
width: 100%; width: 100%;
} */ } */
.toolkitcontain { .toolkitcontain {
border: 1px solid #cdd!important; border: 1px solid #cdd!important;
width: 100%; width: 100%;
@ -163,7 +167,7 @@
margin-top: 10px; margin-top: 10px;
position: relative; position: relative;
} }
.toolkitcontain table { .toolkitcontain table {
table-layout: fixed; table-layout: fixed;
word-break: break-all; word-break: break-all;
@ -173,7 +177,7 @@
white-space: nowrap; white-space: nowrap;
width: 100%; width: 100%;
} }
.toolkitcontain th, .toolkitcontain th,
.toolkitcontain td, .toolkitcontain td,
.toolkitcontain tr { .toolkitcontain tr {
@ -187,7 +191,7 @@
padding-left: 1px; padding-left: 1px;
-webkit-user-select: none; -webkit-user-select: none;
} }
.toolkitcontain .toolkittb2 { .toolkitcontain .toolkittb2 {
position: sticky; position: sticky;
top: 0px; top: 0px;
@ -195,13 +199,13 @@
background-color: azure; background-color: azure;
z-index: 1000; z-index: 1000;
} }
.toolkitcontain .toolkittb4 { .toolkitcontain .toolkittb4 {
position: absolute; position: absolute;
} }
.toolkitcontain a { .toolkitcontain a {
font-size: 13px!important; font-size: 13px!important;
cursor: pointer; cursor: pointer;
color: blue; color: blue;
} }

View File

@ -580,7 +580,14 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
</div> </div>
</div> </div>
<div style="margin-top:5px"> <div style="margin-top:5px">
<label>Seconds <b>after executed</b> (Can be set to a decimal, such as 0.5):</label> <label>Wait for the following elements to appear <b>before</b> executing:</label>
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
placeholder="Enter the XPath of the element to wait for, leave blank to skip waiting"></textarea>
<label style="margin-top:5px">In which iframe is the element located? Set to 0 if the element is not inside an iframe:</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
<label style="margin-top:5px">Maximum waiting time for element appearance (in seconds):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
<label style="margin-top:5px">Wait seconds <b>after</b> execution (can set decimal values, e.g., 0.5):</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input> <input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
<label>Wait Type</label> <label>Wait Type</label>
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control"> <select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">

View File

@ -625,31 +625,7 @@ document.oncontextmenu = function() {
return false; return false;
} //屏蔽右键菜单 } //屏蔽右键菜单
//删除元素 //删除元素
document.onkeydown = function(e) {
if (nowNode != null && e.keyCode == 46) {
// if (confirm("确定要删除元素吗?")) {
deleteElement();
// }
} else { //ctrl+s保存服务
let currKey = 0;
currKey = e.keyCode || e.which || e.charCode;
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
$('#save').click();
return true;
} else if (currKey == 116) {
location.reload();
} else if (currKey == 123) {
console.log("打开devtools")
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
command.onopen = function() {
let message = {
type: 6, //消息类型0代表连接操作
};
this.send(JSON.stringify(message));
};
}
}
}
function inputDelete(e) { function inputDelete(e) {
if (e.keyCode == 46) { if (e.keyCode == 46) {

View File

@ -580,9 +580,16 @@
</div> </div>
</div> </div>
<div style="margin-top:5px"> <div style="margin-top:5px">
<label><b>执行后</b>等待秒数所有等待时间均可设置为小数如0.5</label> <label>操作<b>执行前</b>等待以下元素出现:</label>
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
placeholder="填写要等待出现元素的XPath不填写则不等待"></textarea>
<label style="margin-top:5px">要等待的元素在页面第几个iframe中0表示元素不在iframe中</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
<label style="margin-top:5px">元素出现的最长等待时间(秒):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
<label style="margin-top:5px">操作<b>执行后</b>等待秒数所有等待时间均可设置为小数如0.5</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input> <input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
<label>等待类型</label> <label style="margin-top:5px">等待类型</label>
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control"> <select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">
<option value = 0>固定等待设置等10秒就等10秒</option> <option value = 0>固定等待设置等10秒就等10秒</option>
<option value = 1>随机等待设置等10秒会随机等10×0.5 - 10 × 1.5 秒)</option> <option value = 1>随机等待设置等10秒会随机等10×0.5 - 10 × 1.5 秒)</option>

View File

@ -75,3 +75,34 @@ function isValidMySQLTableName(tableName) {
return pattern.test(tableName); return pattern.test(tableName);
} }
document.onkeydown = function(e) {
let t = false;
try{
t = nowNode;
} catch (e) {
console.log(e);
}
if (t && nowNode != null && e.keyCode == 46) {
// if (confirm("确定要删除元素吗?")) {
deleteElement();
// }
} else { //ctrl+s保存服务
let currKey = 0;
currKey = e.keyCode || e.which || e.charCode;
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
$('#save').click();
return true;
} else if (currKey == 116) {
location.reload();
} else if (currKey == 123) {
console.log("打开devtools")
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
command.onopen = function() {
let message = {
type: 6, //消息类型0代表连接操作
};
this.send(JSON.stringify(message));
};
}
}
}

View File

@ -146,6 +146,9 @@ function addParameters(t) {
beforeJSWaitTime: 0, //执行前js等待时间 beforeJSWaitTime: 0, //执行前js等待时间
afterJS: "", //执行后执行的js afterJS: "", //执行后执行的js
afterJSWaitTime: 0, //执行后js等待时间 afterJSWaitTime: 0, //执行后js等待时间
waitElement: "", //等待元素
waitElementTime: 10, //等待元素时间
waitElementIframeIndex: 0, //等待元素在第几个iframe中
}; //公共参数处理 }; //公共参数处理
if (t.option == 1) { if (t.option == 1) {
t["parameters"]["url"] = "about:blank"; t["parameters"]["url"] = "about:blank";
@ -518,4 +521,4 @@ function LANG(zh, en) {
} else { } else {
return en; return en;
} }
} }

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":183,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/16/2023, 1:55:02 AM","update_time":"7/16/2023, 2:02:09 AM","version":"0.3.6","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n  ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"//*[@id=\"service-2017\"]/div[1]/ol/li[1]","waitElementTime":10,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[4]/div[1]","allXPaths":["/html/body/div[4]/div[1]","//div[contains(., '')]","//DIV[@class='w']","/html/body/div[last()-6]/div"],"exampleValues":[{"num":0,"value":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n  ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"unique_index":"7c04qey9fkllk4b56jd","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//div[123]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[]}}]}

View File

@ -12,7 +12,7 @@
"justMyCode": false, "justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[84]", "--headless", "0", "--user_data", "1", "--keyboard", "0"] "args": ["--id", "[54]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
} }
] ]
} }

View File

@ -1,5 +1,28 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# import atexit # import atexit
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
from commandline_config import Config
import os
import csv
from openpyxl import load_workbook, Workbook
import random
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from pynput.keyboard import Key, Listener
from datetime import datetime from datetime import datetime
import io # 遇到错误退出时应执行的代码 import io # 遇到错误退出时应执行的代码
import json import json
@ -17,41 +40,18 @@ from ddddocr import DdddOcr
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree from lxml import etree
import onnxruntime import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志 onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
# import undetected_chromedriver as uc # import undetected_chromedriver as uc
from pynput.keyboard import Key, Listener
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import random
# import pandas as pd # import pandas as pd
from openpyxl import load_workbook, Workbook
# import numpy # import numpy
import csv
import os
from commandline_config import Config
# import pytesseract # import pytesseract
from PIL import Image
# import uuid # import uuid
from threading import Thread, Event
from myChrome import MyChrome
if sys.platform != "darwin": if sys.platform != "darwin":
from myChrome import MyUCChrome from myChrome import MyUCChrome
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
desired_capabilities = DesiredCapabilities.CHROME desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none" desired_capabilities["pageLoadStrategy"] = "none"
class BrowserThread(Thread): class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config): def __init__(self, browser_t, id, service, version, event, saveName, config):
Thread.__init__(self) Thread.__init__(self)
@ -73,7 +73,7 @@ class BrowserThread(Thread):
self.BREAK = False self.BREAK = False
self.CONTINUE = False self.CONTINUE = False
# 名称设定 # 名称设定
if saveName != "": # 命令行覆盖保存名称 if saveName != "": # 命令行覆盖保存名称
self.saveName = saveName # 保存文件的名字 self.saveName = saveName # 保存文件的名字
now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
self.saveName = self.saveName.replace("current_time", now) self.saveName = self.saveName.replace("current_time", now)
@ -83,22 +83,27 @@ class BrowserThread(Thread):
if not os.path.exists("Data/Task_" + str(i)): if not os.path.exists("Data/Task_" + str(i)):
os.mkdir("Data/Task_" + str(i)) os.mkdir("Data/Task_" + str(i))
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName): if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图 os.mkdir("Data/Task_" + str(i) + "/" +
self.saveName) # 创建保存文件夹用来保存截图
self.getDataStep = 0 self.getDataStep = 0
self.startSteps = 0 self.startSteps = 0
try: try:
startFromExit = service["startFromExit"] # 从上次退出的步骤开始 startFromExit = service["startFromExit"] # 从上次退出的步骤开始
if startFromExit == 1: if startFromExit == 1:
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r', encoding='utf-8-sig') as file_obj: with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r', encoding='utf-8-sig') as file_obj:
self.startSteps = int(file_obj.read()) # 读取已执行步数 self.startSteps = int(file_obj.read()) # 读取已执行步数
except: except:
pass pass
if self.startSteps != 0: if self.startSteps != 0:
print("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为", self.startSteps, "条。") print("此模式下任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
print("In this mode, task ID", self.id, "will start from the last step, before we already collected", self.startSteps, " items.") self.startSteps, "条。")
print("In this mode, task ID", self.id,
"will start from the last step, before we already collected", self.startSteps, " items.")
else: else:
print("此模式下任务ID", self.id, "将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。") print("此模式下任务ID", self.id,
print("In this mode, task ID", self.id, "will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.") "将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
print("In this mode, task ID", self.id,
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
stealth_path = driver_path[:driver_path.find( stealth_path = driver_path[:driver_path.find(
"chromedriver")] + "stealth.min.js" "chromedriver")] + "stealth.min.js"
with open(stealth_path, 'r') as f: with open(stealth_path, 'r') as f:
@ -167,12 +172,12 @@ class BrowserThread(Thread):
self.outputParameters = {} self.outputParameters = {}
self.service = service self.service = service
self.outputParametersTypes = [] self.outputParametersTypes = []
self.outputParametersRecord = [] # 字段是否被记录 self.outputParametersRecord = [] # 字段是否被记录
self.dataNotFoundKeys = {} # 记录没有找到数据的key self.dataNotFoundKeys = {} # 记录没有找到数据的key
self.log = "" # 记下现在总共开了多少个标签页 self.log = "" # 记下现在总共开了多少个标签页
self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置 self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
self.SAVED = False # 记录是否已经存储了 self.SAVED = False # 记录是否已经存储了
for para in service["outputParameters"]: # 初始化输出参数 for para in service["outputParameters"]: # 初始化输出参数
if para["name"] not in self.outputParameters.keys(): if para["name"] not in self.outputParameters.keys():
self.outputParameters[para["name"]] = "" self.outputParameters[para["name"]] = ""
self.dataNotFoundKeys[para["name"]] = False self.dataNotFoundKeys[para["name"]] = False
@ -181,7 +186,8 @@ class BrowserThread(Thread):
except: except:
self.outputParametersTypes.append("text") self.outputParametersTypes.append("text")
try: try:
self.outputParametersRecord.append(bool(para["recordASField"])) self.outputParametersRecord.append(
bool(para["recordASField"]))
except: except:
self.outputParametersRecord.append(True) self.outputParametersRecord.append(True)
# 文件叠加的时候不添加表头 # 文件叠加的时候不添加表头
@ -203,11 +209,19 @@ class BrowserThread(Thread):
iframe = node["parameters"]["iframe"] iframe = node["parameters"]["iframe"]
except: except:
node["parameters"]["iframe"] = False node["parameters"]["iframe"] = False
try: try:
node["parameters"]["xpath"] = lowercase_tags_in_xpath( node["parameters"]["xpath"] = lowercase_tags_in_xpath(
node["parameters"]["xpath"]) node["parameters"]["xpath"])
except: except:
pass pass
try:
node["parameters"]["waitElementIframeIndex"] = int(
node["parameters"]["waitElementIframeIndex"])
except:
node["parameters"]["waitElement"] = ""
node["parameters"]["waitElementTime"] = 10
node["parameters"]["waitElementIframeIndex"] = 0
if node["option"] == 1: # 打开网页操作 if node["option"] == 1: # 打开网页操作
try: try:
cookies = node["parameters"]["cookies"] cookies = node["parameters"]["cookies"]
@ -216,8 +230,10 @@ class BrowserThread(Thread):
if node["option"] == 2: # 点击操作 if node["option"] == 2: # 点击操作
if node["parameters"]["useLoop"]: if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5": if self.task_version <= "0.3.5":
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
print("您的任务版本号为" + self.task_version + "循环点击不支持相对XPath写法已自动切换为纯循环的XPath") node["parameters"]["xpath"] = ""
print("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
elif node["option"] == 3: # 提取数据操作 elif node["option"] == 3: # 提取数据操作
node["parameters"]["recordASField"] = 0 node["parameters"]["recordASField"] = 0
paras = node["parameters"]["paras"] paras = node["parameters"]["paras"]
@ -231,7 +247,8 @@ class BrowserThread(Thread):
except: except:
para["iframe"] = False para["iframe"] = False
try: try:
para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"]) para["relativeXPath"] = lowercase_tags_in_xpath(
para["relativeXPath"])
except: except:
pass pass
try: try:
@ -247,7 +264,7 @@ class BrowserThread(Thread):
para["optimizable"] = False para["optimizable"] = False
elif node["option"] == 4: # 输入文字 elif node["option"] == 4: # 输入文字
try: try:
index = node["parameters"]["index"] # 索引值 index = node["parameters"]["index"] # 索引值
except: except:
node["parameters"]["index"] = 0 node["parameters"]["index"] = 0
elif node["option"] == 5: # 自定义操作 elif node["option"] == 5: # 自定义操作
@ -255,23 +272,27 @@ class BrowserThread(Thread):
clear = node["parameters"]["clear"] clear = node["parameters"]["clear"]
except: except:
node["parameters"]["clear"] = 0 node["parameters"]["clear"] = 0
elif node["option"] == 7: # 移动到元素 elif node["option"] == 7: # 移动到元素
if node["parameters"]["useLoop"]: if node["parameters"]["useLoop"]:
if self.task_version <= "0.3.5": if self.task_version <= "0.3.5":
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
print("您的任务版本号为" + self.task_version + "循环点击不支持相对XPath写法已自动切换为纯循环的XPath") node["parameters"]["xpath"] = ""
print("您的任务版本号为" + self.task_version +
"循环点击不支持相对XPath写法已自动切换为纯循环的XPath")
def readFromExcel(self): def readFromExcel(self):
if self.inputExcel == "": if self.inputExcel == "":
return 0 return 0
try: try:
workbook = load_workbook(self.inputExcel) workbook = load_workbook(self.inputExcel)
except: except:
print("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确", os.path.abspath(self.inputExcel)) print("读取Excel失败将会使用默认参数执行任务请检查文件路径是否正确",
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ", os.path.abspath(self.inputExcel)) os.path.abspath(self.inputExcel))
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ",
os.path.abspath(self.inputExcel))
time.sleep(5) time.sleep(5)
return 0 return 0
sheet_name_list = workbook.sheetnames sheet_name_list = workbook.sheetnames
sheet = workbook[sheet_name_list[0]] sheet = workbook[sheet_name_list[0]]
data = [] data = []
@ -285,7 +306,7 @@ class BrowserThread(Thread):
key = row[0] key = row[0]
values = [str(val) for val in row[1:] if val is not None] values = [str(val) for val in row[1:] if val is not None]
result_dict.setdefault(key, []).extend([values]) result_dict.setdefault(key, []).extend([values])
data = {} data = {}
for key, arr in result_dict.items(): for key, arr in result_dict.items():
result = [] result = []
@ -365,15 +386,19 @@ class BrowserThread(Thread):
# 写入数据 # 写入数据
if self.outputFormat == "csv" or self.outputFormat == "txt": if self.outputFormat == "csv" or self.outputFormat == "txt":
file_name = "Data/Task_" + \ file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + '.' + self.outputFormat str(self.id) + "/" + self.saveName + \
write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord) '.' + self.outputFormat
write_to_csv(file_name, self.OUTPUT,
self.outputParametersRecord)
elif self.outputFormat == "xlsx": elif self.outputFormat == "xlsx":
file_name = "Data/Task_" + \ file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + '.xlsx' str(self.id) + "/" + self.saveName + '.xlsx'
write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord) write_to_excel(
file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
elif self.outputFormat == "mysql": elif self.outputFormat == "mysql":
self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes) self.mysql.write_to_mysql(
self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
self.OUTPUT = [] self.OUTPUT = []
self.log = "" self.log = ""
@ -403,10 +428,12 @@ class BrowserThread(Thread):
i = 0 i = 0
while True: while True:
# newBodyText = self.browser.page_source # newBodyText = self.browser.page_source
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=para["iframe"]).text newBodyText = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
if newBodyText == bodyText: if newBodyText == bodyText:
print("页面已检测不到新内容,停止滚动。") print("页面已检测不到新内容,停止滚动。")
print("No new content detected on the page, stop scrolling.") print(
"No new content detected on the page, stop scrolling.")
break break
else: else:
bodyText = newBodyText bodyText = newBodyText
@ -493,13 +520,15 @@ class BrowserThread(Thread):
output = exec(code) output = exec(code)
except Exception as e: except Exception as e:
print("执行下面的代码时出错:" + code, ",错误为:", e) print("执行下面的代码时出错:" + code, ",错误为:", e)
print("Error executing the following code:" + code, ", error is:", e) print("Error executing the following code:" +
code, ", error is:", e)
elif int(codeMode) == 6: elif int(codeMode) == 6:
try: try:
output = eval(code) output = eval(code)
except Exception as e: except Exception as e:
print("获得下面的代码返回值时出错:" + code, ",错误为:", e) print("获得下面的代码返回值时出错:" + code, ",错误为:", e)
print("Error executing and getting return value the following code:" + code, ", error is:", e) print(
"Error executing and getting return value the following code:" + code, ", error is:", e)
elif int(codeMode) == 1: elif int(codeMode) == 1:
self.recordLog("Execute System Call:" + code) self.recordLog("Execute System Call:" + code)
self.recordLog("执行系统命令:" + code) self.recordLog("执行系统命令:" + code)
@ -531,7 +560,8 @@ class BrowserThread(Thread):
max_wait_time = int(paras["waitTime"]) max_wait_time = int(paras["waitTime"])
if codeMode == 2: # 使用循环的情况下传入的clickPath就是实际的xpath if codeMode == 2: # 使用循环的情况下传入的clickPath就是实际的xpath
try: try:
loopPath = replace_field_values(loopPath, self.outputParameters) loopPath = replace_field_values(
loopPath, self.outputParameters)
elements = self.browser.find_elements( elements = self.browser.find_elements(
By.XPATH, loopPath, iframe=paras["iframe"]) By.XPATH, loopPath, iframe=paras["iframe"])
element = elements[index] element = elements[index]
@ -544,7 +574,7 @@ class BrowserThread(Thread):
self.BREAK = True self.BREAK = True
elif codeMode == 4: elif codeMode == 4:
self.CONTINUE = True self.CONTINUE = True
else: # 0 1 5 6 else: # 0 1 5 6
output = self.execute_code( output = self.execute_code(
codeMode, code, max_wait_time, iframe=paras["iframe"]) codeMode, code, max_wait_time, iframe=paras["iframe"])
recordASField = bool(paras["recordASField"]) recordASField = bool(paras["recordASField"])
@ -553,7 +583,8 @@ class BrowserThread(Thread):
# print("The return value of operation <" + node["title"] + "> is: " + output) # print("The return value of operation <" + node["title"] + "> is: " + output)
self.outputParameters[node["title"]] = output self.outputParameters[node["title"]] = output
if recordASField: if recordASField:
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord) line = new_line(self.outputParameters,
self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line) self.OUTPUT.append(line)
def switchSelect(self, para, loopValue): def switchSelect(self, para, loopValue):
@ -566,7 +597,8 @@ class BrowserThread(Thread):
optionValue = loopValue.split("~")[index - 1] optionValue = loopValue.split("~")[index - 1]
except: except:
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值") print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
print("Failed to get value, maybe because the index is out of range, will use the entire text value") print(
"Failed to get value, maybe because the index is out of range, will use the entire text value")
else: else:
optionValue = loopValue optionValue = loopValue
optionMode = 1 optionMode = 1
@ -605,11 +637,11 @@ class BrowserThread(Thread):
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
if xpath == "": if xpath == "":
path = loopPath path = loopPath
else: else:
path = "(" + loopPath + ")" + \ path = "(" + loopPath + ")" + \
"[" + str(index + 1) + "]" + \ "[" + str(index + 1) + "]" + \
xpath xpath
index = 0 # 如果是相对循环内元素的点击在定位到元素后index应该重置为0 index = 0 # 如果是相对循环内元素的点击在定位到元素后index应该重置为0
# element = loopElement # element = loopElement
else: else:
index = 0 index = 0
@ -632,10 +664,35 @@ class BrowserThread(Thread):
def executeNode(self, nodeId, loopValue="", loopPath="", index=0): def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
node = self.procedure[nodeId] node = self.procedure[nodeId]
WebDriverWait(self.browser, 10).until # WebDriverWait(self.browser, 10).until
# 等待元素出现才进行操作10秒内未出现则报错 # # 等待元素出现才进行操作10秒内未出现则报错
(EC.visibility_of_element_located( # (EC.visibility_of_element_located(
(By.XPATH, node["parameters"]["xpath"]))) # (By.XPATH, node["parameters"]["xpath"])))
try:
if node["parameters"]["waitElement"] != "":
waitElement = replace_field_values(
node["parameters"]["waitElement"], self.outputParameters)
waitElementTime = float(node["parameters"]["waitElementTime"])
waitElementIframeIndex = node["parameters"]["waitElementInIframe"]
print("等待元素出现:", waitElement)
print("Waiting for element to appear:", waitElement)
if waitElementIframeIndex > 0:
iframes = self.browser.find_elements(
By.CSS_SELECTOR, "iframe", iframe=False)
iframe = iframes[waitElementIframeIndex - 1]
self.browser.switch_to.frame(iframe)
WebDriverWait(self.browser, waitElementTime).until(
EC.presence_of_element_located((By.XPATH, waitElement))
)
if waitElementIframeIndex > 0:
self.browser.switch_to.default_content()
except Exception as e:
if waitElement != "":
print("等待元素出现超时:", waitElement, ",将继续执行。")
print("Timeout waiting for element to appear:",
waitElement, ", will continue to execute.")
print(e)
self.recordLog("Wait element not found")
# 根据不同选项执行不同操作 # 根据不同选项执行不同操作
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作 if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
@ -652,7 +709,7 @@ class BrowserThread(Thread):
if self.totalSteps >= self.startSteps: if self.totalSteps >= self.startSteps:
self.recordLog("getData") self.recordLog("getData")
self.getData(node["parameters"], loopValue, node["isInLoop"], self.getData(node["parameters"], loopValue, node["isInLoop"],
parentPath=loopPath, index=index) parentPath=loopPath, index=index)
self.saveData() self.saveData()
else: else:
# self.getDataStep += 1 # self.getDataStep += 1
@ -714,7 +771,8 @@ class BrowserThread(Thread):
continue continue
elif tType == 2: # 当前页面包含元素 elif tType == 2: # 当前页面包含元素
try: try:
xpath = replace_field_values(cnode["parameters"]["value"], self.outputParameters) xpath = replace_field_values(
cnode["parameters"]["value"], self.outputParameters)
if self.browser.find_element(By.XPATH, xpath, iframe=cnode["parameters"]["iframe"]): if self.browser.find_element(By.XPATH, xpath, iframe=cnode["parameters"]["iframe"]):
executeBranchId = i executeBranchId = i
break break
@ -722,7 +780,8 @@ class BrowserThread(Thread):
continue continue
elif tType == 3: # 当前循环元素包括文本 elif tType == 3: # 当前循环元素包括文本
try: try:
value = replace_field_values(cnode["parameters"]["value"], self.outputParameters) value = replace_field_values(
cnode["parameters"]["value"], self.outputParameters)
if loopElement.text.find(value) >= 0: if loopElement.text.find(value) >= 0:
executeBranchId = i executeBranchId = i
break break
@ -730,7 +789,8 @@ class BrowserThread(Thread):
continue continue
elif tType == 4: # 当前循环元素包括元素 elif tType == 4: # 当前循环元素包括元素
try: try:
xpath = replace_field_values(cnode["parameters"]["value"][1:], self.outputParameters) xpath = replace_field_values(
cnode["parameters"]["value"][1:], self.outputParameters)
if loopElement.find_element(By.XPATH, xpath): if loopElement.find_element(By.XPATH, xpath):
executeBranchId = i executeBranchId = i
break break
@ -782,7 +842,8 @@ class BrowserThread(Thread):
finished = False finished = False
# newBodyText = self.browser.page_source # newBodyText = self.browser.page_source
# newBodyText = self.browser.find_element(By.XPATH, "//body").text # newBodyText = self.browser.find_element(By.XPATH, "//body").text
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text newBodyText = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
if node["parameters"]["exitCount"] == 0: if node["parameters"]["exitCount"] == 0:
if newBodyText == bodyText: # 如果页面内容无变化 if newBodyText == bodyText: # 如果页面内容无变化
print("页面已检测不到新内容,停止循环。") print("页面已检测不到新内容,停止循环。")
@ -790,9 +851,8 @@ class BrowserThread(Thread):
finished = True finished = True
break break
else: else:
if node["parameters"]["exitCount"] == 0: print("检测到页面变化,继续循环。")
print("检测到页面变化,继续循环。") print("Page changed detected, continue loop.")
print("Page changed detected, continue loop.")
bodyText = newBodyText bodyText = newBodyText
xpath = replace_field_values( xpath = replace_field_values(
node["parameters"]["xpath"], self.outputParameters) node["parameters"]["xpath"], self.outputParameters)
@ -801,10 +861,10 @@ class BrowserThread(Thread):
for i in node["sequence"]: # 挨个执行操作 for i in node["sequence"]: # 挨个执行操作
self.executeNode( self.executeNode(
i, element, xpath, 0) i, element, xpath, 0)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: # 如果有break操作退出循环 if self.BREAK: # 如果有break操作退出循环
self.BREAK = False self.BREAK = False
finished = True finished = True
break break
@ -864,7 +924,7 @@ class BrowserThread(Thread):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作 for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
self.executeNode(i, elements[index], self.executeNode(i, elements[index],
xpath, index) xpath, index)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: if self.BREAK:
@ -872,9 +932,9 @@ class BrowserThread(Thread):
break break
try: try:
changed_handle = self.browser.current_window_handle != thisHandle changed_handle = self.browser.current_window_handle != thisHandle
except: # 如果网页被意外关闭了的情况下 except: # 如果网页被意外关闭了的情况下
self.browser.switch_to.window( self.browser.switch_to.window(
self.browser.window_handles[-1]) self.browser.window_handles[-1])
changed_handle = self.browser.window_handles[-1] != thisHandle changed_handle = self.browser.window_handles[-1] != thisHandle
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化 if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
try: try:
@ -925,7 +985,7 @@ class BrowserThread(Thread):
By.XPATH, path, iframe=node["parameters"]["iframe"]) By.XPATH, path, iframe=node["parameters"]["iframe"])
for i in node["sequence"]: # 挨个执行操作 for i in node["sequence"]: # 挨个执行操作
self.executeNode(i, element, path, 0) self.executeNode(i, element, path, 0)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: if self.BREAK:
@ -933,9 +993,9 @@ class BrowserThread(Thread):
break break
try: try:
changed_handle = self.browser.current_window_handle != thisHandle changed_handle = self.browser.current_window_handle != thisHandle
except: # 如果网页被意外关闭了的情况下 except: # 如果网页被意外关闭了的情况下
self.browser.switch_to.window( self.browser.switch_to.window(
self.browser.window_handles[-1]) self.browser.window_handles[-1])
changed_handle = self.browser.window_handles[-1] != thisHandle changed_handle = self.browser.window_handles[-1] != thisHandle
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化 if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
try: try:
@ -984,7 +1044,7 @@ class BrowserThread(Thread):
self.recordLog("input: " + text) self.recordLog("input: " + text)
for i in node["sequence"]: # 挨个执行操作 for i in node["sequence"]: # 挨个执行操作
self.executeNode(i, text, "", 0) self.executeNode(i, text, "", 0)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: if self.BREAK:
@ -1009,7 +1069,7 @@ class BrowserThread(Thread):
self.recordLog("input: " + url) self.recordLog("input: " + url)
for i in node["sequence"]: for i in node["sequence"]:
self.executeNode(i, url, "", 0) self.executeNode(i, url, "", 0)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: if self.BREAK:
@ -1037,7 +1097,7 @@ class BrowserThread(Thread):
break break
for i in node["sequence"]: # 挨个执行操作 for i in node["sequence"]: # 挨个执行操作
self.executeNode(i, code, node["parameters"]["xpath"], 0) self.executeNode(i, code, node["parameters"]["xpath"], 0)
if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行 if self.BREAK or self.CONTINUE: # 如果有break操作下面的操作不执行
self.CONTINUE = False self.CONTINUE = False
break break
if self.BREAK: if self.BREAK:
@ -1067,7 +1127,7 @@ class BrowserThread(Thread):
# clear output parameters # clear output parameters
for key in self.outputParameters: for key in self.outputParameters:
self.outputParameters[key] = "" self.outputParameters[key] = ""
else: # 在流程图其他位置设置了打开网页的操作,读取的应该是第一个网址,如打开网页后登录,再打开第二个网页 else: # 在流程图其他位置设置了打开网页的操作,读取的应该是第一个网址,如打开网页后登录,再打开第二个网页
url = list(filter(isnotnull, para["links"].split("\n")))[0] url = list(filter(isnotnull, para["links"].split("\n")))[0]
# 将value中的Field[""]替换为outputParameters中的键值 # 将value中的Field[""]替换为outputParameters中的键值
url = replace_field_values(url, self.outputParameters) url = replace_field_values(url, self.outputParameters)
@ -1150,7 +1210,8 @@ class BrowserThread(Thread):
replaced_text = replaced_text.split("~")[index - 1] replaced_text = replaced_text.split("~")[index - 1]
except: except:
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值") print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
print("Failed to get value, maybe because the index is out of range, will use the entire text value") print(
"Failed to get value, maybe because the index is out of range, will use the entire text value")
textbox.send_keys(replaced_text) textbox.send_keys(replaced_text)
if value.lower().find("<enter>") >= 0: if value.lower().find("<enter>") >= 0:
textbox.send_keys(Keys.ENTER) textbox.send_keys(Keys.ENTER)
@ -1180,11 +1241,11 @@ class BrowserThread(Thread):
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
if xpath == "": if xpath == "":
path = clickPath path = clickPath
else: else:
path = "(" + clickPath + ")" + \ path = "(" + clickPath + ")" + \
"[" + str(index + 1) + "]" + \ "[" + str(index + 1) + "]" + \
xpath xpath
index = 0 # 如果是相对循环内元素的点击在定位到元素后index应该重置为0 index = 0 # 如果是相对循环内元素的点击在定位到元素后index应该重置为0
# element = loopElement # element = loopElement
else: else:
index = 0 index = 0
@ -1226,7 +1287,8 @@ class BrowserThread(Thread):
pass pass
except Exception as e: except Exception as e:
print("点击元素失败:" + path, "请尝试将点击类型改为JavaScript点击后重试。") print("点击元素失败:" + path, "请尝试将点击类型改为JavaScript点击后重试。")
print("Failed to click element:" + path, ", please try to change the click type to JavaScript Click.") print("Failed to click element:" + path,
", please try to change the click type to JavaScript Click.")
print(e) print(e)
self.Log(e) self.Log(e)
self.recordLog(str(e)) self.recordLog(str(e))
@ -1374,14 +1436,15 @@ class BrowserThread(Thread):
# 使用Pillow库打开截图并转换为灰度图像 # 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L') image = Image.open(screenshot_stream).convert('L')
temp_name = "OCR_" + str(time.time()) + ".png" temp_name = "OCR_" + str(time.time()) + ".png"
location = "Data/Task_" + str(self.id) + "/" + self.saveName + "/" + temp_name location = "Data/Task_" + \
str(self.id) + "/" + self.saveName + "/" + temp_name
image.save(location) image.save(location)
ocr = DdddOcr() ocr = DdddOcr()
with open(location, 'rb') as f: with open(location, 'rb') as f:
image_bytes = f.read() image_bytes = f.read()
content = ocr.classification(image_bytes) content = ocr.classification(image_bytes)
os.remove(location) os.remove(location)
# 使用Tesseract OCR引擎识别图像中的文本 # 使用Tesseract OCR引擎识别图像中的文本
# content = pytesseract.image_to_string(image, lang='chi_sim+eng') # content = pytesseract.image_to_string(image, lang='chi_sim+eng')
except Exception as e: except Exception as e:
# try: # try:
@ -1392,30 +1455,30 @@ class BrowserThread(Thread):
# screenshot_stream = io.BytesIO(screenshot) # screenshot_stream = io.BytesIO(screenshot)
# # 使用Pillow库打开截图并转换为灰度图像 # # 使用Pillow库打开截图并转换为灰度图像
# image = Image.open(screenshot_stream).convert('L') # image = Image.open(screenshot_stream).convert('L')
# # 使用Tesseract OCR引擎识别图像中的文本 # # 使用Tesseract OCR引擎识别图像中的文本
# # content = pytesseract.image_to_string(image, lang='eng') # # content = pytesseract.image_to_string(image, lang='eng')
# except Exception as e: # except Exception as e:
content = "OCR Error" content = "OCR Error"
print(e) print(e)
# if sys.platform == "win32": # if sys.platform == "win32":
# print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501") # print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501")
# print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/") # print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
# elif sys.platform == "darwin": # elif sys.platform == "darwin":
# print( # print(
# "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/146044810") # "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/146044810")
# elif sys.platform == "linux": # elif sys.platform == "linux":
# print( # print(
# "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/420259031") # "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/420259031")
# else: # else:
# print("注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501") # print("注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501")
# print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/") # print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
# print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html") # print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
elif p["contentType"] == 9: elif p["contentType"] == 9:
content = self.execute_code( content = self.execute_code(
2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"]) 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
elif p["contentType"] == 12: # 系统命令返回值 elif p["contentType"] == 12: # 系统命令返回值
content = self.execute_code(1, p["JS"], p["JSWaitTime"]) content = self.execute_code(1, p["JS"], p["JSWaitTime"])
elif p["contentType"] == 13: # eval返回值 elif p["contentType"] == 13: # eval返回值
content = self.execute_code(6, p["JS"], p["JSWaitTime"]) content = self.execute_code(6, p["JS"], p["JSWaitTime"])
elif p["contentType"] == 10: # 下拉框选中的值 elif p["contentType"] == 10: # 下拉框选中的值
try: try:
@ -1488,17 +1551,18 @@ class BrowserThread(Thread):
# else: # else:
# 如果字串里有//即子孙查找,则不动语句 # 如果字串里有//即子孙查找,则不动语句
if relativeXPath.find("//") >= 0: if relativeXPath.find("//") >= 0:
if xpath.startswith("/"): if xpath.startswith("/"):
full_path = "(" + parentPath + ")" + \ full_path = "(" + parentPath + ")" + \
"[" + str(index + 1) + "]"+ \ "[" + str(index + 1) + "]" + \
relativeXPath + content_type relativeXPath + content_type
else: # 如果是id()这种形式不需要包parentPath else: # 如果是id()这种形式不需要包parentPath
full_path = xpath full_path = xpath
try: try:
content = pageHTML.xpath(full_path) content = pageHTML.xpath(full_path)
except: except:
content = [] content = []
elif not relativeXPath.startswith("/"): # 如果是id()这种形式,不需要包/html/body # 如果是id()这种形式,不需要包/html/body
elif not relativeXPath.startswith("/"):
try: try:
content = loopElementHTML.xpath(xpath) content = loopElementHTML.xpath(xpath)
except: except:
@ -1507,7 +1571,8 @@ class BrowserThread(Thread):
content = loopElementHTML.xpath( content = loopElementHTML.xpath(
"/html/body/" + loopElementHTML[0][0].tag + xpath) "/html/body/" + loopElementHTML[0][0].tag + xpath)
else: else:
if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body # 如果是id()或(//div)[1]这种形式,不需要包/html/body
if xpath.find("/body") < 0 and xpath.startswith("/"):
xpath = "/html/body" + xpath xpath = "/html/body" + xpath
content = pageHTML.xpath(xpath) content = pageHTML.xpath(xpath)
if len(content) > 0: if len(content) > 0:
@ -1517,7 +1582,8 @@ class BrowserThread(Thread):
for result in content if result.strip()) for result in content if result.strip())
if p["nodeType"] == 2: if p["nodeType"] == 2:
base_url = self.browser.current_url base_url = self.browser.current_url
content = urljoin(base_url, content) # 合并链接相对路径为绝对路径 # 合并链接相对路径为绝对路径
content = urljoin(base_url, content)
else: else:
content = p["default"] content = p["default"]
if not self.dataNotFoundKeys[p["name"]]: if not self.dataNotFoundKeys[p["name"]]:
@ -1544,7 +1610,7 @@ class BrowserThread(Thread):
if not p["optimizable"]: if not p["optimizable"]:
content = "" content = ""
relativeXPath = replace_field_values( relativeXPath = replace_field_values(
p["relativeXPath"], self.outputParameters) p["relativeXPath"], self.outputParameters)
if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL去找元素 if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL去找元素
try: try:
# relativeXPath = relativeXPath.lower() # relativeXPath = relativeXPath.lower()
@ -1640,9 +1706,10 @@ class BrowserThread(Thread):
continue # 再出现类似问题直接跳过 continue # 再出现类似问题直接跳过
self.outputParameters[p["name"]] = content self.outputParameters[p["name"]] = content
self.execute_code( self.execute_code(
2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS 2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
if para["recordASField"] > 0: if para["recordASField"] > 0:
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord) line = new_line(self.outputParameters,
self.maxViewLength, self.outputParametersRecord)
self.OUTPUT.append(line) self.OUTPUT.append(line)
# rt.end() # rt.end()
@ -1659,7 +1726,7 @@ if __name__ == '__main__':
"read_type": "remote", "read_type": "remote",
"headless": False, "headless": False,
"server_address": "http://localhost:8074", "server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入 "keyboard": True, # 是否监听键盘输入
"version": "0.3.6", "version": "0.3.6",
} }
c = Config(config) c = Config(config)
@ -1675,8 +1742,10 @@ if __name__ == '__main__':
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome" options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# MacOS需要用option而不是options # MacOS需要用option而不是options
option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome" option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx") option.add_extension(
options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx") "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64" driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome" # options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# # MacOS需要用option而不是options # # MacOS需要用option而不是options
@ -1684,7 +1753,8 @@ if __name__ == '__main__':
# driver_path = os.getcwd()+ "/chromedriver_mac64" # driver_path = os.getcwd()+ "/chromedriver_mac64"
print(driver_path) print(driver_path)
if c.config_folder == "": if c.config_folder == "":
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/") c.config_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder) # print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径 elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider", print("Finding chromedriver in EasySpider",
@ -1727,7 +1797,7 @@ if __name__ == '__main__':
elif os.path.exists(os.getcwd()+"/../ElectronJS"): elif os.path.exists(os.getcwd()+"/../ElectronJS"):
# 软件dev用 # 软件dev用
print("Finding chromedriver in EasySpider", print("Finding chromedriver in EasySpider",
os.getcwd()+"/ElectronJS") os.getcwd()+"/ElectronJS")
option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置 option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置 options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe" driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
@ -1739,7 +1809,7 @@ if __name__ == '__main__':
option.add_experimental_option( option.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式 'excludeSwitches', ['enable-automation']) # 以开发者模式
# user_data_dir = r'' # 注意没有Default # user_data_dir = r'' # 注意没有Default
# options.add_argument('--user-data-dir='+p) # options.add_argument('--user-data-dir='+p)
@ -1780,6 +1850,11 @@ if __name__ == '__main__':
options.add_argument( options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒 "--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
option.add_argument('-ignore-certificate-errors')
option.add_argument('-ignore -ssl-errors')
threads = [] threads = []
for i in c.id: for i in c.id:
# print(options) # print(options)
@ -1833,10 +1908,10 @@ if __name__ == '__main__':
options=options, chrome_options=option, executable_path=driver_path) options=options, chrome_options=option, executable_path=driver_path)
elif cloudflare == 1: elif cloudflare == 1:
if sys.platform == "win32": if sys.platform == "win32":
options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器 options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器 # options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
browser_t = MyUCChrome( browser_t = MyUCChrome(
options=options, driver_executable_path=driver_path) options=options, driver_executable_path=driver_path)
else: else:
print("Cloudflare模式只支持Windows x64平台。") print("Cloudflare模式只支持Windows x64平台。")
print("Cloudflare Mode only support on Windows x64 platform.") print("Cloudflare Mode only support on Windows x64 platform.")
@ -1849,7 +1924,7 @@ if __name__ == '__main__':
threads.append(thread) threads.append(thread)
thread.start() thread.start()
# Set the pause operation # Set the pause operation
# if sys.platform != "linux": # if sys.platform != "linux":
# time.sleep(3) # time.sleep(3)
# print("\n\n----------------------------------") # print("\n\n----------------------------------")
# print("正在运行任务长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码如果想恢复任务的执行请再次长按p键。") # print("正在运行任务长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码如果想恢复任务的执行请再次长按p键。")
@ -1868,17 +1943,16 @@ if __name__ == '__main__':
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.") # print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入 # 使用监听器监听键盘输入
try: try:
if c.keyboard: if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener: with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
listener.join() listener.join()
except: except:
pass pass
# print("您的操作系统不支持暂停功能。") # print("您的操作系统不支持暂停功能。")
# print("Your operating system does not support the pause function.") # print("Your operating system does not support the pause function.")
# print("线程长度:", len(threads) ) # print("线程长度:", len(threads) )
for thread in threads: for thread in threads:
print() print()
thread.join() thread.join()

View File

@ -3,7 +3,6 @@ requests==2.31.0
selenium==4.5.0 selenium==4.5.0
pyinstaller==5.9.0 pyinstaller==5.9.0
Pillow==9.4.0 Pillow==9.4.0
pytesseract==0.3.10
openpyxl==3.1.2 openpyxl==3.1.2
pymysql==1.1.0 pymysql==1.1.0
lxml==4.9.2 lxml==4.9.2