mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-22 23:24:22 +08:00
执行操作前可设置等待指定元素出现
This commit is contained in:
parent
92d6315a22
commit
528500f795
@ -85,6 +85,10 @@ let flowchart_window = null;
|
||||
let current_handle = null;
|
||||
let old_handles = [];
|
||||
let handle_pairs = {};
|
||||
let socket_window = null;
|
||||
let socket_start = null;
|
||||
let socket_flowchart = null;
|
||||
let invoke_window = null;
|
||||
|
||||
// var ffi = require('ffi-napi');
|
||||
// var libm = ffi.Library('libm', {
|
||||
@ -233,27 +237,7 @@ async function beginInvoke(msg, ws) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// .then(function (element) {
|
||||
// console.log("element", element, handles);
|
||||
// element.sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
|
||||
// exit = true;
|
||||
// }, function (error) {
|
||||
// console.log("error", error);
|
||||
// len = len - 1;
|
||||
// if (len == 0) {
|
||||
// exit = true;
|
||||
// }
|
||||
// }
|
||||
// );
|
||||
}
|
||||
// let handles = driver.getAllWindowHandles();
|
||||
// driver.switchTo().window(handles[handles.length - 1]);
|
||||
// driver.findElement(By.xpath(msg.message.xpath)).sendKeys(Key.HOME, Key.chord(Key.SHIFT, Key.END), keyInfo);
|
||||
// robot.keyTap("a", "control");
|
||||
// robot.keyTap("backspace");
|
||||
// robot.typeString(keyInfo);
|
||||
// robot.keyTap("shift");
|
||||
// robot.keyTap("shift");
|
||||
} else if (msg.type == 3) {
|
||||
try {
|
||||
if (msg.from == 0) {
|
||||
@ -368,6 +352,11 @@ async function beginInvoke(msg, ws) {
|
||||
} catch {
|
||||
console.log("open devtools error");
|
||||
}
|
||||
try{
|
||||
invoke_window.openDevTools();
|
||||
} catch {
|
||||
console.log("open devtools error");
|
||||
}
|
||||
} else if (msg.type == 7) {
|
||||
// 获得当前页面Cookies
|
||||
try{
|
||||
@ -383,9 +372,6 @@ async function beginInvoke(msg, ws) {
|
||||
|
||||
const WebSocket = require('ws');
|
||||
const {all} = require("express/lib/application");
|
||||
let socket_window = null;
|
||||
let socket_start = null;
|
||||
let socket_flowchart = null;
|
||||
let wss = new WebSocket.Server({port: websocket_port});
|
||||
wss.on('connection', function (ws) {
|
||||
ws.on('message', async function (message, isBinary) {
|
||||
@ -521,7 +507,7 @@ function handleOpenBrowser(event, lang = "en", user_data_folder = "", mobile = f
|
||||
}
|
||||
|
||||
function handleOpenInvoke(event, lang = "en") {
|
||||
const window = new BrowserWindow({icon: iconPath});
|
||||
invoke_window = new BrowserWindow({icon: iconPath});
|
||||
let url = "";
|
||||
language = lang;
|
||||
if (lang == "en") {
|
||||
@ -530,10 +516,10 @@ function handleOpenInvoke(event, lang = "en") {
|
||||
url = server_address + `/taskGrid/taskList.html?type=1&wsport=${websocket_port}&backEndAddressServiceWrapper=` + server_address + "&lang=zh";
|
||||
}
|
||||
// and load the index.html of the app.
|
||||
window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' });
|
||||
window.maximize();
|
||||
invoke_window.loadURL(url, { extraHeaders: 'pragma: no-cache\n' });
|
||||
invoke_window.maximize();
|
||||
mainWindow.hide();
|
||||
window.on('close', function (event) {
|
||||
invoke_window.on('close', function (event) {
|
||||
mainWindow.show();
|
||||
});
|
||||
}
|
||||
|
@ -8,6 +8,10 @@
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
label{
|
||||
margin-left: 2px;
|
||||
}
|
||||
|
||||
div.node {
|
||||
height: 45px;
|
||||
width: 150px;
|
||||
|
@ -580,7 +580,14 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
|
||||
</div>
|
||||
</div>
|
||||
<div style="margin-top:5px">
|
||||
<label>Seconds <b>after executed</b> (Can be set to a decimal, such as 0.5):</label>
|
||||
<label>Wait for the following elements to appear <b>before</b> executing:</label>
|
||||
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
|
||||
placeholder="Enter the XPath of the element to wait for, leave blank to skip waiting"></textarea>
|
||||
<label style="margin-top:5px">In which iframe is the element located? Set to 0 if the element is not inside an iframe:</label>
|
||||
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
|
||||
<label style="margin-top:5px">Maximum waiting time for element appearance (in seconds):</label>
|
||||
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
|
||||
<label style="margin-top:5px">Wait seconds <b>after</b> execution (can set decimal values, e.g., 0.5):</label>
|
||||
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
||||
<label>Wait Type</label>
|
||||
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">
|
||||
|
@ -625,31 +625,7 @@ document.oncontextmenu = function() {
|
||||
return false;
|
||||
} //屏蔽右键菜单
|
||||
//删除元素
|
||||
document.onkeydown = function(e) {
|
||||
if (nowNode != null && e.keyCode == 46) {
|
||||
// if (confirm("确定要删除元素吗?")) {
|
||||
deleteElement();
|
||||
// }
|
||||
} else { //ctrl+s保存服务
|
||||
let currKey = 0;
|
||||
currKey = e.keyCode || e.which || e.charCode;
|
||||
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
|
||||
$('#save').click();
|
||||
return true;
|
||||
} else if (currKey == 116) {
|
||||
location.reload();
|
||||
} else if (currKey == 123) {
|
||||
console.log("打开devtools")
|
||||
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
|
||||
command.onopen = function() {
|
||||
let message = {
|
||||
type: 6, //消息类型,0代表连接操作
|
||||
};
|
||||
this.send(JSON.stringify(message));
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function inputDelete(e) {
|
||||
if (e.keyCode == 46) {
|
||||
|
@ -580,9 +580,16 @@
|
||||
</div>
|
||||
</div>
|
||||
<div style="margin-top:5px">
|
||||
<label><b>执行后</b>等待秒数(所有等待时间均可设置为小数,如0.5):</label>
|
||||
<label>操作<b>执行前</b>等待以下元素出现:</label>
|
||||
<textarea onkeydown="inputDelete(event)" class="form-control" style="min-height: 30px" v-model='nowNode["parameters"]["waitElement"]'
|
||||
placeholder="填写要等待出现元素的XPath,不填写则不等待"></textarea>
|
||||
<label style="margin-top:5px">要等待的元素在页面第几个iframe中,0表示元素不在iframe中:</label>
|
||||
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementIframeIndex']" type="number" required></input>
|
||||
<label style="margin-top:5px">元素出现的最长等待时间(秒):</label>
|
||||
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['waitElementTime']" type="number" required></input>
|
||||
<label style="margin-top:5px">操作<b>执行后</b>等待秒数(所有等待时间均可设置为小数,如0.5):</label>
|
||||
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
||||
<label>等待类型</label>
|
||||
<label style="margin-top:5px">等待类型</label>
|
||||
<select v-model='list.nl[index.nowNodeIndex]["parameters"]["waitType"]' class="form-control">
|
||||
<option value = 0>固定等待(设置等10秒就等10秒)</option>
|
||||
<option value = 1>随机等待(设置等10秒会随机等10×0.5 - 10 × 1.5 秒)</option>
|
||||
|
@ -75,3 +75,34 @@ function isValidMySQLTableName(tableName) {
|
||||
return pattern.test(tableName);
|
||||
}
|
||||
|
||||
document.onkeydown = function(e) {
|
||||
let t = false;
|
||||
try{
|
||||
t = nowNode;
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
if (t && nowNode != null && e.keyCode == 46) {
|
||||
// if (confirm("确定要删除元素吗?")) {
|
||||
deleteElement();
|
||||
// }
|
||||
} else { //ctrl+s保存服务
|
||||
let currKey = 0;
|
||||
currKey = e.keyCode || e.which || e.charCode;
|
||||
if (currKey == 83 && (e.ctrlKey || e.metaKey)) {
|
||||
$('#save').click();
|
||||
return true;
|
||||
} else if (currKey == 116) {
|
||||
location.reload();
|
||||
} else if (currKey == 123) {
|
||||
console.log("打开devtools")
|
||||
let command = new WebSocket("ws://localhost:"+getUrlParam("wsport"))
|
||||
command.onopen = function() {
|
||||
let message = {
|
||||
type: 6, //消息类型,0代表连接操作
|
||||
};
|
||||
this.send(JSON.stringify(message));
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -146,6 +146,9 @@ function addParameters(t) {
|
||||
beforeJSWaitTime: 0, //执行前js等待时间
|
||||
afterJS: "", //执行后执行的js
|
||||
afterJSWaitTime: 0, //执行后js等待时间
|
||||
waitElement: "", //等待元素
|
||||
waitElementTime: 10, //等待元素时间
|
||||
waitElementIframeIndex: 0, //等待元素在第几个iframe中
|
||||
}; //公共参数处理
|
||||
if (t.option == 1) {
|
||||
t["parameters"]["url"] = "about:blank";
|
||||
|
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/183.json
Normal file
1
ElectronJS/tasks/183.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":183,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/16/2023, 1:55:02 AM","update_time":"7/16/2023, 2:02:09 AM","version":"0.3.6","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"//*[@id=\"service-2017\"]/div[1]/ol/li[1]","waitElementTime":10,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[4]/div[1]","allXPaths":["/html/body/div[4]/div[1]","//div[contains(., '')]","//DIV[@class='w']","/html/body/div[last()-6]/div"],"exampleValues":[{"num":0,"value":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n ;0\n 我的购物车\n \n \n \n 平板電腦爆款耳機手機數據線年貨節\n \n 领券中心今日推荐\n "}],"unique_index":"7c04qey9fkllk4b56jd","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//div[123]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[]}}]}
|
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
||||
"justMyCode": false,
|
||||
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||
"args": ["--id", "[84]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
|
||||
"args": ["--id", "[54]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
|
||||
}
|
||||
]
|
||||
}
|
@ -1,5 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# import atexit
|
||||
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
|
||||
from myChrome import MyChrome
|
||||
from threading import Thread, Event
|
||||
from PIL import Image
|
||||
from commandline_config import Config
|
||||
import os
|
||||
import csv
|
||||
from openpyxl import load_workbook, Workbook
|
||||
import random
|
||||
from selenium.webdriver import ActionChains
|
||||
from selenium.webdriver.support.ui import Select
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from pynput.keyboard import Key, Listener
|
||||
from datetime import datetime
|
||||
import io # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
@ -19,39 +42,16 @@ from lxml import etree
|
||||
import onnxruntime
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
# import undetected_chromedriver as uc
|
||||
from pynput.keyboard import Key, Listener
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
from selenium.webdriver.support.ui import Select
|
||||
from selenium.webdriver import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
import random
|
||||
# import pandas as pd
|
||||
from openpyxl import load_workbook, Workbook
|
||||
# import numpy
|
||||
import csv
|
||||
import os
|
||||
from commandline_config import Config
|
||||
# import pytesseract
|
||||
from PIL import Image
|
||||
# import uuid
|
||||
from threading import Thread, Event
|
||||
from myChrome import MyChrome
|
||||
if sys.platform != "darwin":
|
||||
from myChrome import MyUCChrome
|
||||
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
|
||||
desired_capabilities = DesiredCapabilities.CHROME
|
||||
desired_capabilities["pageLoadStrategy"] = "none"
|
||||
|
||||
|
||||
class BrowserThread(Thread):
|
||||
def __init__(self, browser_t, id, service, version, event, saveName, config):
|
||||
Thread.__init__(self)
|
||||
@ -83,7 +83,8 @@ class BrowserThread(Thread):
|
||||
if not os.path.exists("Data/Task_" + str(i)):
|
||||
os.mkdir("Data/Task_" + str(i))
|
||||
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
|
||||
os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
|
||||
os.mkdir("Data/Task_" + str(i) + "/" +
|
||||
self.saveName) # 创建保存文件夹用来保存截图
|
||||
self.getDataStep = 0
|
||||
self.startSteps = 0
|
||||
try:
|
||||
@ -94,11 +95,15 @@ class BrowserThread(Thread):
|
||||
except:
|
||||
pass
|
||||
if self.startSteps != 0:
|
||||
print("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为", self.startSteps, "条。")
|
||||
print("In this mode, task ID", self.id, "will start from the last step, before we already collected", self.startSteps, " items.")
|
||||
print("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
|
||||
self.startSteps, "条。")
|
||||
print("In this mode, task ID", self.id,
|
||||
"will start from the last step, before we already collected", self.startSteps, " items.")
|
||||
else:
|
||||
print("此模式下,任务ID", self.id, "将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
print("In this mode, task ID", self.id, "will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||
print("此模式下,任务ID", self.id,
|
||||
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
print("In this mode, task ID", self.id,
|
||||
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||
stealth_path = driver_path[:driver_path.find(
|
||||
"chromedriver")] + "stealth.min.js"
|
||||
with open(stealth_path, 'r') as f:
|
||||
@ -181,7 +186,8 @@ class BrowserThread(Thread):
|
||||
except:
|
||||
self.outputParametersTypes.append("text")
|
||||
try:
|
||||
self.outputParametersRecord.append(bool(para["recordASField"]))
|
||||
self.outputParametersRecord.append(
|
||||
bool(para["recordASField"]))
|
||||
except:
|
||||
self.outputParametersRecord.append(True)
|
||||
# 文件叠加的时候不添加表头
|
||||
@ -203,11 +209,19 @@ class BrowserThread(Thread):
|
||||
iframe = node["parameters"]["iframe"]
|
||||
except:
|
||||
node["parameters"]["iframe"] = False
|
||||
|
||||
try:
|
||||
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
|
||||
node["parameters"]["xpath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["waitElementIframeIndex"] = int(
|
||||
node["parameters"]["waitElementIframeIndex"])
|
||||
except:
|
||||
node["parameters"]["waitElement"] = ""
|
||||
node["parameters"]["waitElementTime"] = 10
|
||||
node["parameters"]["waitElementIframeIndex"] = 0
|
||||
if node["option"] == 1: # 打开网页操作
|
||||
try:
|
||||
cookies = node["parameters"]["cookies"]
|
||||
@ -216,8 +230,10 @@ class BrowserThread(Thread):
|
||||
if node["option"] == 2: # 点击操作
|
||||
if node["parameters"]["useLoop"]:
|
||||
if self.task_version <= "0.3.5":
|
||||
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
print("您的任务版本号为" + self.task_version + ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
print("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 3: # 提取数据操作
|
||||
node["parameters"]["recordASField"] = 0
|
||||
paras = node["parameters"]["paras"]
|
||||
@ -231,7 +247,8 @@ class BrowserThread(Thread):
|
||||
except:
|
||||
para["iframe"] = False
|
||||
try:
|
||||
para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
|
||||
para["relativeXPath"] = lowercase_tags_in_xpath(
|
||||
para["relativeXPath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
@ -258,8 +275,10 @@ class BrowserThread(Thread):
|
||||
elif node["option"] == 7: # 移动到元素
|
||||
if node["parameters"]["useLoop"]:
|
||||
if self.task_version <= "0.3.5":
|
||||
node["parameters"]["xpath"] = "" # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
print("您的任务版本号为" + self.task_version + ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
print("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
|
||||
def readFromExcel(self):
|
||||
if self.inputExcel == "":
|
||||
@ -267,8 +286,10 @@ class BrowserThread(Thread):
|
||||
try:
|
||||
workbook = load_workbook(self.inputExcel)
|
||||
except:
|
||||
print("读取Excel失败,将会使用默认参数执行任务,请检查文件路径是否正确:", os.path.abspath(self.inputExcel))
|
||||
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ", os.path.abspath(self.inputExcel))
|
||||
print("读取Excel失败,将会使用默认参数执行任务,请检查文件路径是否正确:",
|
||||
os.path.abspath(self.inputExcel))
|
||||
print("Failed to read Excel, will execute the task with default parameters, please check if the file path is correct: ",
|
||||
os.path.abspath(self.inputExcel))
|
||||
time.sleep(5)
|
||||
return 0
|
||||
|
||||
@ -365,14 +386,18 @@ class BrowserThread(Thread):
|
||||
# 写入数据
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt":
|
||||
file_name = "Data/Task_" + \
|
||||
str(self.id) + "/" + self.saveName + '.' + self.outputFormat
|
||||
write_to_csv(file_name, self.OUTPUT, self.outputParametersRecord)
|
||||
str(self.id) + "/" + self.saveName + \
|
||||
'.' + self.outputFormat
|
||||
write_to_csv(file_name, self.OUTPUT,
|
||||
self.outputParametersRecord)
|
||||
elif self.outputFormat == "xlsx":
|
||||
file_name = "Data/Task_" + \
|
||||
str(self.id) + "/" + self.saveName + '.xlsx'
|
||||
write_to_excel(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
|
||||
write_to_excel(
|
||||
file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
|
||||
elif self.outputFormat == "mysql":
|
||||
self.mysql.write_to_mysql(self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
|
||||
self.mysql.write_to_mysql(
|
||||
self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
|
||||
|
||||
self.OUTPUT = []
|
||||
self.log = ""
|
||||
@ -403,10 +428,12 @@ class BrowserThread(Thread):
|
||||
i = 0
|
||||
while True:
|
||||
# newBodyText = self.browser.page_source
|
||||
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
|
||||
newBodyText = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=para["iframe"]).text
|
||||
if newBodyText == bodyText:
|
||||
print("页面已检测不到新内容,停止滚动。")
|
||||
print("No new content detected on the page, stop scrolling.")
|
||||
print(
|
||||
"No new content detected on the page, stop scrolling.")
|
||||
break
|
||||
else:
|
||||
bodyText = newBodyText
|
||||
@ -493,13 +520,15 @@ class BrowserThread(Thread):
|
||||
output = exec(code)
|
||||
except Exception as e:
|
||||
print("执行下面的代码时出错:" + code, ",错误为:", e)
|
||||
print("Error executing the following code:" + code, ", error is:", e)
|
||||
print("Error executing the following code:" +
|
||||
code, ", error is:", e)
|
||||
elif int(codeMode) == 6:
|
||||
try:
|
||||
output = eval(code)
|
||||
except Exception as e:
|
||||
print("获得下面的代码返回值时出错:" + code, ",错误为:", e)
|
||||
print("Error executing and getting return value the following code:" + code, ", error is:", e)
|
||||
print(
|
||||
"Error executing and getting return value the following code:" + code, ", error is:", e)
|
||||
elif int(codeMode) == 1:
|
||||
self.recordLog("Execute System Call:" + code)
|
||||
self.recordLog("执行系统命令:" + code)
|
||||
@ -531,7 +560,8 @@ class BrowserThread(Thread):
|
||||
max_wait_time = int(paras["waitTime"])
|
||||
if codeMode == 2: # 使用循环的情况下,传入的clickPath就是实际的xpath
|
||||
try:
|
||||
loopPath = replace_field_values(loopPath, self.outputParameters)
|
||||
loopPath = replace_field_values(
|
||||
loopPath, self.outputParameters)
|
||||
elements = self.browser.find_elements(
|
||||
By.XPATH, loopPath, iframe=paras["iframe"])
|
||||
element = elements[index]
|
||||
@ -553,7 +583,8 @@ class BrowserThread(Thread):
|
||||
# print("The return value of operation <" + node["title"] + "> is: " + output)
|
||||
self.outputParameters[node["title"]] = output
|
||||
if recordASField:
|
||||
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
|
||||
line = new_line(self.outputParameters,
|
||||
self.maxViewLength, self.outputParametersRecord)
|
||||
self.OUTPUT.append(line)
|
||||
|
||||
def switchSelect(self, para, loopValue):
|
||||
@ -566,7 +597,8 @@ class BrowserThread(Thread):
|
||||
optionValue = loopValue.split("~")[index - 1]
|
||||
except:
|
||||
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
|
||||
print("Failed to get value, maybe because the index is out of range, will use the entire text value")
|
||||
print(
|
||||
"Failed to get value, maybe because the index is out of range, will use the entire text value")
|
||||
else:
|
||||
optionValue = loopValue
|
||||
optionMode = 1
|
||||
@ -632,10 +664,35 @@ class BrowserThread(Thread):
|
||||
|
||||
def executeNode(self, nodeId, loopValue="", loopPath="", index=0):
|
||||
node = self.procedure[nodeId]
|
||||
WebDriverWait(self.browser, 10).until
|
||||
# 等待元素出现才进行操作,10秒内未出现则报错
|
||||
(EC.visibility_of_element_located(
|
||||
(By.XPATH, node["parameters"]["xpath"])))
|
||||
# WebDriverWait(self.browser, 10).until
|
||||
# # 等待元素出现才进行操作,10秒内未出现则报错
|
||||
# (EC.visibility_of_element_located(
|
||||
# (By.XPATH, node["parameters"]["xpath"])))
|
||||
try:
|
||||
if node["parameters"]["waitElement"] != "":
|
||||
waitElement = replace_field_values(
|
||||
node["parameters"]["waitElement"], self.outputParameters)
|
||||
waitElementTime = float(node["parameters"]["waitElementTime"])
|
||||
waitElementIframeIndex = node["parameters"]["waitElementInIframe"]
|
||||
print("等待元素出现:", waitElement)
|
||||
print("Waiting for element to appear:", waitElement)
|
||||
if waitElementIframeIndex > 0:
|
||||
iframes = self.browser.find_elements(
|
||||
By.CSS_SELECTOR, "iframe", iframe=False)
|
||||
iframe = iframes[waitElementIframeIndex - 1]
|
||||
self.browser.switch_to.frame(iframe)
|
||||
WebDriverWait(self.browser, waitElementTime).until(
|
||||
EC.presence_of_element_located((By.XPATH, waitElement))
|
||||
)
|
||||
if waitElementIframeIndex > 0:
|
||||
self.browser.switch_to.default_content()
|
||||
except Exception as e:
|
||||
if waitElement != "":
|
||||
print("等待元素出现超时:", waitElement, ",将继续执行。")
|
||||
print("Timeout waiting for element to appear:",
|
||||
waitElement, ", will continue to execute.")
|
||||
print(e)
|
||||
self.recordLog("Wait element not found")
|
||||
|
||||
# 根据不同选项执行不同操作
|
||||
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
|
||||
@ -714,7 +771,8 @@ class BrowserThread(Thread):
|
||||
continue
|
||||
elif tType == 2: # 当前页面包含元素
|
||||
try:
|
||||
xpath = replace_field_values(cnode["parameters"]["value"], self.outputParameters)
|
||||
xpath = replace_field_values(
|
||||
cnode["parameters"]["value"], self.outputParameters)
|
||||
if self.browser.find_element(By.XPATH, xpath, iframe=cnode["parameters"]["iframe"]):
|
||||
executeBranchId = i
|
||||
break
|
||||
@ -722,7 +780,8 @@ class BrowserThread(Thread):
|
||||
continue
|
||||
elif tType == 3: # 当前循环元素包括文本
|
||||
try:
|
||||
value = replace_field_values(cnode["parameters"]["value"], self.outputParameters)
|
||||
value = replace_field_values(
|
||||
cnode["parameters"]["value"], self.outputParameters)
|
||||
if loopElement.text.find(value) >= 0:
|
||||
executeBranchId = i
|
||||
break
|
||||
@ -730,7 +789,8 @@ class BrowserThread(Thread):
|
||||
continue
|
||||
elif tType == 4: # 当前循环元素包括元素
|
||||
try:
|
||||
xpath = replace_field_values(cnode["parameters"]["value"][1:], self.outputParameters)
|
||||
xpath = replace_field_values(
|
||||
cnode["parameters"]["value"][1:], self.outputParameters)
|
||||
if loopElement.find_element(By.XPATH, xpath):
|
||||
executeBranchId = i
|
||||
break
|
||||
@ -782,7 +842,8 @@ class BrowserThread(Thread):
|
||||
finished = False
|
||||
# newBodyText = self.browser.page_source
|
||||
# newBodyText = self.browser.find_element(By.XPATH, "//body").text
|
||||
newBodyText = self.browser.find_element(By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
|
||||
newBodyText = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=node["parameters"]["iframe"]).text
|
||||
if node["parameters"]["exitCount"] == 0:
|
||||
if newBodyText == bodyText: # 如果页面内容无变化
|
||||
print("页面已检测不到新内容,停止循环。")
|
||||
@ -790,7 +851,6 @@ class BrowserThread(Thread):
|
||||
finished = True
|
||||
break
|
||||
else:
|
||||
if node["parameters"]["exitCount"] == 0:
|
||||
print("检测到页面变化,继续循环。")
|
||||
print("Page changed detected, continue loop.")
|
||||
bodyText = newBodyText
|
||||
@ -1150,7 +1210,8 @@ class BrowserThread(Thread):
|
||||
replaced_text = replaced_text.split("~")[index - 1]
|
||||
except:
|
||||
print("取值失败,可能是因为取值索引超出范围,将使用整个文本值")
|
||||
print("Failed to get value, maybe because the index is out of range, will use the entire text value")
|
||||
print(
|
||||
"Failed to get value, maybe because the index is out of range, will use the entire text value")
|
||||
textbox.send_keys(replaced_text)
|
||||
if value.lower().find("<enter>") >= 0:
|
||||
textbox.send_keys(Keys.ENTER)
|
||||
@ -1226,7 +1287,8 @@ class BrowserThread(Thread):
|
||||
pass
|
||||
except Exception as e:
|
||||
print("点击元素失败:" + path, ",请尝试将点击类型改为JavaScript点击后重试。")
|
||||
print("Failed to click element:" + path, ", please try to change the click type to JavaScript Click.")
|
||||
print("Failed to click element:" + path,
|
||||
", please try to change the click type to JavaScript Click.")
|
||||
print(e)
|
||||
self.Log(e)
|
||||
self.recordLog(str(e))
|
||||
@ -1374,7 +1436,8 @@ class BrowserThread(Thread):
|
||||
# 使用Pillow库打开截图,并转换为灰度图像
|
||||
image = Image.open(screenshot_stream).convert('L')
|
||||
temp_name = "OCR_" + str(time.time()) + ".png"
|
||||
location = "Data/Task_" + str(self.id) + "/" + self.saveName + "/" + temp_name
|
||||
location = "Data/Task_" + \
|
||||
str(self.id) + "/" + self.saveName + "/" + temp_name
|
||||
image.save(location)
|
||||
ocr = DdddOcr()
|
||||
with open(location, 'rb') as f:
|
||||
@ -1498,7 +1561,8 @@ class BrowserThread(Thread):
|
||||
content = pageHTML.xpath(full_path)
|
||||
except:
|
||||
content = []
|
||||
elif not relativeXPath.startswith("/"): # 如果是id()这种形式,不需要包/html/body
|
||||
# 如果是id()这种形式,不需要包/html/body
|
||||
elif not relativeXPath.startswith("/"):
|
||||
try:
|
||||
content = loopElementHTML.xpath(xpath)
|
||||
except:
|
||||
@ -1507,7 +1571,8 @@ class BrowserThread(Thread):
|
||||
content = loopElementHTML.xpath(
|
||||
"/html/body/" + loopElementHTML[0][0].tag + xpath)
|
||||
else:
|
||||
if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
|
||||
# 如果是id()或(//div)[1]这种形式,不需要包/html/body
|
||||
if xpath.find("/body") < 0 and xpath.startswith("/"):
|
||||
xpath = "/html/body" + xpath
|
||||
content = pageHTML.xpath(xpath)
|
||||
if len(content) > 0:
|
||||
@ -1517,7 +1582,8 @@ class BrowserThread(Thread):
|
||||
for result in content if result.strip())
|
||||
if p["nodeType"] == 2:
|
||||
base_url = self.browser.current_url
|
||||
content = urljoin(base_url, content) # 合并链接相对路径为绝对路径
|
||||
# 合并链接相对路径为绝对路径
|
||||
content = urljoin(base_url, content)
|
||||
else:
|
||||
content = p["default"]
|
||||
if not self.dataNotFoundKeys[p["name"]]:
|
||||
@ -1642,7 +1708,8 @@ class BrowserThread(Thread):
|
||||
self.execute_code(
|
||||
2, p["afterJS"], p["afterJSWaitTime"], element, iframe=p["iframe"]) # 执行后置JS
|
||||
if para["recordASField"] > 0:
|
||||
line = new_line(self.outputParameters, self.maxViewLength, self.outputParametersRecord)
|
||||
line = new_line(self.outputParameters,
|
||||
self.maxViewLength, self.outputParametersRecord)
|
||||
self.OUTPUT.append(line)
|
||||
# rt.end()
|
||||
|
||||
@ -1675,8 +1742,10 @@ if __name__ == '__main__':
|
||||
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||
# MacOS需要用option而不是options!
|
||||
option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||
option.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
|
||||
options.add_extension("EasySpider.app/Contents/Resources/app/XPathHelper.crx")
|
||||
option.add_extension(
|
||||
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
|
||||
options.add_extension(
|
||||
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
|
||||
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
|
||||
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||
# # MacOS需要用option而不是options!
|
||||
@ -1684,7 +1753,8 @@ if __name__ == '__main__':
|
||||
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
||||
print(driver_path)
|
||||
if c.config_folder == "":
|
||||
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
|
||||
c.config_folder = os.path.expanduser(
|
||||
"~/Library/Application Support/EasySpider/")
|
||||
# print("Config folder for MacOS:", c.config_folder)
|
||||
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
||||
print("Finding chromedriver in EasySpider",
|
||||
@ -1780,6 +1850,11 @@ if __name__ == '__main__':
|
||||
options.add_argument(
|
||||
"--disable-blink-features=AutomationControlled") # TMALL 反扒
|
||||
|
||||
options.add_argument('-ignore-certificate-errors')
|
||||
options.add_argument('-ignore -ssl-errors')
|
||||
option.add_argument('-ignore-certificate-errors')
|
||||
option.add_argument('-ignore -ssl-errors')
|
||||
|
||||
threads = []
|
||||
for i in c.id:
|
||||
# print(options)
|
||||
@ -1876,7 +1951,6 @@ if __name__ == '__main__':
|
||||
# print("您的操作系统不支持暂停功能。")
|
||||
# print("Your operating system does not support the pause function.")
|
||||
|
||||
|
||||
# print("线程长度:", len(threads) )
|
||||
|
||||
for thread in threads:
|
||||
|
@ -3,7 +3,6 @@ requests==2.31.0
|
||||
selenium==4.5.0
|
||||
pyinstaller==5.9.0
|
||||
Pillow==9.4.0
|
||||
pytesseract==0.3.10
|
||||
openpyxl==3.1.2
|
||||
pymysql==1.1.0
|
||||
lxml==4.9.2
|
||||
|
Loading…
x
Reference in New Issue
Block a user