加快采集速度+键盘硬回车模拟

This commit is contained in:
naibo 2023-07-01 21:28:26 +08:00
parent 71c751a8cd
commit 88691b9e4e
23 changed files with 1508 additions and 388 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -235,7 +235,7 @@
<option :value = 11>Selected text of the current select box</option>
</select>
<div v-if='paras.parameters[paraIndex]["contentType"] == 9'>
<label>JavaScript Code: </label>
<label>JavaScript Code (Use Field["FieldName"] to input the last extracted value of a field): </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2"
placeholder='The element should be represented by arguments[0]. Here is an example: return arguments[0].innerText + "US Dollar". This code extracts the innerText of the element and appends "US Dollar" to it.' v-model='paras.parameters[paraIndex]["JS"]'></textarea>
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
@ -277,7 +277,7 @@
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>Use text inside the Loop (If not checked, the text entered each time will be the text inside the "Input Value" textbox below. If checked, the text set within the loop will be used)</p>
</div>
<div v-if='!useLoop'>
<label>Input Value (Use Field["FieldName"] to input the last extracted value of a field):</label>
<label>Input Value (Use Field["FieldName"] to input the last extracted value of a field, Use &lt;enter&gt; or &lt;ENTER&gt; to simulate pressing the Enter key):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
</div>
@ -313,7 +313,7 @@
</select>
<div>
<label>Code: </label>
<label>Code (Use Field["FieldName"] to input the last extracted value of a field): </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="Please input a JavaScript command or a system command. For example, document.body.innerText = '1' is an example of a JavaScript command, and python D:/test.py is an example of a system command. If you choose to execute a JavaScript script for the current iteration, you can represent the element of the current iteration using arguments[0]. For instance, arguments[0].style.color = 'blue' sets the color of the element in the current iteration to blue."></textarea>
<p style="margin-top: 15px">Whether to record the output/return value of the execution as a field: </p>
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
@ -387,7 +387,7 @@
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="One text/URL per line" v-model='nowNode["parameters"]["textList"]'></textarea>
</div>
<div v-else-if='parseInt(loopType) < 7'>
<label>Code:</label>
<label>Code (Use Field["FieldName"] to input the last extracted value of a field):</label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Continue the loop if the command return value is greater than 0 or evaluates to true; otherwise, stop the loop. For example, return document.body.scrollWidth > 1000 is an example of a JavaScript command return value, and python D:/test.py is an example of a system command return value."></textarea>
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
@ -463,7 +463,7 @@
</div>
<div style="margin-top:5px">
<label>Seconds <b>after executed</b>:</label>
<label>Seconds <b>after executed</b> (Can be set to a decimal, such as 0.5):</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
</div>
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">Confirm</button>

View File

@ -277,7 +277,7 @@
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>使用循环内的文本(不勾选则每次输入的文本为下方“输入值”文本框内的文本,勾选后会使用所在循环内设置的文本)</p>
</div>
<div v-if='!useLoop'>
<label>输入值用Field["字段名"]来输入某字段最后一次提取到的值):</label>
<label>输入值用Field["字段名"]来输入某字段提取到的最新,用&lt;enter&gt;&lt;ENTER&gt;表示模拟按下回车键</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
</div>
@ -313,7 +313,7 @@
</select>
<div>
<label>代码/脚本内容: </label>
<label>代码/脚本内容用Field["字段名"]来输入某字段提取到的最新值) </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="输入JS或系统命令document.body.innerText = '1' 或 python D:/test.py分别为JS命令和系统命令示例。如选择针对当前循环项的JS脚本则循环项元素用arguments[0]表示如arguments[0].style.color = 'blue'"></textarea>
<p style="margin-top: 15px">是否将执行后的输出/返回值作为字段记录:</p>
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
@ -383,11 +383,11 @@
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个XPath" v-model='nowNode["parameters"]["pathList"]'></textarea>
</div>
<div v-else-if='parseInt(loopType) < 5'>
<label>内容列表用Field["字段名"]来输入某字段最后一次提取到的值):</label>
<label>内容列表用Field["字段名"]来输入某字段提取到的最新值):</label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个文本/网址" v-model='nowNode["parameters"]["textList"]'></textarea>
</div>
<div v-else-if='parseInt(loopType) < 7'>
<label>代码/脚本内容: </label>
<label>代码/脚本内容用Field["字段名"]来输入某字段提取到的最新值) </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则继续循环否则停止循环。如return document.body.scrollWidth > 1000 或 python D:/test.py分别为JS命令和系统命令返回值示例。"></textarea>
<label>最长等待脚本执行时间0代表无限等待 </label>
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
@ -449,7 +449,7 @@
<textarea onkeydown="inputDelete(event)" required placeholder="如果是当前循环包含元素则输入相对元素的xpath如/div[2]/div[1]/img如果写相对路径需要写成/*//img即检测当前循环项所有的子孙元素是否存在img标签。" class="form-control" rows="3" v-model='nowNode["parameters"]["value"]'></textarea>
</div>
<div v-else-if='TClass > 0 && TClass < 7'>
<label>代码/脚本内容: </label>
<label>代码/脚本内容用Field["字段名"]来输入某字段提取到的最新值) </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则执行此分支内操作否则不执行。如return document.body.scrollWidth > 1000 或 python D:/test.py分别为JS命令和系统命令返回值示例。"></textarea>
<label>最长等待脚本执行时间0代表无限等待 </label>
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
@ -462,7 +462,7 @@
</div>
</div>
<div style="margin-top:5px">
<label><b>执行后</b>等待秒数:</label>
<label><b>执行后</b>等待秒数所有等待时间均可设置为小数如0.5</label>
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
</div>
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">确定</button>

View File

@ -1 +1 @@
{"id":104,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"6/28/2023, 12:13:15 AM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../../div[1]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
{"id":104,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/1/2023, 3:58:56 PM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0.5,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0.5,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../../div[1]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

View File

@ -1 +1 @@
{"id":105,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"6/28/2023, 12:22:26 AM","version":"0.3.3","containJudge":false,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../a/span[2]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
{"id":105,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"7/1/2023, 3:55:40 PM","version":"0.3.3","containJudge":false,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":1.4,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1.1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../a/span[2]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

View File

@ -0,0 +1 @@
{"id":106,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/1/2023, 9:20:10 PM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"string","exampleValue":"iphone<enter>","value":"iphone<enter>"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"iphone<enter>","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}}]}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -36,7 +36,6 @@ mac_chrome_path = "/Applications/Google Chrome.app/Contents/MacOS"
linux_chrome_path = "/opt/google/chrome"
if __name__ == "__main__":
driver_downloads = []
response = requests.get(chrome_driver_url)
if response.status_code == 200:

View File

@ -10,7 +10,7 @@
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"args": ["--id", "[47]", "--read_type", "local", "--headless", "0"]
"args": ["--id", "[104]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[44]", "--headless", "0", "--user_data", "1"]
}
]

View File

@ -11,6 +11,7 @@ import base64
import hashlib
import time
import requests
from lxml import etree
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
@ -161,7 +162,19 @@ class BrowserThread(Thread):
self.dataNotFoundKeys[para["name"]] = False
self.OUTPUT[0].append(para["name"])
self.urlId = 0 # 全局记录变量
self.preprocess() # 预处理,优化提取数据流程
# 检测如果没有复杂的操作,优化提取数据流程
def preprocess(self):
for node in self.procedure:
if node["option"] == 3: # 提取数据操作
paras = node["parameters"]["paras"]
for para in paras:
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
para["optimizable"] = True
else:
para["optimizable"] = False
def run(self):
# 挨个执行程序
for i in range(len(self.links)):
@ -242,6 +255,12 @@ class BrowserThread(Thread):
if max_wait_time == 0:
max_wait_time = 999999
# print(codeMode, code)
pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
try:
replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
except:
replaced_text = code
code = replaced_text
if int(codeMode) == 0:
self.recordLog("Execute JavaScript:" + code)
self.recordLog("执行JavaScript:" + code)
@ -391,9 +410,9 @@ class BrowserThread(Thread):
self.judgeExecute(node, loopValue, loopPath, index)
# 执行完之后进行等待
if node["option"] != 0 and node["option"] != 2:
if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
waitTime = 0.01 # 默认等待0.01秒
if node["parameters"]["wait"] >= 1:
if node["parameters"]["wait"] >= 0:
waitTime = node["parameters"]["wait"]
time.sleep(waitTime)
self.Log("Wait seconds after node executing: ", waitTime)
@ -536,10 +555,10 @@ class BrowserThread(Thread):
self.history["index"] # 计算历史记录变化差值
self.browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
# else:
# time.sleep(2)
# 切换历史记录等待2秒或者
self.Log("Change history back time or:",
node["parameters"]["historyWait"])
@ -573,10 +592,10 @@ class BrowserThread(Thread):
self.history["index"] # 计算历史记录变化差值
self.browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
# else:
# time.sleep(2)
self.Log("Change history back time or:",
node["parameters"]["historyWait"])
self.browser.execute_script('window.stop()')
@ -638,7 +657,7 @@ class BrowserThread(Thread):
# 打开网页事件
def openPage(self, para, loopValue):
time.sleep(2) # 打开网页后强行等待至少2
time.sleep(1) # 打开网页后强行等待至少1
if len(self.browser.window_handles) > 1:
self.browser.switch_to.window(self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
self.browser.close()
@ -700,7 +719,7 @@ class BrowserThread(Thread):
# 键盘输入事件
def inputInfo(self, para, loopValue):
time.sleep(0.1) # 输入之前等待0.1秒
self.Log("Wait 1 second before input")
self.Log("Wait 0.1 second before input")
try:
textbox = self.browser.find_element(By.XPATH, para["xpath"])
# textbox.send_keys(Keys.CONTROL, 'a')
@ -720,10 +739,12 @@ class BrowserThread(Thread):
pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
try:
replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
replaced_text = re.sub('<enter>', '', replaced_text, flags=re.IGNORECASE)
except:
replaced_text = value
value = replaced_text
textbox.send_keys(value)
textbox.send_keys(replaced_text)
if value.lower().find("<enter>") >= 0:
textbox.send_keys(Keys.ENTER)
self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], textbox) # 执行后置js
# global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
@ -928,81 +949,130 @@ class BrowserThread(Thread):
# 提取数据事件
def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
for p in para["paras"]: # 加并行处理
content = ""
if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL去找元素
pageHTML = etree.HTML(self.browser.page_source)
loopElementOuterHTML = loopElement.get_attribute('outerHTML')
loopElementHTML = etree.HTML(loopElementOuterHTML)
for p in para["paras"]:
if p["optimizable"]:
try:
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
p["relativeXPath"] = p["relativeXPath"].lower()
if p["nodeType"] == 2:
xpath = p["relativeXPath"] + "/@href"
elif p["contentType"] == 1:
xpath = p["relativeXPath"] + "/text()"
elif p["contentType"] == 0:
xpath = p["relativeXPath"] + "//text()"
if p["relative"]:
# if p["relativeXPath"] == "":
# content = [loopElementHTML]
# else:
if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
full_path = "(" + parentPath + \
xpath + ")" + \
"[" + str(index + 1) + "]"
content = pageHTML.xpath(full_path)
else:
if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
full_path = "(" + parentPath + \
p["relativeXPath"] + ")" + \
"[" + str(index + 1) + "]"
element = self.browser.find_element(By.XPATH, full_path)
content = loopElementHTML.xpath("/html/body/" + loopElementHTML[0][0].tag + xpath)
else:
if xpath.find("/html/body") < 0:
xpath = "/html/body" + xpath
content = pageHTML.xpath(xpath)
if len(content) > 0:
# html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
# 拼接所有文本内容并去掉两边的空白
content = ' '.join(result.strip() for result in content if result.strip())
else:
content = p["default"]
try:
if not self.dataNotFoundKeys[p["name"]]:
print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
self.dataNotFoundKeys[p["name"]] = True
self.recordLog('Element %s not found, use default' % p["relativeXPath"])
except:
pass
except Exception as e:
print(e)
self.outputParameters[p["name"]] = content
# 对于不能优化的操作使用selenium执行
for p in para["paras"]:
if not p["optimizable"]:
content = ""
if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL去找元素
try:
p["relativeXPath"] = p["relativeXPath"].lower()
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
full_path = "(" + parentPath + \
p["relativeXPath"] + ")" + \
"[" + str(index + 1) + "]"
element = self.browser.find_element(By.XPATH, full_path)
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
else:
element = self.browser.find_element(By.XPATH, p["relativeXPath"])
except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
# print(p)
try:
content = p["default"]
except Exception as e:
content = ""
self.outputParameters[p["name"]] = content
try:
if not self.dataNotFoundKeys[p["name"]]:
print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
self.dataNotFoundKeys[p["name"]] = True
self.recordLog('Element %s not found, use default' % p["relativeXPath"])
except:
pass
continue
except TimeoutException: # 超时的时候设置超时值
self.Log('time out after set seconds when getting data')
self.recordLog('time out after set seconds when getting data')
self.browser.execute_script('window.stop()')
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
else:
element = self.browser.find_element(By.XPATH, p["relativeXPath"])
except (NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
# print(p)
try:
content = p["default"]
except Exception as e:
content = ""
self.outputParameters[p["name"]] = content
try:
if not self.dataNotFoundKeys[p["name"]]:
print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
self.dataNotFoundKeys[p["name"]] = True
self.recordLog('Element %s not found, use default' % p["relativeXPath"])
except:
pass
continue
except TimeoutException: # 超时的时候设置超时值
self.Log('time out after set seconds when getting data')
self.recordLog('time out after set seconds when getting data')
self.browser.execute_script('window.stop()')
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
else:
element = self.browser.find_element(By.XPATH, p["relativeXPath"])
# rt.end()
else:
element = self.browser.find_element(By.XPATH, "//body")
try:
self.execute_code(2, p["beforeJS"], p["beforeJSWaitTime"], element) # 执行前置js
content = self.get_content(p, element)
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
self.recordLog('StaleElementReferenceException: '+p["relativeXPath"])
time.sleep(3)
element = self.browser.find_element(By.XPATH, p["relativeXPath"])
# rt.end()
else:
element = self.browser.find_element(By.XPATH, "//body")
try:
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
self.recordLog('StaleElementReferenceException: loopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
self.recordLog(
'StaleElementReferenceException: loopElement+relativeXPath')
else:
element = self.browser.find_element(
By.XPATH, p["relativeXPath"])
self.recordLog('StaleElementReferenceException: relativeXPath')
self.execute_code(2, p["beforeJS"], p["beforeJSWaitTime"], element) # 执行前置js
content = self.get_content(p, element)
except StaleElementReferenceException:
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
self.recordLog('StaleElementReferenceException: '+p["relativeXPath"])
continue # 再出现类似问题直接跳过
self.outputParameters[p["name"]] = content
self.execute_code(2, p["afterJS"], p["afterJSWaitTime"], element) # 执行后置JS
time.sleep(3)
try:
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
self.recordLog('StaleElementReferenceException: loopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
self.recordLog(
'StaleElementReferenceException: loopElement+relativeXPath')
else:
element = self.browser.find_element(
By.XPATH, p["relativeXPath"])
self.recordLog('StaleElementReferenceException: relativeXPath')
content = self.get_content(p, element)
except StaleElementReferenceException:
self.recordLog('StaleElementReferenceException: '+p["relativeXPath"])
continue # 再出现类似问题直接跳过
self.outputParameters[p["name"]] = content
self.execute_code(2, p["afterJS"], p["afterJSWaitTime"], element) # 执行后置JS
line = []
for value in self.outputParameters.values():
line.append(value)

35
ExecuteStage/test.py Normal file
View File

@ -0,0 +1,35 @@
from lxml import etree
# 解析HTML
html = """
<div>
123
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
</ul>
456
<div></div>
789
</div>
"""
html = etree.HTML(html)
element = html.xpath("*")
direct_text = "/html/body/" + html[0][0].tag + "/text()"
all_text = "/html/body/" + html[0][0].tag + "//text()"
# 使用XPath选择元素
results = html.xpath(direct_text)
# print(results)
# 拼接所有文本内容并去掉两边的空白
text = ' '.join(result.strip() for result in results if result.strip())
# 输出结果
print(text)
results = html.xpath(all_text)
# print(results)
# 拼接所有文本内容并去掉两边的空白
text = ' '.join(result.strip() for result in results if result.strip())
# 输出结果
print(text)