mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-22 23:24:22 +08:00
加快采集速度+键盘硬回车模拟
This commit is contained in:
parent
71c751a8cd
commit
88691b9e4e
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_windows_x64/tasks/49.json
Normal file
1
.temp_to_pub/EasySpider_windows_x64/tasks/49.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_windows_x64/tasks/50.json
Normal file
1
.temp_to_pub/EasySpider_windows_x64/tasks/50.json
Normal file
File diff suppressed because one or more lines are too long
@ -235,7 +235,7 @@
|
|||||||
<option :value = 11>Selected text of the current select box</option>
|
<option :value = 11>Selected text of the current select box</option>
|
||||||
</select>
|
</select>
|
||||||
<div v-if='paras.parameters[paraIndex]["contentType"] == 9'>
|
<div v-if='paras.parameters[paraIndex]["contentType"] == 9'>
|
||||||
<label>JavaScript Code: </label>
|
<label>JavaScript Code (Use Field["FieldName"] to input the last extracted value of a field): </label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2"
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2"
|
||||||
placeholder='The element should be represented by arguments[0]. Here is an example: return arguments[0].innerText + "US Dollar". This code extracts the innerText of the element and appends "US Dollar" to it.' v-model='paras.parameters[paraIndex]["JS"]'></textarea>
|
placeholder='The element should be represented by arguments[0]. Here is an example: return arguments[0].innerText + "US Dollar". This code extracts the innerText of the element and appends "US Dollar" to it.' v-model='paras.parameters[paraIndex]["JS"]'></textarea>
|
||||||
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
|
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
|
||||||
@ -277,7 +277,7 @@
|
|||||||
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>Use text inside the Loop (If not checked, the text entered each time will be the text inside the "Input Value" textbox below. If checked, the text set within the loop will be used)</p>
|
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>Use text inside the Loop (If not checked, the text entered each time will be the text inside the "Input Value" textbox below. If checked, the text set within the loop will be used)</p>
|
||||||
</div>
|
</div>
|
||||||
<div v-if='!useLoop'>
|
<div v-if='!useLoop'>
|
||||||
<label>Input Value (Use Field["FieldName"] to input the last extracted value of a field):</label>
|
<label>Input Value (Use Field["FieldName"] to input the last extracted value of a field, Use <enter> or <ENTER> to simulate pressing the Enter key):</label>
|
||||||
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
|
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@ -313,7 +313,7 @@
|
|||||||
</select>
|
</select>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<label>Code: </label>
|
<label>Code (Use Field["FieldName"] to input the last extracted value of a field): </label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="Please input a JavaScript command or a system command. For example, document.body.innerText = '1' is an example of a JavaScript command, and python D:/test.py is an example of a system command. If you choose to execute a JavaScript script for the current iteration, you can represent the element of the current iteration using arguments[0]. For instance, arguments[0].style.color = 'blue' sets the color of the element in the current iteration to blue."></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="Please input a JavaScript command or a system command. For example, document.body.innerText = '1' is an example of a JavaScript command, and python D:/test.py is an example of a system command. If you choose to execute a JavaScript script for the current iteration, you can represent the element of the current iteration using arguments[0]. For instance, arguments[0].style.color = 'blue' sets the color of the element in the current iteration to blue."></textarea>
|
||||||
<p style="margin-top: 15px">Whether to record the output/return value of the execution as a field: </p>
|
<p style="margin-top: 15px">Whether to record the output/return value of the execution as a field: </p>
|
||||||
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
|
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
|
||||||
@ -387,7 +387,7 @@
|
|||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="One text/URL per line" v-model='nowNode["parameters"]["textList"]'></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="One text/URL per line" v-model='nowNode["parameters"]["textList"]'></textarea>
|
||||||
</div>
|
</div>
|
||||||
<div v-else-if='parseInt(loopType) < 7'>
|
<div v-else-if='parseInt(loopType) < 7'>
|
||||||
<label>Code:</label>
|
<label>Code (Use Field["FieldName"] to input the last extracted value of a field):</label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Continue the loop if the command return value is greater than 0 or evaluates to true; otherwise, stop the loop. For example, return document.body.scrollWidth > 1000 is an example of a JavaScript command return value, and python D:/test.py is an example of a system command return value."></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Continue the loop if the command return value is greater than 0 or evaluates to true; otherwise, stop the loop. For example, return document.body.scrollWidth > 1000 is an example of a JavaScript command return value, and python D:/test.py is an example of a system command return value."></textarea>
|
||||||
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
|
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
|
||||||
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
||||||
@ -463,7 +463,7 @@
|
|||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div style="margin-top:5px">
|
<div style="margin-top:5px">
|
||||||
<label>Seconds <b>after executed</b>:</label>
|
<label>Seconds <b>after executed</b> (Can be set to a decimal, such as 0.5):</label>
|
||||||
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
||||||
</div>
|
</div>
|
||||||
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">Confirm</button>
|
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">Confirm</button>
|
||||||
|
@ -277,7 +277,7 @@
|
|||||||
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>使用循环内的文本(不勾选则每次输入的文本为下方“输入值”文本框内的文本,勾选后会使用所在循环内设置的文本)</p>
|
<p><input onkeydown="inputDelete(event)" type="checkbox" v-model='useLoop'></input>使用循环内的文本(不勾选则每次输入的文本为下方“输入值”文本框内的文本,勾选后会使用所在循环内设置的文本)</p>
|
||||||
</div>
|
</div>
|
||||||
<div v-if='!useLoop'>
|
<div v-if='!useLoop'>
|
||||||
<label>输入值(用Field["字段名"]来输入某字段最后一次提取到的值):</label>
|
<label>输入值(用Field["字段名"]来输入某字段提取到的最新值,用<enter>或<ENTER>表示模拟按下回车键):</label>
|
||||||
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
|
<input onkeydown="inputDelete(event)" class="form-control" v-model='nowNode["parameters"]["value"]'></input>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@ -313,7 +313,7 @@
|
|||||||
</select>
|
</select>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<label>代码/脚本内容: </label>
|
<label>代码/脚本内容(用Field["字段名"]来输入某字段提取到的最新值): </label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="输入JS或系统命令,如:document.body.innerText = '1' 或 python D:/test.py,分别为JS命令和系统命令示例。如选择针对当前循环项的JS脚本,则循环项元素用arguments[0]表示,如arguments[0].style.color = 'blue'"></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="2" v-model='nowNode["parameters"]["code"]' placeholder="输入JS或系统命令,如:document.body.innerText = '1' 或 python D:/test.py,分别为JS命令和系统命令示例。如选择针对当前循环项的JS脚本,则循环项元素用arguments[0]表示,如arguments[0].style.color = 'blue'"></textarea>
|
||||||
<p style="margin-top: 15px">是否将执行后的输出/返回值作为字段记录:</p>
|
<p style="margin-top: 15px">是否将执行后的输出/返回值作为字段记录:</p>
|
||||||
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
|
<p><select v-model='nowNode["parameters"]["recordASField"]' class="form-control">
|
||||||
@ -383,11 +383,11 @@
|
|||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个XPath" v-model='nowNode["parameters"]["pathList"]'></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个XPath" v-model='nowNode["parameters"]["pathList"]'></textarea>
|
||||||
</div>
|
</div>
|
||||||
<div v-else-if='parseInt(loopType) < 5'>
|
<div v-else-if='parseInt(loopType) < 5'>
|
||||||
<label>内容列表(用Field["字段名"]来输入某字段最后一次提取到的值):</label>
|
<label>内容列表(用Field["字段名"]来输入某字段提取到的最新值):</label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个文本/网址" v-model='nowNode["parameters"]["textList"]'></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" placeholder="每行一个文本/网址" v-model='nowNode["parameters"]["textList"]'></textarea>
|
||||||
</div>
|
</div>
|
||||||
<div v-else-if='parseInt(loopType) < 7'>
|
<div v-else-if='parseInt(loopType) < 7'>
|
||||||
<label>代码/脚本内容: </label>
|
<label>代码/脚本内容(用Field["字段名"]来输入某字段提取到的最新值): </label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则继续循环,否则停止循环。如:return document.body.scrollWidth > 1000 或 python D:/test.py,分别为JS命令和系统命令返回值示例。"></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则继续循环,否则停止循环。如:return document.body.scrollWidth > 1000 或 python D:/test.py,分别为JS命令和系统命令返回值示例。"></textarea>
|
||||||
<label>最长等待脚本执行时间(0代表无限等待): </label>
|
<label>最长等待脚本执行时间(0代表无限等待): </label>
|
||||||
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
||||||
@ -449,7 +449,7 @@
|
|||||||
<textarea onkeydown="inputDelete(event)" required placeholder="如果是当前循环包含元素,则输入相对元素的xpath(如/div[2]/div[1]/img,如果写相对路径,需要写成/*//img,即检测当前循环项所有的子孙元素是否存在img标签)。" class="form-control" rows="3" v-model='nowNode["parameters"]["value"]'></textarea>
|
<textarea onkeydown="inputDelete(event)" required placeholder="如果是当前循环包含元素,则输入相对元素的xpath(如/div[2]/div[1]/img,如果写相对路径,需要写成/*//img,即检测当前循环项所有的子孙元素是否存在img标签)。" class="form-control" rows="3" v-model='nowNode["parameters"]["value"]'></textarea>
|
||||||
</div>
|
</div>
|
||||||
<div v-else-if='TClass > 0 && TClass < 7'>
|
<div v-else-if='TClass > 0 && TClass < 7'>
|
||||||
<label>代码/脚本内容: </label>
|
<label>代码/脚本内容(用Field["字段名"]来输入某字段提取到的最新值): </label>
|
||||||
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则执行此分支内操作,否则不执行。如:return document.body.scrollWidth > 1000 或 python D:/test.py,分别为JS命令和系统命令返回值示例。"></textarea>
|
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="命令返回值大于0或为真则执行此分支内操作,否则不执行。如:return document.body.scrollWidth > 1000 或 python D:/test.py,分别为JS命令和系统命令返回值示例。"></textarea>
|
||||||
<label>最长等待脚本执行时间(0代表无限等待): </label>
|
<label>最长等待脚本执行时间(0代表无限等待): </label>
|
||||||
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
|
||||||
@ -462,7 +462,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div style="margin-top:5px">
|
<div style="margin-top:5px">
|
||||||
<label><b>执行后</b>等待秒数:</label>
|
<label><b>执行后</b>等待秒数(所有等待时间均可设置为小数,如0.5):</label>
|
||||||
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
<input onkeydown="inputDelete(event)" required type="number" class="form-control" v-model.number='list.nl[index.nowNodeIndex]["parameters"]["wait"]'></input>
|
||||||
</div>
|
</div>
|
||||||
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">确定</button>
|
<button class="btn btn-outline-primary" style="margin-top: 20px;" id="confirm">确定</button>
|
||||||
|
@ -1 +1 @@
|
|||||||
{"id":104,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"6/28/2023, 12:13:15 AM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../../div[1]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
{"id":104,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/1/2023, 3:58:56 PM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0.5,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0.5,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../../div[1]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
@ -1 +1 @@
|
|||||||
{"id":105,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"6/28/2023, 12:22:26 AM","version":"0.3.3","containJudge":false,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../a/span[2]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
{"id":105,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"7/1/2023, 3:55:40 PM","version":"0.3.3","containJudge":false,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"自定义参数_0","desc":"","type":"string","exampleValue":"自定义字段"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[4,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":4,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","wait":1.4,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1.1,"clickWay":0,"maxWaitTime":10,"paras":[],"allXPaths":"","loopType":1}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"自定义参数_0","desc":"","extractType":0,"relativeXPath":"*../a/span[2]","allXPaths":[],"exampleValues":[{"num":0,"value":"自定义字段"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
1
ElectronJS/tasks/106.json
Normal file
1
ElectronJS/tasks/106.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id":106,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/1/2023, 9:20:10 PM","version":"0.3.3","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"string","exampleValue":"iphone<enter>","value":"iphone<enter>"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"iphone<enter>","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}}]}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -36,7 +36,6 @@ mac_chrome_path = "/Applications/Google Chrome.app/Contents/MacOS"
|
|||||||
linux_chrome_path = "/opt/google/chrome"
|
linux_chrome_path = "/opt/google/chrome"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
driver_downloads = []
|
driver_downloads = []
|
||||||
response = requests.get(chrome_driver_url)
|
response = requests.get(chrome_driver_url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -10,7 +10,7 @@
|
|||||||
"program": "${file}",
|
"program": "${file}",
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": true,
|
"justMyCode": true,
|
||||||
"args": ["--id", "[47]", "--read_type", "local", "--headless", "0"]
|
"args": ["--id", "[104]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--id", "[44]", "--headless", "0", "--user_data", "1"]
|
// "args": ["--id", "[44]", "--headless", "0", "--user_data", "1"]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -11,6 +11,7 @@ import base64
|
|||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
import requests
|
import requests
|
||||||
|
from lxml import etree
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
@ -161,6 +162,18 @@ class BrowserThread(Thread):
|
|||||||
self.dataNotFoundKeys[para["name"]] = False
|
self.dataNotFoundKeys[para["name"]] = False
|
||||||
self.OUTPUT[0].append(para["name"])
|
self.OUTPUT[0].append(para["name"])
|
||||||
self.urlId = 0 # 全局记录变量
|
self.urlId = 0 # 全局记录变量
|
||||||
|
self.preprocess() # 预处理,优化提取数据流程
|
||||||
|
|
||||||
|
# 检测如果没有复杂的操作,优化提取数据流程
|
||||||
|
def preprocess(self):
|
||||||
|
for node in self.procedure:
|
||||||
|
if node["option"] == 3: # 提取数据操作
|
||||||
|
paras = node["parameters"]["paras"]
|
||||||
|
for para in paras:
|
||||||
|
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
|
||||||
|
para["optimizable"] = True
|
||||||
|
else:
|
||||||
|
para["optimizable"] = False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
# 挨个执行程序
|
# 挨个执行程序
|
||||||
@ -242,6 +255,12 @@ class BrowserThread(Thread):
|
|||||||
if max_wait_time == 0:
|
if max_wait_time == 0:
|
||||||
max_wait_time = 999999
|
max_wait_time = 999999
|
||||||
# print(codeMode, code)
|
# print(codeMode, code)
|
||||||
|
pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
|
||||||
|
try:
|
||||||
|
replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), code)
|
||||||
|
except:
|
||||||
|
replaced_text = code
|
||||||
|
code = replaced_text
|
||||||
if int(codeMode) == 0:
|
if int(codeMode) == 0:
|
||||||
self.recordLog("Execute JavaScript:" + code)
|
self.recordLog("Execute JavaScript:" + code)
|
||||||
self.recordLog("执行JavaScript:" + code)
|
self.recordLog("执行JavaScript:" + code)
|
||||||
@ -391,9 +410,9 @@ class BrowserThread(Thread):
|
|||||||
self.judgeExecute(node, loopValue, loopPath, index)
|
self.judgeExecute(node, loopValue, loopPath, index)
|
||||||
|
|
||||||
# 执行完之后进行等待
|
# 执行完之后进行等待
|
||||||
if node["option"] != 0 and node["option"] != 2:
|
if node["option"] != 0 and node["option"] != 2: # 点击元素操作单独定义等待时间操作
|
||||||
waitTime = 0.01 # 默认等待0.01秒
|
waitTime = 0.01 # 默认等待0.01秒
|
||||||
if node["parameters"]["wait"] >= 1:
|
if node["parameters"]["wait"] >= 0:
|
||||||
waitTime = node["parameters"]["wait"]
|
waitTime = node["parameters"]["wait"]
|
||||||
time.sleep(waitTime)
|
time.sleep(waitTime)
|
||||||
self.Log("Wait seconds after node executing: ", waitTime)
|
self.Log("Wait seconds after node executing: ", waitTime)
|
||||||
@ -536,10 +555,10 @@ class BrowserThread(Thread):
|
|||||||
self.history["index"] # 计算历史记录变化差值
|
self.history["index"] # 计算历史记录变化差值
|
||||||
self.browser.execute_script(
|
self.browser.execute_script(
|
||||||
'history.go(' + str(difference) + ')') # 回退历史记录
|
'history.go(' + str(difference) + ')') # 回退历史记录
|
||||||
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
||||||
time.sleep(node["parameters"]["historyWait"])
|
time.sleep(node["parameters"]["historyWait"])
|
||||||
else:
|
# else:
|
||||||
time.sleep(2)
|
# time.sleep(2)
|
||||||
# 切换历史记录等待2秒或者:
|
# 切换历史记录等待2秒或者:
|
||||||
self.Log("Change history back time or:",
|
self.Log("Change history back time or:",
|
||||||
node["parameters"]["historyWait"])
|
node["parameters"]["historyWait"])
|
||||||
@ -573,10 +592,10 @@ class BrowserThread(Thread):
|
|||||||
self.history["index"] # 计算历史记录变化差值
|
self.history["index"] # 计算历史记录变化差值
|
||||||
self.browser.execute_script(
|
self.browser.execute_script(
|
||||||
'history.go(' + str(difference) + ')') # 回退历史记录
|
'history.go(' + str(difference) + ')') # 回退历史记录
|
||||||
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
# if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
||||||
time.sleep(node["parameters"]["historyWait"])
|
time.sleep(node["parameters"]["historyWait"])
|
||||||
else:
|
# else:
|
||||||
time.sleep(2)
|
# time.sleep(2)
|
||||||
self.Log("Change history back time or:",
|
self.Log("Change history back time or:",
|
||||||
node["parameters"]["historyWait"])
|
node["parameters"]["historyWait"])
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
@ -638,7 +657,7 @@ class BrowserThread(Thread):
|
|||||||
|
|
||||||
# 打开网页事件
|
# 打开网页事件
|
||||||
def openPage(self, para, loopValue):
|
def openPage(self, para, loopValue):
|
||||||
time.sleep(2) # 打开网页后强行等待至少2秒
|
time.sleep(1) # 打开网页后强行等待至少1秒
|
||||||
if len(self.browser.window_handles) > 1:
|
if len(self.browser.window_handles) > 1:
|
||||||
self.browser.switch_to.window(self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
|
self.browser.switch_to.window(self.browser.window_handles[-1]) # 打开网页操作从第1个页面开始
|
||||||
self.browser.close()
|
self.browser.close()
|
||||||
@ -700,7 +719,7 @@ class BrowserThread(Thread):
|
|||||||
# 键盘输入事件
|
# 键盘输入事件
|
||||||
def inputInfo(self, para, loopValue):
|
def inputInfo(self, para, loopValue):
|
||||||
time.sleep(0.1) # 输入之前等待0.1秒
|
time.sleep(0.1) # 输入之前等待0.1秒
|
||||||
self.Log("Wait 1 second before input")
|
self.Log("Wait 0.1 second before input")
|
||||||
try:
|
try:
|
||||||
textbox = self.browser.find_element(By.XPATH, para["xpath"])
|
textbox = self.browser.find_element(By.XPATH, para["xpath"])
|
||||||
# textbox.send_keys(Keys.CONTROL, 'a')
|
# textbox.send_keys(Keys.CONTROL, 'a')
|
||||||
@ -720,10 +739,12 @@ class BrowserThread(Thread):
|
|||||||
pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
|
pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值
|
||||||
try:
|
try:
|
||||||
replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
|
replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
|
||||||
|
replaced_text = re.sub('<enter>', '', replaced_text, flags=re.IGNORECASE)
|
||||||
except:
|
except:
|
||||||
replaced_text = value
|
replaced_text = value
|
||||||
value = replaced_text
|
textbox.send_keys(replaced_text)
|
||||||
textbox.send_keys(value)
|
if value.lower().find("<enter>") >= 0:
|
||||||
|
textbox.send_keys(Keys.ENTER)
|
||||||
self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], textbox) # 执行后置js
|
self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], textbox) # 执行后置js
|
||||||
# global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
# global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||||
self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
|
self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text
|
||||||
@ -928,10 +949,59 @@ class BrowserThread(Thread):
|
|||||||
|
|
||||||
# 提取数据事件
|
# 提取数据事件
|
||||||
def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
|
def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0):
|
||||||
for p in para["paras"]: # 加并行处理
|
pageHTML = etree.HTML(self.browser.page_source)
|
||||||
|
loopElementOuterHTML = loopElement.get_attribute('outerHTML')
|
||||||
|
loopElementHTML = etree.HTML(loopElementOuterHTML)
|
||||||
|
for p in para["paras"]:
|
||||||
|
if p["optimizable"]:
|
||||||
|
try:
|
||||||
|
p["relativeXPath"] = p["relativeXPath"].lower()
|
||||||
|
if p["nodeType"] == 2:
|
||||||
|
xpath = p["relativeXPath"] + "/@href"
|
||||||
|
elif p["contentType"] == 1:
|
||||||
|
xpath = p["relativeXPath"] + "/text()"
|
||||||
|
elif p["contentType"] == 0:
|
||||||
|
xpath = p["relativeXPath"] + "//text()"
|
||||||
|
if p["relative"]:
|
||||||
|
# if p["relativeXPath"] == "":
|
||||||
|
# content = [loopElementHTML]
|
||||||
|
# else:
|
||||||
|
if p["relativeXPath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
|
||||||
|
full_path = "(" + parentPath + \
|
||||||
|
xpath + ")" + \
|
||||||
|
"[" + str(index + 1) + "]"
|
||||||
|
content = pageHTML.xpath(full_path)
|
||||||
|
else:
|
||||||
|
content = loopElementHTML.xpath("/html/body/" + loopElementHTML[0][0].tag + xpath)
|
||||||
|
else:
|
||||||
|
if xpath.find("/html/body") < 0:
|
||||||
|
xpath = "/html/body" + xpath
|
||||||
|
content = pageHTML.xpath(xpath)
|
||||||
|
if len(content) > 0:
|
||||||
|
# html = etree.tostring(content[0], encoding='utf-8').decode('utf-8')
|
||||||
|
# 拼接所有文本内容并去掉两边的空白
|
||||||
|
content = ' '.join(result.strip() for result in content if result.strip())
|
||||||
|
else:
|
||||||
|
content = p["default"]
|
||||||
|
try:
|
||||||
|
if not self.dataNotFoundKeys[p["name"]]:
|
||||||
|
print('Element %s not found with parameter name %s when extracting data, use default, this error will only show once' % (p["relativeXPath"], p["name"]))
|
||||||
|
print("提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (p["name"], p["relativeXPath"]))
|
||||||
|
self.dataNotFoundKeys[p["name"]] = True
|
||||||
|
self.recordLog('Element %s not found, use default' % p["relativeXPath"])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.outputParameters[p["name"]] = content
|
||||||
|
|
||||||
|
# 对于不能优化的操作,使用selenium执行
|
||||||
|
for p in para["paras"]:
|
||||||
|
if not p["optimizable"]:
|
||||||
content = ""
|
content = ""
|
||||||
if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
|
if not (p["contentType"] == 5 or p["contentType"] == 6): # 如果不是页面标题或URL,去找元素
|
||||||
try:
|
try:
|
||||||
|
p["relativeXPath"] = p["relativeXPath"].lower()
|
||||||
if p["relative"]: # 是否相对xpath
|
if p["relative"]: # 是否相对xpath
|
||||||
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
||||||
element = loopElement
|
element = loopElement
|
||||||
|
35
ExecuteStage/test.py
Normal file
35
ExecuteStage/test.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# 解析HTML
|
||||||
|
html = """
|
||||||
|
<div>
|
||||||
|
123
|
||||||
|
<ul class="list">
|
||||||
|
<li class="item-0">first item</li>
|
||||||
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||||
|
</ul>
|
||||||
|
456
|
||||||
|
<div></div>
|
||||||
|
789
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
html = etree.HTML(html)
|
||||||
|
element = html.xpath("*")
|
||||||
|
direct_text = "/html/body/" + html[0][0].tag + "/text()"
|
||||||
|
all_text = "/html/body/" + html[0][0].tag + "//text()"
|
||||||
|
# 使用XPath选择元素
|
||||||
|
results = html.xpath(direct_text)
|
||||||
|
# print(results)
|
||||||
|
# 拼接所有文本内容并去掉两边的空白
|
||||||
|
text = ' '.join(result.strip() for result in results if result.strip())
|
||||||
|
|
||||||
|
# 输出结果
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
results = html.xpath(all_text)
|
||||||
|
# print(results)
|
||||||
|
# 拼接所有文本内容并去掉两边的空白
|
||||||
|
text = ' '.join(result.strip() for result in results if result.strip())
|
||||||
|
|
||||||
|
# 输出结果
|
||||||
|
print(text)
|
Loading…
x
Reference in New Issue
Block a user