mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-20 10:05:00 +08:00
Speed UP!!!
This commit is contained in:
parent
30feb2ad8a
commit
5b1d653626
@ -505,14 +505,14 @@ Please note that this feature does not support assigning values to variables. In
|
||||
<!-- 循环选项 -->
|
||||
<label>Loop Type:</label>
|
||||
<select v-model='loopType' class="form-control">
|
||||
<option value = 0>Single Element</option>
|
||||
<option value = 1>Unfixed Element List</option>
|
||||
<option value = 2>Fixed Element List</option>
|
||||
<option value = 3>Text List</option>
|
||||
<option value = 4>Weblink List</option>
|
||||
<option value = 5>Return value of JavaScript command (start with 'return ')</option>
|
||||
<option value = 6>Return value of system command</option>
|
||||
<option value = 7>Return value of Python code under current environment</option>
|
||||
<option :value = 0>Single Element</option>
|
||||
<option :value = 1>Unfixed Element List</option>
|
||||
<option :value = 2>Fixed Element List</option>
|
||||
<option :value = 3>Text List</option>
|
||||
<option :value = 4>Weblink List</option>
|
||||
<option :value = 5>Return value of JavaScript command (start with 'return ')</option>
|
||||
<option :value = 6>Return value of system command</option>
|
||||
<option :value = 7>Return value of Python code under current environment</option>
|
||||
</select>
|
||||
<div v-if='parseInt(loopType) < 2'>
|
||||
<label>XPath: <span style="font-size: 30px!important;" title="Relative XPATH writing: start with /, e.g. the loop item XPATH is /html/body/div[1], your input is /*[@id='tab-customer'], then the final addressed xpath is: /html/body/div[1]/*[@id='tab-customer']">☺</span></label>
|
||||
@ -545,7 +545,7 @@ If the expression returns a value greater than 0 or evaluates to True, the loop
|
||||
<div v-if='parseInt(loopType) == 0'>
|
||||
<label>Maximum number of loop iterations (0 represents an infinite loop until no more elements are found or no changes in page content are detected):</label>
|
||||
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["exitCount"]'></input>
|
||||
<label>Exit the loop when the content of the following page element does not change:</label>
|
||||
<label>Exit the loop when the content of the following page element does not change (only works when loop iterations are set to 0):</label>
|
||||
<input onkeydown="inputDelete(event)" required class="form-control" type="text" v-model='nowNode["parameters"]["exitElement"]'></input>
|
||||
</div>
|
||||
|
||||
|
@ -147,6 +147,7 @@ let app = new Vue({
|
||||
"relative": false,
|
||||
"name": LANG("自定义参数_" + parameterNum.toString(), "Custom_Field_" + parameterNum.toString()),
|
||||
"desc": "",
|
||||
"iframe": false,
|
||||
"extractType": 0,
|
||||
"relativeXPath": "//body",
|
||||
"recordASField": 1,
|
||||
|
@ -505,14 +505,14 @@ print(emotlib.emoji()) # 使用其中的函数。
|
||||
<!-- 循环选项 -->
|
||||
<label>循环类型:</label>
|
||||
<select v-model='loopType' class="form-control">
|
||||
<option value = 0>单个元素(多用于循环点击下一页)</option>
|
||||
<option value = 1>不固定元素列表</option>
|
||||
<option value = 2>固定元素列表</option>
|
||||
<option value = 3>文本列表(多用于循环在文本框输入文本)</option>
|
||||
<option value = 4>网址列表(多用于循环打开网页)</option>
|
||||
<option value = 5>JavaScript命令返回值(需以return 开头)</option>
|
||||
<option value = 6>系统命令返回值</option>
|
||||
<option value = 7>执行环境下的Python表达式值(eval操作)</option>
|
||||
<option :value = 0>单个元素(多用于循环点击下一页)</option>
|
||||
<option :value = 1>不固定元素列表</option>
|
||||
<option :value = 2>固定元素列表</option>
|
||||
<option :value = 3>文本列表(多用于循环在文本框输入文本)</option>
|
||||
<option :value = 4>网址列表(多用于循环打开网页)</option>
|
||||
<option :value = 5>JavaScript命令返回值(需以return 开头)</option>
|
||||
<option :value = 6>系统命令返回值</option>
|
||||
<option :value = 7>执行环境下的Python表达式值(eval操作)</option>
|
||||
</select>
|
||||
<div v-if='parseInt(loopType) < 2'>
|
||||
<label>XPath: <span style="font-size: 30px!important;" title="相对XPATH写法:以/开头,如循环项XPATH为/html/body/div[1],您的输入为/*[@id='tab-customer'],则最终寻址的xpath为:/html/body/div[1]/*[@id='tab-customer']">☺</span></label>
|
||||
@ -543,9 +543,9 @@ print(emotlib.emoji()) # 使用其中的函数。
|
||||
</div>
|
||||
<!-- 这里添加退出循环条件,找不到元素肯定退出循环 -->
|
||||
<div v-if='parseInt(loopType) == 0'>
|
||||
<label>最多执行循环次数(0代表无限循环直到找不到元素或检测不到页面内容变化为止):</label>
|
||||
<label>最多执行循环次数(0代表无限循环直到找不到元素或检测不到页面元素内容变化为止):</label>
|
||||
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["exitCount"]'></input>
|
||||
<label>检测页面以下元素内容不变化时退出循环:</label>
|
||||
<label>检测页面以下元素内容不变化时退出循环(次数为0时生效):</label>
|
||||
<input onkeydown="inputDelete(event)" required class="form-control" type="text" v-model='nowNode["parameters"]["exitElement"]'></input>
|
||||
</div>
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/235.json
Normal file
1
ElectronJS/tasks/235.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/236.json
Normal file
1
ElectronJS/tasks/236.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/237.json
Normal file
1
ElectronJS/tasks/237.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/238.json
Normal file
1
ElectronJS/tasks/238.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":238,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"12/9/2023, 3:45:18 AM","update_time":"12/9/2023, 3:45:18 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"paras":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"pekgp2r81lpx1dvwo","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"pekgp2r81lpx1dvwo","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
1
ElectronJS/tasks/239.json
Normal file
1
ElectronJS/tasks/239.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":239,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"12/9/2023, 3:46:06 AM","update_time":"12/9/2023, 3:46:06 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"si40asxrsglpx1ev64","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
1
ElectronJS/tasks/240.json
Normal file
1
ElectronJS/tasks/240.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/241.json
Normal file
1
ElectronJS/tasks/241.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/242.json
Normal file
1
ElectronJS/tasks/242.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":242,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"12/9/2023, 4:32:43 AM","update_time":"12/9/2023, 4:32:43 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"家用电器"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://search.jd.com/Search?keyword=%E5%B0%8F%E5%AE%B6%E7%94%B5&enc=utf-8&wq=%E5%B0%8F%E5%AE%B6%E7%94%B5&pvid=261a350161304c979fa0e7ce95c05671"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":2,"pathList":"//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[2]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[3]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[4]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[5]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[6]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[7]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[8]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[9]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[10]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[11]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[12]/a[1]\n//*[contains(@class, \"LeftSide_menu_list__qXCeM\")]/div[13]/a[1]","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"paras":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"家用电器"}],"unique_index":"99rt6679v9blpx32v39","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://search.jd.com/Search?keyword=%E5%B0%8F%E5%AE%B6%E7%94%B5&enc=utf-8&wq=%E5%B0%8F%E5%AE%B6%E7%94%B5&pvid=261a350161304c979fa0e7ce95c05671"}],"unique_index":"99rt6679v9blpx32v39","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":2}}]}
|
1
ElectronJS/tasks/243.json
Normal file
1
ElectronJS/tasks/243.json
Normal file
File diff suppressed because one or more lines are too long
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
||||
"justMyCode": false,
|
||||
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||
"args": ["--ids", "[37]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
|
||||
"args": ["--ids", "[89]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
|
||||
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
||||
}
|
||||
]
|
||||
|
@ -6,7 +6,7 @@ import platform
|
||||
import shutil
|
||||
import string
|
||||
import undetected_chromedriver as uc
|
||||
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
on_press_creator, on_release_creator, readCode, replace_field_values, write_to_csv, write_to_excel, write_to_json
|
||||
from myChrome import MyChrome
|
||||
from threading import Thread, Event
|
||||
@ -46,7 +46,7 @@ import time
|
||||
import requests
|
||||
from ddddocr import DdddOcr
|
||||
from urllib.parse import urljoin
|
||||
from lxml import etree
|
||||
from lxml import etree, html
|
||||
import onnxruntime
|
||||
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
@ -280,8 +280,7 @@ class BrowserThread(Thread):
|
||||
except:
|
||||
para["iframe"] = False
|
||||
try:
|
||||
para["relativeXPath"] = lowercase_tags_in_xpath(
|
||||
para["relativeXPath"])
|
||||
para["relativeXPath"] = lowercase_tags_in_xpath(para["relativeXPath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
@ -293,11 +292,7 @@ class BrowserThread(Thread):
|
||||
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log(
|
||||
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
|
||||
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para[
|
||||
"nodeType"] <= 2:
|
||||
para["optimizable"] = True
|
||||
else:
|
||||
para["optimizable"] = False
|
||||
para["optimizable"] = detect_optimizable(para)
|
||||
elif node["option"] == 4: # 输入文字
|
||||
try:
|
||||
index = node["parameters"]["index"] # 索引值
|
||||
@ -326,6 +321,56 @@ class BrowserThread(Thread):
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
except:
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
|
||||
# 如果循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3:
|
||||
paras = self.procedure[node["sequence"][0]]["parameters"]["paras"]
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
|
||||
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
|
||||
for para in paras:
|
||||
optimizable = detect_optimizable(para, ignoreWaitElement=False, waitElement=waitElement, includePicture=True)
|
||||
if para["iframe"]: # 如果是iframe,那么不可以快速提取
|
||||
optimizable = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
break
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
|
||||
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
|
||||
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
|
||||
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
|
||||
node["parameters"]["quickParas"] = []
|
||||
for para in paras:
|
||||
content_type = ""
|
||||
if para["relativeXPath"].find("/@href") >= 0 or para["relativeXPath"].find("/text()") >= 0 or para["relativeXPath"].find(
|
||||
"::text()") >= 0:
|
||||
content_type = ""
|
||||
elif para["nodeType"] == 2:
|
||||
content_type = "//@href"
|
||||
elif para["nodeType"] == 4: # 图片链接
|
||||
content_type = "//@src"
|
||||
elif para["contentType"] == 1:
|
||||
content_type = "/text()"
|
||||
elif para["contentType"] == 0:
|
||||
content_type = "//text()"
|
||||
if para["relative"]: # 如果是相对XPath
|
||||
xpath = "." + para["relativeXPath"] + content_type
|
||||
else:
|
||||
xpath = para["relativeXPath"] + content_type
|
||||
# 如果是id()或(//div)[1]这种形式,不需要包/html/body
|
||||
# if xpath.find("/body") < 0 and xpath.startswith("/"):
|
||||
# xpath = "/html/body" + xpath
|
||||
node["parameters"]["quickParas"].append({
|
||||
"name": para["name"],
|
||||
"relative": para["relative"],
|
||||
"xpath": xpath,
|
||||
"nodeType": para["nodeType"],
|
||||
"default": para["default"],
|
||||
})
|
||||
self.print_and_log("预处理完成|Preprocess completed")
|
||||
|
||||
def readFromExcel(self):
|
||||
@ -984,15 +1029,50 @@ class BrowserThread(Thread):
|
||||
self.history["index"] = thisHistoryLength
|
||||
self.history["handle"] = thisHandle
|
||||
thisHitoryURL = self.browser.current_url
|
||||
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
|
||||
# 快速提取处理
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.browser.switch_to.default_content() # 切换到主页面
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
baseXPath = replace_field_values(node["parameters"]["baseXPath"], self.outputParameters, self)
|
||||
rows = tree.xpath(baseXPath)
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
rows = []
|
||||
for path in node["parameters"]["baseXPath"].split("\n"):
|
||||
baseXPath = replace_field_values(path, self.outputParameters, self)
|
||||
rows.extend(tree.xpath(baseXPath))
|
||||
|
||||
for row in rows:
|
||||
if node["parameters"]["clear"] == 1:
|
||||
self.clearOutputParameters()
|
||||
for para in node["parameters"]["quickParas"]:
|
||||
xpath = replace_field_values(para["xpath"], self.outputParameters, self)
|
||||
content = row.xpath(xpath)
|
||||
try:
|
||||
content = ' '.join(result.strip()
|
||||
for result in content if result.strip())
|
||||
# 链接或者图片的情况下,合并链接相对路径为绝对路径
|
||||
if para["nodeType"] == 2 or para["nodeType"] == 4:
|
||||
base_url = self.browser.current_url
|
||||
# 合并链接相对路径为绝对路径
|
||||
content = urljoin(base_url, content)
|
||||
if len(content) == 0:
|
||||
content = para["default"]
|
||||
except:
|
||||
content = para["default"]
|
||||
self.outputParameters[para["name"]] = content
|
||||
if node["parameters"]["newLine"]:
|
||||
line = new_line(self.outputParameters,
|
||||
self.maxViewLength, self.outputParametersRecord)
|
||||
self.OUTPUT.append(line)
|
||||
self.saveData()
|
||||
elif int(node["parameters"]["loopType"]) == 0: # 单个元素循环
|
||||
# 无跳转标签页操作
|
||||
count = 0 # 执行次数
|
||||
bodyText = "-"
|
||||
while True: # do while循环
|
||||
try:
|
||||
finished = False
|
||||
# newBodyText = self.browser.page_source
|
||||
# newBodyText = self.browser.find_element(By.XPATH, "//body").text
|
||||
if node["parameters"]["exitCount"] == 0:
|
||||
newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
|
||||
if node["parameters"]["iframe"]: # 如果标记了iframe
|
||||
|
@ -94,6 +94,25 @@ def on_release_creator(event, press_time):
|
||||
# event.clear()
|
||||
# time.sleep(1) # 每秒检查一次
|
||||
|
||||
def detect_optimizable(para, ignoreWaitElement=True, waitElement="", includePicture=False):
|
||||
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1:
|
||||
if para["nodeType"] <= 2:
|
||||
if ignoreWaitElement or waitElement == "":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif para["nodeType"] == 4: # 如果是图片
|
||||
if includePicture:
|
||||
if para["downloadPic"]:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def download_image(browser, url, save_directory):
|
||||
# 定义浏览器头信息
|
||||
@ -195,6 +214,7 @@ def replace_field_values(orginal_text, outputParameters, browser=None):
|
||||
replaced_text = replaced_text.replace(match.group(0), eval_replaced_text)
|
||||
except Exception as e:
|
||||
print("eval替换失败,请检查eval语句是否正确。| Failed to replace eval, please check if the eval statement is correct.")
|
||||
print(e)
|
||||
replaced_text = orginal_text
|
||||
return replaced_text
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user