新增下载图片功能

This commit is contained in:
naibo 2023-05-20 20:44:34 +08:00
parent 9774592910
commit 42db55deb8
23 changed files with 579 additions and 153 deletions

View File

@ -10,3 +10,7 @@ user_data/
Data/
Chrome/
execution_instances/*
EasySpider_en.crx
EasySpider_zh.crx
.DS_Store
npminstall-debug.log

Binary file not shown.

Binary file not shown.

View File

@ -235,6 +235,13 @@
<option :value = 3>表单值</option>
<option :value = 4>图片地址</option>
</select>
<div v-if='paras.parameters[paraIndex]["nodeType"] == 4'>
<label>提取图片地址后是否同时下载图片</label>
<select v-model='paras.parameters[paraIndex]["downloadPic"]' class="form-control">
<option :value = 0></option>
<option :value = 1></option>
</select>
</div>
<!-- <label>提取方式</label>-->
<!-- <select v-model='paras.parameters[paraIndex]["extractType"]' class="form-control">-->
<!-- <option :value = 0>普通提取</option>-->
@ -390,8 +397,8 @@
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
</div>
<div v-else-if='TClass == 7'>
<label>代码/脚本内容: </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令该循环项用arguments[0]表示返回值大于0或为真则执行此分支内操作否则不执行。如return arguments[0].innerText.indexOf('123') >=0 即判断当前循环项的文本是否包含123,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
<label>代码/脚本内容<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例): </label>
<textarea onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令该循环项用arguments[0]表示返回值大于0或为真则执行此分支内操作否则不执行。如return arguments[0].innerText.length >=5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
<label>最长等待脚本执行时间0代表无限等待: </label>
<input onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
</div>

View File

@ -44,6 +44,7 @@ function changeGetDataParameters(msg, i) {
msg["parameters"][i]["JSWaitTime"] = 0; //JS等待时间
msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
}
function handleAddElement(msg) {

335
ElectronJS/tasks/57.json Normal file
View File

@ -0,0 +1,335 @@
{
"id": 57,
"name": "图片下载",
"url": "https://www.jd.com",
"links": "https://www.jd.com",
"create_time": "5/20/2023, 8:18:15 PM",
"containJudge": false,
"desc": "https://www.jd.com",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 1,
"nodeName": "打开网页",
"value": "https://www.jd.com",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://www.jd.com"
}
],
"outputParameters": [
{
"id": 0,
"name": "参数3_图片地址",
"desc": "",
"type": "string",
"exampleValue": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [
1,
4
],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": 1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "打开网页",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 0,
"beforeJS": "",
"beforeJSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"url": "https://www.jd.com",
"links": "https://www.jd.com",
"maxWaitTime": 10,
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": -1,
"index": 2,
"parentId": 0,
"type": 1,
"option": 8,
"title": "循环",
"sequence": [
3
],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 4,
"tabIndex": -1,
"useLoop": false,
"xpath": "/html/body/div[4]/div[1]/div[4]/a",
"wait": 0,
"beforeJS": "",
"beforeJSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"code": "",
"waitTime": 0,
"exitCount": 0,
"historyWait": 2,
"allXPaths": [
"/html/body/div[4]/div[1]/div[4]/a[1]",
"//a[contains(., '平板電腦')]"
]
}
},
{
"id": -1,
"index": 3,
"parentId": 2,
"type": 0,
"option": 3,
"title": "提取数据",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 4,
"tabIndex": -1,
"useLoop": false,
"xpath": "",
"wait": 0,
"beforeJS": "",
"beforeJSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"paras": [
{
"nodeType": 1,
"contentType": 0,
"relative": true,
"name": "参数1_链接文本",
"desc": "",
"extractType": 0,
"relativeXPath": "",
"allXPaths": "",
"exampleValues": [
{
"num": 0,
"value": "平板電腦"
},
{
"num": 1,
"value": "爆款耳機"
},
{
"num": 2,
"value": "手機"
},
{
"num": 3,
"value": "數據線"
},
{
"num": 4,
"value": "年貨節"
}
],
"default": "",
"beforeJS": "",
"beforeJSWaitTime": 0,
"JS": "",
"JSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"downloadPic": 0
},
{
"nodeType": 2,
"contentType": 0,
"relative": true,
"name": "参数2_链接地址",
"desc": "",
"relativeXPath": "",
"allXPaths": "",
"exampleValues": [
{
"num": 0,
"value": "https://search.jd.com/Search?keyword=%E5%B9%B3%E6%9D%BF%E7%94%B5%E8%84%91&enc=utf-8&wq=%E5%B9%B3%E6%9D%BF%E7%94%B5%E8%84%91&pvid=84c62205dccd43dfad1b6eb5fdf5077b"
},
{
"num": 1,
"value": "https://audio.jd.com/"
},
{
"num": 2,
"value": "https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&ev=exbrand_%E5%B0%8F%E7%B1%B3%EF%BC%88MI%EF%BC%89%5E&uc=0#J_searchWrap"
},
{
"num": 3,
"value": "https://mall.jd.com/index-1000007418.html"
},
{
"num": 4,
"value": "https://pro.jd.com/mall/active/22WyJjMqTCbvjj1YB3pSJssBonLR/index.html"
}
],
"default": "",
"beforeJS": "",
"beforeJSWaitTime": 0,
"JS": "",
"JSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"downloadPic": 0
}
],
"loopType": 1
}
},
{
"id": 2,
"index": 4,
"parentId": 0,
"type": 1,
"option": 8,
"title": "循环",
"sequence": [
5
],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 4,
"tabIndex": -1,
"useLoop": false,
"xpath": "/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div[1]/a[1]/img[1]",
"wait": 0,
"beforeJS": "",
"beforeJSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"code": "",
"waitTime": 0,
"exitCount": 0,
"historyWait": 2,
"allXPaths": [
"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/a[1]/img[1]",
"//img[contains(., '')]"
]
}
},
{
"id": 3,
"index": 5,
"parentId": 2,
"type": 0,
"option": 3,
"title": "提取数据",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 4,
"tabIndex": -1,
"useLoop": false,
"xpath": "",
"wait": 0,
"beforeJS": "",
"beforeJSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"paras": [
{
"nodeType": 4,
"contentType": 0,
"relative": true,
"name": "参数3_图片地址",
"desc": "",
"extractType": 0,
"relativeXPath": "",
"allXPaths": "",
"exampleValues": [
{
"num": 0,
"value": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
},
{
"num": 1,
"value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
},
{
"num": 2,
"value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/222655/28/27238/153145/644b858eF2cd1200f/e37bd7da42a814b0.jpg!q70.dpg"
},
{
"num": 3,
"value": "//m.360buyimg.com/babel/s710x370_jfs/t1/197659/30/31344/62825/640fd751F694963ed/a6e1ac2e5c27f160.jpg!q70.dpg"
},
{
"num": 4,
"value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
},
{
"num": 5,
"value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/222655/28/27238/153145/644b858eF2cd1200f/e37bd7da42a814b0.jpg!q70.dpg"
},
{
"num": 6,
"value": "//m.360buyimg.com/babel/s710x370_jfs/t1/197659/30/31344/62825/640fd751F694963ed/a6e1ac2e5c27f160.jpg!q70.dpg"
},
{
"num": 7,
"value": "//m.360buyimg.com/babel/jfs/t1/223646/1/18719/254758/6458a465F7a57af84/f44d7d983018d9ed.png"
},
{
"num": 8,
"value": "//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"
}
],
"default": "",
"beforeJS": "",
"beforeJSWaitTime": 0,
"JS": "",
"JSWaitTime": 0,
"afterJS": "",
"afterJSWaitTime": 0,
"downloadPic": 1
}
]
}
}
]
}

1
ElectronJS/tasks/58.json Normal file
View File

@ -0,0 +1 @@
{"id":58,"name":"新web采集任务","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/20/2023, 8:35:56 PM","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_图片地址","desc":"","type":"string","exampleValue":"//m.360buyimg.com/babel/jfs/t1/81488/28/23346/102165/63b41485F7ecc4f22/be5cee8cf04d7e16.png"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":0}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[2]/ul[1]/li/a[1]/img[1]","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":0,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"allXPaths":["/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[2]/ul[1]/li[1]/a[1]/img[1]","//img[contains(., '')]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":4,"contentType":0,"relative":true,"name":"参数1_图片地址","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/jfs/t1/81488/28/23346/102165/63b41485F7ecc4f22/be5cee8cf04d7e16.png"},{"num":1,"value":"//m.360buyimg.com/babel/jfs/t1/93200/23/34752/53589/63b4148cF5150739b/5d0dc855fe43ca85.png"},{"num":2,"value":"//m.360buyimg.com/babel/jfs/t1/54690/10/22629/29568/63b41496Fad92ac75/605f4fe1c473192c.png"},{"num":3,"value":"//m.360buyimg.com/babel/jfs/t1/53202/29/23281/117684/63b4149dF2beb8956/ac86841f42a75cd0.png"},{"num":4,"value":"//m.360buyimg.com/babel/jfs/t1/90039/40/25105/65438/63b414a5Fb09a6926/740fb4daeef82c57.png"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":1}],"loopType":1}}]}

1
ElectronJS/tasks/59.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -11,4 +11,5 @@ Data/
tasks/
Application/
.history
execution_instances/
execution_instances/
.DS_Store

View File

@ -12,7 +12,7 @@
"console": "integratedTerminal",
"justMyCode": true,
// "args": ["--id", "38", "--read_type", "local", "--headless", "1"]
"args": ["--id", "10", "--headless", "0"]
"args": ["--id", "15", "--headless", "0"]
}
]
}

View File

@ -30,7 +30,7 @@ from selenium.webdriver.common.by import By
from commandline_config import Config
import pytesseract
from PIL import Image
import uuid
saveName, log, OUTPUT, browser, SAVED = None, "", "", None, False
@ -65,6 +65,38 @@ def Log(text, text2=""):
# 屏幕滚动函数
def download_image(url, save_directory):
# 定义浏览器头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 发送 GET 请求获取图片数据
response = requests.get(url, headers=headers)
# 检查响应状态码是否为成功状态
if response.status_code == requests.codes.ok:
# 提取文件名
file_name = url.split('/')[-1]
# 生成唯一的新文件名
new_file_name = str(uuid.uuid4()) + '_' + file_name
# 构建保存路径
save_path = os.path.join(save_directory, new_file_name)
# 保存图片到本地
with open(save_path, 'wb') as file:
file.write(response.content)
print("图片已成功下载到:", save_path)
print("The image has been successfully downloaded to:", save_path)
else:
print("下载图片失败,请检查此图片链接是否有效:", url)
print("Failed to download image, please check if this image link is valid:", url)
def scrollDown(para, rt=""):
try:
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
@ -180,6 +212,7 @@ def executeNode(nodeId, loopValue="", clickPath="", index=0):
inputInfo(node["parameters"], loopValue)
elif node["option"] == 5: # 自定义操作
customOperation(node, loopValue)
saveData()
elif node["option"] == 8: # 循环
recordLog("loop")
loopExcute(node, loopValue, clickPath, index) # 执行循环
@ -644,8 +677,8 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
recordLog('Element %s not found, use default' % p["relativeXPath"])
continue
except TimeoutException: # 超时的时候设置超时值
Log('time out after 10 seconds when getting data')
recordLog('time out after 10 seconds when getting data')
Log('time out after set seconds when getting data')
recordLog('time out after set seconds when getting data')
browser.execute_script('window.stop()')
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
@ -660,104 +693,44 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
element = browser.find_element(By.XPATH, "//body")
try:
execute_code(2, p["beforeJS"], p["beforeJSWaitTime"], element) # 执行前置js
if p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 4:
# 获取元素的背景图片地址
bg_url = element.value_of_css_property('background-image')
# 清除背景图片地址中的多余字符
bg_url = bg_url.replace('url("', '').replace('")', '')
content = bg_url
elif p["contentType"] == 5:
content = browser.current_url
elif p["contentType"] == 6:
content = browser.title
elif p["contentType"] == 7:
# 获取整个网页的高度和宽度
height = browser.execute_script("return document.body.scrollHeight");
width = browser.execute_script("return document.body.scrollWidth");
# 调整浏览器窗口的大小
browser.set_window_size(width, height)
element.screenshot("Data/" +saveName + "/"+ str(time.time()) + ".png")
elif p["contentType"] == 8:
try:
screenshot = element.screenshot_as_png
screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
# 使用Tesseract OCR引擎识别图像中的文本
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
content = text
except Exception as e:
content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中https://blog.csdn.net/u010454030/article/details/80515501")
elif p["contentType"] == 9:
content = execute_code(2, p["JS"], p["JSWaitTime"], element)
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
recordLog('StaleElementReferenceException'+p["relativeXPath"])
time.sleep(3)
try:
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
recordLog('StaleElementReferenceExceptionloopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
recordLog(
'StaleElementReferenceExceptionloopElement+relativeXPath')
# 先处理特殊节点类型
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
element = browser.find_element(
By.XPATH, p["relativeXPath"])
recordLog('StaleElementReferenceExceptionrelativeXPath')
if p["contentType"] == 2:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
try:
downloadPic = p["downloadPic"]
except:
downloadPic = 0
if downloadPic == 1:
download_image(content, "Data/" +saveName + "/")
else: # 普通节点
if p["contentType"] == 0:
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
elif p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
@ -788,55 +761,101 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
content = text
except Exception as e:
content = "OCR失败"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable path: https://tesseract-ocr.github.io/tessdoc/Installation.html")
print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量path中")
content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中https://blog.csdn.net/u010454030/article/details/80515501")
elif p["contentType"] == 9:
content = execute_code(2, p["JS"], p["JSWaitTime"], element)
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
recordLog('StaleElementReferenceException'+p["relativeXPath"])
time.sleep(3)
try:
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
recordLog('StaleElementReferenceExceptionloopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXPath"][1:])
recordLog(
'StaleElementReferenceExceptionloopElement+relativeXPath')
else:
element = browser.find_element(
By.XPATH, p["relativeXPath"])
recordLog('StaleElementReferenceExceptionrelativeXPath')
# 先处理特殊节点类型
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
try:
downloadPic = p["downloadPic"]
except:
downloadPic = 0
if downloadPic == 1:
download_image(content, "Data/" +saveName + "/")
else: # 普通节点
if p["contentType"] == 0:
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
elif p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 4:
# 获取元素的背景图片地址
bg_url = element.value_of_css_property('background-image')
# 清除背景图片地址中的多余字符
bg_url = bg_url.replace('url("', '').replace('")', '')
content = bg_url
elif p["contentType"] == 5:
content = browser.current_url
elif p["contentType"] == 6:
content = browser.title
elif p["contentType"] == 7:
# 获取整个网页的高度和宽度
height = browser.execute_script("return document.body.scrollHeight");
width = browser.execute_script("return document.body.scrollWidth");
# 调整浏览器窗口的大小
browser.set_window_size(width, height)
element.screenshot("Data/" +saveName + "/"+ str(time.time()) + ".png")
elif p["contentType"] == 8:
try:
screenshot = element.screenshot_as_png
screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
# 使用Tesseract OCR引擎识别图像中的文本
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
content = text
except Exception as e:
content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH: https://tesseract-ocr.github.io/tessdoc/Installation.html")
print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中https://blog.csdn.net/u010454030/article/details/80515501")
elif p["contentType"] == 9:
content = execute_code(2, p["JS"], p["JSWaitTime"], element)
except StaleElementReferenceException:
recordLog('StaleElementReferenceException'+p["relativeXPath"])
continue # 再出现类似问题直接跳过
@ -859,7 +878,7 @@ def isnull(s):
def saveData(exit=False):
global saveName, log, OUTPUT, browser
if exit == True or len(OUTPUT) > 100: # 每100条保存一次
if exit == True or len(OUTPUT) >= 100: # 每100条保存一次
with open("Data/"+saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
file_obj.write(log)
file_obj.close()
@ -890,6 +909,7 @@ if __name__ == '__main__':
"config_folder": "",
"config_file_name": "config.json",
"headless": False,
"version": "0.3.0",
}
c = Config(config)
print(c)

View File

@ -4,3 +4,5 @@ dist
.env
EasySpider_en
EasySpider_zh
EasySpider_en.crx
EasySpider_zh.crx

View File

@ -1,6 +1,6 @@
{
"name": "EasySpider",
"version": "0.2",
"version": "0.3.0",
"description": "EasySpider's chrome extension",
"author": "Naibo Wang",
"manifest_version": 3,

View File

@ -0,0 +1,48 @@
https://github.com/NaiboWang/EasySpider/releases/tag/v0.3.0
### 强烈建议大家观看新特性讲解视频
B站最新版特性视频已上传新视频非常有用推荐大家观看。
[【重要】自定义条件判断之使用循环项内的JS命令返回值 - 第二弹](https://www.bilibili.com/video/BV1mu411x7Nn/)
[如何执行自己写的JS代码和系统代码 (自定义操作)](https://www.bilibili.com/video/BV1qs4y1z7Hc/)
[如何自定义循环和判断条件 - 第一弹](https://www.bilibili.com/video/BV1Ys4y1z777/)
[如何对元素和网页截图及(无头模式)命令行执行指南](https://www.bilibili.com/video/BV1dV4y1z764/)
[OCR识别元素内容功能](https://www.bilibili.com/video/BV1xz4y1b72D/)
注意v0.3.0版本任务task文件夹内`.json`文件和v0.2.0版本不兼容请重新设计v0.3.0版本任务。
## 更新说明
1. 高级操作:
- 可以在任务流程中**执行自定义脚本**,包括在浏览器中**执行Javascript指令**以及**操作系统级别的脚本调用**并可**得到命令返回值并记录**,大大扩展了可操作空间。
![image](https://github.com/NaiboWang/EasySpider/assets/30287768/06e63a06-328d-4339-b40b-2d57c94cee66)
- 在每一个操作执行前和执行后都可以指定执行一段针对当前定位元素的JavaScript指令。
<img src="https://github.com/NaiboWang/EasySpider/assets/30287768/dde64388-5668-40ff-951e-fb8f60655c49" height=50% width=50%>
2. **判断条件和循环条件**中同样增加了**执行自定义脚本**,并根据自定义脚本的返回值是否为真来作为条件判断和循环的判断条件,同样极大的增加了任务的可操作性。
![image](https://github.com/NaiboWang/EasySpider/assets/30287768/9dea0564-1a1c-487d-9fa4-427c5e284796)
3. 可同时生成多种XPath供用户选择并**预装了XPath Helper扩展**供大家调试XPath。
4. 增加采集元素背景图片地址当前页面标题当前页面URL地址功能。
5. 增加保存元素截图功能,如要截图某元素或整个网页页面,可以用此功能(配合无头模式效果更好)。
6. 增加下载图片功能正式版Beta版没有
7. 增加OCR识别元素功能使用此功能需首先自行安装Tesseract库[https://blog.csdn.net/u010454030/article/details/80515501](https://blog.csdn.net/u010454030/article/details/80515501)
8. 可直接提取对元素执行JavaScript代码后的返回值实现如正则表达式获得元素背景颜色等功能。
<img src="https://github.com/NaiboWang/EasySpider/assets/30287768/f6a9b5ce-63c5-4348-8967-053c21d67ef9" width=50% height=50%>
9. 大幅增加使用提示和说明使软件更易用如增加了iframe标签的处理方式说明各个选项的参数意义以及循环项XPath的修改说明等等
10. 执行命令时增加了如何用命令行执行任务的提示:[https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction](https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction)。
![image](https://github.com/NaiboWang/EasySpider/assets/30287768/a9e774df-e345-4d51-b7c9-2c4dac0ec624)
11. 增加无头模式,即无浏览器界面模式配置。
12. 修复了使用用户配置浏览器模式下的中文路径不能正确识别的问题。
13. 修复了条件分支没有无条件分支时会卡死的问题。
14. 修复了保存任务后会输入框卡死的问题。
15. 打开网页操作和点击元素操作新增设置页面最长加载等待时间。
16. 增加版本更新提示。
17. 更新chrome版本为113。

View File

@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\Releases\\EasySpider_windows_amd64\\user_data1"}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data12","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\Releases\\EasySpider_windows_amd64\\user_data1"}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long