Update Readme

This commit is contained in:
NaiboWang-Alienware 2023-02-04 00:04:36 +08:00
parent e9d0caea95
commit 9cb86d3b82
22 changed files with 180 additions and 2145 deletions

View File

@ -1,691 +0,0 @@
# -*- coding: utf-8 -*-
import atexit # 遇到错误退出时应执行的代码
import json
from lib2to3.pgen2 import driver
import re
import sys
from urllib import parse
import base64
import hashlib
import time
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
import numpy
import csv
import os
from selenium.webdriver.common.by import By
saveName, log, OUTPUT, browser, SAVED = None, "", "", None, False
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
class Time:
def __init__(self, type1=""):
self.t = int(round(time.time() * 1000))
self.type = type1
def end(self):
at = int(round(time.time() * 1000))
Log(str(self.type)+":"+str(at-self.t))
# 记录log
def recordLog(str=""):
global log
log = log + str + "\n"
# 控制台打印log函数
def Log(text, text2=""):
switch = False
if switch:
print(text, text2)
# 屏幕滚动函数
def scrollDown(para, rt=""):
try:
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
body.send_keys(Keys.END)
except TimeoutException:
Log('time out after 10 seconds when scrolling. ')
recordLog('time out after 10 seconds when scrolling')
browser.execute_script('window.stop()')
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
body.send_keys(Keys.END)
if rt != "":
rt.end()
# 执行节点关键函数部分
def excuteNode(nodeId, loopValue="", clickPath="", index=0):
node = procedure[nodeId]
WebDriverWait(browser, 10).until
# 等待元素出现才进行操作10秒内未出现则报错
(EC.visibility_of_element_located((By.XPATH, node["parameters"]["xpath"])))
# 根据不同选项执行不同操作
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
for i in node["sequence"]: # 从根节点开始向下读取
excuteNode(i, loopValue, clickPath, index)
elif node["option"] == 1: # 打开网页操作
recordLog("openPage")
openPage(node["parameters"], loopValue)
elif node["option"] == 2: # 点击元素
recordLog("Click")
clickElement(node["parameters"], loopValue, clickPath, index)
elif node["option"] == 3: # 提取数据
recordLog("getData")
getData(node["parameters"], loopValue, node["isInLoop"])
elif node["option"] == 4: # 输入文字
inputInfo(node["parameters"], loopValue)
elif node["option"] == 8: # 循环
recordLog("loop")
loopExcute(node, loopValue, clickPath, index) # 执行循环
elif node["option"] == 9: # 条件分支
recordLog("judge")
judgeExcute(node, loopValue, clickPath, index)
# 执行完之后进行等待
if node["option"] != 0:
waitTime = 0.01 # 默认等待0.01秒
if node["parameters"]["wait"] > 1:
waitTime = node["parameters"]["wait"]
time.sleep(waitTime)
Log("Node执行完后等待", waitTime)
# 对判断条件的处理
def judgeExcute(node, loopElement, clickPath="", index=0):
rt = Time("条件判断")
global bodyText # 引入bodyText
excuteBranchId = 0 # 要执行的BranchId
for i in node["sequence"]:
cnode = procedure[i] # 获得条件分支
tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
if tType == 0: # 什么条件都没有
excuteBranchId = i
break
elif tType == 1: # 当前页面包含文本
try:
if bodyText.find(cnode["parameters"]["value"]) >= 0:
excuteBranchId = i
break
except: # 找不到元素下一个条件
continue
elif tType == 2: # 当前页面包含元素
try:
if browser.find_element(By.XPATH, cnode["parameters"]["value"]):
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
elif tType == 3: # 当前循环元素包括文本
try:
if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
elif tType == 4: # 当前循环元素包括元素
try:
if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
rt.end()
excuteNode(excuteBranchId, loopElement, clickPath, index)
# 对循环的处理
def loopExcute(node, loopValue, clickPath="", index=0):
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
Log("循环执行前等待0.1秒")
global history
thisHandle = browser.current_window_handle # 记录本次循环内的标签页的ID
thisHistoryLength = browser.execute_script(
'return history.length') # 记录本次循环内的history的length
history["index"] = thisHistoryLength
history["handle"] = thisHandle
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
# 无跳转标签页操作
count = 0 # 执行次数
while True: # do while循环
try:
element = browser.find_element(By.XPATH,
node["parameters"]["xpath"])
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, element, node["parameters"]["xpath"], 0)
Log("click: ", node["parameters"]["xpath"])
recordLog("click:" + node["parameters"]["xpath"])
except NoSuchElementException:
Log("clickNotFound: ", node["parameters"]["xpath"])
recordLog("clickNotFound:" + node["parameters"]["xpath"])
break # 如果找不到元素,退出循环
except Exception as e:
raise
count = count + 1
Log("页数:", count)
recordLog("页数:" + str(count))
# print(node["parameters"]["exitCount"], "-------")
if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
break
elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
try:
elements = browser.find_elements(By.XPATH,
node["parameters"]["xpath"])
for index in range(len(elements)):
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, elements[index],
node["parameters"]["xpath"], index)
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
while True: # 一直关闭窗口直到当前标签页
browser.close() # 关闭使用完的标签页
browser.switch_to.window(browser.window_handles[-1])
if browser.current_window_handle == thisHandle:
break
if history["index"] != thisHistoryLength and history[
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
history["index"] # 计算历史记录变化差值
browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
Log("切换历史记录等待2秒或者", node["parameters"]["historyWait"])
browser.execute_script('window.stop()')
except NoSuchElementException:
Log("pathNotFound: ", node["parameters"]["xpath"])
recordLog("pathNotFound: " + node["parameters"]["xpath"])
pass # 循环中找不到元素就略过操作
except Exception as e:
raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
for path in node["parameters"]["pathList"].split("\n"): # 千万不要忘了分割!!
try:
element = browser.find_element(By.XPATH, path)
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, element, path, 0)
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
while True: # 一直关闭窗口直到当前标签页
browser.close() # 关闭使用完的标签页
browser.switch_to.window(browser.window_handles[-1])
if browser.current_window_handle == thisHandle:
break
if history["index"] != thisHistoryLength and history[
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
history["index"] # 计算历史记录变化差值
browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
Log("切换历史记录等待2秒或者", node["parameters"]["historyWait"])
browser.execute_script('window.stop()')
except NoSuchElementException:
Log("pathNotFound: ", path)
recordLog("pathNotFound: " + path)
continue # 循环中找不到元素就略过操作
except Exception as e:
raise
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n")
for text in textList:
recordLog("input: " + text)
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, text, "", 0)
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
pass # 以后再做
history["index"] = thisHistoryLength
history["handle"] = browser.current_window_handle
scrollDown(node["parameters"])
# 打开网页事件
def openPage(para, loopValue):
rt = Time("打开网页")
time.sleep(2) # 打开网页后强行等待至少2秒
global links
global urlId
global history
# try:
# firstTime = True
# for handle in browser.window_handles:
# browser.switch_to.window(handle)
# if (not firstTime):
# browser.close()
# firstTime = False
# except:
# return
if len(browser.window_handles) > 1:
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
browser.close()
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
history["handle"] = browser.current_window_handle
if para["useLoop"]:
url = loopValue
else:
url = links[urlId]
try:
browser.get(url)
except TimeoutException:
Log('time out after 10 seconds when loading page: ' + url)
recordLog('time out after 10 seconds when loading page: ' + url)
browser.execute_script('window.stop()')
rt.end()
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
scrollDown(para, rt) # 控制屏幕向下滚动
if containJudge:
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text: ' + url)
recordLog('time out after 10 seconds when getting body text:: ' + url)
browser.execute_script('window.stop()')
time.sleep(1)
Log("获得bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
rt.end()
# 键盘输入事件
def inputInfo(para, loopValue):
time.sleep(1) # 输入之前等待1秒
Log("输入前等待1秒")
rt = Time("输入文字")
try:
textbox = browser.find_element(By.XPATH, para["xpath"])
except:
Log("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
recordLog("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
exit()
textbox.send_keys(Keys.CONTROL, 'a')
textbox.send_keys(Keys.BACKSPACE)
if para["useLoop"]:
textbox.send_keys(loopValue)
else:
textbox.send_keys(para["value"])
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
# 点击元素事件
def clickElement(para, loopElement=None, clickPath="", index=0):
global history
time.sleep(0.1) # 点击之前等待1秒
rt = Time("点击元素")
Log("点击之前等待1秒")
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
path = clickPath
else:
path = para["xpath"] # 不然使用元素定义的xpath
tempHandleNum = len(browser.window_handles) # 记录之前的窗口位置
try:
script = 'var result = document.evaluate(`' + path + \
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
browser.execute_script(script, str(index)) # 用js的点击方法
except TimeoutException:
Log('time out after 10 seconds when loading clicked page')
recordLog('time out after 10 seconds when loading clicked page')
browser.execute_script('window.stop()')
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
time.sleep(0.5) # 点击之后等半秒
Log("点击之后等待0.5秒")
if tempHandleNum != len(browser.window_handles): # 如果有新标签页的行为发生
browser.switch_to.window(browser.window_handles[-1]) # 跳转到新的标签页
history["handle"] = browser.current_window_handle
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
else:
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
# 如果打开了新窗口,切换到新窗口
scrollDown(para, rt) # 根据参数配置向下滚动
if containJudge: # 有判断语句才执行以下操作
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text')
recordLog('time out after 10 seconds when getting body text')
browser.execute_script('window.stop()')
time.sleep(1)
Log("bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
rt.end()
# 提取数据事件
def getData(para, loopElement, isInLoop=True):
if not isInLoop and para["wait"] == 0:
time.sleep(1) # 如果提取数据字段不在循环内而且设置的等待时间为0默认等待1秒
Log("提取数据等待1秒")
rt = Time("提取数据")
for p in para["paras"]:
content = ""
try:
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
else:
element = browser.find_element(By.XPATH, p["relativeXpath"])
except NoSuchElementException: # 找不到元素的时候,使用默认值
# print(p)
try:
content = p["default"]
except Exception as e:
content = ""
outputParameters[p["name"]] = content
Log('Element %s not found,use default' % p["relativeXpath"])
recordLog('Element %s not found, use default' % p["relativeXpath"])
continue
except TimeoutException: # 超时的时候设置超时值
Log('time out after 10 seconds when getting data')
recordLog('time out after 10 seconds when getting data')
browser.execute_script('window.stop()')
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
else:
element = browser.find_element(By.XPATH, p["relativeXpath"])
rt.end()
try:
if p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
recordLog('StaleElementReferenceException'+p["relativeXpath"])
time.sleep(3)
try:
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
recordLog('StaleElementReferenceExceptionloopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
recordLog(
'StaleElementReferenceExceptionloopElement+relativeXPath')
else:
element = browser.find_element(
By.XPATH, p["relativeXpath"])
recordLog('StaleElementReferenceExceptionrelativeXpath')
if p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
except StaleElementReferenceException:
recordLog('StaleElementReferenceException'+p["relativeXpath"])
continue # 再出现类似问题直接跳过
outputParameters[p["name"]] = content
global OUTPUT
line = []
for value in outputParameters.values():
line.append(value)
print(value[:15], " ", end="")
print("")
OUTPUT.append(line)
rt.end()
# 判断字段是否为空
def isnull(s):
return len(s) != 0
@atexit.register
def clean():
global saveName, log, OUTPUT, browser, SAVED
if not SAVED:
print('清理环境保存数据')
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
file_obj.write(log)
file_obj.close()
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
f_csv = csv.writer(f)
for line in OUTPUT:
f_csv.writerow(line)
f.close()
browser.quit()
sys.exit(saveName + '.csv')
if __name__ == '__main__':
options = Options()
exe_path = "chromedriver.exe"
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/ServiceWrapper")
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd()+"/Debug"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
exe_path = "Debug/Chrome/chromedriver.exe"
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
exe_path = "./Chrome/chromedriver.exe"
else:
options.binary_location = "chrome.exe" # 指定chrome位置
browser = webdriver.Chrome(options=options, executable_path=exe_path)
browser.get('about:blank')
browser.set_page_load_timeout(10) # 加载页面最大超时时间
browser.set_script_timeout(10)
if len(sys.argv) > 1:
id = int(sys.argv[1]) # taskId这里修改
else:
id = 7 # 设置默认值
print("id", id)
if len(sys.argv) > 2:
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
else:
saveName = "task_" + str(id) + "_" + \
str(random.randint(0, 999999999)) # 保存文件的名字
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
if len(sys.argv) > 3:
backEndAddress = sys.argv[3]
else:
backEndAddress = "http://servicewrapper.naibo.wang"
# content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
with open("tasks/" + str(id) + ".json", 'r') as f:
content = f.read()
service = json.loads(content)
# print(service)
# service = json.loads() # 加载服务信息
print("name:", service["name"])
procedure = service["graph"] # 程序执行流程
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
OUTPUT = [] # 采集的数据
OUTPUT.append([]) # 添加表头
containJudge = service["containJudge"] # 是否含有判断语句
bodyText = "" # 记录bodyText
tOut = service["outputParameters"] # 生成输出参数对象
outputParameters = {}
log = "" # 记下现在总共开了多少个标签页
history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
SAVED = False # 记录是否已经存储了
for para in tOut:
outputParameters[para["name"]] = ""
OUTPUT[0].append(para["name"])
# 挨个执行程序
urlId = 0 # 全局记录变量
for i in range(len(links)):
excuteNode(0)
urlId = urlId + 1
print("执行完成!")
recordLog("Done!")
# dataPath = os.path.abspath(os.path.join(os.getcwd(), "../Data"))
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
file_obj.write(log)
file_obj.close()
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
f_csv = csv.writer(f)
for line in OUTPUT:
f_csv.writerow(line)
f.close()
SAVED = True
browser.quit()

View File

@ -1,42 +0,0 @@
# -*- coding: utf-8 -*-
from base64 import encode
import json
import os
def queryService(id):
with open("services/%d.json" % id, "r", encoding='utf-8') as f:
service = json.loads(f.read())
return service
def invokeService(id, data):
service = queryService(id)
try:
service["links"] = data["urlList_0"]
except:
pass
for key, value in data.items():
for i in range(len(service["inputParameters"])):
if key == service["inputParameters"][i]["name"]: # 能调用
nodeId = int(service["inputParameters"][i]["nodeId"])
node = service["graph"][nodeId]
if node["option"] == 1:
node["parameters"]["links"] = value
elif node["option"] == 4:
node["parameters"]["value"] = value
elif node["option"] == 8 and node["parameters"]["loopType"] == 0:
# print("loopType 0", value)
node["parameters"]["exitCount"] = int(value)
# print(node)
elif node["option"] == 8:
node["parameters"]["textList"] = value
break
count = len(os.listdir("tasks")) + 1
service["id"] = count # 修改id
print(count)
with open("tasks/%d.json" % count, "w", encoding='utf-8') as f:
s = json.dumps(service, ensure_ascii=False)
f.write(s)
return count

View File

@ -1,218 +0,0 @@
{
"id": 6,
"name": "toutiao_authors",
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 2,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"
},
{
"id": 1,
"name": "loopTimes_Loop_Click_1",
"nodeId": 5,
"nodeName": "Loop_Click",
"desc": "循环Loop_Click执行的次数0代表无限循环",
"type": "int",
"exampleValue": 5,
"value": 5
}
],
"outputParameters": [
{
"id": 0,
"name": "参数1_链接文本",
"desc": "",
"type": "string",
"exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": -1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 0,
"url": "https://www.jd.com",
"links": "https://www.jd.com",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 1,
"index": 2,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 5,
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 3,
"index": 3,
"parentId": 2,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": 5,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 1,
"contentType": 0,
"relative": true,
"name": "参数1_链接文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "经常戴耳机危害 - 语音科普 - 博禾医生"
},
{
"num": 1,
"value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"
},
{
"num": 2,
"value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"
},
{
"num": 3,
"value": "长时间戴耳机听歌或致耳聋-名医在线网"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 2,
"index": 5,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop_Click",
"sequence": [3, 6],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//span[contains(text(), \"下一页\")]/../..",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 0,
"pathList": "",
"textList": "",
"exitCount": 5,
"historyWait": 2
}
},
{
"id": 4,
"index": 6,
"parentId": 2,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": true,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": true,
"xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": [],
"loopType": 0
}
}
]
}

View File

@ -1,278 +0,0 @@
{
"id": 7,
"name": "toutiao_author_desc",
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 1,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
}
],
"outputParameters": [
{
"id": 0,
"name": "参数2_文本",
"desc": "",
"type": "string",
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
},
{
"id": 1,
"name": "参数3_文本",
"desc": "",
"type": "string",
"exampleValue": "63.3万获赞"
},
{
"id": 2,
"name": "参数1_文本",
"desc": "",
"type": "string",
"exampleValue": "小荷医典"
},
{
"id": 3,
"name": "参数6_链接地址",
"desc": "",
"type": "string",
"exampleValue": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [1, 6, 2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": 1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 1,
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 3,
"index": 2,
"parentId": 0,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//a[contains(@class, 'flex-row')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": []
}
},
{
"id": -1,
"index": 3,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": -1,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": true,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
},
{
"num": 1,
"value": "10.3万粉丝"
},
{
"num": 2,
"value": "5关注"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 4,
"index": 5,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 3,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数2_文本",
"desc": "",
"relativeXpath": "//p[contains(@class, 'user-desc')]",
"exampleValues": [
{
"num": 0,
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数3_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
"exampleValues": [
{
"num": 0,
"value": "小荷医典"
}
]
}
]
}
},
{
"id": 2,
"index": 6,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数6_链接地址",
"desc": "",
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
"exampleValues": [
{
"num": 0,
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"default": ""
}
]
}
}
]
}

View File

@ -1,292 +0,0 @@
{
"id": 7,
"name": "toutiao_author_desc",
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 1,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
}
],
"outputParameters": [
{
"id": 0,
"name": "参数2_文本",
"desc": "",
"type": "string",
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
},
{
"id": 1,
"name": "参数3_文本",
"desc": "",
"type": "string",
"exampleValue": "63.3万获赞"
},
{
"id": 2,
"name": "参数1_文本",
"desc": "",
"type": "string",
"exampleValue": "小荷医典"
},
{
"id": 3,
"name": "参数1_链接地址",
"desc": "",
"type": "string",
"exampleValue": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [1, 2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": 1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 1,
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 2,
"index": 2,
"parentId": 0,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//a[contains(@class, 'flex-row')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": []
}
},
{
"id": -1,
"index": 3,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": -1,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": true,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
},
{
"num": 1,
"value": "10.3万粉丝"
},
{
"num": 2,
"value": "5关注"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 3,
"index": 5,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数2_文本",
"desc": "",
"relativeXpath": "//p[contains(@class, 'user-desc')]",
"exampleValues": [
{
"num": 0,
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数3_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
"exampleValues": [
{
"num": 0,
"value": "小荷医典"
}
]
},
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数1_链接地址",
"desc": "",
"relativeXpath": "//a[@class=\"avatar\"]",
"exampleValues": [
{
"num": 0,
"value": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
}
]
}
]
}
},
{
"id": -1,
"index": 6,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数6_链接地址",
"desc": "",
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
"exampleValues": [
{
"num": 0,
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"default": ""
}
]
}
}
]
}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{'id': 12, 'name': 'toutiao_authors', 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=?&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'containJudge': False, 'desc': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=?&pd=synthesis&action_type=pagination&page_num=0', 'inputParameters': [{'id': 0, 'name': 'urlList_0', 'nodeId': 2, 'nodeName': 'Open Page', 'value': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword= ?&pd=synthesis&action_type=pagination&page_num=0', 'desc': ',\\n', 'type': 'string', 'exampleValue': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=? &pd=synthesis&action_type=pagination&page_num=0'}, {'id': 1, 'name': 'loopTimes_Loop_Click_1', 'nodeId': 5, 'nodeName': 'Loop_Click', 'desc': 'Loop_Click 0', 'type': 'int', 'exampleValue': 5, 'value': 5}], 'outputParameters': [{'id': 0, 'name': '1_', 'desc': '', 'type': 'string', 'exampleValue': ' - - '}], 'graph': [{'index': 0, 'id': 0, 'parentId': 0, 'type': -1, 'option': 0, 'title': 'root', 'sequence': [2, 5], 'parameters': {'history': 1, 'tabIndex': 0, 'useLoop': False, 'xpath': '', 'wait': 0}, 'isInLoop': False}, {'id': -1, 'index': 1, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 0, 'url': 'https://www.jd.com', 'links': 'https://www.jd.com', 'scrollType': 0, 'scrollCount': 0}}, {'id': 1, 'index': 2, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 5, 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword= ?&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'scrollType': 0, 'scrollCount': 0}}, {'id': 3, 'index': 3, 'parentId': 2, 'type': 1, 'option': 8, 'title': 'Loop', 'sequence': [4], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//div[contains(@class, "cs-source")]//span[contains(@class, \'text-ellipsis\')][1]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 1, 'pathList': '', 'textList': '', 'exitCount': 0, 'historyWait': 2}}, {'id': 5, 'index': 4, 'parentId': 3, 'type': 0, 'option': 3, 'title': 'Extract Data', 'sequence': [], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '', 'wait': 0, 'paras': [{'nodeType': 1, 'contentType': 0, 'relative': True, 'name': '1_', 'desc': '', 'relativeXpath': '', 'exampleValues': [{'num': 0, 'value': ' - - '}, {'num': 1, 'value': ' - - '}, {'num': 2, 'value': ' - - '}, {'num': 3, 'value': '-线'}], 'default': ''}], 'loopType': 1}}, {'id': 2, 'index': 5, 'parentId': 0, 'type': 1, 'option': 8, 'title': 'Loop_Click', 'sequence': [3, 6], 'isInLoop': False, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//span[contains(text(), "下一页")]/../..', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 0, 'pathList': '', 'textList': '', 'exitCount': 5, 'historyWait': 2}}, {'id': 4, 'index': 6, 'parentId': 2, 'type': 0, 'option': 2, 'title': 'Click Element', 'sequence': [], 'isInLoop': True, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': True, 'xpath': '//*[@id="s-dom-f0607f20"]/div[1]/div[1]/a[7]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'paras': [], 'loopType': 0}}]}

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{"id": 12, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "desc": "要采集的网址列表,多行以\\n分开", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "循环Loop_Click执行的次数0代表无限循环", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "参数1_链接文本", "desc": "", "type": "string", "exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "参数1_链接文本", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "经常戴耳机危害 - 语音科普 - 博禾医生"}, {"num": 1, "value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"}, {"num": 2, "value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"}, {"num": 3, "value": "长时间戴耳机听歌或致耳聋-名医在线网"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"下一页\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +0,0 @@
{"id": 4, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "desc": "\u8981\u91c7\u96c6\u7684\u7f51\u5740\u5217\u8868,\u591a\u884c\u4ee5\\n\u5206\u5f00", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "\u5faa\u73afLoop_Click\u6267\u884c\u7684\u6b21\u6570\uff080\u4ee3\u8868\u65e0\u9650\u5faa\u73af\uff09", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "type": "string", "exampleValue": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}, {"num": 1, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u6709\u4ec0\u4e48\u5371\u5bb3 - \u535a\u79be\u5fae\u89c6 - \u535a\u79be\u533b\u751f"}, {"num": 2, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u8033\u6735\u4f1a\u4e0d\u4f1a\u804b - \u4e13\u5bb6\u6587\u7ae0 - \u535a\u79be\u533b\u751f"}, {"num": 3, "value": "\u957f\u65f6\u95f4\u6234\u8033\u673a\u542c\u6b4c\u6216\u81f4\u8033\u804b-\u540d\u533b\u5728\u7ebf\u7f51"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"\u4e0b\u4e00\u9875\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

196
Readme.md
View File

@ -1,190 +1,34 @@
# 请您Star Please Star
<!-- # 请您Star Please Star
如果你觉得此工具不错,请轻轻点击此页面右上角**Star**按钮增加项目曝光度,谢谢!
If you think this tool is good, please gently click the **Star** button in the upper right corner at this page to increase the project exposure, thank you!
If you think this tool is good, please gently click the **Star** button in the upper right corner at this page to increase the project exposure, thank you! -->
# 无代码服务可视化Web数据采集爬虫器 Code-Free Visual Web Data Crawler/Spider (Service Wrapper)
# EasySpider: Visual Code-Free Web Crawler/Spider
一个可以可视化无代码设计和执行的面向服务架构的爬虫软件(服务包装器)。
A service oriented architecture GUI visual code-free web crawler/spider (service wrapper).
A visual code-free/no-code web crawler/spider.
## Download
# 发布版本
## Windows版本可执行程序<https://github.com/NaiboWang/ServiceWrapper/releases/download/v0.5.0/ServiceWrapper.7z>
打开压缩包内的ServiceWrapper.exe即可在Windows10/11或以上系统执行无需配置环境其余Windows系统需手动安装.net Framework 4.5)。
数据存储后放在Data/文件夹内
## 中文视频教程:<https://github.com/NaiboWang/ServiceWrapper/releases/download/v0.5.0/tutorial_CN.mp4>
Refer to the [Releases Page](https://github.com/NaiboWang/EasySpider/releases) of github to download the latest version of the software.
## 版权声明 Copyright Declarationc
## Video Tutorials
该工具/软件已获得中华人民共和国国家知识产权局授权发明专利证书,因此知识产权受中国法律保护。本着科研和开源社区的开放精神,经浙江大学和相关人员授权,该工具代码公开且可供科研人员和其他相关人员免费使用。但**如需商业使用**,请通过邮件联系作者或浙江大学相关负责人。
Refer to [Youtube Playlist](https://youtube.com/playlist?list=PL0kEFEkWrT7mt9MUlEBV2DTo1QsaanUTp) to see the video tutorials of this software.
This tool/software has been granted a patent for invention by the State Intellectual Property Office of the People's Republic of China, and thus the intellectual property is protected by Chinese law. In the spirit of openness in research and open source community, the code of this tool is open and available for free use by researchers and other related persons with the authorization of Zhejiang University and related persons. However, if you need to use it **commercially**, please contact the author or the person in charge of Zhejiang University by email.
## Documentation
![pic](media/patent.png){:height="10%" width="10%"}
Documentation can be found from [Github Wiki](https://github.com/NaiboWang/EasySpider/wiki).
专利主要发明人信息:
## Copyright Declarartion
Main Inventor Information:
See the [Copyright Declarartion](https://github.com/NaiboWang/EasySpider/releases) here.
王: 新加坡国立大学在读博士生,个人主页:<https://naibo.wang>
Wang: Ph.D. student at National University of Singapore, personal homepage: <https://naibo.wang>
尹: 浙江大学计算机学院教授,副院长,个人主页:<https://person.zju.edu.cn/0001038>
Yin: Professor of College of Computer Science and Technology of Zhejiang University, deputy dean, personal page: < https://person.zju.edu.cn/0001038 >
吴: 计算机科学家,中国科学院院士,其余信息详见:<https://baike.baidu.com/item/%E5%90%B4%E6%9C%9D%E6%99%96/41009>
Wu: Computer Scientist, Academician of the Chinese Academy of Sciences. For more information, see: <https://en.wikipedia.org/wiki/Wu_Zhaohui>
Any user (inside and outside China) can use this tool freely, but
## 免责声明
## 可能的错误
- System.Net.Sockets.SocketException 以一种访问权限不允许的方式做了一个访问套接字的尝试
重启电脑一般可以解决
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
# 目录
注:文档待完善
- [请您Star Please Star](#请您star-please-star)
- [无代码服务可视化Web数据采集爬虫器 Code-Free Visual Web Data Crawler/Spider (Service Wrapper)](#无代码服务可视化web数据采集爬虫器-code-free-visual-web-data-crawlerspider-service-wrapper)
- [发布版本](#发布版本)
- [Windows版本可执行程序https://github.com/NaiboWang/ServiceWrapper/releases/download/v0.5.0/ServiceWrapper.7z](#windows版本可执行程序httpsgithubcomnaibowangservicewrapperreleasesdownloadv050servicewrapper7z)
- [中文视频教程https://github.com/NaiboWang/ServiceWrapper/releases/download/v0.5.0/tutorial\_CN.mp4](#中文视频教程httpsgithubcomnaibowangservicewrapperreleasesdownloadv050tutorial_cnmp4)
- [版权声明 Copyright Declarationc](#版权声明-copyright-declarationc)
- [免责声明](#免责声明)
- [可能的错误](#可能的错误)
- [目录](#目录)
- [界面截图](#界面截图)
- [软件界面示例](#软件界面示例)
- [块和子块及表单定义](#块和子块及表单定义)
- [已选中和待选择示例](#已选中和待选择示例)
- [京东商品块选择示例:](#京东商品块选择示例)
- [京东商品标题自动匹配选择示例](#京东商品标题自动匹配选择示例)
- [分块选择所有子元素示例](#分块选择所有子元素示例)
- [同类型元素自动和手动匹配示例](#同类型元素自动和手动匹配示例)
- [四种选择方式示例](#四种选择方式示例)
- [输入文字示例](#输入文字示例)
- [循环点击58同城房屋标题以进入详情页采集示例](#循环点击58同城房屋标题以进入详情页采集示例)
- [采集元素文本示例](#采集元素文本示例)
- [流程图界面介绍](#流程图界面介绍)
- [循环选项示例](#循环选项示例)
- [循环点击下一页示例](#循环点击下一页示例)
- [条件分支示例](#条件分支示例)
- [完整采集流程图示例](#完整采集流程图示例)
- [完整采集流程图转换为常规流程图示例](#完整采集流程图转换为常规流程图示例)
- [服务信息示例](#服务信息示例)
- [服务调用示例](#服务调用示例)
- [58 同城房源信息采集服务部分采集结果展示](#58-同城房源信息采集服务部分采集结果展示)
- [服务包装手动版程序结构](#服务包装手动版程序结构)
- [Chrome插件部分](#chrome插件部分)
- [后台流程图部分](#后台流程图部分)
- [服务展示部分](#服务展示部分)
- [C#部分](#c部分)
- [后台服务页面](#后台服务页面)
- [服务执行](#服务执行)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
## 界面截图
#### 软件界面示例
![pic](media/Picture.png)
#### 块和子块及表单定义
![pic](media/Picture2.png)
#### 已选中和待选择示例
![pic](media/Picture7.png)
#### 京东商品块选择示例:
![pic](media/Picture1.png)
#### 京东商品标题自动匹配选择示例
![pic](media/Picture5.png)
#### 分块选择所有子元素示例
![pic](media/Picture6.png)
#### 同类型元素自动和手动匹配示例
![pic](media/Picture8.png)
#### 四种选择方式示例
![pic](media/Picture90.png)
#### 输入文字示例
![pic](media/Picture10.png)
#### 循环点击58同城房屋标题以进入详情页采集示例
![pic](media/Picture12.png)
#### 采集元素文本示例
![pic](media/Picture14.png)
#### 流程图界面介绍
![pic](media/Picture4.png)
#### 循环选项示例
![pic](media/Picture9.png)
#### 循环点击下一页示例
![pic](media/Picture11.png)
#### 条件分支示例
![pic](media/Picture13.png)
#### 完整采集流程图示例
![pic](media/Picture16.png)
#### 完整采集流程图转换为常规流程图示例
![pic](media/Picture91.png)
#### 服务信息示例
![pic](media/Picture15.png)
#### 服务调用示例
![pic](media/Picture17.png)
#### 58 同城房源信息采集服务部分采集结果展示
![pic](media/Picture18.png)
## 服务包装手动版程序结构
### Chrome插件部分
* Extension/app内的文件
### 后台流程图部分
* ServiceGrid/frontEnd/FlowChart.html
* ServiceGrid/frontEnd/FlowChart.js
* ServiceGrid/frontEnd/FlowChart.css
* ServiceGrid/frontEnd/logic.css
### 服务展示部分
* 服务列表ServiceGrid/frontEnd/serviceList.html
* 服务信息ServiceGrid/frontEnd/serviceInfo.html
* 新服务ServiceGrid/frontEnd/newService.html
* 调用服务ServiceGrid/frontEnd/invokeService.html
### C#部分
* C#/内的文件
### 后台服务页面
* Django后台ServiceGrid/backEnd/*
### 服务执行
* ExcuteStage/ServiceWrapper_ExcuteStage.py
## Ethics Discussion
Various fields can benefit from web crawlers due to their open access nature.
Inevitably, there will be some risk of malicious use or data infringement issue,
e.g., automatic order swiping and ticket grabbing,
but this is contrary to our expectations.
As a tool developer, we only hope that it can be used for legitimate purposes.
We advocate the reasonable and legal utilization of our system,
respecting and protecting the data security and privacy.

160
media/readme_back.md Normal file
View File

@ -0,0 +1,160 @@
## Copyright Declaration 版权声明
This tool/software has been granted a patent for invention by the State Intellectual Property Office of the People's Republic of China, and thus the intellectual property is protected by Chinese law. In the spirit of openness in research and open source community, the code of this tool is open and available for free use by researchers and other related persons with the authorization of Zhejiang University and related persons. However, if you need to use it **commercially**, please contact the author or the person in charge of Zhejiang University by email.
该工具/软件已获得中华人民共和国国家知识产权局授权发明专利证书,因此知识产权受中国法律保护。本着科研和开源社区的开放精神,经浙江大学和相关人员授权,该工具代码公开且可供科研人员和其他相关人员免费使用。但**如需商业使用**,请通过邮件联系作者或浙江大学相关负责人。
![pic](patent.png)
Main Inventor Information:
专利主要发明人信息:
Wang: Ph.D. student at National University of Singapore, personal homepage: <https://naibo.wang>
王: 新加坡国立大学在读博士生,个人主页:<https://naibo.wang>
Yin: Professor of College of Computer Science and Technology of Zhejiang University, deputy dean, personal page: < https://person.zju.edu.cn/0001038 >
尹: 浙江大学计算机学院教授,副院长,个人主页:<https://person.zju.edu.cn/0001038>
Wu: Computer Scientist, Academician of the Chinese Academy of Sciences. For more information, see: <https://en.wikipedia.org/wiki/Wu_Zhaohui>
吴: 计算机科学家,中国科学院院士,其余信息详见:<https://baike.baidu.com/item/%E5%90%B4%E6%9C%9D%E6%99%96/41009>
## 可能的错误
- System.Net.Sockets.SocketException 以一种访问权限不允许的方式做了一个访问套接字的尝试
重启电脑一般可以解决
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
# 目录
注:文档待完善
- [目录](#目录)
- [界面截图](#界面截图)
- [软件界面示例](#软件界面示例)
- [块和子块及表单定义](#块和子块及表单定义)
- [已选中和待选择示例](#已选中和待选择示例)
- [京东商品块选择示例:](#京东商品块选择示例)
- [京东商品标题自动匹配选择示例](#京东商品标题自动匹配选择示例)
- [分块选择所有子元素示例](#分块选择所有子元素示例)
- [同类型元素自动和手动匹配示例](#同类型元素自动和手动匹配示例)
- [四种选择方式示例](#四种选择方式示例)
- [输入文字示例](#输入文字示例)
- [循环点击58同城房屋标题以进入详情页采集示例](#循环点击58同城房屋标题以进入详情页采集示例)
- [采集元素文本示例](#采集元素文本示例)
- [流程图界面介绍](#流程图界面介绍)
- [循环选项示例](#循环选项示例)
- [循环点击下一页示例](#循环点击下一页示例)
- [条件分支示例](#条件分支示例)
- [完整采集流程图示例](#完整采集流程图示例)
- [完整采集流程图转换为常规流程图示例](#完整采集流程图转换为常规流程图示例)
- [服务信息示例](#服务信息示例)
- [服务调用示例](#服务调用示例)
- [58 同城房源信息采集服务部分采集结果展示](#58-同城房源信息采集服务部分采集结果展示)
- [服务包装手动版程序结构](#服务包装手动版程序结构)
- [Chrome插件部分](#chrome插件部分)
- [后台流程图部分](#后台流程图部分)
- [服务展示部分](#服务展示部分)
- [C#部分](#c部分)
- [后台服务页面](#后台服务页面)
- [服务执行](#服务执行)
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
## 界面截图
#### 软件界面示例
![pic](Picture.png)
#### 块和子块及表单定义
![pic](Picture2.png)
#### 已选中和待选择示例
![pic](Picture7.png)
#### 京东商品块选择示例:
![pic](Picture1.png)
#### 京东商品标题自动匹配选择示例
![pic](Picture5.png)
#### 分块选择所有子元素示例
![pic](Picture6.png)
#### 同类型元素自动和手动匹配示例
![pic](Picture8.png)
#### 四种选择方式示例
![pic](Picture90.png)
#### 输入文字示例
![pic](Picture10.png)
#### 循环点击58同城房屋标题以进入详情页采集示例
![pic](Picture12.png)
#### 采集元素文本示例
![pic](Picture14.png)
#### 流程图界面介绍
![pic](Picture4.png)
#### 循环选项示例
![pic](Picture9.png)
#### 循环点击下一页示例
![pic](Picture11.png)
#### 条件分支示例
![pic](Picture13.png)
#### 完整采集流程图示例
![pic](Picture16.png)
#### 完整采集流程图转换为常规流程图示例
![pic](Picture91.png)
#### 服务信息示例
![pic](Picture15.png)
#### 服务调用示例
![pic](Picture17.png)
#### 58 同城房源信息采集服务部分采集结果展示
![pic](Picture18.png)
## 服务包装手动版程序结构
### Chrome插件部分
* Extension/app内的文件
### 后台流程图部分
* ServiceGrid/frontEnd/FlowChart.html
* ServiceGrid/frontEnd/FlowChart.js
* ServiceGrid/frontEnd/FlowChart.css
* ServiceGrid/frontEnd/logic.css
### 服务展示部分
* 服务列表ServiceGrid/frontEnd/serviceList.html
* 服务信息ServiceGrid/frontEnd/serviceInfo.html
* 新服务ServiceGrid/frontEnd/newService.html
* 调用服务ServiceGrid/frontEnd/invokeService.html
### C#部分
* C#/内的文件
### 后台服务页面
* Django后台ServiceGrid/backEnd/*
### 服务执行
* ExcuteStage/ServiceWrapper_ExcuteStage.py