New version

This commit is contained in:
NaiboWang-Alienware 2022-10-19 15:33:12 +08:00
parent 3646513d5b
commit f125db1f8e
33 changed files with 2398 additions and 18 deletions

View File

@ -0,0 +1,712 @@
# -*- coding: utf-8 -*-
import atexit # 遇到错误退出时应执行的代码
import json
from lib2to3.pgen2 import driver
import re
import sys
from urllib import parse
import base64
import hashlib
import time
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
import numpy
import csv
import os
from selenium.webdriver.common.by import By
saveName, log, OUTPUT, browser, SAVED = None, "", "", None, False
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
outputParameters = {}
class Time:
def __init__(self, type1=""):
self.t = int(round(time.time() * 1000))
self.type = type1
def end(self):
at = int(round(time.time() * 1000))
Log(str(self.type)+":"+str(at-self.t))
# 记录log
def recordLog(str=""):
global log
log = log + str + "\n"
# 控制台打印log函数
def Log(text, text2=""):
switch = False
if switch:
print(text, text2)
# 屏幕滚动函数
def scrollDown(para, rt=""):
try:
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element(By.CSS_SELECTOR,"body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
body.send_keys(Keys.END)
except TimeoutException:
Log('time out after 10 seconds when scrolling. ')
recordLog('time out after 10 seconds when scrolling')
browser.execute_script('window.stop()')
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element(By.CSS_SELECTOR,"body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
body.send_keys(Keys.END)
if rt != "":
rt.end()
# 执行节点关键函数部分
def excuteNode(nodeId, loopValue="", clickPath="", index=0):
node = procedure[nodeId]
WebDriverWait(browser, 10).until
# 等待元素出现才进行操作10秒内未出现则报错
(EC.visibility_of_element_located((By.XPATH, node["parameters"]["xpath"])))
# 根据不同选项执行不同操作
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
for i in node["sequence"]: # 从根节点开始向下读取
excuteNode(i, loopValue, clickPath, index)
elif node["option"] == 1: # 打开网页操作
recordLog("openPage")
openPage(node["parameters"], loopValue)
elif node["option"] == 2: # 点击元素
recordLog("Click")
clickElement(node["parameters"], loopValue, clickPath, index)
elif node["option"] == 3: # 提取数据
recordLog("getData")
getData(node["parameters"], loopValue, node["isInLoop"], parentPath = clickPath, index = index)
elif node["option"] == 4: # 输入文字
inputInfo(node["parameters"], loopValue)
elif node["option"] == 8: # 循环
recordLog("loop")
loopExcute(node, loopValue, clickPath, index) # 执行循环
elif node["option"] == 9: # 条件分支
recordLog("judge")
judgeExcute(node, loopValue, clickPath, index)
# 执行完之后进行等待
if node["option"] != 0:
waitTime = 0.01 # 默认等待0.01秒
if node["parameters"]["wait"] > 1:
waitTime = node["parameters"]["wait"]
time.sleep(waitTime)
Log("Node执行完后等待", waitTime)
# 对判断条件的处理
def judgeExcute(node, loopElement, clickPath="", index=0):
rt = Time("条件判断")
global bodyText # 引入bodyText
excuteBranchId = 0 # 要执行的BranchId
for i in node["sequence"]:
cnode = procedure[i] # 获得条件分支
tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
if tType == 0: # 什么条件都没有
excuteBranchId = i
break
elif tType == 1: # 当前页面包含文本
try:
if bodyText.find(cnode["parameters"]["value"]) >= 0:
excuteBranchId = i
break
except: # 找不到元素下一个条件
continue
elif tType == 2: # 当前页面包含元素
try:
if browser.find_element(By.XPATH, cnode["parameters"]["value"]):
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
elif tType == 3: # 当前循环元素包括文本
try:
if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
elif tType == 4: # 当前循环元素包括元素
try:
if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
excuteBranchId = i
break
except: # 找不到元素或者xpath写错了下一个条件
continue
rt.end()
excuteNode(excuteBranchId, loopElement, clickPath, index)
# 对循环的处理
def loopExcute(node, loopValue, clickPath="", index=0):
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
Log("循环执行前等待0.1秒")
global history
thisHandle = browser.current_window_handle # 记录本次循环内的标签页的ID
thisHistoryLength = browser.execute_script(
'return history.length') # 记录本次循环内的history的length
history["index"] = thisHistoryLength
history["handle"] = thisHandle
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
# 无跳转标签页操作
count = 0 # 执行次数
while True: # do while循环
try:
element = browser.find_element(By.XPATH,
node["parameters"]["xpath"])
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, element, node["parameters"]["xpath"], 0)
Log("click: ", node["parameters"]["xpath"])
recordLog("click:" + node["parameters"]["xpath"])
# except NoSuchElementException:
except:
Log("clickNotFound: ", node["parameters"]["xpath"])
recordLog("clickNotFound:" + node["parameters"]["xpath"])
for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
if node["option"] != 2:
excuteNode(i, None, node["parameters"]["xpath"], 0)
break # 如果找不到元素,退出循环
count = count + 1
Log("页数:", count)
recordLog("页数:" + str(count))
# print(node["parameters"]["exitCount"], "-------")
if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
break
elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
try:
elements = browser.find_elements(By.XPATH,
node["parameters"]["xpath"])
for index in range(len(elements)):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
excuteNode(i, elements[index],
node["parameters"]["xpath"], index)
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
while True: # 一直关闭窗口直到当前标签页
browser.close() # 关闭使用完的标签页
browser.switch_to.window(browser.window_handles[-1])
if browser.current_window_handle == thisHandle:
break
if history["index"] != thisHistoryLength and history[
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
history["index"] # 计算历史记录变化差值
browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
Log("切换历史记录等待2秒或者", node["parameters"]["historyWait"])
browser.execute_script('window.stop()')
except NoSuchElementException:
Log("pathNotFound: ", node["parameters"]["xpath"])
recordLog("pathNotFound: " + node["parameters"]["xpath"])
pass # 循环中找不到元素就略过操作
except Exception as e:
raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
for path in node["parameters"]["pathList"].split("\n"): # 千万不要忘了分割!!
try:
element = browser.find_element(By.XPATH, path)
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, element, path, 0)
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
while True: # 一直关闭窗口直到当前标签页
browser.close() # 关闭使用完的标签页
browser.switch_to.window(browser.window_handles[-1])
if browser.current_window_handle == thisHandle:
break
if history["index"] != thisHistoryLength and history[
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
difference = thisHistoryLength - \
history["index"] # 计算历史记录变化差值
browser.execute_script(
'history.go(' + str(difference) + ')') # 回退历史记录
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
time.sleep(node["parameters"]["historyWait"])
else:
time.sleep(2)
Log("切换历史记录等待2秒或者", node["parameters"]["historyWait"])
browser.execute_script('window.stop()')
except NoSuchElementException:
Log("pathNotFound: ", path)
recordLog("pathNotFound: " + path)
continue # 循环中找不到元素就略过操作
except Exception as e:
raise
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n")
for text in textList:
recordLog("input: " + text)
for i in node["sequence"]: # 挨个执行操作
excuteNode(i, text, "", 0)
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
# tempList = node["parameters"]["textList"].split("\r\n")
urlList = list(filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
# urlList = []
# for url in tempList:
# if url != "":
# urlList.append(url)
for url in urlList:
recordLog("input: " + url)
for i in node["sequence"]:
excuteNode(i, url, "", 0)
history["index"] = thisHistoryLength
history["handle"] = browser.current_window_handle
scrollDown(node["parameters"])
# 打开网页事件
def openPage(para, loopValue):
rt = Time("打开网页")
time.sleep(2) # 打开网页后强行等待至少2秒
global links
global urlId
global history
global outputParameters
# try:
# firstTime = True
# for handle in browser.window_handles:
# browser.switch_to.window(handle)
# if (not firstTime):
# browser.close()
# firstTime = False
# except:
# return
if len(browser.window_handles) > 1:
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
browser.close()
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
history["handle"] = browser.current_window_handle
if para["useLoop"]:
url = loopValue
else:
url = links[urlId]
try:
browser.get(url)
Log('Loading page: ' + url)
recordLog('Loading page: ' + url)
except TimeoutException:
Log('time out after 10 seconds when loading page: ' + url)
recordLog('time out after 10 seconds when loading page: ' + url)
browser.execute_script('window.stop()')
rt.end()
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
scrollDown(para, rt) # 控制屏幕向下滚动
if containJudge:
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
Log('URL Page: ' + url)
recordLog('URL Page: ' + url)
except TimeoutException:
Log('time out after 10 seconds when getting body text: ' + url)
recordLog('time out after 10 seconds when getting body text:: ' + url)
browser.execute_script('window.stop()')
time.sleep(1)
Log("获得bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
# clear output parameters
for key in outputParameters:
outputParameters[key] = ""
rt.end()
# 键盘输入事件
def inputInfo(para, loopValue):
time.sleep(1) # 输入之前等待1秒
Log("输入前等待1秒")
rt = Time("输入文字")
try:
textbox = browser.find_element(By.XPATH, para["xpath"])
except:
Log("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
recordLog("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
exit()
textbox.send_keys(Keys.CONTROL, 'a')
textbox.send_keys(Keys.BACKSPACE)
if para["useLoop"]:
textbox.send_keys(loopValue)
else:
textbox.send_keys(para["value"])
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
rt.end()
# 点击元素事件
def clickElement(para, loopElement=None, clickPath="", index=0):
global history
time.sleep(0.1) # 点击之前等待1秒
rt = Time("点击元素")
Log("点击之前等待1秒")
if para["useLoop"]: # 使用循环的情况下传入的clickPath就是实际的xpath
path = clickPath
else:
path = para["xpath"] # 不然使用元素定义的xpath
tempHandleNum = len(browser.window_handles) # 记录之前的窗口位置
try:
script = 'var result = document.evaluate(`' + path + \
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
browser.execute_script(script, str(index)) # 用js的点击方法
except TimeoutException:
Log('time out after 10 seconds when loading clicked page')
recordLog('time out after 10 seconds when loading clicked page')
browser.execute_script('window.stop()')
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
time.sleep(0.5) # 点击之后等半秒
Log("点击之后等待0.5秒")
if tempHandleNum != len(browser.window_handles): # 如果有新标签页的行为发生
browser.switch_to.window(browser.window_handles[-1]) # 跳转到新的标签页
history["handle"] = browser.current_window_handle
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
else:
try:
history["index"] = browser.execute_script("return history.length")
except TimeoutException:
browser.execute_script('window.stop()')
history["index"] = browser.execute_script("return history.length")
rt.end()
# 如果打开了新窗口,切换到新窗口
scrollDown(para, rt) # 根据参数配置向下滚动
if containJudge: # 有判断语句才执行以下操作
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text')
recordLog('time out after 10 seconds when getting body text')
browser.execute_script('window.stop()')
time.sleep(1)
Log("bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
rt.end()
except Exception as e:
Log(e)
recordLog(str(e))
rt.end()
# 提取数据事件
def getData(para, loopElement, isInLoop=True, parentPath="", index = 0):
if not isInLoop and para["wait"] == 0:
time.sleep(1) # 如果提取数据字段不在循环内而且设置的等待时间为0默认等待1秒
Log("提取数据等待1秒")
rt = Time("提取数据")
for p in para["paras"]:
content = ""
try:
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
if p["relativeXpath"].find("//")>=0: # 如果字串里有//即子孙查找,则不动语句
full_path = "(" + parentPath + p["relativeXpath"] + ")" + "[" + str(index + 1) + "]"
element = browser.find_element(By.XPATH, full_path)
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
else:
element = browser.find_element(By.XPATH, p["relativeXpath"])
except NoSuchElementException: # 找不到元素的时候,使用默认值
# print(p)
try:
content = p["default"]
except Exception as e:
content = ""
outputParameters[p["name"]] = content
Log('Element %s not found,use default' % p["relativeXpath"])
recordLog('Element %s not found, use default' % p["relativeXpath"])
continue
except TimeoutException: # 超时的时候设置超时值
Log('time out after 10 seconds when getting data')
recordLog('time out after 10 seconds when getting data')
browser.execute_script('window.stop()')
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
else:
element = browser.find_element(By.XPATH, p["relativeXpath"])
rt.end()
try:
if p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
recordLog('StaleElementReferenceException'+p["relativeXpath"])
time.sleep(3)
try:
if p["relative"]: # 是否相对xpath
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
recordLog('StaleElementReferenceExceptionloopElement')
else:
element = loopElement.find_element(By.XPATH,
p["relativeXpath"][1:])
recordLog(
'StaleElementReferenceExceptionloopElement+relativeXPath')
else:
element = browser.find_element(
By.XPATH, p["relativeXpath"])
recordLog('StaleElementReferenceExceptionrelativeXpath')
if p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 0:
content = element.text
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
except StaleElementReferenceException:
recordLog('StaleElementReferenceException'+p["relativeXpath"])
continue # 再出现类似问题直接跳过
outputParameters[p["name"]] = content
global OUTPUT
line = []
for value in outputParameters.values():
line.append(value)
print(value[:15], " ", end="")
print("")
OUTPUT.append(line)
rt.end()
# 判断字段是否为空
def isnull(s):
return len(s) != 0
@atexit.register
def clean():
global saveName, log, OUTPUT, browser, SAVED
if not SAVED:
print('清理环境保存数据')
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
file_obj.write(log)
file_obj.close()
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
f_csv = csv.writer(f)
for line in OUTPUT:
f_csv.writerow(line)
f.close()
browser.quit()
sys.exit(saveName + '.csv')
if __name__ == '__main__':
options = Options()
exe_path = "chromedriver.exe"
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/ServiceWrapper")
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd()+"/Debug"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
exe_path = "Debug/Chrome/chromedriver.exe"
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
exe_path = "./Chrome/chromedriver.exe"
else:
options.binary_location = "chrome.exe" # 指定chrome位置
browser = webdriver.Chrome(options=options, executable_path=exe_path)
browser.get('about:blank')
browser.set_page_load_timeout(10) # 加载页面最大超时时间
browser.set_script_timeout(10)
if len(sys.argv) > 1:
id = int(sys.argv[1]) # taskId这里修改
else:
id = 7 # 设置默认值
print("id", id)
if len(sys.argv) > 2:
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
else:
saveName = "task_" + str(id) + "_" + \
str(random.randint(0, 999999999)) # 保存文件的名字
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
if len(sys.argv) > 3:
backEndAddress = sys.argv[3]
else:
backEndAddress = "http://servicewrapper.naibo.wang"
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
print("name", service["name"])
procedure = service["graph"] # 程序执行流程
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
OUTPUT = [] # 采集的数据
OUTPUT.append([]) # 添加表头
containJudge = service["containJudge"] # 是否含有判断语句
bodyText = "" # 记录bodyText
tOut = service["outputParameters"] # 生成输出参数对象
outputParameters = {}
log = "" # 记下现在总共开了多少个标签页
history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
SAVED = False # 记录是否已经存储了
for para in tOut:
outputParameters[para["name"]] = ""
OUTPUT[0].append(para["name"])
# 挨个执行程序
urlId = 0 # 全局记录变量
for i in range(len(links)):
excuteNode(0)
urlId = urlId + 1
print("执行完成!")
recordLog("Done!")
# dataPath = os.path.abspath(os.path.join(os.getcwd(), "../Data"))
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
file_obj.write(log)
file_obj.close()
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
f_csv = csv.writer(f)
for line in OUTPUT:
f_csv.writerow(line)
f.close()
SAVED = True
browser.quit()

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import atexit # 遇到错误退出时应执行的代码
import json
from lib2to3.pgen2 import driver
import re
import sys
from urllib import parse
@ -64,7 +65,7 @@ def scrollDown(para, rt=""):
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element_by_css_selector("body")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
@ -77,7 +78,7 @@ def scrollDown(para, rt=""):
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element_by_css_selector("body")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
@ -283,6 +284,18 @@ def openPage(para, loopValue):
global links
global urlId
global history
# try:
# firstTime = True
# for handle in browser.window_handles:
# browser.switch_to.window(handle)
# if (not firstTime):
# browser.close()
# firstTime = False
# except:
# return
if len(browser.window_handles) > 1:
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
browser.close()
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
history["handle"] = browser.current_window_handle
if para["useLoop"]:
@ -306,7 +319,7 @@ def openPage(para, loopValue):
if containJudge:
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text: ' + url)
recordLog('time out after 10 seconds when getting body text:: ' + url)
@ -314,7 +327,7 @@ def openPage(para, loopValue):
time.sleep(1)
Log("获得bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
@ -340,7 +353,7 @@ def inputInfo(para, loopValue):
else:
textbox.send_keys(para["value"])
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
@ -391,7 +404,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
if containJudge: # 有判断语句才执行以下操作
global bodyText # 每次执行点击输入元素和打开网页操作后需要更新bodyText
try:
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text')
recordLog('time out after 10 seconds when getting body text')
@ -399,7 +412,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
time.sleep(1)
Log("bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
@ -594,18 +607,25 @@ def clean():
f_csv.writerow(line)
f.close()
browser.quit()
sys.exit(saveName + '.csv')
if __name__ == '__main__':
options = Options()
exe_path = "chromedriver.exe"
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/ServiceWrapper")
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd()+"/Debug"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
exe_path = "Debug/Chrome/chromedriver.exe"
elif os.getcwd().find("ExcuteStage") >= 0: # 如果直接执行
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
exe_path = "./Chrome/chromedriver.exe"
else:
@ -620,17 +640,23 @@ if __name__ == '__main__':
id = 7 # 设置默认值
print("id", id)
if len(sys.argv) > 2:
backEndAddress = sys.argv[2]
else:
backEndAddress = "http://servicewrapper.naibo.wang"
if len(sys.argv) > 3:
saveName = "task_" + str(id) + "_" + sys.argv[3] # 保存文件的名字
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
else:
saveName = "task_" + str(id) + "_" + \
str(random.randint(0, 999999999)) # 保存文件的名字
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
print("name", service["name"])
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
if len(sys.argv) > 3:
backEndAddress = sys.argv[3]
else:
backEndAddress = "http://servicewrapper.naibo.wang"
# content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
with open("tasks/" + str(id) + ".json", 'r') as f:
content = f.read()
service = json.loads(content)
# print(service)
# service = json.loads() # 加载服务信息
print("name:", service["name"])
procedure = service["graph"] # 程序执行流程
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
OUTPUT = [] # 采集的数据
@ -663,4 +689,3 @@ if __name__ == '__main__':
f.close()
SAVED = True
browser.quit()
sys.exit(0)

BIN
ExecuteStage/all_data.xls Normal file

Binary file not shown.

View File

@ -0,0 +1,151 @@
# _*_coding:utf-8_*_
from hashlib import new
import json
import os
import sys
import time
from multiprocessing import Process
import time
from datetime import datetime, timedelta
import os
import pickle
import calendar
import re
from copy import deepcopy
import requests
import csv
from commandline_config import Config
from service_invoke import invokeService
class TimeUtil(object):
@classmethod
def parse_timezone(cls, timezone):
"""
解析时区表示
:param timezone: str eg: +8
:return: dict{symbol, offset}
"""
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
symbol = result.groupdict()['symbol']
offset = int(result.groupdict()['offset'])
return {
'symbol': symbol,
'offset': offset
}
@classmethod
def convert_timezone(cls, dt, timezone="+0"):
"""默认是utc时间需要"""
result = cls.parse_timezone(timezone)
symbol = result['symbol']
offset = result['offset']
if symbol == '+':
return dt + timedelta(hours=offset)
elif symbol == '-':
return dt - timedelta(hours=offset)
else:
raise Exception('dont parse timezone format')
def generate_timestamp():
current_GMT = time.gmtime()
# ts stores timestamp
ts = calendar.timegm(current_GMT)
current_time = datetime.utcnow()
convert_now = TimeUtil.convert_timezone(current_time, '+8')
print("current_time: " + str(convert_now))
return str(convert_now)
def main():
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
# res = result.read()
# for line in res.splitlines():
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
config = {
"pages": 5,
"test": False,
"test_pages": 3,
}
c = Config(config)
print(c)
csv_reader = csv.reader(open("./关键词.csv", encoding='utf-8'))
keywords = []
i = 0
for line in csv_reader:
if i < c.test_pages:
print(line)
i += 1
keywords.append(line[0])
urlList = ""
i = 0
for keyword in keywords:
url = "https://so.toutiao.com/search?dvpf=pc&source=pagination&filter_vendor=site&keyword=%s&pd=synthesis&filter_vendor=site&action_type=pagination&page_num=0\r\n" % keyword
# print(url)
urlList += url
i += 1
if c.test and i > c.test_pages:
break
print(urlList)
# result = requests.post(
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
# data={"id": 6, # serviceID
# "paras": json.dumps({"loopTimes_Loop_Click_1": c.pages,
# "urlList_0": urlList,
# }),
# })
# authorTaskID = int(result.text)
authorTaskID = invokeService(
0, {"loopTimes_Loop_Click_1": c.pages, "urlList_0": urlList})
print("authorTaskID: " + str(authorTaskID))
# exit(0)
filename = generate_timestamp().replace(" ", "").replace(":", "-")
print("filename:", filename)
command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
str(authorTaskID) + ' ' + filename
result = os.system(command)
# authorTaskID = 53
file_name = "task_" + str(authorTaskID) + "_" + filename + ".csv"
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
print("file_name:", file_name)
csv_reader = csv.reader(
open("./Data/"+file_name, encoding='utf-8')) # taskID
new_author_list = []
i = 0
for line in csv_reader:
# print(line)
if i > 0:
new_author_list.append(line[0])
i += 1
# print(new_author_list)
new_author_list = list(set(new_author_list)) # 去重
csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
author_list = []
for line in csv_reader:
author_list.append(line[0])
author_list = list(set(author_list)) # 去重
print("author_list:", author_list)
print("new_author_list:", new_author_list)
real_new_author_list = list(
set(new_author_list).difference(set(author_list)))
print("real_new_author_list:", real_new_author_list)
with open("author_list.csv", "a", encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
for row in real_new_author_list:
writer.writerow([row])
if __name__ == '__main__':
main()

202
ExecuteStage/desc_crawl.py Normal file
View File

@ -0,0 +1,202 @@
# _*_coding:utf-8_*_
from hashlib import new
import json
import os
import sys
import time
from multiprocessing import Process
import time
from datetime import datetime, timedelta
import os
import pickle
import calendar
import re
from copy import deepcopy
import requests
import csv
from commandline_config import Config
from service_invoke import invokeService
class TimeUtil(object):
@classmethod
def parse_timezone(cls, timezone):
"""
解析时区表示
:param timezone: str eg: +8
:return: dict{symbol, offset}
"""
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
symbol = result.groupdict()['symbol']
offset = int(result.groupdict()['offset'])
return {
'symbol': symbol,
'offset': offset
}
@classmethod
def convert_timezone(cls, dt, timezone="+0"):
"""默认是utc时间需要"""
result = cls.parse_timezone(timezone)
symbol = result['symbol']
offset = result['offset']
if symbol == '+':
return dt + timedelta(hours=offset)
elif symbol == '-':
return dt - timedelta(hours=offset)
else:
raise Exception('dont parse timezone format')
def generate_timestamp():
current_GMT = time.gmtime()
# ts stores timestamp
ts = calendar.timegm(current_GMT)
current_time = datetime.utcnow()
convert_now = TimeUtil.convert_timezone(current_time, '+8')
print("current_time: " + str(convert_now))
return str(convert_now)
def main():
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
# res = result.read()
# for line in res.splitlines():
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
config = {
"pages": 5,
"test": False,
"test_pages": 3,
}
c = Config(config)
print(c)
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
author_list = []
for line in csv_reader:
author_list.append(line[4])
csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
keywords = []
i = 0
for line in csv_reader:
if line[0] not in author_list:
keywords.append(line[0])
else:
print("Will not append keyword %s", line[0])
i += 1
if c.test and i > c.test_pages * 100:
break
# print("author_list:", author_list)
# exit(0)
urlList = ""
i = 0
for keyword in keywords:
url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword
# print(url)
urlList += url
i += 1
if c.test and i > c.test_pages:
break
print(urlList)
# exit(0)
# result = requests.post(
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
# data={"id": 7, # serviceID
# "paras": json.dumps({"urlList_0": urlList,
# }),
# })
# descTaskID = int(result.text)
descTaskID = invokeService(
1, {"urlList_0": urlList})
print("descTaskID: " + str(descTaskID))
# exit(0)
filename = generate_timestamp().replace(" ", "").replace(":", "-")
print("filename:", filename)
command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
str(descTaskID) + ' ' + filename
result = os.system(command)
# authorTaskID = 53
file_name = "task_" + str(descTaskID) + "_" + filename + ".csv"
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
print("file_name:", file_name)
csv_reader = csv.reader(
open("./Data/"+file_name, encoding='utf-8')) # taskID
new_descTaskID = []
i = 0
for line in csv_reader:
# print(line)
if i > 0:
new_descTaskID.append(line)
i += 1
# print(new_author_list)
# new_descTaskID = list(set([tuple(t) for t in new_descTaskID]))
# new_descTaskID = list(set(new_descTaskID)) # 去重
after_remove_duplicate = []
for i in range(len(new_descTaskID)):
try:
if i > 0:
if new_descTaskID[i][2] == new_descTaskID[i-1][2]:
continue
if new_descTaskID[i][2] != "":
zan = new_descTaskID[i][1].split("获赞")[0]
fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1]
follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1]
after_remove_duplicate.append(
[new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]])
except:
pass
print("after_remove_duplicate", after_remove_duplicate)
all_collected = []
for author in after_remove_duplicate:
all_collected.append(author[4])
print("all_collected:", all_collected)
for keyword in keywords:
if keyword not in all_collected:
print("keyword not collected:", keyword)
after_remove_duplicate.append(['', '', '', '', keyword, ''])
new_descTaskID = after_remove_duplicate
print("new_descTaskID:", new_descTaskID)
# for i in range(len(keywords)):
# author_list[i] = [keywords[i]].extend(new_descTaskID[i])
# for row in author_list:
# print(row)
with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
for row in new_descTaskID:
writer.writerow(row)
import xlwt
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
all_data = []
for line in csv_reader:
all_data.append(line)
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("Sheet")
for i in range(len(all_data)):
for j in range(len(all_data[i])):
sheet.write(i, j, all_data[i][j])
workbook.save("all_data.xls")
if __name__ == "__main__":
main()

View File

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

BIN
ExecuteStage/list.xlsx Normal file

Binary file not shown.

7
ExecuteStage/main.py Normal file
View File

@ -0,0 +1,7 @@
import author_crawl
import desc_crawl
if __name__ == "__main__":
author_crawl.main()
desc_crawl.main()

View File

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
from base64 import encode
import json
import os
def queryService(id):
with open("services/%d.json" % id, "r", encoding='utf-8') as f:
service = json.loads(f.read())
return service
def invokeService(id, data):
service = queryService(id)
try:
service["links"] = data["urlList_0"]
except:
pass
for key, value in data.items():
for i in range(len(service["inputParameters"])):
if key == service["inputParameters"][i]["name"]: # 能调用
nodeId = int(service["inputParameters"][i]["nodeId"])
node = service["graph"][nodeId]
if node["option"] == 1:
node["parameters"]["links"] = value
elif node["option"] == 4:
node["parameters"]["value"] = value
elif node["option"] == 8 and node["parameters"]["loopType"] == 0:
# print("loopType 0", value)
node["parameters"]["exitCount"] = int(value)
# print(node)
elif node["option"] == 8:
node["parameters"]["textList"] = value
break
count = len(os.listdir("tasks")) + 1
service["id"] = count # 修改id
print(count)
with open("tasks/%d.json" % count, "w", ) as f:
s = json.dumps(service, ensure_ascii=False)
f.write(s)
return count

View File

@ -0,0 +1,218 @@
{
"id": 6,
"name": "toutiao_authors",
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 2,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"
},
{
"id": 1,
"name": "loopTimes_Loop_Click_1",
"nodeId": 5,
"nodeName": "Loop_Click",
"desc": "循环Loop_Click执行的次数0代表无限循环",
"type": "int",
"exampleValue": 5,
"value": 5
}
],
"outputParameters": [
{
"id": 0,
"name": "参数1_链接文本",
"desc": "",
"type": "string",
"exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": -1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 0,
"url": "https://www.jd.com",
"links": "https://www.jd.com",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 1,
"index": 2,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 5,
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 3,
"index": 3,
"parentId": 2,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": 5,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 1,
"contentType": 0,
"relative": true,
"name": "参数1_链接文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "经常戴耳机危害 - 语音科普 - 博禾医生"
},
{
"num": 1,
"value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"
},
{
"num": 2,
"value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"
},
{
"num": 3,
"value": "长时间戴耳机听歌或致耳聋-名医在线网"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 2,
"index": 5,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop_Click",
"sequence": [3, 6],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//span[contains(text(), \"下一页\")]/../..",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 0,
"pathList": "",
"textList": "",
"exitCount": 5,
"historyWait": 2
}
},
{
"id": 4,
"index": 6,
"parentId": 2,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": true,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": true,
"xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": [],
"loopType": 0
}
}
]
}

View File

@ -0,0 +1,278 @@
{
"id": 7,
"name": "toutiao_author_desc",
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 1,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
}
],
"outputParameters": [
{
"id": 0,
"name": "参数2_文本",
"desc": "",
"type": "string",
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
},
{
"id": 1,
"name": "参数3_文本",
"desc": "",
"type": "string",
"exampleValue": "63.3万获赞"
},
{
"id": 2,
"name": "参数1_文本",
"desc": "",
"type": "string",
"exampleValue": "小荷医典"
},
{
"id": 3,
"name": "参数6_链接地址",
"desc": "",
"type": "string",
"exampleValue": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [1, 6, 2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": 1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 1,
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 3,
"index": 2,
"parentId": 0,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//a[contains(@class, 'flex-row')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": []
}
},
{
"id": -1,
"index": 3,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": -1,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": true,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
},
{
"num": 1,
"value": "10.3万粉丝"
},
{
"num": 2,
"value": "5关注"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 4,
"index": 5,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 3,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数2_文本",
"desc": "",
"relativeXpath": "//p[contains(@class, 'user-desc')]",
"exampleValues": [
{
"num": 0,
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数3_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
"exampleValues": [
{
"num": 0,
"value": "小荷医典"
}
]
}
]
}
},
{
"id": 2,
"index": 6,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数6_链接地址",
"desc": "",
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
"exampleValues": [
{
"num": 0,
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"default": ""
}
]
}
}
]
}

View File

@ -0,0 +1,292 @@
{
"id": 7,
"name": "toutiao_author_desc",
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"containJudge": false,
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"inputParameters": [
{
"id": 0,
"name": "urlList_0",
"nodeId": 1,
"nodeName": "Open Page",
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"desc": "要采集的网址列表,多行以\\n分开",
"type": "string",
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
}
],
"outputParameters": [
{
"id": 0,
"name": "参数2_文本",
"desc": "",
"type": "string",
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
},
{
"id": 1,
"name": "参数3_文本",
"desc": "",
"type": "string",
"exampleValue": "63.3万获赞"
},
{
"id": 2,
"name": "参数1_文本",
"desc": "",
"type": "string",
"exampleValue": "小荷医典"
},
{
"id": 3,
"name": "参数1_链接地址",
"desc": "",
"type": "string",
"exampleValue": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
}
],
"graph": [
{
"index": 0,
"id": 0,
"parentId": 0,
"type": -1,
"option": 0,
"title": "root",
"sequence": [1, 2, 5],
"parameters": {
"history": 1,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0
},
"isInLoop": false
},
{
"id": 1,
"index": 1,
"parentId": 0,
"type": 0,
"option": 1,
"title": "Open Page",
"sequence": [],
"isInLoop": false,
"position": 0,
"parameters": {
"useLoop": false,
"xpath": "",
"wait": 1,
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
"scrollType": 0,
"scrollCount": 0
}
},
{
"id": 2,
"index": 2,
"parentId": 0,
"type": 0,
"option": 2,
"title": "Click Element",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": -1,
"useLoop": false,
"xpath": "//a[contains(@class, 'flex-row')][1]",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"paras": []
}
},
{
"id": -1,
"index": 3,
"parentId": 0,
"type": 1,
"option": 8,
"title": "Loop",
"sequence": [4],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
"wait": 0,
"scrollType": 0,
"scrollCount": 0,
"loopType": 1,
"pathList": "",
"textList": "",
"exitCount": 0,
"historyWait": 2
}
},
{
"id": -1,
"index": 4,
"parentId": 3,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": true,
"position": 0,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": true,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
},
{
"num": 1,
"value": "10.3万粉丝"
},
{
"num": 2,
"value": "5关注"
}
],
"default": ""
}
],
"loopType": 1
}
},
{
"id": 3,
"index": 5,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 2,
"parameters": {
"history": 1,
"tabIndex": 1,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数2_文本",
"desc": "",
"relativeXpath": "//p[contains(@class, 'user-desc')]",
"exampleValues": [
{
"num": 0,
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数3_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
"exampleValues": [
{
"num": 0,
"value": "63.3万获赞"
}
],
"default": ""
},
{
"nodeType": 0,
"contentType": 0,
"relative": false,
"name": "参数1_文本",
"desc": "",
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
"exampleValues": [
{
"num": 0,
"value": "小荷医典"
}
]
},
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数1_链接地址",
"desc": "",
"relativeXpath": "//a[@class=\"avatar\"]",
"exampleValues": [
{
"num": 0,
"value": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
}
]
}
]
}
},
{
"id": -1,
"index": 6,
"parentId": 0,
"type": 0,
"option": 3,
"title": "Extract Data",
"sequence": [],
"isInLoop": false,
"position": 1,
"parameters": {
"history": 5,
"tabIndex": 0,
"useLoop": false,
"xpath": "",
"wait": 0,
"paras": [
{
"nodeType": 2,
"contentType": 0,
"relative": false,
"name": "参数6_链接地址",
"desc": "",
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
"exampleValues": [
{
"num": 0,
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
}
],
"default": ""
}
]
}
}
]
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{'id': 12, 'name': 'toutiao_authors', 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=?&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'containJudge': False, 'desc': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=?&pd=synthesis&action_type=pagination&page_num=0', 'inputParameters': [{'id': 0, 'name': 'urlList_0', 'nodeId': 2, 'nodeName': 'Open Page', 'value': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword= ?&pd=synthesis&action_type=pagination&page_num=0', 'desc': ',\\n', 'type': 'string', 'exampleValue': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=? &pd=synthesis&action_type=pagination&page_num=0'}, {'id': 1, 'name': 'loopTimes_Loop_Click_1', 'nodeId': 5, 'nodeName': 'Loop_Click', 'desc': 'Loop_Click 0', 'type': 'int', 'exampleValue': 5, 'value': 5}], 'outputParameters': [{'id': 0, 'name': '1_', 'desc': '', 'type': 'string', 'exampleValue': ' - - '}], 'graph': [{'index': 0, 'id': 0, 'parentId': 0, 'type': -1, 'option': 0, 'title': 'root', 'sequence': [2, 5], 'parameters': {'history': 1, 'tabIndex': 0, 'useLoop': False, 'xpath': '', 'wait': 0}, 'isInLoop': False}, {'id': -1, 'index': 1, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 0, 'url': 'https://www.jd.com', 'links': 'https://www.jd.com', 'scrollType': 0, 'scrollCount': 0}}, {'id': 1, 'index': 2, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 5, 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword= ?&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'scrollType': 0, 'scrollCount': 0}}, {'id': 3, 'index': 3, 'parentId': 2, 'type': 1, 'option': 8, 'title': 'Loop', 'sequence': [4], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//div[contains(@class, "cs-source")]//span[contains(@class, \'text-ellipsis\')][1]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 1, 'pathList': '', 'textList': '', 'exitCount': 0, 'historyWait': 2}}, {'id': 5, 'index': 4, 'parentId': 3, 'type': 0, 'option': 3, 'title': 'Extract Data', 'sequence': [], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '', 'wait': 0, 'paras': [{'nodeType': 1, 'contentType': 0, 'relative': True, 'name': '1_', 'desc': '', 'relativeXpath': '', 'exampleValues': [{'num': 0, 'value': ' - - '}, {'num': 1, 'value': ' - - '}, {'num': 2, 'value': ' - - '}, {'num': 3, 'value': '-线'}], 'default': ''}], 'loopType': 1}}, {'id': 2, 'index': 5, 'parentId': 0, 'type': 1, 'option': 8, 'title': 'Loop_Click', 'sequence': [3, 6], 'isInLoop': False, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//span[contains(text(), "下一页")]/../..', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 0, 'pathList': '', 'textList': '', 'exitCount': 5, 'historyWait': 2}}, {'id': 4, 'index': 6, 'parentId': 2, 'type': 0, 'option': 2, 'title': 'Click Element', 'sequence': [], 'isInLoop': True, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': True, 'xpath': '//*[@id="s-dom-f0607f20"]/div[1]/div[1]/a[7]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'paras': [], 'loopType': 0}}]}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id": 12, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "desc": "要采集的网址列表,多行以\\n分开", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "循环Loop_Click执行的次数0代表无限循环", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "参数1_链接文本", "desc": "", "type": "string", "exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "参数1_链接文本", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "经常戴耳机危害 - 语音科普 - 博禾医生"}, {"num": 1, "value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"}, {"num": 2, "value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"}, {"num": 3, "value": "长时间戴耳机听歌或致耳聋-名医在线网"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"下一页\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id": 4, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "desc": "\u8981\u91c7\u96c6\u7684\u7f51\u5740\u5217\u8868,\u591a\u884c\u4ee5\\n\u5206\u5f00", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "\u5faa\u73afLoop_Click\u6267\u884c\u7684\u6b21\u6570\uff080\u4ee3\u8868\u65e0\u9650\u5faa\u73af\uff09", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "type": "string", "exampleValue": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}, {"num": 1, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u6709\u4ec0\u4e48\u5371\u5bb3 - \u535a\u79be\u5fae\u89c6 - \u535a\u79be\u533b\u751f"}, {"num": 2, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u8033\u6735\u4f1a\u4e0d\u4f1a\u804b - \u4e13\u5bb6\u6587\u7ae0 - \u535a\u79be\u533b\u751f"}, {"num": 3, "value": "\u957f\u65f6\u95f4\u6234\u8033\u673a\u542c\u6b4c\u6216\u81f4\u8033\u804b-\u540d\u533b\u5728\u7ebf\u7f51"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"\u4e0b\u4e00\u9875\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}

218
ExecuteStage/tasks/5.json Normal file

File diff suppressed because one or more lines are too long

View File

218
ExecuteStage/tasks/7.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,6 @@
import pandas as pd
if __name__ == "__main__":
df = pd.read_excel('list.xlsx', "sheet2")
data = df.values
print(len(data), data[0])

View File

@ -1,4 +1,3 @@
from django.http import HttpResponse
import pymongo
import json