mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-12 11:37:11 +08:00
New version
This commit is contained in:
parent
3646513d5b
commit
f125db1f8e
712
ExecuteStage/ServiceWrapper_ExecuteStage.py
Normal file
712
ExecuteStage/ServiceWrapper_ExecuteStage.py
Normal file
@ -0,0 +1,712 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import atexit # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
from lib2to3.pgen2 import driver
|
||||
import re
|
||||
import sys
|
||||
from urllib import parse
|
||||
import base64
|
||||
import hashlib
|
||||
import time
|
||||
import requests
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.common.exceptions import StaleElementReferenceException
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
import random
|
||||
import numpy
|
||||
import csv
|
||||
import os
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
saveName, log, OUTPUT, browser, SAVED = None, "", "", None, False
|
||||
|
||||
desired_capabilities = DesiredCapabilities.CHROME
|
||||
desired_capabilities["pageLoadStrategy"] = "none"
|
||||
outputParameters = {}
|
||||
|
||||
class Time:
|
||||
def __init__(self, type1=""):
|
||||
self.t = int(round(time.time() * 1000))
|
||||
self.type = type1
|
||||
|
||||
def end(self):
|
||||
at = int(round(time.time() * 1000))
|
||||
Log(str(self.type)+":"+str(at-self.t))
|
||||
|
||||
# 记录log
|
||||
|
||||
|
||||
def recordLog(str=""):
|
||||
global log
|
||||
log = log + str + "\n"
|
||||
|
||||
|
||||
# 控制台打印log函数
|
||||
def Log(text, text2=""):
|
||||
switch = False
|
||||
if switch:
|
||||
print(text, text2)
|
||||
|
||||
# 屏幕滚动函数
|
||||
|
||||
|
||||
def scrollDown(para, rt=""):
|
||||
try:
|
||||
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
|
||||
for i in range(para["scrollCount"]):
|
||||
time.sleep(1) # 下拉完等1秒
|
||||
Log("下拉完等待1秒")
|
||||
body = browser.find_element(By.CSS_SELECTOR,"body")
|
||||
if para["scrollType"] == 1:
|
||||
body.send_keys(Keys.PGDN)
|
||||
else:
|
||||
body.send_keys(Keys.END)
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when scrolling. ')
|
||||
recordLog('time out after 10 seconds when scrolling')
|
||||
browser.execute_script('window.stop()')
|
||||
if para["scrollType"] != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
|
||||
for i in range(para["scrollCount"]):
|
||||
time.sleep(1) # 下拉完等1秒
|
||||
Log("下拉完等待1秒")
|
||||
body = browser.find_element(By.CSS_SELECTOR,"body")
|
||||
if para["scrollType"] == 1:
|
||||
body.send_keys(Keys.PGDN)
|
||||
else:
|
||||
body.send_keys(Keys.END)
|
||||
if rt != "":
|
||||
rt.end()
|
||||
|
||||
|
||||
# 执行节点关键函数部分
|
||||
def excuteNode(nodeId, loopValue="", clickPath="", index=0):
|
||||
node = procedure[nodeId]
|
||||
WebDriverWait(browser, 10).until
|
||||
# 等待元素出现才进行操作,10秒内未出现则报错
|
||||
(EC.visibility_of_element_located((By.XPATH, node["parameters"]["xpath"])))
|
||||
|
||||
# 根据不同选项执行不同操作
|
||||
if node["option"] == 0 or node["option"] == 10: # root操作,条件分支操作
|
||||
for i in node["sequence"]: # 从根节点开始向下读取
|
||||
excuteNode(i, loopValue, clickPath, index)
|
||||
elif node["option"] == 1: # 打开网页操作
|
||||
recordLog("openPage")
|
||||
openPage(node["parameters"], loopValue)
|
||||
elif node["option"] == 2: # 点击元素
|
||||
recordLog("Click")
|
||||
clickElement(node["parameters"], loopValue, clickPath, index)
|
||||
elif node["option"] == 3: # 提取数据
|
||||
recordLog("getData")
|
||||
getData(node["parameters"], loopValue, node["isInLoop"], parentPath = clickPath, index = index)
|
||||
elif node["option"] == 4: # 输入文字
|
||||
inputInfo(node["parameters"], loopValue)
|
||||
elif node["option"] == 8: # 循环
|
||||
recordLog("loop")
|
||||
loopExcute(node, loopValue, clickPath, index) # 执行循环
|
||||
elif node["option"] == 9: # 条件分支
|
||||
recordLog("judge")
|
||||
judgeExcute(node, loopValue, clickPath, index)
|
||||
|
||||
# 执行完之后进行等待
|
||||
if node["option"] != 0:
|
||||
waitTime = 0.01 # 默认等待0.01秒
|
||||
if node["parameters"]["wait"] > 1:
|
||||
waitTime = node["parameters"]["wait"]
|
||||
time.sleep(waitTime)
|
||||
Log("Node执行完后等待:", waitTime)
|
||||
|
||||
|
||||
# 对判断条件的处理
|
||||
def judgeExcute(node, loopElement, clickPath="", index=0):
|
||||
rt = Time("条件判断")
|
||||
global bodyText # 引入bodyText
|
||||
excuteBranchId = 0 # 要执行的BranchId
|
||||
for i in node["sequence"]:
|
||||
cnode = procedure[i] # 获得条件分支
|
||||
tType = int(cnode["parameters"]["class"]) # 获得判断条件类型
|
||||
if tType == 0: # 什么条件都没有
|
||||
excuteBranchId = i
|
||||
break
|
||||
elif tType == 1: # 当前页面包含文本
|
||||
try:
|
||||
if bodyText.find(cnode["parameters"]["value"]) >= 0:
|
||||
excuteBranchId = i
|
||||
break
|
||||
except: # 找不到元素下一个条件
|
||||
continue
|
||||
elif tType == 2: # 当前页面包含元素
|
||||
try:
|
||||
if browser.find_element(By.XPATH, cnode["parameters"]["value"]):
|
||||
excuteBranchId = i
|
||||
break
|
||||
except: # 找不到元素或者xpath写错了,下一个条件
|
||||
continue
|
||||
elif tType == 3: # 当前循环元素包括文本
|
||||
try:
|
||||
if loopElement.text.find(cnode["parameters"]["value"]) >= 0:
|
||||
excuteBranchId = i
|
||||
break
|
||||
except: # 找不到元素或者xpath写错了,下一个条件
|
||||
continue
|
||||
elif tType == 4: # 当前循环元素包括元素
|
||||
try:
|
||||
if loopElement.find_element(By.XPATH, cnode["parameters"]["value"][1:]):
|
||||
excuteBranchId = i
|
||||
break
|
||||
except: # 找不到元素或者xpath写错了,下一个条件
|
||||
continue
|
||||
rt.end()
|
||||
excuteNode(excuteBranchId, loopElement, clickPath, index)
|
||||
|
||||
|
||||
# 对循环的处理
|
||||
def loopExcute(node, loopValue, clickPath="", index=0):
|
||||
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
|
||||
Log("循环执行前等待0.1秒")
|
||||
global history
|
||||
thisHandle = browser.current_window_handle # 记录本次循环内的标签页的ID
|
||||
thisHistoryLength = browser.execute_script(
|
||||
'return history.length') # 记录本次循环内的history的length
|
||||
history["index"] = thisHistoryLength
|
||||
history["handle"] = thisHandle
|
||||
|
||||
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
|
||||
# 无跳转标签页操作
|
||||
count = 0 # 执行次数
|
||||
while True: # do while循环
|
||||
try:
|
||||
element = browser.find_element(By.XPATH,
|
||||
node["parameters"]["xpath"])
|
||||
for i in node["sequence"]: # 挨个执行操作
|
||||
excuteNode(i, element, node["parameters"]["xpath"], 0)
|
||||
Log("click: ", node["parameters"]["xpath"])
|
||||
recordLog("click:" + node["parameters"]["xpath"])
|
||||
# except NoSuchElementException:
|
||||
except:
|
||||
Log("clickNotFound: ", node["parameters"]["xpath"])
|
||||
recordLog("clickNotFound:" + node["parameters"]["xpath"])
|
||||
for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
|
||||
if node["option"] != 2:
|
||||
excuteNode(i, None, node["parameters"]["xpath"], 0)
|
||||
break # 如果找不到元素,退出循环
|
||||
count = count + 1
|
||||
Log("页数:", count)
|
||||
recordLog("页数:" + str(count))
|
||||
# print(node["parameters"]["exitCount"], "-------")
|
||||
if node["parameters"]["exitCount"] == count: # 如果达到设置的退出循环条件的话
|
||||
break
|
||||
elif int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
try:
|
||||
elements = browser.find_elements(By.XPATH,
|
||||
node["parameters"]["xpath"])
|
||||
for index in range(len(elements)):
|
||||
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
|
||||
excuteNode(i, elements[index],
|
||||
node["parameters"]["xpath"], index)
|
||||
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
|
||||
while True: # 一直关闭窗口直到当前标签页
|
||||
browser.close() # 关闭使用完的标签页
|
||||
browser.switch_to.window(browser.window_handles[-1])
|
||||
if browser.current_window_handle == thisHandle:
|
||||
break
|
||||
if history["index"] != thisHistoryLength and history[
|
||||
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
|
||||
difference = thisHistoryLength - \
|
||||
history["index"] # 计算历史记录变化差值
|
||||
browser.execute_script(
|
||||
'history.go(' + str(difference) + ')') # 回退历史记录
|
||||
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
||||
time.sleep(node["parameters"]["historyWait"])
|
||||
else:
|
||||
time.sleep(2)
|
||||
Log("切换历史记录等待2秒或者:", node["parameters"]["historyWait"])
|
||||
browser.execute_script('window.stop()')
|
||||
except NoSuchElementException:
|
||||
Log("pathNotFound: ", node["parameters"]["xpath"])
|
||||
recordLog("pathNotFound: " + node["parameters"]["xpath"])
|
||||
pass # 循环中找不到元素就略过操作
|
||||
except Exception as e:
|
||||
raise
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
for path in node["parameters"]["pathList"].split("\n"): # 千万不要忘了分割!!
|
||||
try:
|
||||
element = browser.find_element(By.XPATH, path)
|
||||
for i in node["sequence"]: # 挨个执行操作
|
||||
excuteNode(i, element, path, 0)
|
||||
if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化
|
||||
while True: # 一直关闭窗口直到当前标签页
|
||||
browser.close() # 关闭使用完的标签页
|
||||
browser.switch_to.window(browser.window_handles[-1])
|
||||
if browser.current_window_handle == thisHandle:
|
||||
break
|
||||
if history["index"] != thisHistoryLength and history[
|
||||
"handle"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
|
||||
difference = thisHistoryLength - \
|
||||
history["index"] # 计算历史记录变化差值
|
||||
browser.execute_script(
|
||||
'history.go(' + str(difference) + ')') # 回退历史记录
|
||||
if node["parameters"]["historyWait"] > 2: # 回退后要等待的时间
|
||||
time.sleep(node["parameters"]["historyWait"])
|
||||
else:
|
||||
time.sleep(2)
|
||||
Log("切换历史记录等待2秒或者:", node["parameters"]["historyWait"])
|
||||
browser.execute_script('window.stop()')
|
||||
except NoSuchElementException:
|
||||
Log("pathNotFound: ", path)
|
||||
recordLog("pathNotFound: " + path)
|
||||
continue # 循环中找不到元素就略过操作
|
||||
except Exception as e:
|
||||
raise
|
||||
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
|
||||
textList = node["parameters"]["textList"].split("\n")
|
||||
for text in textList:
|
||||
recordLog("input: " + text)
|
||||
for i in node["sequence"]: # 挨个执行操作
|
||||
excuteNode(i, text, "", 0)
|
||||
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
|
||||
# tempList = node["parameters"]["textList"].split("\r\n")
|
||||
urlList = list(filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
|
||||
# urlList = []
|
||||
# for url in tempList:
|
||||
# if url != "":
|
||||
# urlList.append(url)
|
||||
for url in urlList:
|
||||
recordLog("input: " + url)
|
||||
for i in node["sequence"]:
|
||||
excuteNode(i, url, "", 0)
|
||||
history["index"] = thisHistoryLength
|
||||
history["handle"] = browser.current_window_handle
|
||||
scrollDown(node["parameters"])
|
||||
|
||||
|
||||
# 打开网页事件
|
||||
def openPage(para, loopValue):
|
||||
rt = Time("打开网页")
|
||||
time.sleep(2) # 打开网页后强行等待至少2秒
|
||||
global links
|
||||
global urlId
|
||||
global history
|
||||
global outputParameters
|
||||
# try:
|
||||
# firstTime = True
|
||||
# for handle in browser.window_handles:
|
||||
# browser.switch_to.window(handle)
|
||||
# if (not firstTime):
|
||||
# browser.close()
|
||||
# firstTime = False
|
||||
# except:
|
||||
# return
|
||||
if len(browser.window_handles) > 1:
|
||||
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
|
||||
browser.close()
|
||||
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
|
||||
history["handle"] = browser.current_window_handle
|
||||
if para["useLoop"]:
|
||||
url = loopValue
|
||||
else:
|
||||
url = links[urlId]
|
||||
try:
|
||||
browser.get(url)
|
||||
Log('Loading page: ' + url)
|
||||
recordLog('Loading page: ' + url)
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when loading page: ' + url)
|
||||
recordLog('time out after 10 seconds when loading page: ' + url)
|
||||
browser.execute_script('window.stop()')
|
||||
rt.end()
|
||||
try:
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
except TimeoutException:
|
||||
browser.execute_script('window.stop()')
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
rt.end()
|
||||
scrollDown(para, rt) # 控制屏幕向下滚动
|
||||
if containJudge:
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
try:
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
|
||||
Log('URL Page: ' + url)
|
||||
recordLog('URL Page: ' + url)
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when getting body text: ' + url)
|
||||
recordLog('time out after 10 seconds when getting body text:: ' + url)
|
||||
browser.execute_script('window.stop()')
|
||||
time.sleep(1)
|
||||
Log("获得bodytext等待1秒")
|
||||
# 再执行一遍
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
|
||||
rt.end()
|
||||
except Exception as e:
|
||||
Log(e)
|
||||
recordLog(str(e))
|
||||
|
||||
# clear output parameters
|
||||
for key in outputParameters:
|
||||
outputParameters[key] = ""
|
||||
|
||||
rt.end()
|
||||
|
||||
|
||||
# 键盘输入事件
|
||||
def inputInfo(para, loopValue):
|
||||
time.sleep(1) # 输入之前等待1秒
|
||||
Log("输入前等待1秒")
|
||||
rt = Time("输入文字")
|
||||
try:
|
||||
textbox = browser.find_element(By.XPATH, para["xpath"])
|
||||
except:
|
||||
Log("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
|
||||
recordLog("找不到输入框元素:" + para["xpath"] + "请尝试执行前等待")
|
||||
exit()
|
||||
textbox.send_keys(Keys.CONTROL, 'a')
|
||||
textbox.send_keys(Keys.BACKSPACE)
|
||||
if para["useLoop"]:
|
||||
textbox.send_keys(loopValue)
|
||||
else:
|
||||
textbox.send_keys(para["value"])
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
|
||||
rt.end()
|
||||
|
||||
|
||||
# 点击元素事件
|
||||
def clickElement(para, loopElement=None, clickPath="", index=0):
|
||||
global history
|
||||
time.sleep(0.1) # 点击之前等待1秒
|
||||
rt = Time("点击元素")
|
||||
Log("点击之前等待1秒")
|
||||
if para["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
|
||||
path = clickPath
|
||||
else:
|
||||
path = para["xpath"] # 不然使用元素定义的xpath
|
||||
tempHandleNum = len(browser.window_handles) # 记录之前的窗口位置
|
||||
try:
|
||||
script = 'var result = document.evaluate(`' + path + \
|
||||
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
|
||||
browser.execute_script(script, str(index)) # 用js的点击方法
|
||||
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when loading clicked page')
|
||||
recordLog('time out after 10 seconds when loading clicked page')
|
||||
browser.execute_script('window.stop()')
|
||||
rt.end()
|
||||
except Exception as e:
|
||||
Log(e)
|
||||
recordLog(str(e))
|
||||
time.sleep(0.5) # 点击之后等半秒
|
||||
Log("点击之后等待0.5秒")
|
||||
if tempHandleNum != len(browser.window_handles): # 如果有新标签页的行为发生
|
||||
browser.switch_to.window(browser.window_handles[-1]) # 跳转到新的标签页
|
||||
history["handle"] = browser.current_window_handle
|
||||
try:
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
except TimeoutException:
|
||||
browser.execute_script('window.stop()')
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
rt.end()
|
||||
else:
|
||||
try:
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
except TimeoutException:
|
||||
browser.execute_script('window.stop()')
|
||||
history["index"] = browser.execute_script("return history.length")
|
||||
rt.end()
|
||||
# 如果打开了新窗口,切换到新窗口
|
||||
scrollDown(para, rt) # 根据参数配置向下滚动
|
||||
if containJudge: # 有判断语句才执行以下操作
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
try:
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when getting body text')
|
||||
recordLog('time out after 10 seconds when getting body text')
|
||||
browser.execute_script('window.stop()')
|
||||
time.sleep(1)
|
||||
Log("bodytext等待1秒")
|
||||
# 再执行一遍
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
|
||||
rt.end()
|
||||
except Exception as e:
|
||||
Log(e)
|
||||
recordLog(str(e))
|
||||
rt.end()
|
||||
|
||||
|
||||
# 提取数据事件
|
||||
def getData(para, loopElement, isInLoop=True, parentPath="", index = 0):
|
||||
if not isInLoop and para["wait"] == 0:
|
||||
time.sleep(1) # 如果提取数据字段不在循环内而且设置的等待时间为0,默认等待1秒
|
||||
Log("提取数据等待1秒")
|
||||
rt = Time("提取数据")
|
||||
for p in para["paras"]:
|
||||
content = ""
|
||||
try:
|
||||
if p["relative"]: # 是否相对xpath
|
||||
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
||||
element = loopElement
|
||||
else:
|
||||
if p["relativeXpath"].find("//")>=0: # 如果字串里有//即子孙查找,则不动语句
|
||||
full_path = "(" + parentPath + p["relativeXpath"] + ")" + "[" + str(index + 1) + "]"
|
||||
element = browser.find_element(By.XPATH, full_path)
|
||||
else:
|
||||
element = loopElement.find_element(By.XPATH,
|
||||
p["relativeXpath"][1:])
|
||||
else:
|
||||
element = browser.find_element(By.XPATH, p["relativeXpath"])
|
||||
except NoSuchElementException: # 找不到元素的时候,使用默认值
|
||||
# print(p)
|
||||
try:
|
||||
content = p["default"]
|
||||
except Exception as e:
|
||||
content = ""
|
||||
outputParameters[p["name"]] = content
|
||||
Log('Element %s not found,use default' % p["relativeXpath"])
|
||||
recordLog('Element %s not found, use default' % p["relativeXpath"])
|
||||
continue
|
||||
except TimeoutException: # 超时的时候设置超时值
|
||||
Log('time out after 10 seconds when getting data')
|
||||
recordLog('time out after 10 seconds when getting data')
|
||||
browser.execute_script('window.stop()')
|
||||
if p["relative"]: # 是否相对xpath
|
||||
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
||||
element = loopElement
|
||||
else:
|
||||
element = loopElement.find_element(By.XPATH,
|
||||
p["relativeXpath"][1:])
|
||||
else:
|
||||
element = browser.find_element(By.XPATH, p["relativeXpath"])
|
||||
rt.end()
|
||||
try:
|
||||
if p["contentType"] == 2:
|
||||
content = element.get_attribute('innerHTML')
|
||||
elif p["contentType"] == 3:
|
||||
content = element.get_attribute('outerHTML')
|
||||
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
|
||||
command = 'var arr = [];\
|
||||
var content = arguments[0];\
|
||||
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
|
||||
if(content.childNodes[i].nodeType === 3){ \
|
||||
arr.push(content.childNodes[i].nodeValue);\
|
||||
}\
|
||||
}\
|
||||
var str = arr.join(" "); \
|
||||
return str;'
|
||||
content = browser.execute_script(command, element).replace(
|
||||
"\n", "").replace("\\s+", " ")
|
||||
if p["nodeType"] == 2:
|
||||
if element.get_attribute("href") != None:
|
||||
content = element.get_attribute("href")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 3:
|
||||
if element.get_attribute("value") != None:
|
||||
content = element.get_attribute("value")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 4: # 图片
|
||||
if element.get_attribute("src") != None:
|
||||
content = element.get_attribute("src")
|
||||
else:
|
||||
content = ""
|
||||
elif p["contentType"] == 0:
|
||||
content = element.text
|
||||
if p["nodeType"] == 2:
|
||||
if element.get_attribute("href") != None:
|
||||
content = element.get_attribute("href")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 3:
|
||||
if element.get_attribute("value") != None:
|
||||
content = element.get_attribute("value")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 4: # 图片
|
||||
if element.get_attribute("src") != None:
|
||||
content = element.get_attribute("src")
|
||||
else:
|
||||
content = ""
|
||||
except StaleElementReferenceException: # 发生找不到元素的异常后,等待几秒重新查找
|
||||
recordLog('StaleElementReferenceException:'+p["relativeXpath"])
|
||||
time.sleep(3)
|
||||
try:
|
||||
if p["relative"]: # 是否相对xpath
|
||||
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
||||
element = loopElement
|
||||
recordLog('StaleElementReferenceException:loopElement')
|
||||
else:
|
||||
element = loopElement.find_element(By.XPATH,
|
||||
p["relativeXpath"][1:])
|
||||
recordLog(
|
||||
'StaleElementReferenceException:loopElement+relativeXPath')
|
||||
else:
|
||||
element = browser.find_element(
|
||||
By.XPATH, p["relativeXpath"])
|
||||
recordLog('StaleElementReferenceException:relativeXpath')
|
||||
if p["contentType"] == 2:
|
||||
content = element.get_attribute('innerHTML')
|
||||
elif p["contentType"] == 3:
|
||||
content = element.get_attribute('outerHTML')
|
||||
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
|
||||
command = 'var arr = [];\
|
||||
var content = arguments[0];\
|
||||
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
|
||||
if(content.childNodes[i].nodeType === 3){ \
|
||||
arr.push(content.childNodes[i].nodeValue);\
|
||||
}\
|
||||
}\
|
||||
var str = arr.join(" "); \
|
||||
return str;'
|
||||
content = browser.execute_script(command, element).replace(
|
||||
"\n", "").replace("\\s+", " ")
|
||||
if p["nodeType"] == 2:
|
||||
if element.get_attribute("href") != None:
|
||||
content = element.get_attribute("href")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 3:
|
||||
if element.get_attribute("value") != None:
|
||||
content = element.get_attribute("value")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 4: # 图片
|
||||
if element.get_attribute("src") != None:
|
||||
content = element.get_attribute("src")
|
||||
else:
|
||||
content = ""
|
||||
elif p["contentType"] == 0:
|
||||
content = element.text
|
||||
if p["nodeType"] == 2:
|
||||
if element.get_attribute("href") != None:
|
||||
content = element.get_attribute("href")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 3:
|
||||
if element.get_attribute("value") != None:
|
||||
content = element.get_attribute("value")
|
||||
else:
|
||||
content = ""
|
||||
elif p["nodeType"] == 4: # 图片
|
||||
if element.get_attribute("src") != None:
|
||||
content = element.get_attribute("src")
|
||||
else:
|
||||
content = ""
|
||||
except StaleElementReferenceException:
|
||||
recordLog('StaleElementReferenceException:'+p["relativeXpath"])
|
||||
continue # 再出现类似问题直接跳过
|
||||
outputParameters[p["name"]] = content
|
||||
global OUTPUT
|
||||
line = []
|
||||
for value in outputParameters.values():
|
||||
line.append(value)
|
||||
print(value[:15], " ", end="")
|
||||
print("")
|
||||
OUTPUT.append(line)
|
||||
rt.end()
|
||||
|
||||
|
||||
# 判断字段是否为空
|
||||
def isnull(s):
|
||||
return len(s) != 0
|
||||
|
||||
|
||||
@atexit.register
|
||||
def clean():
|
||||
global saveName, log, OUTPUT, browser, SAVED
|
||||
if not SAVED:
|
||||
print('清理环境保存数据')
|
||||
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
|
||||
file_obj.write(log)
|
||||
file_obj.close()
|
||||
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
|
||||
f_csv = csv.writer(f)
|
||||
for line in OUTPUT:
|
||||
f_csv.writerow(line)
|
||||
f.close()
|
||||
browser.quit()
|
||||
sys.exit(saveName + '.csv')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
options = Options()
|
||||
exe_path = "chromedriver.exe"
|
||||
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/ServiceWrapper")
|
||||
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
|
||||
elif os.path.exists(os.getcwd()+"/Debug"):
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/Debug")
|
||||
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "Debug/Chrome/chromedriver.exe"
|
||||
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/Debug")
|
||||
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "./Chrome/chromedriver.exe"
|
||||
else:
|
||||
options.binary_location = "chrome.exe" # 指定chrome位置
|
||||
browser = webdriver.Chrome(options=options, executable_path=exe_path)
|
||||
browser.get('about:blank')
|
||||
browser.set_page_load_timeout(10) # 加载页面最大超时时间
|
||||
browser.set_script_timeout(10)
|
||||
if len(sys.argv) > 1:
|
||||
id = int(sys.argv[1]) # taskId这里修改
|
||||
else:
|
||||
id = 7 # 设置默认值
|
||||
print("id:", id)
|
||||
if len(sys.argv) > 2:
|
||||
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
|
||||
else:
|
||||
saveName = "task_" + str(id) + "_" + \
|
||||
str(random.randint(0, 999999999)) # 保存文件的名字
|
||||
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
|
||||
if len(sys.argv) > 3:
|
||||
backEndAddress = sys.argv[3]
|
||||
else:
|
||||
backEndAddress = "http://servicewrapper.naibo.wang"
|
||||
|
||||
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
print("name:", service["name"])
|
||||
procedure = service["graph"] # 程序执行流程
|
||||
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||
OUTPUT = [] # 采集的数据
|
||||
OUTPUT.append([]) # 添加表头
|
||||
containJudge = service["containJudge"] # 是否含有判断语句
|
||||
bodyText = "" # 记录bodyText
|
||||
tOut = service["outputParameters"] # 生成输出参数对象
|
||||
outputParameters = {}
|
||||
log = "" # 记下现在总共开了多少个标签页
|
||||
history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
|
||||
SAVED = False # 记录是否已经存储了
|
||||
for para in tOut:
|
||||
outputParameters[para["name"]] = ""
|
||||
OUTPUT[0].append(para["name"])
|
||||
# 挨个执行程序
|
||||
urlId = 0 # 全局记录变量
|
||||
for i in range(len(links)):
|
||||
excuteNode(0)
|
||||
urlId = urlId + 1
|
||||
print("执行完成!")
|
||||
recordLog("Done!")
|
||||
# dataPath = os.path.abspath(os.path.join(os.getcwd(), "../Data"))
|
||||
with open("Data/"+saveName + '_log.txt', 'w', encoding='utf-8-sig') as file_obj:
|
||||
file_obj.write(log)
|
||||
file_obj.close()
|
||||
with open("Data/"+saveName + '.csv', 'w', encoding='utf-8-sig', newline="") as f:
|
||||
f_csv = csv.writer(f)
|
||||
for line in OUTPUT:
|
||||
f_csv.writerow(line)
|
||||
f.close()
|
||||
SAVED = True
|
||||
browser.quit()
|
@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import atexit # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
from lib2to3.pgen2 import driver
|
||||
import re
|
||||
import sys
|
||||
from urllib import parse
|
||||
@ -64,7 +65,7 @@ def scrollDown(para, rt=""):
|
||||
for i in range(para["scrollCount"]):
|
||||
time.sleep(1) # 下拉完等1秒
|
||||
Log("下拉完等待1秒")
|
||||
body = browser.find_element_by_css_selector("body")
|
||||
body = browser.find_element(By.CSS_SELECTOR, "body")
|
||||
if para["scrollType"] == 1:
|
||||
body.send_keys(Keys.PGDN)
|
||||
else:
|
||||
@ -77,7 +78,7 @@ def scrollDown(para, rt=""):
|
||||
for i in range(para["scrollCount"]):
|
||||
time.sleep(1) # 下拉完等1秒
|
||||
Log("下拉完等待1秒")
|
||||
body = browser.find_element_by_css_selector("body")
|
||||
body = browser.find_element(By.CSS_SELECTOR, "body")
|
||||
if para["scrollType"] == 1:
|
||||
body.send_keys(Keys.PGDN)
|
||||
else:
|
||||
@ -283,6 +284,18 @@ def openPage(para, loopValue):
|
||||
global links
|
||||
global urlId
|
||||
global history
|
||||
# try:
|
||||
# firstTime = True
|
||||
# for handle in browser.window_handles:
|
||||
# browser.switch_to.window(handle)
|
||||
# if (not firstTime):
|
||||
# browser.close()
|
||||
# firstTime = False
|
||||
# except:
|
||||
# return
|
||||
if len(browser.window_handles) > 1:
|
||||
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
|
||||
browser.close()
|
||||
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
|
||||
history["handle"] = browser.current_window_handle
|
||||
if para["useLoop"]:
|
||||
@ -306,7 +319,7 @@ def openPage(para, loopValue):
|
||||
if containJudge:
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
try:
|
||||
bodyText = browser.find_element_by_css_selector("body").text
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when getting body text: ' + url)
|
||||
recordLog('time out after 10 seconds when getting body text:: ' + url)
|
||||
@ -314,7 +327,7 @@ def openPage(para, loopValue):
|
||||
time.sleep(1)
|
||||
Log("获得bodytext等待1秒")
|
||||
# 再执行一遍
|
||||
bodyText = browser.find_element_by_css_selector("body").text
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
|
||||
rt.end()
|
||||
except Exception as e:
|
||||
Log(e)
|
||||
@ -340,7 +353,7 @@ def inputInfo(para, loopValue):
|
||||
else:
|
||||
textbox.send_keys(para["value"])
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
bodyText = browser.find_element_by_css_selector("body").text
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
|
||||
rt.end()
|
||||
|
||||
|
||||
@ -391,7 +404,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
|
||||
if containJudge: # 有判断语句才执行以下操作
|
||||
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
|
||||
try:
|
||||
bodyText = browser.find_element_by_css_selector("body").text
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
|
||||
except TimeoutException:
|
||||
Log('time out after 10 seconds when getting body text')
|
||||
recordLog('time out after 10 seconds when getting body text')
|
||||
@ -399,7 +412,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
|
||||
time.sleep(1)
|
||||
Log("bodytext等待1秒")
|
||||
# 再执行一遍
|
||||
bodyText = browser.find_element_by_css_selector("body").text
|
||||
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
|
||||
rt.end()
|
||||
except Exception as e:
|
||||
Log(e)
|
||||
@ -594,18 +607,25 @@ def clean():
|
||||
f_csv.writerow(line)
|
||||
f.close()
|
||||
browser.quit()
|
||||
sys.exit(saveName + '.csv')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
options = Options()
|
||||
exe_path = "chromedriver.exe"
|
||||
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/ServiceWrapper")
|
||||
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
|
||||
elif os.path.exists(os.getcwd()+"/Debug"):
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/Debug")
|
||||
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "Debug/Chrome/chromedriver.exe"
|
||||
elif os.getcwd().find("ExcuteStage") >= 0: # 如果直接执行
|
||||
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
|
||||
print("Finding chromedriver in ServiceWrapper",
|
||||
os.getcwd()+"/Debug")
|
||||
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
|
||||
exe_path = "./Chrome/chromedriver.exe"
|
||||
else:
|
||||
@ -620,17 +640,23 @@ if __name__ == '__main__':
|
||||
id = 7 # 设置默认值
|
||||
print("id:", id)
|
||||
if len(sys.argv) > 2:
|
||||
backEndAddress = sys.argv[2]
|
||||
else:
|
||||
backEndAddress = "http://servicewrapper.naibo.wang"
|
||||
if len(sys.argv) > 3:
|
||||
saveName = "task_" + str(id) + "_" + sys.argv[3] # 保存文件的名字
|
||||
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
|
||||
else:
|
||||
saveName = "task_" + str(id) + "_" + \
|
||||
str(random.randint(0, 999999999)) # 保存文件的名字
|
||||
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
print("name:", service["name"])
|
||||
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
|
||||
if len(sys.argv) > 3:
|
||||
backEndAddress = sys.argv[3]
|
||||
else:
|
||||
backEndAddress = "http://servicewrapper.naibo.wang"
|
||||
|
||||
# content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
|
||||
with open("tasks/" + str(id) + ".json", 'r') as f:
|
||||
content = f.read()
|
||||
service = json.loads(content)
|
||||
# print(service)
|
||||
# service = json.loads() # 加载服务信息
|
||||
print("name:", service["name"])
|
||||
procedure = service["graph"] # 程序执行流程
|
||||
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||
OUTPUT = [] # 采集的数据
|
||||
@ -663,4 +689,3 @@ if __name__ == '__main__':
|
||||
f.close()
|
||||
SAVED = True
|
||||
browser.quit()
|
||||
sys.exit(0)
|
BIN
ExecuteStage/all_data.xls
Normal file
BIN
ExecuteStage/all_data.xls
Normal file
Binary file not shown.
151
ExecuteStage/author_crawl.py
Normal file
151
ExecuteStage/author_crawl.py
Normal file
@ -0,0 +1,151 @@
|
||||
# _*_coding:utf-8_*_
|
||||
from hashlib import new
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Process
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
import pickle
|
||||
import calendar
|
||||
import re
|
||||
from copy import deepcopy
|
||||
import requests
|
||||
import csv
|
||||
from commandline_config import Config
|
||||
from service_invoke import invokeService
|
||||
|
||||
|
||||
class TimeUtil(object):
|
||||
@classmethod
|
||||
def parse_timezone(cls, timezone):
|
||||
"""
|
||||
解析时区表示
|
||||
:param timezone: str eg: +8
|
||||
:return: dict{symbol, offset}
|
||||
"""
|
||||
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
|
||||
symbol = result.groupdict()['symbol']
|
||||
offset = int(result.groupdict()['offset'])
|
||||
|
||||
return {
|
||||
'symbol': symbol,
|
||||
'offset': offset
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert_timezone(cls, dt, timezone="+0"):
|
||||
"""默认是utc时间,需要"""
|
||||
result = cls.parse_timezone(timezone)
|
||||
symbol = result['symbol']
|
||||
|
||||
offset = result['offset']
|
||||
|
||||
if symbol == '+':
|
||||
return dt + timedelta(hours=offset)
|
||||
elif symbol == '-':
|
||||
return dt - timedelta(hours=offset)
|
||||
else:
|
||||
raise Exception('dont parse timezone format')
|
||||
|
||||
|
||||
def generate_timestamp():
|
||||
current_GMT = time.gmtime()
|
||||
# ts stores timestamp
|
||||
ts = calendar.timegm(current_GMT)
|
||||
|
||||
current_time = datetime.utcnow()
|
||||
convert_now = TimeUtil.convert_timezone(current_time, '+8')
|
||||
print("current_time: " + str(convert_now))
|
||||
return str(convert_now)
|
||||
|
||||
|
||||
def main():
|
||||
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
|
||||
# res = result.read()
|
||||
# for line in res.splitlines():
|
||||
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
|
||||
config = {
|
||||
"pages": 5,
|
||||
"test": False,
|
||||
"test_pages": 3,
|
||||
}
|
||||
c = Config(config)
|
||||
print(c)
|
||||
csv_reader = csv.reader(open("./关键词.csv", encoding='utf-8'))
|
||||
keywords = []
|
||||
i = 0
|
||||
for line in csv_reader:
|
||||
if i < c.test_pages:
|
||||
print(line)
|
||||
i += 1
|
||||
keywords.append(line[0])
|
||||
urlList = ""
|
||||
i = 0
|
||||
for keyword in keywords:
|
||||
url = "https://so.toutiao.com/search?dvpf=pc&source=pagination&filter_vendor=site&keyword=%s&pd=synthesis&filter_vendor=site&action_type=pagination&page_num=0\r\n" % keyword
|
||||
# print(url)
|
||||
urlList += url
|
||||
i += 1
|
||||
if c.test and i > c.test_pages:
|
||||
break
|
||||
print(urlList)
|
||||
|
||||
# result = requests.post(
|
||||
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
|
||||
# data={"id": 6, # serviceID
|
||||
# "paras": json.dumps({"loopTimes_Loop_Click_1": c.pages,
|
||||
# "urlList_0": urlList,
|
||||
# }),
|
||||
# })
|
||||
# authorTaskID = int(result.text)
|
||||
authorTaskID = invokeService(
|
||||
0, {"loopTimes_Loop_Click_1": c.pages, "urlList_0": urlList})
|
||||
print("authorTaskID: " + str(authorTaskID))
|
||||
# exit(0)
|
||||
filename = generate_timestamp().replace(" ", "").replace(":", "-")
|
||||
print("filename:", filename)
|
||||
|
||||
command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
|
||||
str(authorTaskID) + ' ' + filename
|
||||
result = os.system(command)
|
||||
|
||||
# authorTaskID = 53
|
||||
file_name = "task_" + str(authorTaskID) + "_" + filename + ".csv"
|
||||
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
|
||||
print("file_name:", file_name)
|
||||
csv_reader = csv.reader(
|
||||
open("./Data/"+file_name, encoding='utf-8')) # taskID
|
||||
new_author_list = []
|
||||
i = 0
|
||||
for line in csv_reader:
|
||||
# print(line)
|
||||
if i > 0:
|
||||
new_author_list.append(line[0])
|
||||
i += 1
|
||||
# print(new_author_list)
|
||||
new_author_list = list(set(new_author_list)) # 去重
|
||||
|
||||
csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
|
||||
author_list = []
|
||||
for line in csv_reader:
|
||||
author_list.append(line[0])
|
||||
author_list = list(set(author_list)) # 去重
|
||||
|
||||
print("author_list:", author_list)
|
||||
print("new_author_list:", new_author_list)
|
||||
|
||||
real_new_author_list = list(
|
||||
set(new_author_list).difference(set(author_list)))
|
||||
print("real_new_author_list:", real_new_author_list)
|
||||
with open("author_list.csv", "a", encoding='utf-8', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
for row in real_new_author_list:
|
||||
writer.writerow([row])
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
202
ExecuteStage/desc_crawl.py
Normal file
202
ExecuteStage/desc_crawl.py
Normal file
@ -0,0 +1,202 @@
|
||||
# _*_coding:utf-8_*_
|
||||
from hashlib import new
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Process
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
import pickle
|
||||
import calendar
|
||||
import re
|
||||
from copy import deepcopy
|
||||
import requests
|
||||
import csv
|
||||
from commandline_config import Config
|
||||
from service_invoke import invokeService
|
||||
|
||||
|
||||
class TimeUtil(object):
|
||||
@classmethod
|
||||
def parse_timezone(cls, timezone):
|
||||
"""
|
||||
解析时区表示
|
||||
:param timezone: str eg: +8
|
||||
:return: dict{symbol, offset}
|
||||
"""
|
||||
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
|
||||
symbol = result.groupdict()['symbol']
|
||||
offset = int(result.groupdict()['offset'])
|
||||
|
||||
return {
|
||||
'symbol': symbol,
|
||||
'offset': offset
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert_timezone(cls, dt, timezone="+0"):
|
||||
"""默认是utc时间,需要"""
|
||||
result = cls.parse_timezone(timezone)
|
||||
symbol = result['symbol']
|
||||
|
||||
offset = result['offset']
|
||||
|
||||
if symbol == '+':
|
||||
return dt + timedelta(hours=offset)
|
||||
elif symbol == '-':
|
||||
return dt - timedelta(hours=offset)
|
||||
else:
|
||||
raise Exception('dont parse timezone format')
|
||||
|
||||
|
||||
def generate_timestamp():
|
||||
current_GMT = time.gmtime()
|
||||
# ts stores timestamp
|
||||
ts = calendar.timegm(current_GMT)
|
||||
|
||||
current_time = datetime.utcnow()
|
||||
convert_now = TimeUtil.convert_timezone(current_time, '+8')
|
||||
print("current_time: " + str(convert_now))
|
||||
return str(convert_now)
|
||||
|
||||
|
||||
def main():
|
||||
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
|
||||
# res = result.read()
|
||||
# for line in res.splitlines():
|
||||
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
|
||||
config = {
|
||||
"pages": 5,
|
||||
"test": False,
|
||||
"test_pages": 3,
|
||||
}
|
||||
c = Config(config)
|
||||
print(c)
|
||||
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
|
||||
author_list = []
|
||||
for line in csv_reader:
|
||||
author_list.append(line[4])
|
||||
|
||||
csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
|
||||
keywords = []
|
||||
i = 0
|
||||
for line in csv_reader:
|
||||
if line[0] not in author_list:
|
||||
keywords.append(line[0])
|
||||
else:
|
||||
print("Will not append keyword %s", line[0])
|
||||
i += 1
|
||||
if c.test and i > c.test_pages * 100:
|
||||
break
|
||||
# print("author_list:", author_list)
|
||||
# exit(0)
|
||||
|
||||
urlList = ""
|
||||
i = 0
|
||||
|
||||
for keyword in keywords:
|
||||
url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword
|
||||
# print(url)
|
||||
urlList += url
|
||||
i += 1
|
||||
if c.test and i > c.test_pages:
|
||||
break
|
||||
print(urlList)
|
||||
# exit(0)
|
||||
# result = requests.post(
|
||||
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
|
||||
# data={"id": 7, # serviceID
|
||||
# "paras": json.dumps({"urlList_0": urlList,
|
||||
# }),
|
||||
# })
|
||||
# descTaskID = int(result.text)
|
||||
descTaskID = invokeService(
|
||||
1, {"urlList_0": urlList})
|
||||
print("descTaskID: " + str(descTaskID))
|
||||
# exit(0)
|
||||
filename = generate_timestamp().replace(" ", "").replace(":", "-")
|
||||
print("filename:", filename)
|
||||
|
||||
command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
|
||||
str(descTaskID) + ' ' + filename
|
||||
result = os.system(command)
|
||||
|
||||
# authorTaskID = 53
|
||||
file_name = "task_" + str(descTaskID) + "_" + filename + ".csv"
|
||||
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
|
||||
print("file_name:", file_name)
|
||||
csv_reader = csv.reader(
|
||||
open("./Data/"+file_name, encoding='utf-8')) # taskID
|
||||
new_descTaskID = []
|
||||
i = 0
|
||||
for line in csv_reader:
|
||||
# print(line)
|
||||
if i > 0:
|
||||
new_descTaskID.append(line)
|
||||
i += 1
|
||||
# print(new_author_list)
|
||||
# new_descTaskID = list(set([tuple(t) for t in new_descTaskID]))
|
||||
# new_descTaskID = list(set(new_descTaskID)) # 去重
|
||||
|
||||
after_remove_duplicate = []
|
||||
for i in range(len(new_descTaskID)):
|
||||
try:
|
||||
if i > 0:
|
||||
if new_descTaskID[i][2] == new_descTaskID[i-1][2]:
|
||||
continue
|
||||
if new_descTaskID[i][2] != "":
|
||||
zan = new_descTaskID[i][1].split("获赞")[0]
|
||||
fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1]
|
||||
follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1]
|
||||
after_remove_duplicate.append(
|
||||
[new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]])
|
||||
except:
|
||||
pass
|
||||
|
||||
print("after_remove_duplicate", after_remove_duplicate)
|
||||
|
||||
all_collected = []
|
||||
for author in after_remove_duplicate:
|
||||
all_collected.append(author[4])
|
||||
print("all_collected:", all_collected)
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword not in all_collected:
|
||||
print("keyword not collected:", keyword)
|
||||
after_remove_duplicate.append(['', '', '', '', keyword, ''])
|
||||
|
||||
new_descTaskID = after_remove_duplicate
|
||||
|
||||
print("new_descTaskID:", new_descTaskID)
|
||||
|
||||
# for i in range(len(keywords)):
|
||||
# author_list[i] = [keywords[i]].extend(new_descTaskID[i])
|
||||
# for row in author_list:
|
||||
# print(row)
|
||||
|
||||
with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
for row in new_descTaskID:
|
||||
writer.writerow(row)
|
||||
|
||||
import xlwt
|
||||
|
||||
csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
|
||||
all_data = []
|
||||
for line in csv_reader:
|
||||
all_data.append(line)
|
||||
|
||||
workbook = xlwt.Workbook()
|
||||
sheet = workbook.add_sheet("Sheet")
|
||||
|
||||
for i in range(len(all_data)):
|
||||
for j in range(len(all_data[i])):
|
||||
sheet.write(i, j, all_data[i][j])
|
||||
|
||||
workbook.save("all_data.xls")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
BIN
ExecuteStage/list.xlsx
Normal file
BIN
ExecuteStage/list.xlsx
Normal file
Binary file not shown.
7
ExecuteStage/main.py
Normal file
7
ExecuteStage/main.py
Normal file
@ -0,0 +1,7 @@
|
||||
import author_crawl
|
||||
import desc_crawl
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
author_crawl.main()
|
||||
desc_crawl.main()
|
42
ExecuteStage/service_invoke.py
Normal file
42
ExecuteStage/service_invoke.py
Normal file
@ -0,0 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from base64 import encode
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def queryService(id):
|
||||
with open("services/%d.json" % id, "r", encoding='utf-8') as f:
|
||||
service = json.loads(f.read())
|
||||
return service
|
||||
|
||||
|
||||
def invokeService(id, data):
|
||||
service = queryService(id)
|
||||
try:
|
||||
service["links"] = data["urlList_0"]
|
||||
except:
|
||||
pass
|
||||
for key, value in data.items():
|
||||
for i in range(len(service["inputParameters"])):
|
||||
if key == service["inputParameters"][i]["name"]: # 能调用
|
||||
nodeId = int(service["inputParameters"][i]["nodeId"])
|
||||
node = service["graph"][nodeId]
|
||||
if node["option"] == 1:
|
||||
node["parameters"]["links"] = value
|
||||
elif node["option"] == 4:
|
||||
node["parameters"]["value"] = value
|
||||
elif node["option"] == 8 and node["parameters"]["loopType"] == 0:
|
||||
# print("loopType 0", value)
|
||||
node["parameters"]["exitCount"] = int(value)
|
||||
# print(node)
|
||||
elif node["option"] == 8:
|
||||
node["parameters"]["textList"] = value
|
||||
break
|
||||
|
||||
count = len(os.listdir("tasks")) + 1
|
||||
service["id"] = count # 修改id
|
||||
print(count)
|
||||
with open("tasks/%d.json" % count, "w", ) as f:
|
||||
s = json.dumps(service, ensure_ascii=False)
|
||||
f.write(s)
|
||||
return count
|
218
ExecuteStage/services/0.json
Normal file
218
ExecuteStage/services/0.json
Normal file
@ -0,0 +1,218 @@
|
||||
{
|
||||
"id": 6,
|
||||
"name": "toutiao_authors",
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"containJudge": false,
|
||||
"desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"inputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "urlList_0",
|
||||
"nodeId": 2,
|
||||
"nodeName": "Open Page",
|
||||
"value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"desc": "要采集的网址列表,多行以\\n分开",
|
||||
"type": "string",
|
||||
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "loopTimes_Loop_Click_1",
|
||||
"nodeId": 5,
|
||||
"nodeName": "Loop_Click",
|
||||
"desc": "循环Loop_Click执行的次数(0代表无限循环)",
|
||||
"type": "int",
|
||||
"exampleValue": 5,
|
||||
"value": 5
|
||||
}
|
||||
],
|
||||
"outputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "参数1_链接文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"
|
||||
}
|
||||
],
|
||||
"graph": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": 0,
|
||||
"parentId": 0,
|
||||
"type": -1,
|
||||
"option": 0,
|
||||
"title": "root",
|
||||
"sequence": [2, 5],
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 0,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0
|
||||
},
|
||||
"isInLoop": false
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 1,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 1,
|
||||
"title": "Open Page",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"url": "https://www.jd.com",
|
||||
"links": "https://www.jd.com",
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"index": 2,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 1,
|
||||
"title": "Open Page",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 5,
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0",
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"index": 3,
|
||||
"parentId": 2,
|
||||
"type": 1,
|
||||
"option": 8,
|
||||
"title": "Loop",
|
||||
"sequence": [4],
|
||||
"isInLoop": true,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": false,
|
||||
"xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"loopType": 1,
|
||||
"pathList": "",
|
||||
"textList": "",
|
||||
"exitCount": 0,
|
||||
"historyWait": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"index": 4,
|
||||
"parentId": 3,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": true,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 1,
|
||||
"contentType": 0,
|
||||
"relative": true,
|
||||
"name": "参数1_链接文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "经常戴耳机危害 - 语音科普 - 博禾医生"
|
||||
},
|
||||
{
|
||||
"num": 1,
|
||||
"value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"
|
||||
},
|
||||
{
|
||||
"num": 2,
|
||||
"value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"
|
||||
},
|
||||
{
|
||||
"num": 3,
|
||||
"value": "长时间戴耳机听歌或致耳聋-名医在线网"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
}
|
||||
],
|
||||
"loopType": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"index": 5,
|
||||
"parentId": 0,
|
||||
"type": 1,
|
||||
"option": 8,
|
||||
"title": "Loop_Click",
|
||||
"sequence": [3, 6],
|
||||
"isInLoop": false,
|
||||
"position": 1,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": false,
|
||||
"xpath": "//span[contains(text(), \"下一页\")]/../..",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"loopType": 0,
|
||||
"pathList": "",
|
||||
"textList": "",
|
||||
"exitCount": 5,
|
||||
"historyWait": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"index": 6,
|
||||
"parentId": 2,
|
||||
"type": 0,
|
||||
"option": 2,
|
||||
"title": "Click Element",
|
||||
"sequence": [],
|
||||
"isInLoop": true,
|
||||
"position": 1,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": true,
|
||||
"xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"paras": [],
|
||||
"loopType": 0
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
278
ExecuteStage/services/1 - Copy.json
Normal file
278
ExecuteStage/services/1 - Copy.json
Normal file
@ -0,0 +1,278 @@
|
||||
{
|
||||
"id": 7,
|
||||
"name": "toutiao_author_desc",
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"containJudge": false,
|
||||
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"inputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "urlList_0",
|
||||
"nodeId": 1,
|
||||
"nodeName": "Open Page",
|
||||
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"desc": "要采集的网址列表,多行以\\n分开",
|
||||
"type": "string",
|
||||
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
|
||||
}
|
||||
],
|
||||
"outputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "参数2_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "参数3_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "63.3万获赞"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "小荷医典"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "参数6_链接地址",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
|
||||
}
|
||||
],
|
||||
"graph": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": 0,
|
||||
"parentId": 0,
|
||||
"type": -1,
|
||||
"option": 0,
|
||||
"title": "root",
|
||||
"sequence": [1, 6, 2, 5],
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 0,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0
|
||||
},
|
||||
"isInLoop": false
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"index": 1,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 1,
|
||||
"title": "Open Page",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 1,
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"index": 2,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 2,
|
||||
"title": "Click Element",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 2,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": false,
|
||||
"xpath": "//a[contains(@class, 'flex-row')][1]",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"paras": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 3,
|
||||
"parentId": 0,
|
||||
"type": 1,
|
||||
"option": 8,
|
||||
"title": "Loop",
|
||||
"sequence": [4],
|
||||
"isInLoop": false,
|
||||
"position": 2,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"loopType": 1,
|
||||
"pathList": "",
|
||||
"textList": "",
|
||||
"exitCount": 0,
|
||||
"historyWait": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 4,
|
||||
"parentId": 3,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": true,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": true,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "63.3万获赞"
|
||||
},
|
||||
{
|
||||
"num": 1,
|
||||
"value": "10.3万粉丝"
|
||||
},
|
||||
{
|
||||
"num": 2,
|
||||
"value": "5关注"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
}
|
||||
],
|
||||
"loopType": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"index": 5,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 3,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数2_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//p[contains(@class, 'user-desc')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
},
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数3_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "63.3万获赞"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
},
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "小荷医典"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"index": 6,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 1,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": 0,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 2,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数6_链接地址",
|
||||
"desc": "",
|
||||
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
292
ExecuteStage/services/1.json
Normal file
292
ExecuteStage/services/1.json
Normal file
@ -0,0 +1,292 @@
|
||||
{
|
||||
"id": 7,
|
||||
"name": "toutiao_author_desc",
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"containJudge": false,
|
||||
"desc": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"inputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "urlList_0",
|
||||
"nodeId": 1,
|
||||
"nodeName": "Open Page",
|
||||
"value": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"desc": "要采集的网址列表,多行以\\n分开",
|
||||
"type": "string",
|
||||
"exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media"
|
||||
}
|
||||
],
|
||||
"outputParameters": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "参数2_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "参数3_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "63.3万获赞"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "小荷医典"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "参数1_链接地址",
|
||||
"desc": "",
|
||||
"type": "string",
|
||||
"exampleValue": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
|
||||
}
|
||||
],
|
||||
"graph": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": 0,
|
||||
"parentId": 0,
|
||||
"type": -1,
|
||||
"option": 0,
|
||||
"title": "root",
|
||||
"sequence": [1, 2, 5],
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 0,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0
|
||||
},
|
||||
"isInLoop": false
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"index": 1,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 1,
|
||||
"title": "Open Page",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 1,
|
||||
"url": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=小荷医典&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=国家反诈中心&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=光明网&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media",
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"index": 2,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 2,
|
||||
"title": "Click Element",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 1,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": -1,
|
||||
"useLoop": false,
|
||||
"xpath": "//a[contains(@class, 'flex-row')][1]",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"paras": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 3,
|
||||
"parentId": 0,
|
||||
"type": 1,
|
||||
"option": 8,
|
||||
"title": "Loop",
|
||||
"sequence": [4],
|
||||
"isInLoop": false,
|
||||
"position": 2,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/button",
|
||||
"wait": 0,
|
||||
"scrollType": 0,
|
||||
"scrollCount": 0,
|
||||
"loopType": 1,
|
||||
"pathList": "",
|
||||
"textList": "",
|
||||
"exitCount": 0,
|
||||
"historyWait": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 4,
|
||||
"parentId": 3,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": true,
|
||||
"position": 0,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": true,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "63.3万获赞"
|
||||
},
|
||||
{
|
||||
"num": 1,
|
||||
"value": "10.3万粉丝"
|
||||
},
|
||||
{
|
||||
"num": 2,
|
||||
"value": "5关注"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
}
|
||||
],
|
||||
"loopType": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"index": 5,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 2,
|
||||
"parameters": {
|
||||
"history": 1,
|
||||
"tabIndex": 1,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数2_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//p[contains(@class, 'user-desc')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "简介:关注小荷医典,获取“权威、科学、实用、易懂”的医学信息"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
},
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数3_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//div[contains(@class, 'relation-stat')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "63.3万获赞"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
},
|
||||
{
|
||||
"nodeType": 0,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数1_文本",
|
||||
"desc": "",
|
||||
"relativeXpath": "//div[contains(@class, 'detail')]//span[contains(@class, 'name')]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "小荷医典"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"nodeType": 2,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数1_链接地址",
|
||||
"desc": "",
|
||||
"relativeXpath": "//a[@class=\"avatar\"]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "https://www.toutiao.com/c/user/token/MS4wLjABAAAAMpLV_1BmiyKp0yLcLZb1xJjVxmOnwObqydIzTC2ngoQ/?source=profile"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": -1,
|
||||
"index": 6,
|
||||
"parentId": 0,
|
||||
"type": 0,
|
||||
"option": 3,
|
||||
"title": "Extract Data",
|
||||
"sequence": [],
|
||||
"isInLoop": false,
|
||||
"position": 1,
|
||||
"parameters": {
|
||||
"history": 5,
|
||||
"tabIndex": 0,
|
||||
"useLoop": false,
|
||||
"xpath": "",
|
||||
"wait": 0,
|
||||
"paras": [
|
||||
{
|
||||
"nodeType": 2,
|
||||
"contentType": 0,
|
||||
"relative": false,
|
||||
"name": "参数6_链接地址",
|
||||
"desc": "",
|
||||
"relativeXpath": "//a[contains(@class, 'flex-row')][1]",
|
||||
"exampleValues": [
|
||||
{
|
||||
"num": 0,
|
||||
"value": "https://so.toutiao.com/search/jump?url=https%3A%2F%2Fwww.toutiao.com%2Fc%2Fuser%2F95347840570%2F&aid=4916&jtoken=804fb410ca0c16fb0980b0a082d0c707d7ae7fb539575218f9e98534f7a9a598c00b8d749a4eb8859d5df7b1b0a122356b0b56810577c9f74a5faf9d6c18f340"
|
||||
}
|
||||
],
|
||||
"default": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
1
ExecuteStage/tasks/0.json
Normal file
1
ExecuteStage/tasks/0.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/10.json
Normal file
1
ExecuteStage/tasks/10.json
Normal file
@ -0,0 +1 @@
|
||||
{'id': 12, 'name': 'toutiao_authors', 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'containJudge': False, 'desc': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0', 'inputParameters': [{'id': 0, 'name': 'urlList_0', 'nodeId': 2, 'nodeName': 'Open Page', 'value': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期 戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0', 'desc': '要采集的网址列表,多行以\\n分开', 'type': 'string', 'exampleValue': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成 耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0'}, {'id': 1, 'name': 'loopTimes_Loop_Click_1', 'nodeId': 5, 'nodeName': 'Loop_Click', 'desc': '循环Loop_Click执行的 次数(0代表无限循环)', 'type': 'int', 'exampleValue': 5, 'value': 5}], 'outputParameters': [{'id': 0, 'name': '参数1_链接文本', 'desc': '', 'type': 'string', 'exampleValue': '经常戴耳机危害 - 语音科普 - 博禾医生'}], 'graph': [{'index': 0, 'id': 0, 'parentId': 0, 'type': -1, 'option': 0, 'title': 'root', 'sequence': [2, 5], 'parameters': {'history': 1, 'tabIndex': 0, 'useLoop': False, 'xpath': '', 'wait': 0}, 'isInLoop': False}, {'id': -1, 'index': 1, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 0, 'url': 'https://www.jd.com', 'links': 'https://www.jd.com', 'scrollType': 0, 'scrollCount': 0}}, {'id': 1, 'index': 2, 'parentId': 0, 'type': 0, 'option': 1, 'title': 'Open Page', 'sequence': [], 'isInLoop': False, 'position': 0, 'parameters': {'useLoop': False, 'xpath': '', 'wait': 5, 'url': 'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴 耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0', 'links': 'https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n', 'scrollType': 0, 'scrollCount': 0}}, {'id': 3, 'index': 3, 'parentId': 2, 'type': 1, 'option': 8, 'title': 'Loop', 'sequence': [4], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//div[contains(@class, "cs-source")]//span[contains(@class, \'text-ellipsis\')][1]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 1, 'pathList': '', 'textList': '', 'exitCount': 0, 'historyWait': 2}}, {'id': 5, 'index': 4, 'parentId': 3, 'type': 0, 'option': 3, 'title': 'Extract Data', 'sequence': [], 'isInLoop': True, 'position': 0, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '', 'wait': 0, 'paras': [{'nodeType': 1, 'contentType': 0, 'relative': True, 'name': '参数1_链接文本', 'desc': '', 'relativeXpath': '', 'exampleValues': [{'num': 0, 'value': '经常 戴耳机危害 - 语音科普 - 博禾医生'}, {'num': 1, 'value': '经常戴耳机有什么危害 - 博禾微视 - 博禾医生'}, {'num': 2, 'value': '经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生'}, {'num': 3, 'value': '长时间戴耳机听歌或致耳聋-名医在线网'}], 'default': ''}], 'loopType': 1}}, {'id': 2, 'index': 5, 'parentId': 0, 'type': 1, 'option': 8, 'title': 'Loop_Click', 'sequence': [3, 6], 'isInLoop': False, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': False, 'xpath': '//span[contains(text(), "下一页")]/../..', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'loopType': 0, 'pathList': '', 'textList': '', 'exitCount': 5, 'historyWait': 2}}, {'id': 4, 'index': 6, 'parentId': 2, 'type': 0, 'option': 2, 'title': 'Click Element', 'sequence': [], 'isInLoop': True, 'position': 1, 'parameters': {'history': 5, 'tabIndex': -1, 'useLoop': True, 'xpath': '//*[@id="s-dom-f0607f20"]/div[1]/div[1]/a[7]', 'wait': 0, 'scrollType': 0, 'scrollCount': 0, 'paras': [], 'loopType': 0}}]}
|
1
ExecuteStage/tasks/11.json
Normal file
1
ExecuteStage/tasks/11.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/12.json
Normal file
1
ExecuteStage/tasks/12.json
Normal file
@ -0,0 +1 @@
|
||||
{"id": 12, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "desc": "要采集的网址列表,多行以\\n分开", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "循环Loop_Click执行的次数(0代表无限循环)", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "参数1_链接文本", "desc": "", "type": "string", "exampleValue": "经常戴耳机危害 - 语音科普 - 博禾医生"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=长期戴耳机听音乐会怎样?或许会造成耳聋,且很难恢复&pd=synthesis&action_type=pagination&page_num=0", "links": "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=www.caifujiao.com&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\nhttps://so.toutiao.com/search?dvpf=pc&source=input&keyword=博客园&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "参数1_链接文本", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "经常戴耳机危害 - 语音科普 - 博禾医生"}, {"num": 1, "value": "经常戴耳机有什么危害 - 博禾微视 - 博禾医生"}, {"num": 2, "value": "经常戴耳机耳朵会不会聋 - 专家文章 - 博禾医生"}, {"num": 3, "value": "长时间戴耳机听歌或致耳聋-名医在线网"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"下一页\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}
|
1
ExecuteStage/tasks/13.json
Normal file
1
ExecuteStage/tasks/13.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/14.json
Normal file
1
ExecuteStage/tasks/14.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/15.json
Normal file
1
ExecuteStage/tasks/15.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/2.json
Normal file
1
ExecuteStage/tasks/2.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/3.json
Normal file
1
ExecuteStage/tasks/3.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/4.json
Normal file
1
ExecuteStage/tasks/4.json
Normal file
@ -0,0 +1 @@
|
||||
{"id": 4, "name": "toutiao_authors", "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "containJudge": false, "desc": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "inputParameters": [{"id": 0, "name": "urlList_0", "nodeId": 2, "nodeName": "Open Page", "value": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "desc": "\u8981\u91c7\u96c6\u7684\u7f51\u5740\u5217\u8868,\u591a\u884c\u4ee5\\n\u5206\u5f00", "type": "string", "exampleValue": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0"}, {"id": 1, "name": "loopTimes_Loop_Click_1", "nodeId": 5, "nodeName": "Loop_Click", "desc": "\u5faa\u73afLoop_Click\u6267\u884c\u7684\u6b21\u6570\uff080\u4ee3\u8868\u65e0\u9650\u5faa\u73af\uff09", "type": "int", "exampleValue": 5, "value": 5}], "outputParameters": [{"id": 0, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "type": "string", "exampleValue": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}], "graph": [{"index": 0, "id": 0, "parentId": 0, "type": -1, "option": 0, "title": "root", "sequence": [2, 5], "parameters": {"history": 1, "tabIndex": 0, "useLoop": false, "xpath": "", "wait": 0}, "isInLoop": false}, {"id": -1, "index": 1, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 0, "url": "https://www.jd.com", "links": "https://www.jd.com", "scrollType": 0, "scrollCount": 0}}, {"id": 1, "index": 2, "parentId": 0, "type": 0, "option": 1, "title": "Open Page", "sequence": [], "isInLoop": false, "position": 0, "parameters": {"useLoop": false, "xpath": "", "wait": 5, "url": "https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=\u957f\u671f\u6234\u8033\u673a\u542c\u97f3\u4e50\u4f1a\u600e\u6837?\u6216\u8bb8\u4f1a\u9020\u6210\u8033\u804b\uff0c\u4e14\u5f88\u96be\u6062\u590d&pd=synthesis&action_type=pagination&page_num=0", "links": "", "scrollType": 0, "scrollCount": 0}}, {"id": 3, "index": 3, "parentId": 2, "type": 1, "option": 8, "title": "Loop", "sequence": [4], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//div[contains(@class, \"cs-source\")]//span[contains(@class, 'text-ellipsis')][1]", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 1, "pathList": "", "textList": "", "exitCount": 0, "historyWait": 2}}, {"id": 5, "index": 4, "parentId": 3, "type": 0, "option": 3, "title": "Extract Data", "sequence": [], "isInLoop": true, "position": 0, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "", "wait": 0, "paras": [{"nodeType": 1, "contentType": 0, "relative": true, "name": "\u53c2\u65701_\u94fe\u63a5\u6587\u672c", "desc": "", "relativeXpath": "", "exampleValues": [{"num": 0, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u5371\u5bb3 - \u8bed\u97f3\u79d1\u666e - \u535a\u79be\u533b\u751f"}, {"num": 1, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u6709\u4ec0\u4e48\u5371\u5bb3 - \u535a\u79be\u5fae\u89c6 - \u535a\u79be\u533b\u751f"}, {"num": 2, "value": "\u7ecf\u5e38\u6234\u8033\u673a\u8033\u6735\u4f1a\u4e0d\u4f1a\u804b - \u4e13\u5bb6\u6587\u7ae0 - \u535a\u79be\u533b\u751f"}, {"num": 3, "value": "\u957f\u65f6\u95f4\u6234\u8033\u673a\u542c\u6b4c\u6216\u81f4\u8033\u804b-\u540d\u533b\u5728\u7ebf\u7f51"}], "default": ""}], "loopType": 1}}, {"id": 2, "index": 5, "parentId": 0, "type": 1, "option": 8, "title": "Loop_Click", "sequence": [3, 6], "isInLoop": false, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": false, "xpath": "//span[contains(text(), \"\u4e0b\u4e00\u9875\")]/../..", "wait": 0, "scrollType": 0, "scrollCount": 0, "loopType": 0, "pathList": "", "textList": "", "exitCount": 5, "historyWait": 2}}, {"id": 4, "index": 6, "parentId": 2, "type": 0, "option": 2, "title": "Click Element", "sequence": [], "isInLoop": true, "position": 1, "parameters": {"history": 5, "tabIndex": -1, "useLoop": true, "xpath": "//*[@id=\"s-dom-f0607f20\"]/div[1]/div[1]/a[7]", "wait": 0, "scrollType": 0, "scrollCount": 0, "paras": [], "loopType": 0}}]}
|
218
ExecuteStage/tasks/5.json
Normal file
218
ExecuteStage/tasks/5.json
Normal file
File diff suppressed because one or more lines are too long
0
ExecuteStage/tasks/6.json
Normal file
0
ExecuteStage/tasks/6.json
Normal file
218
ExecuteStage/tasks/7.json
Normal file
218
ExecuteStage/tasks/7.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/8.json
Normal file
1
ExecuteStage/tasks/8.json
Normal file
File diff suppressed because one or more lines are too long
1
ExecuteStage/tasks/9.json
Normal file
1
ExecuteStage/tasks/9.json
Normal file
File diff suppressed because one or more lines are too long
6
ExecuteStage/tianyancha.py
Normal file
6
ExecuteStage/tianyancha.py
Normal file
@ -0,0 +1,6 @@
|
||||
import pandas as pd
|
||||
|
||||
if __name__ == "__main__":
|
||||
df = pd.read_excel('list.xlsx', "sheet2")
|
||||
data = df.values
|
||||
print(len(data), data[0])
|
@ -1,4 +1,3 @@
|
||||
|
||||
from django.http import HttpResponse
|
||||
import pymongo
|
||||
import json
|
||||
|
Loading…
x
Reference in New Issue
Block a user