mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-22 08:27:27 +08:00
911 lines
54 KiB
Plaintext
911 lines
54 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 服务包装手动版工具执行阶段"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"pycharm": {
|
||
"name": "#%% md\n"
|
||
}
|
||
},
|
||
"source": [
|
||
"## 导入包"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# -*- coding: utf-8 -*-\n",
|
||
"import json\n",
|
||
"import re\n",
|
||
"from urllib import parse\n",
|
||
"import base64\n",
|
||
"import hashlib\n",
|
||
"import time\n",
|
||
"import requests\n",
|
||
"from selenium.webdriver.common.keys import Keys\n",
|
||
"from selenium.webdriver.common.action_chains import ActionChains\n",
|
||
"from selenium import webdriver\n",
|
||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||
"from selenium.webdriver.common.by import By\n",
|
||
"from selenium.common.exceptions import NoSuchElementException\n",
|
||
"from selenium.common.exceptions import TimeoutException\n",
|
||
"from selenium.common.exceptions import StaleElementReferenceException\n",
|
||
"import random\n",
|
||
"import numpy\n",
|
||
"import csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"pycharm": {
|
||
"name": "#%% md\n"
|
||
}
|
||
},
|
||
"source": [
|
||
"## 核心函数处理部分"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 记录log\n",
|
||
"def recordLog(str=\"\"):\n",
|
||
" global log\n",
|
||
" log = log + str + \"\\n\"\n",
|
||
" \n",
|
||
"#控制台打印log函数\n",
|
||
"def Log(text,text2=\"\"):\n",
|
||
" switch = False\n",
|
||
" if switch:\n",
|
||
" print(text,text2)\n",
|
||
"\n",
|
||
"# 执行节点关键函数部分\n",
|
||
"def excuteNode(nodeId, loopValue=\"\", clickPath=\"\", index=0):\n",
|
||
" node = procedure[nodeId]\n",
|
||
" WebDriverWait(browser, 10).until\n",
|
||
" (EC.visibility_of_element_located((By.XPATH, node[\"parameters\"][\"xpath\"]))) # 等待元素出现才进行操作,10秒内未出现则报错\n",
|
||
"\n",
|
||
" # 根据不同选项执行不同操作\n",
|
||
" if node[\"option\"] == 0 or node[\"option\"] == 10: # root操作,条件分支操作\n",
|
||
" for i in node[\"sequence\"]: # 从根节点开始向下读取\n",
|
||
" excuteNode(i, loopValue)\n",
|
||
" elif node[\"option\"] == 1: # 打开网页操作\n",
|
||
" recordLog(\"openPage\")\n",
|
||
" openPage(node[\"parameters\"], loopValue)\n",
|
||
" elif node[\"option\"] == 2: # 点击元素\n",
|
||
" recordLog(\"Click\")\n",
|
||
" clickElement(node[\"parameters\"], loopValue, clickPath, index)\n",
|
||
" elif node[\"option\"] == 3: # 提取数据\n",
|
||
" recordLog(\"getData\")\n",
|
||
" getData(node[\"parameters\"], loopValue, node[\"isInLoop\"])\n",
|
||
" elif node[\"option\"] == 4: # 输入文字\n",
|
||
" inputInfo(node[\"parameters\"], loopValue)\n",
|
||
" elif node[\"option\"] == 8: # 循环\n",
|
||
" recordLog(\"loop\")\n",
|
||
" loopExcute(node, loopValue) # 执行循环\n",
|
||
" elif node[\"option\"] == 9: # 条件分支\n",
|
||
" recordLog(\"judge\")\n",
|
||
" judgeExcute(node, loopValue)\n",
|
||
"\n",
|
||
" # 执行完之后进行等待\n",
|
||
" if node[\"option\"] != 0:\n",
|
||
" waitTime = 0.01 # 默认等待0.01秒\n",
|
||
" if node[\"parameters\"][\"wait\"] > 1:\n",
|
||
" waitTime = node[\"parameters\"][\"wait\"]\n",
|
||
" time.sleep(waitTime)\n",
|
||
" Log(\"Node执行完后等待:\",waitTime)\n",
|
||
"\n",
|
||
"\n",
|
||
"# 对判断条件的处理\n",
|
||
"def judgeExcute(node, loopElement):\n",
|
||
" global bodyText # 引入bodyText\n",
|
||
" excuteBranchId = 0 # 要执行的BranchId\n",
|
||
" for i in node[\"sequence\"]:\n",
|
||
" cnode = procedure[i] # 获得条件分支\n",
|
||
" tType = int(cnode[\"parameters\"][\"class\"]) # 获得判断条件类型\n",
|
||
" if tType == 0: # 什么条件都没有\n",
|
||
" excuteBranchId = i\n",
|
||
" break\n",
|
||
" elif tType == 1: # 当前页面包含文本\n",
|
||
" try:\n",
|
||
" if bodyText.find(cnode[\"parameters\"][\"value\"]) >= 0:\n",
|
||
" excuteBranchId = i\n",
|
||
" break\n",
|
||
" except: # 找不到元素下一个条件\n",
|
||
" continue\n",
|
||
" elif tType == 2: # 当前页面包含元素\n",
|
||
" try:\n",
|
||
" if browser.find_element_by_xpath(cnode[\"parameters\"][\"value\"]):\n",
|
||
" excuteBranchId = i\n",
|
||
" break\n",
|
||
" except: # 找不到元素或者xpath写错了,下一个条件\n",
|
||
" continue\n",
|
||
" elif tType == 3: # 当前循环元素包括文本\n",
|
||
" try:\n",
|
||
" if loopElement.text.find(cnode[\"parameters\"][\"value\"]) >= 0:\n",
|
||
" excuteBranchId = i\n",
|
||
" break\n",
|
||
" except: # 找不到元素或者xpath写错了,下一个条件\n",
|
||
" continue\n",
|
||
" elif tType == 4: # 当前循环元素包括元素\n",
|
||
" try:\n",
|
||
" if loopElement.find_element_by_xpath(cnode[\"parameters\"][\"value\"][1:]):\n",
|
||
" excuteBranchId = i\n",
|
||
" break\n",
|
||
" except: # 找不到元素或者xpath写错了,下一个条件\n",
|
||
" continue\n",
|
||
" excuteNode(excuteBranchId, loopElement)\n",
|
||
"\n",
|
||
"\n",
|
||
"# 对循环的处理\n",
|
||
"def loopExcute(node, loopValue):\n",
|
||
" time.sleep(0.1) # 第一次执行循环的时候强制等待1秒\n",
|
||
" Log(\"循环执行前等待0.1秒\")\n",
|
||
" global history\n",
|
||
" thisHandle = browser.current_window_handle # 记录本次循环内的标签页的ID\n",
|
||
" thisHistoryLength = browser.execute_script('return history.length') # 记录本次循环内的history的length\n",
|
||
"\n",
|
||
" if int(node[\"parameters\"][\"loopType\"]) == 0: # 单个元素循环\n",
|
||
" # 无跳转标签页操作\n",
|
||
" count = 0 # 执行次数\n",
|
||
" while True: # do while循环\n",
|
||
" try:\n",
|
||
" element = browser.find_element_by_xpath(node[\"parameters\"][\"xpath\"])\n",
|
||
" for i in node[\"sequence\"]: # 挨个执行操作\n",
|
||
" excuteNode(i, element, node[\"parameters\"][\"xpath\"])\n",
|
||
" Log(\"click: \", node[\"parameters\"][\"xpath\"])\n",
|
||
" recordLog(\"click:\" + node[\"parameters\"][\"xpath\"])\n",
|
||
" except NoSuchElementException:\n",
|
||
" break # 如果找不到元素,退出循环\n",
|
||
" except Exception as e:\n",
|
||
" raise\n",
|
||
" count = count + 1\n",
|
||
" Log(\"页数:\", count)\n",
|
||
" recordLog(\"页数:\" + str(count))\n",
|
||
" if node[\"parameters\"][\"exitCount\"] == count: # 如果达到设置的退出循环条件的话\n",
|
||
" break\n",
|
||
" elif int(node[\"parameters\"][\"loopType\"]) == 1: # 不固定元素列表\n",
|
||
" try:\n",
|
||
" elements = browser.find_elements_by_xpath(node[\"parameters\"][\"xpath\"])\n",
|
||
" for index in range(len(elements)):\n",
|
||
" for i in node[\"sequence\"]: # 挨个执行操作\n",
|
||
" excuteNode(i, elements[index], node[\"parameters\"][\"xpath\"], index)\n",
|
||
" if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化\n",
|
||
" while True: # 一直关闭窗口直到当前标签页\n",
|
||
" browser.close() # 关闭使用完的标签页\n",
|
||
" browser.switch_to.window(browser.window_handles[-1])\n",
|
||
" if browser.current_window_handle == thisHandle:\n",
|
||
" break\n",
|
||
" if history[\"index\"] != thisHistoryLength and history[\"handle\"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断\n",
|
||
" difference = thisHistoryLength - history[\"index\"] # 计算历史记录变化差值\n",
|
||
" browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录\n",
|
||
" if node[\"parameters\"][\"historyWait\"] > 2: # 回退后要等待的时间\n",
|
||
" time.sleep(node[\"parameters\"][\"historyWait\"])\n",
|
||
" else:\n",
|
||
" time.sleep(2)\n",
|
||
" Log(\"切换历史记录等待2秒或者:\",node[\"parameters\"][\"historyWait\"])\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" except NoSuchElementException:\n",
|
||
" Log(\"pathNotFound: \", node[\"parameters\"][\"xpath\"])\n",
|
||
" recordLog(\"pathNotFound: \" + node[\"parameters\"][\"xpath\"])\n",
|
||
" pass # 循环中找不到元素就略过操作\n",
|
||
" except Exception as e:\n",
|
||
" raise\n",
|
||
" elif int(node[\"parameters\"][\"loopType\"]) == 2: # 固定元素列表\n",
|
||
" for path in node[\"parameters\"][\"pathList\"].split(\"\\n\"): # 千万不要忘了分割!!\n",
|
||
" try:\n",
|
||
" element = browser.find_element_by_xpath(path)\n",
|
||
" for i in node[\"sequence\"]: # 挨个执行操作\n",
|
||
" excuteNode(i, element, path,0)\n",
|
||
" if browser.current_window_handle != thisHandle: # 如果执行完一次循环之后标签页的位置发生了变化\n",
|
||
" while True: # 一直关闭窗口直到当前标签页\n",
|
||
" browser.close() # 关闭使用完的标签页\n",
|
||
" browser.switch_to.window(browser.window_handles[-1])\n",
|
||
" if browser.current_window_handle == thisHandle:\n",
|
||
" break\n",
|
||
" if history[\"index\"] != thisHistoryLength and history[\"handle\"] == browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断\n",
|
||
" difference = thisHistoryLength - history[\"index\"] # 计算历史记录变化差值\n",
|
||
" browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录\n",
|
||
" if node[\"parameters\"][\"historyWait\"] > 2: # 回退后要等待的时间\n",
|
||
" time.sleep(node[\"parameters\"][\"historyWait\"])\n",
|
||
" else:\n",
|
||
" time.sleep(2)\n",
|
||
" Log(\"切换历史记录等待2秒或者:\",node[\"parameters\"][\"historyWait\"])\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" except NoSuchElementException:\n",
|
||
" Log(\"pathNotFound: \", path)\n",
|
||
" recordLog(\"pathNotFound: \" + path)\n",
|
||
" continue # 循环中找不到元素就略过操作\n",
|
||
" except Exception as e:\n",
|
||
" raise\n",
|
||
" elif int(node[\"parameters\"][\"loopType\"]) == 3: # 固定文本列表\n",
|
||
" textList = node[\"parameters\"][\"textList\"].split(\"\\n\")\n",
|
||
" for text in textList:\n",
|
||
" recordLog(\"input: \" + text)\n",
|
||
" for i in node[\"sequence\"]: # 挨个执行操作\n",
|
||
" excuteNode(i, text, \"\")\n",
|
||
" elif int(node[\"parameters\"][\"loopType\"]) == 4: # 固定网址列表\n",
|
||
" pass # 以后再做\n",
|
||
" history[\"index\"] = thisHistoryLength\n",
|
||
" history[\"handle\"] = browser.current_window_handle\n",
|
||
" \n",
|
||
"# 打开网页事件\n",
|
||
"def openPage(para, loopValue):\n",
|
||
" global links\n",
|
||
" global urlId\n",
|
||
" global history\n",
|
||
" browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始\n",
|
||
" history[\"handle\"] = browser.current_window_handle\n",
|
||
" if para[\"useLoop\"]:\n",
|
||
" url = loopValue\n",
|
||
" else:\n",
|
||
" url = links[urlId]\n",
|
||
" try:\n",
|
||
" browser.get(url)\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when loading page: ' + url)\n",
|
||
" recordLog('time out after 10 seconds when loading page: ' + url)\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" try:\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" except TimeoutException:\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" try:\n",
|
||
" if para[\"scrollType\"] != 0 and para[\"scrollCount\"] > 0: # 控制屏幕向下滚动\n",
|
||
" for i in range(para[\"scrollCount\"]):\n",
|
||
" time.sleep(1) # 下拉完等1秒\n",
|
||
" Log(\"下拉等待1秒\")\n",
|
||
" body = browser.find_element_by_css_selector(\"body\")\n",
|
||
" body.send_keys(Keys.END)\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when loading page: ' + url)\n",
|
||
" recordLog('time out after 10 seconds when loading page: ' + url)\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" if containJudge:\n",
|
||
" global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText\n",
|
||
" try:\n",
|
||
" bodyText = browser.find_element_by_css_selector(\"body\").text\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when getting body text: ' + url)\n",
|
||
" recordLog('time out after 10 seconds when getting body text:: ' + url)\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" time.sleep(1)\n",
|
||
" Log(\"获得bodytext等待1秒\")\n",
|
||
" # 再执行一遍\n",
|
||
" bodyText = browser.find_element_by_css_selector(\"body\").text\n",
|
||
" except Exception as e:\n",
|
||
" Log(e)\n",
|
||
" recordLog(str(e))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 键盘输入事件\n",
|
||
"def inputInfo(para, loopValue):\n",
|
||
" time.sleep(1) # 输入之前等待1秒\n",
|
||
" Log(\"输入前等待1秒\")\n",
|
||
" try:\n",
|
||
" textbox = browser.find_element_by_xpath(para[\"xpath\"])\n",
|
||
" except:\n",
|
||
" Log(\"找不到输入框元素:\" + para[\"xpath\"] + \"请尝试执行前等待\")\n",
|
||
" recordLog(\"找不到输入框元素:\" + para[\"xpath\"] + \"请尝试执行前等待\")\n",
|
||
" exit()\n",
|
||
" textbox.send_keys(Keys.CONTROL, 'a')\n",
|
||
" textbox.send_keys(Keys.BACKSPACE)\n",
|
||
" if para[\"useLoop\"]:\n",
|
||
" textbox.send_keys(loopValue)\n",
|
||
" else:\n",
|
||
" textbox.send_keys(para[\"value\"])\n",
|
||
" global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText\n",
|
||
" bodyText = browser.find_element_by_css_selector(\"body\").text\n",
|
||
"\n",
|
||
"\n",
|
||
"# 点击元素事件\n",
|
||
"def clickElement(para, loopElement=None, clickPath=\"\", index=0):\n",
|
||
" global history\n",
|
||
" time.sleep(1) # 点击之前等待1秒\n",
|
||
" Log(\"点击之前等待1秒\")\n",
|
||
" if para[\"useLoop\"]: #使用循环的情况下,传入的clickPath就是实际的xpath\n",
|
||
" path = clickPath\n",
|
||
" else:\n",
|
||
" path = clickPath + para[\"xpath\"] #不然使用元素定义的xpath\n",
|
||
" tempHandleNum = len(browser.window_handles) #记录之前的窗口位置\n",
|
||
" try:\n",
|
||
" script = 'var result = document.evaluate(`' + path + '`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'\n",
|
||
" browser.execute_script(script,str(index))# 用js的点击方法\n",
|
||
"\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when loading clicked page')\n",
|
||
" recordLog('time out after 10 seconds when loading clicked page')\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" except Exception as e:\n",
|
||
" Log(e)\n",
|
||
" recordLog(str(e))\n",
|
||
" time.sleep(0.5) # 点击之后等半秒\n",
|
||
" Log(\"点击之后等待0.5秒\")\n",
|
||
" if tempHandleNum != len(browser.window_handles): # 如果有新标签页的行为发生\n",
|
||
" browser.switch_to.window(browser.window_handles[-1]) # 跳转到新的标签页\n",
|
||
" history[\"handle\"] = browser.current_window_handle\n",
|
||
" try:\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" except TimeoutException:\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" else:\n",
|
||
" try:\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" except TimeoutException:\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" history[\"index\"] = browser.execute_script(\"return history.length\")\n",
|
||
" # 如果打开了新窗口,切换到新窗口\n",
|
||
" try:\n",
|
||
" if para[\"scrollType\"] != 0 and para[\"scrollCount\"] > 0: # 控制屏幕向下滚动\n",
|
||
" for i in range(para[\"scrollCount\"]):\n",
|
||
" time.sleep(1) # 下拉完等1秒\n",
|
||
" Log(\"下拉完等待1秒\")\n",
|
||
" body = browser.find_element_by_css_selector(\"body\")\n",
|
||
" body.send_keys(Keys.END)\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when scrolling. ')\n",
|
||
" recordLog('time out after 10 seconds when scrolling')\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" if para[\"scrollType\"] != 0 and para[\"scrollCount\"] > 0: # 控制屏幕向下滚动\n",
|
||
" for i in range(para[\"scrollCount\"]):\n",
|
||
" time.sleep(1) # 下拉完等1秒\n",
|
||
" Log(\"下拉完等待1秒\")\n",
|
||
" body = browser.find_element_by_css_selector(\"body\")\n",
|
||
" body.send_keys(Keys.END)\n",
|
||
" if containJudge: #有判断语句才执行以下操作\n",
|
||
" global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText\n",
|
||
" try:\n",
|
||
" bodyText = browser.find_element_by_css_selector(\"body\").text\n",
|
||
" except TimeoutException:\n",
|
||
" Log('time out after 10 seconds when getting body text')\n",
|
||
" recordLog('time out after 10 seconds when getting body text')\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" time.sleep(1)\n",
|
||
" Log(\"bodytext等待1秒\")\n",
|
||
" # 再执行一遍\n",
|
||
" bodyText = browser.find_element_by_css_selector(\"body\").text\n",
|
||
" except Exception as e:\n",
|
||
" Log(e)\n",
|
||
" recordLog(str(e))\n",
|
||
"\n",
|
||
"\n",
|
||
"# 提取数据事件\n",
|
||
"def getData(para, loopElement, isInLoop=True):\n",
|
||
" if not isInLoop and para[\"wait\"] == 0:\n",
|
||
" time.sleep(1) # 如果提取数据字段不在循环内而且设置的等待时间为0,默认等待1秒\n",
|
||
" Log(\"提取数据等待1秒\")\n",
|
||
" for p in para[\"paras\"]:\n",
|
||
" content = \"\"\n",
|
||
" try:\n",
|
||
" if p[\"relative\"]: # 是否相对xpath\n",
|
||
" if p[\"relativeXpath\"] == \"\": # 相对xpath有时候就是元素本身,不需要二次查找\n",
|
||
" element = loopElement\n",
|
||
" else:\n",
|
||
" element = loopElement.find_element_by_xpath(p[\"relativeXpath\"][1:])\n",
|
||
" else:\n",
|
||
" element = browser.find_element_by_xpath(p[\"relativeXpath\"])\n",
|
||
" except NoSuchElementException: # 找不到元素的时候,使用默认值\n",
|
||
" outputParameters[p[\"name\"]] = p[\"default\"]\n",
|
||
" Log('Element not found,use default')\n",
|
||
" recordLog('Element not found,use default')\n",
|
||
" continue\n",
|
||
" except TimeoutException: #超时的时候设置超时值\n",
|
||
" Log('time out after 10 seconds when getting data')\n",
|
||
" recordLog('time out after 10 seconds when getting data')\n",
|
||
" browser.execute_script('window.stop()')\n",
|
||
" if p[\"relative\"]: # 是否相对xpath\n",
|
||
" if p[\"relativeXpath\"] == \"\": # 相对xpath有时候就是元素本身,不需要二次查找\n",
|
||
" element = loopElement\n",
|
||
" else:\n",
|
||
" element = loopElement.find_element_by_xpath(p[\"relativeXpath\"][1:])\n",
|
||
" else:\n",
|
||
" element = browser.find_element_by_xpath(p[\"relativeXpath\"])\n",
|
||
" if p[\"contentType\"] == 2:\n",
|
||
" content = element.get_attribute('innerHTML')\n",
|
||
" elif p[\"contentType\"] == 3:\n",
|
||
" content = element.get_attribute('outerHTML')\n",
|
||
" elif p[\"contentType\"] == 1: # 只采集当期元素下的文本,不包括子元素\n",
|
||
" command = 'var arr = [];\\\n",
|
||
" var content = arguments[0];\\\n",
|
||
" for(var i = 0, len = content.childNodes.length; i < len; i++) {\\\n",
|
||
" if(content.childNodes[i].nodeType === 3){ \\\n",
|
||
" arr.push(content.childNodes[i].nodeValue);\\\n",
|
||
" }\\\n",
|
||
" }\\\n",
|
||
" var str = arr.join(\"\"); \\\n",
|
||
" return str;'\n",
|
||
" content = browser.execute_script(command, element).replace(\" \", \"\").replace(\"\\n\", \"\")\n",
|
||
" if p[\"nodeType\"] == 2:\n",
|
||
" if element.get_attribute(\"href\") != None:\n",
|
||
" content = element.get_attribute(\"href\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" elif p[\"nodeType\"] == 3:\n",
|
||
" if element.get_attribute(\"value\") != None:\n",
|
||
" content = element.get_attribute(\"value\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" elif p[\"nodeType\"] == 4: # 图片\n",
|
||
" if element.get_attribute(\"src\") != None:\n",
|
||
" content = element.get_attribute(\"src\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" elif p[\"contentType\"] == 0:\n",
|
||
" content = element.text\n",
|
||
" if p[\"nodeType\"] == 2:\n",
|
||
" if element.get_attribute(\"href\") != None:\n",
|
||
" content = element.get_attribute(\"href\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" elif p[\"nodeType\"] == 3:\n",
|
||
" if element.get_attribute(\"value\") != None:\n",
|
||
" content = element.get_attribute(\"value\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" elif p[\"nodeType\"] == 4: # 图片\n",
|
||
" if element.get_attribute(\"src\") != None:\n",
|
||
" content = element.get_attribute(\"src\")\n",
|
||
" else:\n",
|
||
" content = \"\"\n",
|
||
" outputParameters[p[\"name\"]] = content\n",
|
||
" global OUTPUT\n",
|
||
" line = []\n",
|
||
" for value in outputParameters.values():\n",
|
||
" line.append(value)\n",
|
||
" print(value[:15], \" \", end=\"\")\n",
|
||
" print(\"\")\n",
|
||
" OUTPUT.append(line)\n",
|
||
"\n",
|
||
"\n",
|
||
"# 判断字段是否为空\n",
|
||
"def isnull(s):\n",
|
||
" return len(s) != 0"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"source": [
|
||
"## 核心代码执行部分,只需要修改id为taskid即可"
|
||
],
|
||
"metadata": {
|
||
"collapsed": false
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"outputs": [],
|
||
"source": [
|
||
"if __name__ == '__main__':\n",
|
||
" browser = webdriver.Chrome();\n",
|
||
" browser.get('about:blank')\n",
|
||
" browser.set_page_load_timeout(10) # 加载页面最大超时时间\n",
|
||
"\n",
|
||
" id = 4 #taskId这里修改\n",
|
||
"\n",
|
||
" saveName = \"task_\" + str(id) + \"_\" + str(random.randint(0, 999999999)) # 保存文件的名字\n",
|
||
" content = requests.get(\"http://183.129.170.180:8041/backEnd/queryTask?id=\" + str(id))\n",
|
||
" service = json.loads(content.text) # 加载服务信息\n",
|
||
" procedure = service[\"graph\"] # 程序执行流程\n",
|
||
" links = list(filter(isnull, service[\"links\"].split(\"\\n\"))) # 要执行的link的列表\n",
|
||
" OUTPUT = [] # 采集的数据\n",
|
||
" OUTPUT.append([]) # 添加表头\n",
|
||
" containJudge = service[\"containJudge\"] #是否含有判断语句\n",
|
||
" bodyText = \"\" # 记录bodyText\n",
|
||
" tOut = service[\"outputParameters\"] # 生成输出参数对象\n",
|
||
" outputParameters = {}\n",
|
||
" log = \"\" # 记下现在总共开了多少个标签页\n",
|
||
" history = {\"index\":0,\"handle\":None} #记录页面现在所以在的历史记录的位置\n",
|
||
" for para in tOut:\n",
|
||
" outputParameters[para[\"name\"]] = \"\"\n",
|
||
" OUTPUT[0].append(para[\"name\"])\n",
|
||
" # 挨个执行程序\n",
|
||
" urlId = 0 # 全局记录变量\n",
|
||
" for i in range(len(links)):\n",
|
||
" excuteNode(0)\n",
|
||
" urlId = urlId + 1\n",
|
||
" print(\"执行完成!\")\n",
|
||
" recordLog(\"Done!\")\n",
|
||
" with open(saveName + '_log.txt', 'w',encoding='utf-8-sig') as file_obj:\n",
|
||
" file_obj.write(log)\n",
|
||
" file_obj.close()\n",
|
||
" with open(saveName + '.csv', 'w', encoding='utf-8-sig', newline=\"\") as f:\n",
|
||
" f_csv = csv.writer(f)\n",
|
||
" for line in OUTPUT:\n",
|
||
" f_csv.writerow(line)\n",
|
||
" f.close()"
|
||
],
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"pycharm": {
|
||
"name": "#%%\n"
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# node = procedure[9]\n",
|
||
"# excuteOnce(node)\n",
|
||
"OUTPUT"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<div class=\"blo \n",
|
||
"执行完成!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"if __name__ == '__main__':\n",
|
||
" browser = webdriver.Chrome();\n",
|
||
" browser.get('about:blank')\n",
|
||
" browser.set_page_load_timeout(10) # 加载页面最大超时时间\n",
|
||
"\n",
|
||
" id = 4 #taskId这里修改\n",
|
||
"\n",
|
||
" saveName = \"task_\" + str(id) + \"_\" + str(random.randint(0, 999999999)) # 保存文件的名字\n",
|
||
" content = requests.get(\"http://183.129.170.180:8041/backEnd/queryTask?id=\" + str(id))\n",
|
||
" service = json.loads(content.text) # 加载服务信息\n",
|
||
" procedure = service[\"graph\"] # 程序执行流程\n",
|
||
" links = list(filter(isnull, service[\"links\"].split(\"\\n\"))) # 要执行的link的列表\n",
|
||
" OUTPUT = [] # 采集的数据\n",
|
||
" OUTPUT.append([]) # 添加表头\n",
|
||
" containJudge = service[\"containJudge\"] #是否含有判断语句\n",
|
||
" bodyText = \"\" # 记录bodyText\n",
|
||
" tOut = service[\"outputParameters\"] # 生成输出参数对象\n",
|
||
" outputParameters = {}\n",
|
||
" log = \"\" # 记下现在总共开了多少个标签页\n",
|
||
" history = {\"index\":0,\"handle\":None} #记录页面现在所以在的历史记录的位置\n",
|
||
" for para in tOut:\n",
|
||
" outputParameters[para[\"name\"]] = \"\"\n",
|
||
" OUTPUT[0].append(para[\"name\"])\n",
|
||
" # 挨个执行程序\n",
|
||
" urlId = 0 # 全局记录变量\n",
|
||
" for i in range(len(links)):\n",
|
||
" excuteNode(0)\n",
|
||
" urlId = urlId + 1\n",
|
||
" print(\"执行完成!\")\n",
|
||
" recordLog(\"Done!\")\n",
|
||
" with open(saveName + '_log.txt', 'w',encoding='utf-8-sig') as file_obj:\n",
|
||
" file_obj.write(log)\n",
|
||
" file_obj.close()\n",
|
||
" with open(saveName + '.csv', 'w', encoding='utf-8-sig', newline=\"\") as f:\n",
|
||
" f_csv = csv.writer(f)\n",
|
||
" for line in OUTPUT:\n",
|
||
" f_csv.writerow(line)\n",
|
||
" f.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false,
|
||
"name": "#%% 测试单个函数\n"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[['参数1_outerHTML'],\n",
|
||
" ['<div class=\"blog-content-box\">\\n <div class=\"article-header-box\">\\n <div class=\"article-header\">\\n <div class=\"article-title-box\">\\n <h1 class=\"title-article\">该扩展程序未列在 Chrome 网上应用店中,并可能是在您不知情的情况下添加的解决办法</h1>\\n </div>\\n <div class=\"article-info-box\">\\n <div class=\"article-bar-top\">\\n <!--文章类型-->\\n <span class=\"article-type type-1 float-left\">原创</span> <a class=\"follow-nickName\" href=\"https://me.csdn.net/gexiaochao\" target=\"_blank\" rel=\"noopener\">葛小勺</a>\\n <span class=\"time\">最后发布于2019-03-22 17:28:27 </span>\\n <span class=\"read-count\">阅读数 17488</span>\\n <a id=\"blog_detail_zk_collection\" data-report-click=\"{"mod":"popu_823"}\">\\n <svg class=\"icon\">\\n <use xlink:href=\"#icon-csdnc-Collection-G\"></use>\\n </svg>\\n 收藏\\n </a>\\n </div>\\n <div class=\"up-time\">发布于2019-03-22 17:28:27</div>\\n <div class=\"slide-content-box\">\\n <div class=\"tags-box artic-tag-box\">\\n <span class=\"label\">分类专栏:</span>\\n <a class=\"tag-link\" target=\"_blank\" rel=\"noopener\" href=\"https://blog.csdn.net/gexiaochao/category_6923169.html\">\\n 计算机网络 </a>\\n </div>\\n <div class=\"article-copyright\">\\n <span class=\"creativecommons\">\\n <a rel=\"license\" href=\"http://creativecommons.org/licenses/by-sa/4.0/\"></a>\\n <span>\\n 版权声明:本文为博主原创文章,遵循<a href=\"http://creativecommons.org/licenses/by-sa/4.0/\" target=\"_blank\" rel=\"noopener\"> CC 4.0 BY-SA </a>版权协议,转载请附上原文出处链接和本声明。 </span>\\n <div class=\"article-source-link2222\">\\n 本文链接:<a href=\"https://blog.csdn.net/gexiaochao/article/details/88746278\">https://blog.csdn.net/gexiaochao/article/details/88746278</a>\\n </div>\\n </span> \\n </div>\\n </div>\\n <div class=\"operating\">\\n <a class=\"href-article-edit slide-toggle\">展开</a>\\n </div>\\n </div>\\n </div>\\n </div>\\n <article class=\"baidu_pl\">\\n <!--python安装手册开始-->\\n <!--python安装手册结束-->\\n <!--####专栏广告位图文切换开始-->\\n <!--####专栏广告位图文切换结束-->\\n <div id=\"article_content\" class=\"article_content clearfix\">\\n <link rel=\"stylesheet\" href=\"https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-833878f763.css\">\\n <link rel=\"stylesheet\" href=\"https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-833878f763.css\">\\n <div class=\"htmledit_views\" id=\"content_views\">\\n <p>如何解决该扩展程序未列在 Chrome 网上应用店中,并可能是在您不知情的情况下添加的</p>\\n\\n<p>在使用Google插件的时候,出现了这个问题,当时是直接下载的crx文件,然后拖拽到浏览器中进行安装的,过了不久,这个插件并不能进行使用了。<br>\\n出现:</p>\\n\\n<p>如何解决<br>\\n该扩展程序未列在 Chrome 网上应用店中,并可能是在您不知情的情况下添加的<br>\\n-------------------</p>\\n\\n<p>方法一</p>\\n\\n<p>1、首先把需要安装的第三方插件,后缀.crx 改成 .rar,然后解压,得到一个文件夹<br>\\n2、再打开chrome://extensions/谷歌扩展应用管理,点击右上角的开发者模式,就可以看到“加载正在开发的扩展程序”这一选项。<br>\\n3、选择刚才步骤1中解压好的文件夹,确定<br>\\n4、确认新增扩展程序,点击添加,成功添加应用程序。</p>\\n\\n<p>如出现如图情况</p>\\n\\n<p><img alt=\"\" class=\"has\" height=\"183\" src=\"https://img-blog.csdnimg.cn/20190322173309261.png\" width=\"642\"></p>\\n\\n<p>出现这种情况Chrome浏览器会提示无法加载以下来源的扩展程序: xxx路径(Chrome插件文件的解压位置)Cannot load extension with file or directory name _metadata. Filenames starting with \"_\" are reserved for use by the system.出现这种情况,是因为这款Chrome插件与新版的Chrome浏览器有些不兼容,这时候,用户可以打开刚刚解压的Chrome插件文件夹,并把其中_metadata文件夹的名字修改为metadata(把前面的下划线去掉),如图所示:</p>\\n\\n<p><img alt=\"\" class=\"has\" height=\"166\" src=\"https://img-blog.csdnimg.cn/20190322173431209.png\" width=\"630\"></p>\\n\\n<p>更新文件夹名称成功以后,点击该错误提示下方的“重试”按钮,就可以成功地把Chrome插件加载谷歌浏览器中了,如图所示</p>\\n\\n<p><img alt=\"\" class=\"has\" src=\"https://img-blog.csdnimg.cn/20190322173504497.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dleGlhb2NoYW8=,size_16,color_FFFFFF,t_70\"></p>\\n\\n<p>基于这种模式安装的chrome插件会因为用户启用了开发者模式而遭到谷歌的警告,用户可以选择忽略Chrome的警告<br>\\n--------------------- </p>\\n\\n<p>方法二</p>\\n\\n<p>运行中输入“gpedit.msc” ,打开 本地策略组 ,导入chrome.adm,再被禁用的插件ID复制下来,依次找到:Google Chrome→扩展程序→配置扩展程序白名单,将刚才的复制的ID粘贴进去,操作如图:<br><img alt=\"\" class=\"has\" src=\"https://img-blog.csdnimg.cn/20190322172648864.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dleGlhb2NoYW8=,size_16,color_FFFFFF,t_70\"></p>\\n\\n<p><img alt=\"\" class=\"has\" src=\"https://img-blog.csdnimg.cn/20190322172814111.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dleGlhb2NoYW8=,size_16,color_FFFFFF,t_70\"></p>\\n\\n<p>操作完后,再回到chrome扩展列表页面,可以看到被禁用的扩展,右侧启用的选项已变成可勾选状态,勾选启用该扩展即可!!</p>\\n </div>\\n <div class=\"more-toolbox\">\\n <div class=\"left-toolbox\">\\n <ul class=\"toolbox-list\">\\n \\n <li class=\"tool-item tool-active is-like \"><a href=\"javascript:;\"><svg class=\"icon\" aria-hidden=\"true\">\\n <use xlink:href=\"#csdnc-thumbsup\"></use>\\n </svg><span class=\"name\">点赞</span>\\n <span class=\"count\">4</span>\\n </a></li>\\n <li class=\"tool-item tool-active is-collection \"><a href=\"javascript:;\" data-report-click=\"{"mod":"popu_824"}\"><svg class=\"icon\" aria-hidden=\"true\">\\n <use xlink:href=\"#icon-csdnc-Collection-G\"></use>\\n </svg><span class=\"name\">收藏</span></a></li>\\n <li class=\"tool-item tool-active is-share\"><a href=\"javascript:;\" data-report-click=\"{"mod":"1582594662_002"}\"><svg class=\"icon\" aria-hidden=\"true\">\\n <use xlink:href=\"#icon-csdnc-fenxiang\"></use>\\n </svg>分享</a></li>\\n <!--打赏开始-->\\n <!--打赏结束-->\\n <li class=\"tool-item tool-more\">\\n <a>\\n <svg t=\"1575545411852\" class=\"icon\" viewBox=\"0 0 1024 1024\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" p-id=\"5717\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"200\" height=\"200\"><defs><style type=\"text/css\"></style></defs><path d=\"M179.176 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z\" p-id=\"5718\"></path><path d=\"M509.684 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z\" p-id=\"5719\"></path><path d=\"M846.175 499.222m-113.245 0a113.245 113.245 0 1 0 226.49 0 113.245 113.245 0 1 0-226.49 0Z\" p-id=\"5720\"></path></svg>\\n </a>\\n <ul class=\"more-box\">\\n <li class=\"item\"><a class=\"article-report\">文章举报</a></li>\\n </ul>\\n </li>\\n </ul>\\n </div>\\n </div>\\n <div class=\"person-messagebox\">\\n <div class=\"left-message\"><a href=\"https://blog.csdn.net/gexiaochao\">\\n <img src=\"https://profile.csdnimg.cn/6/9/B/3_gexiaochao\" class=\"avatar_pic\" username=\"gexiaochao\">\\n <img src=\"https://g.csdnimg.cn/static/user-reg-year/1x/7.png\" class=\"user-years\">\\n </a></div>\\n <div class=\"middle-message\">\\n <div class=\"title\"><span class=\"tit\"><a href=\"https://blog.csdn.net/gexiaochao\" data-report-click=\"{"mod":"popu_379"}\" target=\"_blank\">葛小勺</a></span>\\n </div>\\n <div class=\"text\"><span>发布了6 篇原创文章</span> · <span>获赞 8</span> · <span>访问量 11万+</span></div>\\n </div>\\n <div class=\"right-message\">\\n <a href=\"https://im.csdn.net/im/main.html?userName=gexiaochao\" target=\"_blank\" class=\"btn btn-sm btn-red-hollow bt-button personal-letter\">私信\\n </a>\\n <a class=\"btn btn-sm bt-button personal-watch\" data-report-click=\"{"mod":"popu_379"}\">关注</a>\\n </div>\\n </div>\\n </div>\\n </article>\\n \\n</div>']]"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# node = procedure[9]\n",
|
||
"# excuteOnce(node)\n",
|
||
"OUTPUT"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 192,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"name": "#%%\n"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"参数1_链接文本 参数2_链接地址 参数3_图片地址 参数4_文本 参数5_文本 参数6_文本 \n",
|
||
"\n",
|
||
"通用新闻资讯接口\n",
|
||
"¥ 3.00 元/10 https://www.idataapi https://www.idataapi 通用新闻资讯接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(23909) \n",
|
||
"\n",
|
||
"新浪微博\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 新浪微博 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(12605) \n",
|
||
"\n",
|
||
"通用酒店数据接口\n",
|
||
"免费\n",
|
||
"使用人数(971 https://www.idataapi https://www.idataapi 通用酒店数据接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(9718) \n",
|
||
"\n",
|
||
"微信公众号\n",
|
||
"¥ 1.00 元/100 次 https://www.idataapi https://www.idataapi 微信公众号 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1137) \n",
|
||
"\n",
|
||
"天猫\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 天猫 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(601) \n",
|
||
"\n",
|
||
"今日头条\n",
|
||
"¥ 0.50 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 今日头条 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥0. 使用人数(520) \n",
|
||
"\n",
|
||
"小红书\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 小红书 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(470) \n",
|
||
"\n",
|
||
"京东商城\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 京东商城 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(275) \n",
|
||
"\n",
|
||
"携程\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 携程 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(250) \n",
|
||
"\n",
|
||
"饿了么\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 饿了么 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(178) \n",
|
||
"\n",
|
||
"360新闻\n",
|
||
"免费\n",
|
||
"使用人数(3380)\n",
|
||
" https://www.idataapi https://www.idataapi 360新闻 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(3380) \n",
|
||
"\n",
|
||
"中文分词\n",
|
||
"免费\n",
|
||
"使用人数(1754)\n",
|
||
"文 https://www.idataapi https://www.idataapi 中文分词 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(1754) \n",
|
||
"\n",
|
||
"微信公众号文章link版\n",
|
||
"¥ 1.00 https://www.idataapi https://www.idataapi 微信公众号文章link版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1109) \n",
|
||
"\n",
|
||
"微信公众号文章高级版\n",
|
||
"¥ 3.00 元/ https://www.idataapi https://www.idataapi 微信公众号文章高级版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(1013) \n",
|
||
"\n",
|
||
"微信公众号文章专业版(关键词)\n",
|
||
"¥ 2. https://www.idataapi https://www.idataapi 微信公众号文章专业版(关键词) \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥2. 使用人数(946) \n",
|
||
"\n",
|
||
"餐饮类情感分析语料\n",
|
||
"¥ 0.01 元\n",
|
||
"使 https://www.idataapi https://www.idataapi 餐饮类情感分析语料 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥0. 使用人数(515) \n",
|
||
"\n",
|
||
"谷歌验证码识别训练集数据\n",
|
||
"¥ 0.01 https://www.idataapi https://www.idataapi 谷歌验证码识别训练集数据 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥0. 使用人数(301) \n",
|
||
"\n",
|
||
"微信20180320特定信息\n",
|
||
"¥ 500 https://www.idataapi https://www.idataapi 微信20180320特定信息 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥50 使用人数(4) \n",
|
||
"\n",
|
||
"微信公众号房地产3月份文章\n",
|
||
"¥ 800. https://www.idataapi https://www.idataapi 微信公众号房地产3月份文章 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥80 使用人数(3) \n",
|
||
"\n",
|
||
"专辑数据-2018\n",
|
||
"¥ 1000.00 https://www.idataapi https://www.idataapi 专辑数据-2018 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥10 使用人数(2) \n",
|
||
"\n",
|
||
"单曲数据-2018\n",
|
||
"¥ 2000.00 https://www.idataapi https://www.idataapi 单曲数据-2018 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥20 使用人数(2) \n",
|
||
"\n",
|
||
"天猫定制数据2018\n",
|
||
"¥ 3000.00 https://www.idataapi https://www.idataapi 天猫定制数据2018 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥30 使用人数(2) \n",
|
||
"\n",
|
||
"甜品店铺信息-2018.3.30\n",
|
||
"¥ 1 https://www.idataapi https://www.idataapi 甜品店铺信息-2018.3.30 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥15 使用人数(2) \n",
|
||
"\n",
|
||
"甜品店铺对应商品信息\n",
|
||
"¥ 1500.00 https://www.idataapi https://www.idataapi 甜品店铺对应商品信息 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥15 使用人数(2) \n",
|
||
"\n",
|
||
"电影,电视剧及图书短评语料\n",
|
||
"¥ 4000 https://www.idataapi https://www.idataapi 电影,电视剧及图书短评语料 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥40 使用人数(2) \n",
|
||
"\n",
|
||
"综艺数据-2018\n",
|
||
"¥ 1000.00 https://www.idataapi https://www.idataapi 综艺数据-2018 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥10 使用人数(2) \n",
|
||
"\n",
|
||
"创业数据库\n",
|
||
"¥ 190000.00 元\n",
|
||
" https://www.idataapi https://www.idataapi 创业数据库 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥19 使用人数(1) \n",
|
||
"\n",
|
||
"中国餐馆词库\n",
|
||
"¥ 20000.00 元\n",
|
||
" https://www.idataapi https://www.idataapi 中国餐馆词库 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥20 使用人数(0) \n",
|
||
"\n",
|
||
"通用新闻资讯接口\n",
|
||
"¥ 3.00 元/10 https://www.idataapi https://www.idataapi 通用新闻资讯接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(23909) \n",
|
||
"\n",
|
||
"新浪微博\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 新浪微博 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(12605) \n",
|
||
"\n",
|
||
"通用酒店数据接口\n",
|
||
"免费\n",
|
||
"使用人数(971 https://www.idataapi https://www.idataapi 通用酒店数据接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(9718) \n",
|
||
"\n",
|
||
"微信公众号\n",
|
||
"¥ 1.00 元/100 次 https://www.idataapi https://www.idataapi 微信公众号 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1137) \n",
|
||
"\n",
|
||
"天猫\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 天猫 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(600) \n",
|
||
"\n",
|
||
"今日头条\n",
|
||
"¥ 0.50 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 今日头条 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥0. 使用人数(520) \n",
|
||
"\n",
|
||
"小红书\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 小红书 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(470) \n",
|
||
"\n",
|
||
"京东商城\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 京东商城 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(275) \n",
|
||
"\n",
|
||
"携程\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 携程 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(250) \n",
|
||
"\n",
|
||
"饿了么\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 饿了么 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(178) \n",
|
||
"\n",
|
||
"360新闻\n",
|
||
"免费\n",
|
||
"使用人数(3380)\n",
|
||
" https://www.idataapi https://www.idataapi 360新闻 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(3380) \n",
|
||
"\n",
|
||
"中文分词\n",
|
||
"免费\n",
|
||
"使用人数(1754)\n",
|
||
"文 https://www.idataapi https://www.idataapi 中文分词 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(1754) \n",
|
||
"\n",
|
||
"微信公众号文章link版\n",
|
||
"¥ 1.00 https://www.idataapi https://www.idataapi 微信公众号文章link版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1109) \n",
|
||
"\n",
|
||
"微信公众号文章高级版\n",
|
||
"¥ 3.00 元/ https://www.idataapi https://www.idataapi 微信公众号文章高级版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(1013) \n",
|
||
"\n",
|
||
"微信公众号文章专业版(关键词)\n",
|
||
"¥ 2. https://www.idataapi https://www.idataapi 微信公众号文章专业版(关键词) \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥2. 使用人数(946) \n",
|
||
"\n",
|
||
"通用新闻资讯接口\n",
|
||
"¥ 3.00 元/10 https://www.idataapi https://www.idataapi 通用新闻资讯接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(23909) \n",
|
||
"\n",
|
||
"新浪微博\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 新浪微博 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(12605) \n",
|
||
"\n",
|
||
"通用酒店数据接口\n",
|
||
"免费\n",
|
||
"使用人数(971 https://www.idataapi https://www.idataapi 通用酒店数据接口 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(9718) \n",
|
||
"\n",
|
||
"微信公众号\n",
|
||
"¥ 1.00 元/100 次 https://www.idataapi https://www.idataapi 微信公众号 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1137) \n",
|
||
"\n",
|
||
"天猫\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 天猫 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(600) \n",
|
||
"\n",
|
||
"今日头条\n",
|
||
"¥ 0.50 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 今日头条 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥0. 使用人数(520) \n",
|
||
"\n",
|
||
"小红书\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 小红书 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(470) \n",
|
||
"\n",
|
||
"京东商城\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
" https://www.idataapi https://www.idataapi 京东商城 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(275) \n",
|
||
"\n",
|
||
"携程\n",
|
||
"¥ 1.00 元/100 次\n",
|
||
"使用 https://www.idataapi https://www.idataapi 携程 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(250) \n",
|
||
"\n",
|
||
"饿了么\n",
|
||
"¥ 3.00 元/100 次\n",
|
||
"使 https://www.idataapi https://www.idataapi 饿了么 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(178) \n",
|
||
"\n",
|
||
"360新闻\n",
|
||
"免费\n",
|
||
"使用人数(3380)\n",
|
||
" https://www.idataapi https://www.idataapi 360新闻 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(3380) \n",
|
||
"\n",
|
||
"中文分词\n",
|
||
"免费\n",
|
||
"使用人数(1754)\n",
|
||
"文 https://www.idataapi https://www.idataapi 中文分词 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t免费\t\t 使用人数(1754) \n",
|
||
"\n",
|
||
"微信公众号文章link版\n",
|
||
"¥ 1.00 https://www.idataapi https://www.idataapi 微信公众号文章link版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥1. 使用人数(1109) \n",
|
||
"\n",
|
||
"微信公众号文章高级版\n",
|
||
"¥ 3.00 元/ https://www.idataapi https://www.idataapi 微信公众号文章高级版 \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥3. 使用人数(1013) \n",
|
||
"\n",
|
||
"微信公众号文章专业版(关键词)\n",
|
||
"¥ 2. https://www.idataapi https://www.idataapi 微信公众号文章专业版(关键词) \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t¥2. 使用人数(946) \n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"for i in OUTPUT:\n",
|
||
" for j in i:\n",
|
||
" print(j[:20],\" \",end=\"\")\n",
|
||
" print(\"\\n\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.4"
|
||
},
|
||
"pycharm": {
|
||
"stem_cell": {
|
||
"cell_type": "raw",
|
||
"source": [],
|
||
"metadata": {
|
||
"collapsed": false
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
} |