mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-23 04:34:22 +08:00
URL pattern detect
This commit is contained in:
parent
a0dbaea00f
commit
f84eb673c8
@ -1,7 +1 @@
|
|||||||
{
|
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":true,"absolute_user_data_folder":"D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"}
|
||||||
"webserver_address": "http://localhost",
|
|
||||||
"webserver_port": 8074,
|
|
||||||
"user_data_folder": "./user_data",
|
|
||||||
"debug": true,
|
|
||||||
"absolute_user_data_folder": "D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"
|
|
||||||
}
|
|
@ -457,7 +457,7 @@ function handleOpenBrowser(event, lang = "en", user_data_folder = "", mobile = f
|
|||||||
runBrowser(lang, user_data_folder, mobile);
|
runBrowser(lang, user_data_folder, mobile);
|
||||||
let size = screen.getPrimaryDisplay().workAreaSize;
|
let size = screen.getPrimaryDisplay().workAreaSize;
|
||||||
let width = parseInt(size.width);
|
let width = parseInt(size.width);
|
||||||
let height = parseInt(size.height * 0.65);
|
let height = parseInt(size.height * 0.6);
|
||||||
flowchart_window = new BrowserWindow({
|
flowchart_window = new BrowserWindow({
|
||||||
x: 0,
|
x: 0,
|
||||||
y: 0,
|
y: 0,
|
||||||
|
@ -513,25 +513,33 @@
|
|||||||
<h4 class="modal-title" id="myModalLabel">保存任务</h4>
|
<h4 class="modal-title" id="myModalLabel">保存任务</h4>
|
||||||
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
|
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="modal-body">
|
<div class="modal-body" style="height:400px;overflow: auto">
|
||||||
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
|
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
|
||||||
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
|
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
|
||||||
<label>任务名称:</label>
|
<label>任务名称:</label>
|
||||||
<input onkeydown="inputDelete(event)" required name="serviceName" value="新web采集任务" id="serviceName" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" required name="serviceName" value="新web采集任务" id="serviceName" class="form-control"></input>
|
||||||
<label>任务描述:</label>
|
<label>任务描述:</label>
|
||||||
<input onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
|
||||||
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
|
<label>导出数据格式:</label>
|
||||||
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
<select id="outputFormat" class="form-control">
|
||||||
<label>是否为cloudflare等极端反爬网站:</label>
|
<option value = "csv">CSV</option>
|
||||||
<select id="cloudflare" name="cloudflare" class="form-control">
|
<option value = "xlsx">XLSX(EXCEL)</option>
|
||||||
<option value = 0>否</option>
|
<option value = "mysql">MySQL</option>
|
||||||
<option value = 1>是</option>
|
|
||||||
</select>
|
</select>
|
||||||
<label>浏览器模拟类型:</label>
|
<label>浏览器模拟类型:</label>
|
||||||
<select id="environment" name="environment" class="form-control">
|
<select id="environment" name="environment" class="form-control">
|
||||||
<option value = 0>电脑端</option>
|
<option value = 0>电脑端</option>
|
||||||
<option value = 1>手机端(Cloudflare模式下不支持)</option>
|
<option value = 1>手机端(Cloudflare模式下不支持)</option>
|
||||||
</select>
|
</select>
|
||||||
|
<label>是否为cloudflare等极端反爬网站:</label>
|
||||||
|
<select id="cloudflare" name="cloudflare" class="form-control">
|
||||||
|
<option value = 0>否</option>
|
||||||
|
<option value = 1>是</option>
|
||||||
|
</select>
|
||||||
|
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
|
||||||
|
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
||||||
|
<label>控制台预览时数据最大显示长度:</label>
|
||||||
|
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div class="modal-footer">
|
<div class="modal-footer">
|
||||||
|
@ -427,6 +427,8 @@ function saveService(type) {
|
|||||||
"saveThreshold": saveThreshold,
|
"saveThreshold": saveThreshold,
|
||||||
"cloudflare": cloudflare,
|
"cloudflare": cloudflare,
|
||||||
"environment": environment,
|
"environment": environment,
|
||||||
|
"maxViewLength": parseInt($("#maxViewLength").val()),
|
||||||
|
"outputFormat": $("#outputFormat").val(),
|
||||||
"containJudge": containJudge,
|
"containJudge": containJudge,
|
||||||
"desc": serviceDescription,
|
"desc": serviceDescription,
|
||||||
"inputParameters": inputParameters,
|
"inputParameters": inputParameters,
|
||||||
@ -460,10 +462,24 @@ if (sId != null && sId != -1) //加载任务
|
|||||||
$.get(backEndAddressServiceWrapper + "/queryTask?id=" + sId, function(result) {
|
$.get(backEndAddressServiceWrapper + "/queryTask?id=" + sId, function(result) {
|
||||||
nodeList = result["graph"];
|
nodeList = result["graph"];
|
||||||
app.$data.list.nl = nodeList;
|
app.$data.list.nl = nodeList;
|
||||||
|
for(let node of nodeList){ //兼容旧版本
|
||||||
|
if(node["option"] == 1){
|
||||||
|
if(!("cookies" in node["parameters"])) {
|
||||||
|
node["parameters"]["cookies"] = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
$("#serviceName").val(result["name"]);
|
$("#serviceName").val(result["name"]);
|
||||||
$("#serviceId").val(result["id"]);
|
$("#serviceId").val(result["id"]);
|
||||||
$("#url").val(result["url"]);
|
$("#url").val(result["url"]);
|
||||||
$("#serviceDescription").val(result["desc"]);
|
$("#serviceDescription").val(result["desc"]);
|
||||||
|
for(let key of Object.keys(result)){
|
||||||
|
try{
|
||||||
|
$("#"+key).val(result[key]);
|
||||||
|
} catch(e){
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
refresh();
|
refresh();
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
|||||||
{"id":142,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/6/2023, 3:38:35 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"string","exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":"test=123\nipLoc-djd=53283-53456-0-0\nareaId=53283\nmba_sid=16885856346417163685425076773.0\n__jdc=122270672\n__jdb=122270672.1.16885856346381587112207|1.1688585634\nmba_muid=16885856346381587112207\n__jdv=122270672%7Clocalhost%3A8074%7C-%7Creferral%7C-%7C1688585634639\n__jda=122270672.16885856346381587112207.1688585634.1688585634.1688585634.1"}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"p2h2i1dva8ljq4aje2","iframe":false,"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
{"id":142,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/6/2023, 4:08:31 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"string","exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":"test=123\nipLoc-djd=53283-53456-0-0\nareaId=53283\nmba_sid=16885856346417163685425076773.0\n__jdc=122270672\n__jdb=122270672.1.16885856346381587112207|1.1688585634\nmba_muid=16885856346381587112207\n__jdv=122270672%7Clocalhost%3A8074%7C-%7Creferral%7C-%7C1688585634639\n__jda=122270672.16885856346381587112207.1688585634.1688585634.1688585634.1"}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"p2h2i1dva8ljq4aje2","iframe":false,"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
1
ElectronJS/tasks/143.json
Normal file
1
ElectronJS/tasks/143.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id":143,"name":"中国知网","url":"https://chn.oversea.cnki.net/index/","links":"https://chn.oversea.cnki.net/index/","create_time":"7/6/2023, 4:50:52 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"csv","containJudge":false,"desc":"https://chn.oversea.cnki.net/index/","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://chn.oversea.cnki.net/index/","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://chn.oversea.cnki.net/index/"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://chn.oversea.cnki.net/index/","links":"https://chn.oversea.cnki.net/index/","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}
|
File diff suppressed because one or more lines are too long
6
ExecuteStage/.vscode/launch.json
vendored
6
ExecuteStage/.vscode/launch.json
vendored
@ -10,9 +10,9 @@
|
|||||||
"program": "${file}",
|
"program": "${file}",
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"justMyCode": true,
|
"justMyCode": true,
|
||||||
"args": ["--id", "[3]", "--read_type", "remote", "--headless", "0"]
|
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--id", "[2]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||||
// "args": ["--id", "[44]", "--headless", "0", "--user_data", "1"]
|
"args": ["--id", "[16]", "--headless", "0", "--user_data", "1"]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
@ -2,5 +2,5 @@
|
|||||||
"webserver_address": "http://localhost",
|
"webserver_address": "http://localhost",
|
||||||
"webserver_port": 8074,
|
"webserver_port": 8074,
|
||||||
"user_data_folder": "./user_data",
|
"user_data_folder": "./user_data",
|
||||||
"absolute_user_data_folder": "D:\\Documents\\Projects\\EasySpider\\.temp_to_pub\\EasySpider_windows_x64\\user_data"
|
"absolute_user_data_folder": "D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"
|
||||||
}
|
}
|
@ -31,6 +31,8 @@ from selenium.webdriver import ActionChains
|
|||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
import random
|
import random
|
||||||
|
# import pandas as pd
|
||||||
|
from openpyxl import load_workbook, Workbook
|
||||||
# import numpy
|
# import numpy
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
@ -40,15 +42,16 @@ from PIL import Image
|
|||||||
# import uuid
|
# import uuid
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from myChrome import MyChrome
|
from myChrome import MyChrome
|
||||||
from utils import check_pause, download_image, get_output_code, isnull
|
from utils import check_pause, download_image, get_output_code, isnull, write_to_csv, write_to_excel
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
|
|
||||||
class BrowserThread(Thread):
|
class BrowserThread(Thread):
|
||||||
def __init__(self, browser_t, id, service, version, event):
|
def __init__(self, browser_t, id, service, version, event, config):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.browser = browser_t
|
self.browser = browser_t
|
||||||
|
self.config = config
|
||||||
self.id = id
|
self.id = id
|
||||||
self.event = event
|
self.event = event
|
||||||
self.saveName = saveName
|
self.saveName = saveName
|
||||||
@ -65,6 +68,14 @@ class BrowserThread(Thread):
|
|||||||
WebDriverWait(self.browser, 10)
|
WebDriverWait(self.browser, 10)
|
||||||
self.browser.get('about:blank')
|
self.browser.get('about:blank')
|
||||||
self.procedure = service["graph"] # 程序执行流程
|
self.procedure = service["graph"] # 程序执行流程
|
||||||
|
try:
|
||||||
|
self.maxViewLength = service["maxViewLength"] # 最大显示长度
|
||||||
|
except:
|
||||||
|
self.maxViewLength = 15
|
||||||
|
try:
|
||||||
|
self.outputFormat = service["outputFormat"] # 输出格式
|
||||||
|
except:
|
||||||
|
self.outputFormat = "csv"
|
||||||
try:
|
try:
|
||||||
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||||
pass
|
pass
|
||||||
@ -88,6 +99,7 @@ class BrowserThread(Thread):
|
|||||||
self.links = list(
|
self.links = list(
|
||||||
filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
|
filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||||
self.OUTPUT = [] # 采集的数据
|
self.OUTPUT = [] # 采集的数据
|
||||||
|
self.OUTPUT.append([]) # 添加表头
|
||||||
self.containJudge = service["containJudge"] # 是否含有判断语句
|
self.containJudge = service["containJudge"] # 是否含有判断语句
|
||||||
tOut = service["outputParameters"] # 生成输出参数对象
|
tOut = service["outputParameters"] # 生成输出参数对象
|
||||||
self.outputParameters = {}
|
self.outputParameters = {}
|
||||||
@ -95,15 +107,19 @@ class BrowserThread(Thread):
|
|||||||
self.log = "" # 记下现在总共开了多少个标签页
|
self.log = "" # 记下现在总共开了多少个标签页
|
||||||
self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
|
self.history = {"index": 0, "handle": None} # 记录页面现在所以在的历史记录的位置
|
||||||
self.SAVED = False # 记录是否已经存储了
|
self.SAVED = False # 记录是否已经存储了
|
||||||
# 文件叠加的时候不添加表头
|
|
||||||
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
|
||||||
self.OUTPUT.append([]) # 添加表头
|
|
||||||
for para in tOut:
|
for para in tOut:
|
||||||
if para["name"] not in self.outputParameters.keys():
|
if para["name"] not in self.outputParameters.keys():
|
||||||
self.outputParameters[para["name"]] = ""
|
self.outputParameters[para["name"]] = ""
|
||||||
self.dataNotFoundKeys[para["name"]] = False
|
self.dataNotFoundKeys[para["name"]] = False
|
||||||
|
# 文件叠加的时候不添加表头
|
||||||
|
if self.outputFormat == "csv":
|
||||||
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv'):
|
||||||
self.OUTPUT[0].append(para["name"])
|
self.OUTPUT[0].append(para["name"])
|
||||||
|
elif self.outputFormat == "xlsx":
|
||||||
|
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
|
||||||
|
self.OUTPUT[0].append(para["name"])
|
||||||
|
elif self.outputFormat == "mysql": # MySQL不需要表头
|
||||||
|
pass
|
||||||
self.urlId = 0 # 全局记录变量
|
self.urlId = 0 # 全局记录变量
|
||||||
self.preprocess() # 预处理,优化提取数据流程
|
self.preprocess() # 预处理,优化提取数据流程
|
||||||
|
|
||||||
@ -134,6 +150,8 @@ class BrowserThread(Thread):
|
|||||||
def run(self):
|
def run(self):
|
||||||
# 挨个执行程序
|
# 挨个执行程序
|
||||||
for i in range(len(self.links)):
|
for i in range(len(self.links)):
|
||||||
|
print("正在执行第", i + 1, "/ ", len(self.links), "个链接")
|
||||||
|
print("Executing link", i + 1, "/ ", len(self.links))
|
||||||
self.executeNode(0)
|
self.executeNode(0)
|
||||||
self.urlId = self.urlId + 1
|
self.urlId = self.urlId + 1
|
||||||
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
||||||
@ -167,11 +185,17 @@ class BrowserThread(Thread):
|
|||||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
|
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
|
||||||
file_obj.write(self.log)
|
file_obj.write(self.log)
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '.csv', 'a', encoding='utf-8-sig', newline="") as f:
|
if self.outputFormat == "csv":
|
||||||
f_csv = csv.writer(f)
|
file_name = "Data/Task_" + \
|
||||||
for line in self.OUTPUT:
|
str(self.id) + "/" + self.saveName + '.csv'
|
||||||
f_csv.writerow(line)
|
write_to_csv(file_name, self.OUTPUT)
|
||||||
f.close()
|
elif self.outputFormat == "xlsx":
|
||||||
|
file_name = "Data/Task_" + \
|
||||||
|
str(self.id) + "/" + self.saveName + '.xlsx'
|
||||||
|
write_to_excel(file_name, self.OUTPUT)
|
||||||
|
elif self.outputFormat == "mysql":
|
||||||
|
# write_to_mysql(self.config, )
|
||||||
|
pass
|
||||||
self.OUTPUT = []
|
self.OUTPUT = []
|
||||||
self.log = ""
|
self.log = ""
|
||||||
|
|
||||||
@ -302,7 +326,7 @@ class BrowserThread(Thread):
|
|||||||
line = []
|
line = []
|
||||||
for value in self.outputParameters.values():
|
for value in self.outputParameters.values():
|
||||||
line.append(value)
|
line.append(value)
|
||||||
print(value[:15], " ", end="")
|
print(value[:self.maxViewLength], " ", end="")
|
||||||
print("")
|
print("")
|
||||||
self.OUTPUT.append(line)
|
self.OUTPUT.append(line)
|
||||||
|
|
||||||
@ -728,6 +752,9 @@ class BrowserThread(Thread):
|
|||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed to load page: " + url)
|
||||||
|
self.recordLog('Failed to load page: ' + url)
|
||||||
try:
|
try:
|
||||||
self.history["index"] = self.browser.execute_script(
|
self.history["index"] = self.browser.execute_script(
|
||||||
"return history.length")
|
"return history.length")
|
||||||
@ -1184,7 +1211,7 @@ class BrowserThread(Thread):
|
|||||||
line = []
|
line = []
|
||||||
for value in self.outputParameters.values():
|
for value in self.outputParameters.values():
|
||||||
line.append(value)
|
line.append(value)
|
||||||
print(value[:15], " ", end="")
|
print(value[:self.maxViewLength], " ", end="")
|
||||||
print("")
|
print("")
|
||||||
self.OUTPUT.append(line)
|
self.OUTPUT.append(line)
|
||||||
# rt.end()
|
# rt.end()
|
||||||
@ -1279,12 +1306,15 @@ if __name__ == '__main__':
|
|||||||
# 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
|
# 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
|
||||||
# 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
|
# 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
|
||||||
# 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
|
# 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
|
||||||
if c.user_data:
|
try:
|
||||||
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
absolute_user_data_folder = config["absolute_user_data_folder"]
|
absolute_user_data_folder = config["absolute_user_data_folder"]
|
||||||
print("\nAbsolute_user_data_folder:",
|
print("\nAbsolute_user_data_folder:",
|
||||||
absolute_user_data_folder, "\n")
|
absolute_user_data_folder, "\n")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if c.user_data:
|
||||||
option.add_argument(
|
option.add_argument(
|
||||||
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
||||||
option.add_argument("--profile-directory=Default")
|
option.add_argument("--profile-directory=Default")
|
||||||
@ -1371,7 +1401,8 @@ if __name__ == '__main__':
|
|||||||
print("过Cloudflare验证模式")
|
print("过Cloudflare验证模式")
|
||||||
event = Event()
|
event = Event()
|
||||||
event.set()
|
event.set()
|
||||||
thread = BrowserThread(browser_t, i, service, c.version, event)
|
thread = BrowserThread(browser_t, i, service,
|
||||||
|
c.version, event, config=config)
|
||||||
print("Thread with task id: ", i, " is created")
|
print("Thread with task id: ", i, " is created")
|
||||||
threads.append(thread)
|
threads.append(thread)
|
||||||
thread.start()
|
thread.start()
|
||||||
|
@ -6,3 +6,4 @@ Pillow
|
|||||||
pytesseract
|
pytesseract
|
||||||
keyboard
|
keyboard
|
||||||
undetected_chromedriver
|
undetected_chromedriver
|
||||||
|
openpyxl
|
@ -1,10 +1,21 @@
|
|||||||
# 控制流程的暂停和继续
|
# 控制流程的暂停和继续
|
||||||
|
|
||||||
|
import csv
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import keyboard
|
import keyboard
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
import requests
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_url(url):
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
return all([result.scheme, result.netloc])
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def check_pause(key, event):
|
def check_pause(key, event):
|
||||||
@ -28,7 +39,7 @@ def download_image(url, save_directory):
|
|||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
}
|
}
|
||||||
|
if is_valid_url(url):
|
||||||
# 发送 GET 请求获取图片数据
|
# 发送 GET 请求获取图片数据
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
@ -38,7 +49,8 @@ def download_image(url, save_directory):
|
|||||||
file_name = url.split('/')[-1].split("?")[0]
|
file_name = url.split('/')[-1].split("?")[0]
|
||||||
|
|
||||||
# 生成唯一的新文件名
|
# 生成唯一的新文件名
|
||||||
new_file_name = file_name + '_' + str(uuid.uuid4()) + '_' + file_name
|
new_file_name = file_name + '_' + \
|
||||||
|
str(uuid.uuid4()) + '_' + file_name
|
||||||
|
|
||||||
# 构建保存路径
|
# 构建保存路径
|
||||||
save_path = os.path.join(save_directory, new_file_name)
|
save_path = os.path.join(save_directory, new_file_name)
|
||||||
@ -49,6 +61,10 @@ def download_image(url, save_directory):
|
|||||||
|
|
||||||
print("图片已成功下载到:", save_path)
|
print("图片已成功下载到:", save_path)
|
||||||
print("The image has been successfully downloaded to:", save_path)
|
print("The image has been successfully downloaded to:", save_path)
|
||||||
|
else:
|
||||||
|
print("下载图片失败,请检查此图片链接是否有效:", url)
|
||||||
|
print(
|
||||||
|
"Failed to download image, please check if this image link is valid:", url)
|
||||||
else:
|
else:
|
||||||
print("下载图片失败,请检查此图片链接是否有效:", url)
|
print("下载图片失败,请检查此图片链接是否有效:", url)
|
||||||
print("Failed to download image, please check if this image link is valid:", url)
|
print("Failed to download image, please check if this image link is valid:", url)
|
||||||
@ -71,6 +87,29 @@ def isnull(s):
|
|||||||
return len(s) != 0
|
return len(s) != 0
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_csv(file_name, data):
|
||||||
|
with open(file_name, 'a', encoding='utf-8-sig', newline="") as f:
|
||||||
|
f_csv = csv.writer(f)
|
||||||
|
for line in data:
|
||||||
|
f_csv.writerow(line)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_excel(file_name, data):
|
||||||
|
if os.path.exists(file_name):
|
||||||
|
# 加载现有的工作簿
|
||||||
|
wb = load_workbook(file_name)
|
||||||
|
ws = wb.active
|
||||||
|
else:
|
||||||
|
# 创建新的工作簿和工作表
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
# 追加数据到工作表
|
||||||
|
for line in data:
|
||||||
|
ws.append(line)
|
||||||
|
# 保存工作簿
|
||||||
|
wb.save(file_name)
|
||||||
|
|
||||||
|
|
||||||
class Time:
|
class Time:
|
||||||
def __init__(self, type1=""):
|
def __init__(self, type1=""):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user