mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-19 18:59:52 +08:00
Deal with data:
This commit is contained in:
parent
d0822c805b
commit
4025e255a0
@ -0,0 +1 @@
|
|||||||
|
{"id":126,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_windows_x64/tasks/235.json
Normal file
1
.temp_to_pub/EasySpider_windows_x64/tasks/235.json
Normal file
File diff suppressed because one or more lines are too long
@ -323,8 +323,11 @@ exports.start = function(port = 8074) {
|
|||||||
task = JSON.parse(task);
|
task = JSON.parse(task);
|
||||||
try{
|
try{
|
||||||
task["links"] = data["urlList_0"];
|
task["links"] = data["urlList_0"];
|
||||||
}catch(error){
|
if (tasks["links"] == undefined) {
|
||||||
console.log(error);
|
task["links"] = "about:blank";
|
||||||
|
}
|
||||||
|
} catch(error) {
|
||||||
|
task["links"] = "about:blank";
|
||||||
}
|
}
|
||||||
for (const [key, value] of Object.entries(data)) {
|
for (const [key, value] of Object.entries(data)) {
|
||||||
for (let i = 0; i < task["inputParameters"].length; i++) {
|
for (let i = 0; i < task["inputParameters"].length; i++) {
|
||||||
|
@ -359,7 +359,7 @@ function saveService(type) {
|
|||||||
let outputNames = [];
|
let outputNames = [];
|
||||||
let inputIndex = 0;
|
let inputIndex = 0;
|
||||||
let outputIndex = 0;
|
let outputIndex = 0;
|
||||||
let links = ""; //记录所有的link
|
let links = "about:blank"; //记录所有的link
|
||||||
let containJudge = false; //是否含有判断语句
|
let containJudge = false; //是否含有判断语句
|
||||||
let saveThreshold = parseInt($("#saveThreshold").val());
|
let saveThreshold = parseInt($("#saveThreshold").val());
|
||||||
let cloudflare = parseInt($("#cloudflare").val());
|
let cloudflare = parseInt($("#cloudflare").val());
|
||||||
|
File diff suppressed because one or more lines are too long
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
|||||||
"justMyCode": false,
|
"justMyCode": false,
|
||||||
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||||
"args": ["--id", "[125]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
|
"args": ["--id", "[134]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
@ -337,7 +337,7 @@ class BrowserThread(Thread):
|
|||||||
if "urlList_0" in data.keys():
|
if "urlList_0" in data.keys():
|
||||||
self.links = data["urlList_0"]
|
self.links = data["urlList_0"]
|
||||||
except:
|
except:
|
||||||
pass
|
self.links = "about:blank"
|
||||||
task = self.service
|
task = self.service
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
for i in range(len(task["inputParameters"])):
|
for i in range(len(task["inputParameters"])):
|
||||||
@ -987,7 +987,8 @@ class BrowserThread(Thread):
|
|||||||
self.print_and_log("Loop element not found: ",
|
self.print_and_log("Loop element not found: ",
|
||||||
xpath)
|
xpath)
|
||||||
self.print_and_log("找不到循环元素: ", xpath)
|
self.print_and_log("找不到循环元素: ", xpath)
|
||||||
for index in range(len(elements)):
|
index = 0
|
||||||
|
while index < len(elements):
|
||||||
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
|
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
|
||||||
self.executeNode(i, elements[index],
|
self.executeNode(i, elements[index],
|
||||||
xpath, index)
|
xpath, index)
|
||||||
@ -1033,16 +1034,22 @@ class BrowserThread(Thread):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if self.browser.current_url.startswith("data:"):
|
if self.browser.current_url.startswith("data:"):
|
||||||
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
|
try:
|
||||||
|
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
|
||||||
|
except: # 超时的情况下
|
||||||
|
pass
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
elements = self.browser.find_elements(By.XPATH,
|
elements = self.browser.find_elements(By.XPATH,
|
||||||
xpath, iframe=node["parameters"]["iframe"])
|
xpath, iframe=node["parameters"]["iframe"])
|
||||||
|
if index > 0:
|
||||||
|
index -= 1 # 如果是data:开头的网址,就要重试一次
|
||||||
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
|
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
|
||||||
output = self.execute_code(int(
|
output = self.execute_code(int(
|
||||||
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
|
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
|
||||||
code = get_output_code(output)
|
code = get_output_code(output)
|
||||||
if code <= 0:
|
if code <= 0:
|
||||||
break
|
break
|
||||||
|
index = index + 1
|
||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.print_and_log("Loop element not found: ", xpath)
|
self.print_and_log("Loop element not found: ", xpath)
|
||||||
self.print_and_log("找不到循环元素: ", xpath)
|
self.print_and_log("找不到循环元素: ", xpath)
|
||||||
@ -1050,7 +1057,11 @@ class BrowserThread(Thread):
|
|||||||
raise
|
raise
|
||||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||||
# 千万不要忘了分割!!
|
# 千万不要忘了分割!!
|
||||||
for path in node["parameters"]["pathList"].split("\n"):
|
paths = node["parameters"]["pathList"].split("\n")
|
||||||
|
# for path in node["parameters"]["pathList"].split("\n"):
|
||||||
|
index = 0
|
||||||
|
while index < len(paths):
|
||||||
|
path = paths[index]
|
||||||
try:
|
try:
|
||||||
path = replace_field_values(
|
path = replace_field_values(
|
||||||
path, self.outputParameters, self)
|
path, self.outputParameters, self)
|
||||||
@ -1100,10 +1111,15 @@ class BrowserThread(Thread):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if self.browser.current_url.startswith("data:"):
|
if self.browser.current_url.startswith("data:"):
|
||||||
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
|
try:
|
||||||
|
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
|
||||||
|
except: # 超时的情况下
|
||||||
|
pass
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
elements = self.browser.find_elements(By.XPATH,
|
elements = self.browser.find_elements(By.XPATH,
|
||||||
xpath, iframe=node["parameters"]["iframe"])
|
xpath, iframe=node["parameters"]["iframe"])
|
||||||
|
if index > 0:
|
||||||
|
index -= 1 # 如果是data:开头的网址,就要重试一次
|
||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.print_and_log("Loop element not found: ", path)
|
self.print_and_log("Loop element not found: ", path)
|
||||||
self.print_and_log("找不到循环元素: ", path)
|
self.print_and_log("找不到循环元素: ", path)
|
||||||
@ -1116,6 +1132,7 @@ class BrowserThread(Thread):
|
|||||||
code = get_output_code(output)
|
code = get_output_code(output)
|
||||||
if code <= 0:
|
if code <= 0:
|
||||||
break
|
break
|
||||||
|
index = index + 1
|
||||||
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
|
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
|
||||||
textList = node["parameters"]["textList"].split("\n")
|
textList = node["parameters"]["textList"].split("\n")
|
||||||
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量
|
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量
|
||||||
|
Loading…
x
Reference in New Issue
Block a user