Deal with data:

This commit is contained in:
naibo 2023-11-23 04:19:18 +08:00
parent d0822c805b
commit 4025e255a0
15 changed files with 40 additions and 10 deletions

View File

@ -0,0 +1 @@
{"id":126,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -323,8 +323,11 @@ exports.start = function(port = 8074) {
task = JSON.parse(task); task = JSON.parse(task);
try{ try{
task["links"] = data["urlList_0"]; task["links"] = data["urlList_0"];
}catch(error){ if (tasks["links"] == undefined) {
console.log(error); task["links"] = "about:blank";
}
} catch(error) {
task["links"] = "about:blank";
} }
for (const [key, value] of Object.entries(data)) { for (const [key, value] of Object.entries(data)) {
for (let i = 0; i < task["inputParameters"].length; i++) { for (let i = 0; i < task["inputParameters"].length; i++) {

View File

@ -359,7 +359,7 @@ function saveService(type) {
let outputNames = []; let outputNames = [];
let inputIndex = 0; let inputIndex = 0;
let outputIndex = 0; let outputIndex = 0;
let links = ""; //记录所有的link let links = "about:blank"; //记录所有的link
let containJudge = false; //是否含有判断语句 let containJudge = false; //是否含有判断语句
let saveThreshold = parseInt($("#saveThreshold").val()); let saveThreshold = parseInt($("#saveThreshold").val());
let cloudflare = parseInt($("#cloudflare").val()); let cloudflare = parseInt($("#cloudflare").val());

File diff suppressed because one or more lines are too long

View File

@ -12,7 +12,7 @@
"justMyCode": false, "justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[125]", "--headless", "0", "--user_data", "0", "--keyboard", "0"] "args": ["--id", "[134]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
} }
] ]
} }

View File

@ -337,7 +337,7 @@ class BrowserThread(Thread):
if "urlList_0" in data.keys(): if "urlList_0" in data.keys():
self.links = data["urlList_0"] self.links = data["urlList_0"]
except: except:
pass self.links = "about:blank"
task = self.service task = self.service
for key, value in data.items(): for key, value in data.items():
for i in range(len(task["inputParameters"])): for i in range(len(task["inputParameters"])):
@ -987,7 +987,8 @@ class BrowserThread(Thread):
self.print_and_log("Loop element not found: ", self.print_and_log("Loop element not found: ",
xpath) xpath)
self.print_and_log("找不到循环元素: ", xpath) self.print_and_log("找不到循环元素: ", xpath)
for index in range(len(elements)): index = 0
while index < len(elements):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作 for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
self.executeNode(i, elements[index], self.executeNode(i, elements[index],
xpath, index) xpath, index)
@ -1033,16 +1034,22 @@ class BrowserThread(Thread):
except: except:
pass pass
if self.browser.current_url.startswith("data:"): if self.browser.current_url.startswith("data:"):
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步 try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2) time.sleep(2)
elements = self.browser.find_elements(By.XPATH, elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"]) xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件 if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int( output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"]) node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
index = index + 1
except NoSuchElementException: except NoSuchElementException:
self.print_and_log("Loop element not found: ", xpath) self.print_and_log("Loop element not found: ", xpath)
self.print_and_log("找不到循环元素: ", xpath) self.print_and_log("找不到循环元素: ", xpath)
@ -1050,7 +1057,11 @@ class BrowserThread(Thread):
raise raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
# 千万不要忘了分割!! # 千万不要忘了分割!!
for path in node["parameters"]["pathList"].split("\n"): paths = node["parameters"]["pathList"].split("\n")
# for path in node["parameters"]["pathList"].split("\n"):
index = 0
while index < len(paths):
path = paths[index]
try: try:
path = replace_field_values( path = replace_field_values(
path, self.outputParameters, self) path, self.outputParameters, self)
@ -1100,10 +1111,15 @@ class BrowserThread(Thread):
except: except:
pass pass
if self.browser.current_url.startswith("data:"): if self.browser.current_url.startswith("data:"):
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步 try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2) time.sleep(2)
elements = self.browser.find_elements(By.XPATH, elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"]) xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
except NoSuchElementException: except NoSuchElementException:
self.print_and_log("Loop element not found: ", path) self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素: ", path) self.print_and_log("找不到循环元素: ", path)
@ -1116,6 +1132,7 @@ class BrowserThread(Thread):
code = get_output_code(output) code = get_output_code(output)
if code <= 0: if code <= 0:
break break
index = index + 1
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表 elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n") textList = node["parameters"]["textList"].split("\n")
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量 if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量