Deal with data:

This commit is contained in:
naibo 2023-11-23 04:19:18 +08:00
parent d0822c805b
commit 4025e255a0
15 changed files with 40 additions and 10 deletions

View File

@ -0,0 +1 @@
{"id":126,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -323,8 +323,11 @@ exports.start = function(port = 8074) {
task = JSON.parse(task);
try{
task["links"] = data["urlList_0"];
if (tasks["links"] == undefined) {
task["links"] = "about:blank";
}
} catch(error) {
console.log(error);
task["links"] = "about:blank";
}
for (const [key, value] of Object.entries(data)) {
for (let i = 0; i < task["inputParameters"].length; i++) {

View File

@ -359,7 +359,7 @@ function saveService(type) {
let outputNames = [];
let inputIndex = 0;
let outputIndex = 0;
let links = ""; //记录所有的link
let links = "about:blank"; //记录所有的link
let containJudge = false; //是否含有判断语句
let saveThreshold = parseInt($("#saveThreshold").val());
let cloudflare = parseInt($("#cloudflare").val());

File diff suppressed because one or more lines are too long

View File

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[125]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
"args": ["--id", "[134]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
}
]
}

View File

@ -337,7 +337,7 @@ class BrowserThread(Thread):
if "urlList_0" in data.keys():
self.links = data["urlList_0"]
except:
pass
self.links = "about:blank"
task = self.service
for key, value in data.items():
for i in range(len(task["inputParameters"])):
@ -987,7 +987,8 @@ class BrowserThread(Thread):
self.print_and_log("Loop element not found: ",
xpath)
self.print_and_log("找不到循环元素: ", xpath)
for index in range(len(elements)):
index = 0
while index < len(elements):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
self.executeNode(i, elements[index],
xpath, index)
@ -1033,16 +1034,22 @@ class BrowserThread(Thread):
except:
pass
if self.browser.current_url.startswith("data:"):
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
code = get_output_code(output)
if code <= 0:
break
index = index + 1
except NoSuchElementException:
self.print_and_log("Loop element not found: ", xpath)
self.print_and_log("找不到循环元素: ", xpath)
@ -1050,7 +1057,11 @@ class BrowserThread(Thread):
raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
# 千万不要忘了分割!!
for path in node["parameters"]["pathList"].split("\n"):
paths = node["parameters"]["pathList"].split("\n")
# for path in node["parameters"]["pathList"].split("\n"):
index = 0
while index < len(paths):
path = paths[index]
try:
path = replace_field_values(
path, self.outputParameters, self)
@ -1100,10 +1111,15 @@ class BrowserThread(Thread):
except:
pass
if self.browser.current_url.startswith("data:"):
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
except NoSuchElementException:
self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素: ", path)
@ -1116,6 +1132,7 @@ class BrowserThread(Thread):
code = get_output_code(output)
if code <= 0:
break
index = index + 1
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n")
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量