Deal with data:

2025-04-19 18:59:52 +08:00 · 2023-11-23 04:19:18 +08:00 · 2023-11-23 04:19:18 +08:00 · 4025e255a0
commit 4025e255a0
parent d0822c805b
15 changed files with 40 additions and 10 deletions
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/126.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/126.json
@ -0,0 +1 @@
 {"id":126,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表，多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/127.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/127.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/128.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/128.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/129.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/129.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/130.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/130.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/131.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/131.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/132.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/132.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/133.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/133.json
--- a/.temp_to_pub/EasySpider_windows_x64/execution_instances/134.json
+++ b/.temp_to_pub/EasySpider_windows_x64/execution_instances/134.json
--- a/.temp_to_pub/EasySpider_windows_x64/tasks/235.json
+++ b/.temp_to_pub/EasySpider_windows_x64/tasks/235.json
--- a/ElectronJS/server.js
+++ b/ElectronJS/server.js
@ -323,8 +323,11 @@ exports.start = function(port = 8074) {
                task = JSON.parse(task);
                try{
                    task["links"] = data["urlList_0"];
-                }catch(error){
+                    if (tasks["links"] == undefined) {
-                    console.log(error);
+                        task["links"] = "about:blank";
                    }
                } catch(error) {
                    task["links"] = "about:blank";
                }
                for (const [key, value] of Object.entries(data)) {
                    for (let i = 0; i < task["inputParameters"].length; i++) {
--- a/ElectronJS/src/taskGrid/logic.js
+++ b/ElectronJS/src/taskGrid/logic.js
@ -359,7 +359,7 @@ function saveService(type) {
        let outputNames = [];
        let inputIndex = 0;
        let outputIndex = 0;
-        let links = ""; //记录所有的link
+        let links = "about:blank"; //记录所有的link
        let containJudge = false; //是否含有判断语句
        let saveThreshold = parseInt($("#saveThreshold").val());
        let cloudflare = parseInt($("#cloudflare").val());
--- a/ElectronJS/tasks/209.json
+++ b/ElectronJS/tasks/209.json
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@ -12,7 +12,7 @@
            "justMyCode": false,
            //  "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
            // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--id", "[125]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
+            "args": ["--id", "[134]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
        }
    ]
 }
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@ -337,7 +337,7 @@ class BrowserThread(Thread):
            if "urlList_0" in data.keys():
                self.links = data["urlList_0"]
        except:
-            pass
+            self.links = "about:blank"
        task = self.service
        for key, value in data.items():
            for i in range(len(task["inputParameters"])):
@ -987,7 +987,8 @@ class BrowserThread(Thread):
                    self.print_and_log("Loop element not found: ",
                                       xpath)
                    self.print_and_log("找不到循环元素: ", xpath)
-                for index in range(len(elements)):
+                index = 0
                while index < len(elements):
                    for i in node["sequence"]:  # 挨个顺序执行循环里所有的操作
                        self.executeNode(i, elements[index],
                                         xpath, index)
@ -1033,16 +1034,22 @@ class BrowserThread(Thread):
                        except:
                            pass
                    if self.browser.current_url.startswith("data:"):
-                        self.browser.execute_script("history.go(1)") # 如果是data:开头的网址，就前进一步
+                        try:
                            self.browser.execute_script("history.go(1)") # 如果是data:开头的网址，就前进一步
                        except: # 超时的情况下
                            pass
                        time.sleep(2)
                        elements = self.browser.find_elements(By.XPATH,
                                                      xpath, iframe=node["parameters"]["iframe"])
                        if index > 0:
                            index -= 1 # 如果是data:开头的网址，就要重试一次
                    if int(node["parameters"]["breakMode"]) > 0:  # 如果设置了退出循环的脚本条件
                        output = self.execute_code(int(
                            node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
                        code = get_output_code(output)
                        if code <= 0:
                            break
                    index = index + 1
            except NoSuchElementException:
                self.print_and_log("Loop element not found: ", xpath)
                self.print_and_log("找不到循环元素: ", xpath)
@ -1050,7 +1057,11 @@ class BrowserThread(Thread):
                raise
        elif int(node["parameters"]["loopType"]) == 2:  # 固定元素列表
            # 千万不要忘了分割！！
-            for path in node["parameters"]["pathList"].split("\n"):
+            paths = node["parameters"]["pathList"].split("\n")
            # for path in node["parameters"]["pathList"].split("\n"):
            index = 0
            while index < len(paths):
                path = paths[index]
                try:
                    path = replace_field_values(
                        path, self.outputParameters, self)
@ -1100,10 +1111,15 @@ class BrowserThread(Thread):
                        except:
                            pass
                    if self.browser.current_url.startswith("data:"):
-                        self.browser.execute_script("history.go(1)") # 如果是data:开头的网址，就前进一步
+                        try:
                            self.browser.execute_script("history.go(1)") # 如果是data:开头的网址，就前进一步
                        except: # 超时的情况下
                            pass
                        time.sleep(2)
                        elements = self.browser.find_elements(By.XPATH,
                                                      xpath, iframe=node["parameters"]["iframe"])
                        if index > 0:
                            index -= 1 # 如果是data:开头的网址，就要重试一次
                except NoSuchElementException:
                    self.print_and_log("Loop element not found: ", path)
                    self.print_and_log("找不到循环元素: ", path)
@ -1116,6 +1132,7 @@ class BrowserThread(Thread):
                    code = get_output_code(output)
                    if code <= 0:
                        break
                index = index + 1
        elif int(node["parameters"]["loopType"]) == 3:  # 固定文本列表
            textList = node["parameters"]["textList"].split("\n")
            if len(textList) == 1:  # 如果固定文本列表只有一行，现在就可以替换变量
		`@ -0,0 +1 @@`
							{"id":126,"name":"【软科排名】-中国最好学科排名\|最权威的大学学科\|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表，多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}