Split Line

2025-04-19 18:59:52 +08:00 · 2023-12-19 22:02:45 +08:00 · 2023-12-19 22:02:45 +08:00 · ae3ae40640
commit ae3ae40640
parent 0ec831dbf2
15 changed files with 53 additions and 12 deletions
--- a/.temp_to_pub/.gitignore
+++ b/.temp_to_pub/.gitignore
@ -11,3 +11,5 @@ config.json
 mysql_config.json
 **/Code
 **/user_data
+**/tasks
+**/execution_instances
--- a/ElectronJS/src/taskGrid/FlowChart.html
+++ b/ElectronJS/src/taskGrid/FlowChart.html
@ -320,6 +320,11 @@
 <!--                            <option :value = 0>普通提取</option>-->
 <!--                            <option :value = 1>OCR提取</option>-->
 <!--                        </select>-->
+                       <label style="margin-top: 15px">Wrap content to new line (set when collecting long articles and wanting to wrap):</label>
+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
+                            <option :value="0">No</option>
+                            <option :value="1">Yes</option>
+                        </select>
                        <label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
                        <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
                            <option :value = 1>Yes</option>
--- a/ElectronJS/src/taskGrid/FlowChart_CN.html
+++ b/ElectronJS/src/taskGrid/FlowChart_CN.html
@ -320,6 +320,11 @@
 <!--                            <option :value = 0>普通提取</option>-->
 <!--                            <option :value = 1>OCR提取</option>-->
 <!--                        </select>-->
+                        <label style="margin-top: 15px">是否将内容换行（长文章采集想要换行时设置）：</label>
+                        <select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
+                            <option :value = 0>否</option>
+                            <option :value = 1>是</option>
+                        </select>
                        <label style="margin-top: 15px">是否保存该字段（只想把此字段当变量而不想保存时可选否）：</label>
                        <select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
                            <option :value = 1>是</option>
--- a/ElectronJS/src/taskGrid/logic.js
+++ b/ElectronJS/src/taskGrid/logic.js
@ -81,6 +81,7 @@ function changeGetDataParameters(msg, i) {
    msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
    msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
    msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
+    msg["parameters"][i]["splitLine"] = 0; //是否分割行
 }


--- a/ElectronJS/tasks/294.json
+++ b/ElectronJS/tasks/294.json
--- a/ElectronJS/tasks/295.json
+++ b/ElectronJS/tasks/295.json
--- a/ElectronJS/tasks/296.json
+++ b/ElectronJS/tasks/296.json
--- a/ElectronJS/tasks/297.json
+++ b/ElectronJS/tasks/297.json
--- a/ElectronJS/tasks/298.json
+++ b/ElectronJS/tasks/298.json
--- a/ElectronJS/tasks/299.json
+++ b/ElectronJS/tasks/299.json
--- a/ElectronJS/tasks/300.json
+++ b/ElectronJS/tasks/300.json
@ -0,0 +1 @@
+{"id":300,"name":"你永远难以忘记的pool party：用windows线程池的新进程注入技术 - 先知社区","url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","create_time":"12/19/2023, 8:18:48 PM","update_time":"12/19/2023, 8:18:52 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://xz.aliyun.com/t/13184","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://xz.aliyun.com/t/13184","desc":"要采集的网址列表，多行以\\n分开","type":"text","exampleValue":"https://xz.aliyun.com/t/13184"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"原文链接：Process Injection Using Windows Thread Pools | Safebreach"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p[1]","//p[contains(., '作者：Alon Le')]","/html/body/div[last()-3]/div/div[last()-1]/div[last()-3]/div/div/div[last()-2]/p[last()-98]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"原文链接：Process Injection Using Windows Thread Pools | Safebreach"}],"unique_index":"omov61ihv2rlqcba4hj","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
--- a/ExecuteStage/.vscode/launch.json
+++ b/ExecuteStage/.vscode/launch.json
@ -12,7 +12,7 @@
            "justMyCode": false,
            //  "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
            // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
-            "args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
+            "args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
        "--read_type", "remote"]
            // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
        }
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@ -6,8 +6,8 @@ import platform
 import shutil
 import string
 import undetected_chromedriver as uc
-from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
-    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
+from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
+    on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
 from myChrome import MyChrome
 from threading import Thread, Event
 from PIL import Image
@ -295,9 +295,13 @@ class BrowserThread(Thread):
                    except:
                        pass
                    try:
-                        node["parameters"]["recordASField"] += param["recordASField"]
+                        node["parameters"]["recordASField"] = param["recordASField"]
                    except:
-                        node["parameters"]["recordASField"] += 1
+                        node["parameters"]["recordASField"] = 1
+                    try:
+                        splitLine = int(param["splitLine"])
+                    except:
+                        param["splitLine"] = 0
                    if param["contentType"] == 8:
                        self.print_and_log(
                            "默认的ddddocr识别功能如果觉得不好用，可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行；或者可以先设置采集内容类型为“元素截图”把图片保存下来，然后用自定义操作调用自己写的程序，程序的功能是读取这个最新生成的图片，然后用好用的模型，如PaddleOCR把图片识别出来，然后把返回值返回给程序作为参数输出。")
@ -1754,6 +1758,10 @@ class BrowserThread(Thread):
                    download_image(self, content, "Data/Task_" +
                                   str(self.id) + "/" + self.saveName + "/", element)
            else:  # 普通节点
+                if p["splitLine"] == 1:
+                    text = extract_text_from_html(element.get_attribute('outerHTML'))
+                    content = split_text_by_lines(text)
+                else:
                    content = element.text
        elif p["contentType"] == 1:  # 只采集当期元素下的文本，不包括子元素
            if p["nodeType"] == 2:
--- a/ExecuteStage/requirements.txt
+++ b/ExecuteStage/requirements.txt
@ -9,4 +9,5 @@ pymysql==1.1.0
 lxml==4.9.2
 ddddocr==1.4.10
 pynput==1.7.6
+beautifulsoup4==4.12.2
 undetected-chromedriver==3.4.7
--- a/ExecuteStage/utils.py
+++ b/ExecuteStage/utils.py
@ -7,6 +7,7 @@ import sys
 import re
 import time
 import uuid
+from bs4 import BeautifulSoup
 # import keyboard
 from openpyxl import Workbook, load_workbook
 # import pandas as pd
@ -71,6 +72,22 @@ def is_valid_url(url):
 def lowercase_tags_in_xpath(xpath):
    return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)

+# 提取HTML中的文本内容
+def extract_text_from_html(html_content):
+    soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
+    for script in soup(["script", "style"]): # 去除脚本和样式内容
+        script.extract()
+    for p_tag in soup.find_all("p"):
+        p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
+        p_tag.append("\n") # 在每个p标签后添加换行符
+    text = soup.get_text()
+    return text
+
+# 将文本按照行分割并去除额外空白
+def split_text_by_lines(text):
+    lines = text.splitlines()
+    lines = [line.strip() for line in lines if line.strip()]  # 去除空行和首尾空格
+    return "\n".join(lines)

 def on_press_creator(press_time, event):
    def on_press(key):
@ -139,7 +156,7 @@ def on_release_creator(event, press_time):
 #         time.sleep(1)  # 每秒检查一次

 def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
-    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
+    if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
        if param["nodeType"] <= 2:
            if ignoreWaitElement or waitElement == "":
                return True
				`@ -0,0 +1 @@`
				{"id":300,"name":"你永远难以忘记的pool party：用windows线程池的新进程注入技术 - 先知社区","url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","create_time":"12/19/2023, 8:18:48 PM","update_time":"12/19/2023, 8:18:52 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://xz.aliyun.com/t/13184","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://xz.aliyun.com/t/13184","desc":"要采集的网址列表，多行以\\n分开","type":"text","exampleValue":"https://xz.aliyun.com/t/13184"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"原文链接：Process Injection Using Windows Thread Pools \| Safebreach"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p[1]","//p[contains(., '作者：Alon Le')]","/html/body/div[last()-3]/div/div[last()-1]/div[last()-3]/div/div/div[last()-2]/p[last()-98]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"原文链接：Process Injection Using Windows Thread Pools \| Safebreach"}],"unique_index":"omov61ihv2rlqcba4hj","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}