Split Line

This commit is contained in:
naibo 2023-12-19 22:02:45 +08:00
parent 0ec831dbf2
commit ae3ae40640
15 changed files with 53 additions and 12 deletions

View File

@ -11,3 +11,5 @@ config.json
mysql_config.json
**/Code
**/user_data
**/tasks
**/execution_instances

View File

@ -320,6 +320,11 @@
<!-- <option :value = 0>普通提取</option>-->
<!-- <option :value = 1>OCR提取</option>-->
<!-- </select>-->
<label style="margin-top: 15px">Wrap content to new line (set when collecting long articles and wanting to wrap):</label>
<select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
<option :value="0">No</option>
<option :value="1">Yes</option>
</select>
<label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
<option :value = 1>Yes</option>

View File

@ -320,6 +320,11 @@
<!-- <option :value = 0>普通提取</option>-->
<!-- <option :value = 1>OCR提取</option>-->
<!-- </select>-->
<label style="margin-top: 15px">是否将内容换行(长文章采集想要换行时设置):</label>
<select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
<option :value = 0></option>
<option :value = 1></option>
</select>
<label style="margin-top: 15px">是否保存该字段(只想把此字段当变量而不想保存时可选否):</label>
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
<option :value = 1></option>

View File

@ -81,6 +81,7 @@ function changeGetDataParameters(msg, i) {
msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
msg["parameters"][i]["splitLine"] = 0; //是否分割行
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":300,"name":"你永远难以忘记的pool party用windows线程池的新进程注入技术 - 先知社区","url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","create_time":"12/19/2023, 8:18:48 PM","update_time":"12/19/2023, 8:18:52 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://xz.aliyun.com/t/13184","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://xz.aliyun.com/t/13184","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://xz.aliyun.com/t/13184"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"原文链接Process Injection Using Windows Thread Pools | Safebreach"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p[1]","//p[contains(., '作者Alon Le')]","/html/body/div[last()-3]/div/div[last()-1]/div[last()-3]/div/div/div[last()-2]/p[last()-98]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"原文链接Process Injection Using Windows Thread Pools | Safebreach"}],"unique_index":"omov61ihv2rlqcba4hj","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

View File

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote"]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}

View File

@ -6,8 +6,8 @@ import platform
import shutil
import string
import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
@ -295,9 +295,13 @@ class BrowserThread(Thread):
except:
pass
try:
node["parameters"]["recordASField"] += param["recordASField"]
node["parameters"]["recordASField"] = param["recordASField"]
except:
node["parameters"]["recordASField"] += 1
node["parameters"]["recordASField"] = 1
try:
splitLine = int(param["splitLine"])
except:
param["splitLine"] = 0
if param["contentType"] == 8:
self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
@ -1754,6 +1758,10 @@ class BrowserThread(Thread):
download_image(self, content, "Data/Task_" +
str(self.id) + "/" + self.saveName + "/", element)
else: # 普通节点
if p["splitLine"] == 1:
text = extract_text_from_html(element.get_attribute('outerHTML'))
content = split_text_by_lines(text)
else:
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
if p["nodeType"] == 2:

View File

@ -9,4 +9,5 @@ pymysql==1.1.0
lxml==4.9.2
ddddocr==1.4.10
pynput==1.7.6
beautifulsoup4==4.12.2
undetected-chromedriver==3.4.7

View File

@ -7,6 +7,7 @@ import sys
import re
import time
import uuid
from bs4 import BeautifulSoup
# import keyboard
from openpyxl import Workbook, load_workbook
# import pandas as pd
@ -71,6 +72,22 @@ def is_valid_url(url):
def lowercase_tags_in_xpath(xpath):
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
# 提取HTML中的文本内容
def extract_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
for script in soup(["script", "style"]): # 去除脚本和样式内容
script.extract()
for p_tag in soup.find_all("p"):
p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
p_tag.append("\n") # 在每个p标签后添加换行符
text = soup.get_text()
return text
# 将文本按照行分割并去除额外空白
def split_text_by_lines(text):
lines = text.splitlines()
lines = [line.strip() for line in lines if line.strip()] # 去除空行和首尾空格
return "\n".join(lines)
def on_press_creator(press_time, event):
def on_press(key):
@ -139,7 +156,7 @@ def on_release_creator(event, press_time):
# time.sleep(1) # 每秒检查一次
def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
if param["nodeType"] <= 2:
if ignoreWaitElement or waitElement == "":
return True