mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-19 18:59:52 +08:00
Split Line
This commit is contained in:
parent
0ec831dbf2
commit
ae3ae40640
2
.temp_to_pub/.gitignore
vendored
2
.temp_to_pub/.gitignore
vendored
@ -11,3 +11,5 @@ config.json
|
|||||||
mysql_config.json
|
mysql_config.json
|
||||||
**/Code
|
**/Code
|
||||||
**/user_data
|
**/user_data
|
||||||
|
**/tasks
|
||||||
|
**/execution_instances
|
||||||
|
@ -320,6 +320,11 @@
|
|||||||
<!-- <option :value = 0>普通提取</option>-->
|
<!-- <option :value = 0>普通提取</option>-->
|
||||||
<!-- <option :value = 1>OCR提取</option>-->
|
<!-- <option :value = 1>OCR提取</option>-->
|
||||||
<!-- </select>-->
|
<!-- </select>-->
|
||||||
|
<label style="margin-top: 15px">Wrap content to new line (set when collecting long articles and wanting to wrap):</label>
|
||||||
|
<select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
|
||||||
|
<option :value="0">No</option>
|
||||||
|
<option :value="1">Yes</option>
|
||||||
|
</select>
|
||||||
<label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
|
<label style="margin-top: 15px">Whether to save this field: (Choose 'No' if you only want to treat this field as a variable and not save it):</label>
|
||||||
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
|
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
|
||||||
<option :value = 1>Yes</option>
|
<option :value = 1>Yes</option>
|
||||||
|
@ -320,6 +320,11 @@
|
|||||||
<!-- <option :value = 0>普通提取</option>-->
|
<!-- <option :value = 0>普通提取</option>-->
|
||||||
<!-- <option :value = 1>OCR提取</option>-->
|
<!-- <option :value = 1>OCR提取</option>-->
|
||||||
<!-- </select>-->
|
<!-- </select>-->
|
||||||
|
<label style="margin-top: 15px">是否将内容换行(长文章采集想要换行时设置):</label>
|
||||||
|
<select v-model='params.parameters[paraIndex]["splitLine"]' class="form-control">
|
||||||
|
<option :value = 0>否</option>
|
||||||
|
<option :value = 1>是</option>
|
||||||
|
</select>
|
||||||
<label style="margin-top: 15px">是否保存该字段(只想把此字段当变量而不想保存时可选否):</label>
|
<label style="margin-top: 15px">是否保存该字段(只想把此字段当变量而不想保存时可选否):</label>
|
||||||
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
|
<select v-model='params.parameters[paraIndex]["recordASField"]' class="form-control">
|
||||||
<option :value = 1>是</option>
|
<option :value = 1>是</option>
|
||||||
|
@ -81,6 +81,7 @@ function changeGetDataParameters(msg, i) {
|
|||||||
msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
|
msg["parameters"][i]["afterJS"] = ""; //执行后执行的js
|
||||||
msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
|
msg["parameters"][i]["afterJSWaitTime"] = 0; //执行后js等待时间
|
||||||
msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
|
msg["parameters"][i]["downloadPic"] = 0; //是否下载图片
|
||||||
|
msg["parameters"][i]["splitLine"] = 0; //是否分割行
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/299.json
Normal file
1
ElectronJS/tasks/299.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/300.json
Normal file
1
ElectronJS/tasks/300.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"id":300,"name":"你永远难以忘记的pool party:用windows线程池的新进程注入技术 - 先知社区","url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","create_time":"12/19/2023, 8:18:48 PM","update_time":"12/19/2023, 8:18:52 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://xz.aliyun.com/t/13184","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://xz.aliyun.com/t/13184","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://xz.aliyun.com/t/13184"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"原文链接:Process Injection Using Windows Thread Pools | Safebreach"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://xz.aliyun.com/t/13184","links":"https://xz.aliyun.com/t/13184","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/p[1]","//p[contains(., '作者:Alon Le')]","/html/body/div[last()-3]/div/div[last()-1]/div[last()-3]/div/div/div[last()-2]/p[last()-98]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"原文链接:Process Injection Using Windows Thread Pools | Safebreach"}],"unique_index":"omov61ihv2rlqcba4hj","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
|||||||
"justMyCode": false,
|
"justMyCode": false,
|
||||||
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||||
"args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
"args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
||||||
"--read_type", "remote"]
|
"--read_type", "remote"]
|
||||||
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,8 @@ import platform
|
|||||||
import shutil
|
import shutil
|
||||||
import string
|
import string
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||||
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
|
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||||
from myChrome import MyChrome
|
from myChrome import MyChrome
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -295,9 +295,13 @@ class BrowserThread(Thread):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
node["parameters"]["recordASField"] += param["recordASField"]
|
node["parameters"]["recordASField"] = param["recordASField"]
|
||||||
except:
|
except:
|
||||||
node["parameters"]["recordASField"] += 1
|
node["parameters"]["recordASField"] = 1
|
||||||
|
try:
|
||||||
|
splitLine = int(param["splitLine"])
|
||||||
|
except:
|
||||||
|
param["splitLine"] = 0
|
||||||
if param["contentType"] == 8:
|
if param["contentType"] == 8:
|
||||||
self.print_and_log(
|
self.print_and_log(
|
||||||
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||||
@ -1754,7 +1758,11 @@ class BrowserThread(Thread):
|
|||||||
download_image(self, content, "Data/Task_" +
|
download_image(self, content, "Data/Task_" +
|
||||||
str(self.id) + "/" + self.saveName + "/", element)
|
str(self.id) + "/" + self.saveName + "/", element)
|
||||||
else: # 普通节点
|
else: # 普通节点
|
||||||
content = element.text
|
if p["splitLine"] == 1:
|
||||||
|
text = extract_text_from_html(element.get_attribute('outerHTML'))
|
||||||
|
content = split_text_by_lines(text)
|
||||||
|
else:
|
||||||
|
content = element.text
|
||||||
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
|
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
|
||||||
if p["nodeType"] == 2:
|
if p["nodeType"] == 2:
|
||||||
if element.get_attribute("href") != None:
|
if element.get_attribute("href") != None:
|
||||||
|
@ -9,4 +9,5 @@ pymysql==1.1.0
|
|||||||
lxml==4.9.2
|
lxml==4.9.2
|
||||||
ddddocr==1.4.10
|
ddddocr==1.4.10
|
||||||
pynput==1.7.6
|
pynput==1.7.6
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
undetected-chromedriver==3.4.7
|
undetected-chromedriver==3.4.7
|
||||||
|
@ -7,6 +7,7 @@ import sys
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
# import keyboard
|
# import keyboard
|
||||||
from openpyxl import Workbook, load_workbook
|
from openpyxl import Workbook, load_workbook
|
||||||
# import pandas as pd
|
# import pandas as pd
|
||||||
@ -71,6 +72,22 @@ def is_valid_url(url):
|
|||||||
def lowercase_tags_in_xpath(xpath):
|
def lowercase_tags_in_xpath(xpath):
|
||||||
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
||||||
|
|
||||||
|
# 提取HTML中的文本内容
|
||||||
|
def extract_text_from_html(html_content):
|
||||||
|
soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
|
||||||
|
for script in soup(["script", "style"]): # 去除脚本和样式内容
|
||||||
|
script.extract()
|
||||||
|
for p_tag in soup.find_all("p"):
|
||||||
|
p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
|
||||||
|
p_tag.append("\n") # 在每个p标签后添加换行符
|
||||||
|
text = soup.get_text()
|
||||||
|
return text
|
||||||
|
|
||||||
|
# 将文本按照行分割并去除额外空白
|
||||||
|
def split_text_by_lines(text):
|
||||||
|
lines = text.splitlines()
|
||||||
|
lines = [line.strip() for line in lines if line.strip()] # 去除空行和首尾空格
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
def on_press_creator(press_time, event):
|
def on_press_creator(press_time, event):
|
||||||
def on_press(key):
|
def on_press(key):
|
||||||
@ -139,7 +156,7 @@ def on_release_creator(event, press_time):
|
|||||||
# time.sleep(1) # 每秒检查一次
|
# time.sleep(1) # 每秒检查一次
|
||||||
|
|
||||||
def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
|
def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
|
||||||
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
|
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
|
||||||
if param["nodeType"] <= 2:
|
if param["nodeType"] <= 2:
|
||||||
if ignoreWaitElement or waitElement == "":
|
if ignoreWaitElement or waitElement == "":
|
||||||
return True
|
return True
|
||||||
|
Loading…
x
Reference in New Issue
Block a user