mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-20 02:24:56 +08:00
Compare commits
154 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
fc5aa8368b | ||
![]() |
793f028a00 | ||
![]() |
ae22977143 | ||
![]() |
541b3c13d2 | ||
![]() |
a6192b730c | ||
![]() |
d39218f5fd | ||
![]() |
a94c45b36d | ||
![]() |
0e8aba6b51 | ||
![]() |
e42ad07d80 | ||
![]() |
2f6344d00b | ||
![]() |
bfa6c0de76 | ||
![]() |
b590cc22c5 | ||
![]() |
d69adacbd1 | ||
![]() |
15654da7eb | ||
![]() |
967f5b8033 | ||
![]() |
aa419ee845 | ||
![]() |
f005e48700 | ||
![]() |
4e96ed7d50 | ||
![]() |
e3fecc8926 | ||
![]() |
119cb99711 | ||
![]() |
f43bdd236d | ||
![]() |
56f0847500 | ||
![]() |
0df6cebd18 | ||
![]() |
4b42f6300c | ||
![]() |
2cf33794f1 | ||
![]() |
9efd3b6efe | ||
![]() |
ad956be10d | ||
![]() |
01de17d471 | ||
![]() |
333dcd3ff4 | ||
![]() |
555f02815c | ||
![]() |
34ed41110a | ||
![]() |
32459b622d | ||
![]() |
02cd8599b0 | ||
![]() |
2feede55db | ||
![]() |
33dda444d7 | ||
![]() |
d7ccb22d01 | ||
![]() |
f7a842eed6 | ||
![]() |
ea6fb049f5 | ||
![]() |
5216ffba82 | ||
![]() |
4f0851e361 | ||
![]() |
7bb9d5a374 | ||
![]() |
c56e87120d | ||
![]() |
5180f47b70 | ||
![]() |
b4d7ddf5cb | ||
![]() |
2031b09297 | ||
![]() |
cc9a8082da | ||
![]() |
3daf5e8c21 | ||
![]() |
8f5d7a3a52 | ||
![]() |
ee4a077630 | ||
![]() |
3fe6f42366 | ||
![]() |
eb3b578745 | ||
![]() |
4ca5333f8b | ||
![]() |
b50d4eae3f | ||
![]() |
998a1ddb19 | ||
![]() |
07563bc750 | ||
![]() |
7b5ccf4a78 | ||
![]() |
209235de8d | ||
![]() |
72529c0675 | ||
![]() |
081c49357e | ||
![]() |
b611ddb6cd | ||
![]() |
abfac8c342 | ||
![]() |
951a39fff6 | ||
![]() |
6d3d10f7a7 | ||
![]() |
46b1959564 | ||
![]() |
e14896d7cd | ||
![]() |
450dfa1a77 | ||
![]() |
3b907ba382 | ||
![]() |
70dd90470f | ||
![]() |
cc8bb70715 | ||
![]() |
c5f1696f11 | ||
![]() |
b987408fc2 | ||
![]() |
391f0ea99d | ||
![]() |
a94b67a1f6 | ||
![]() |
54ef89aef7 | ||
![]() |
22a3b45f13 | ||
![]() |
44bfb69a36 | ||
![]() |
5c1207649d | ||
![]() |
c967db3dac | ||
![]() |
baec9c4298 | ||
![]() |
3e7abd6273 | ||
![]() |
32df9d5060 | ||
![]() |
05c52f9dc8 | ||
![]() |
7c4dafc002 | ||
![]() |
2afaf43162 | ||
![]() |
b79d92df1d | ||
![]() |
e4e1a1b095 | ||
![]() |
048dfb1f4b | ||
![]() |
1750481744 | ||
![]() |
3ead5e7312 | ||
![]() |
81957adb52 | ||
![]() |
dbad074565 | ||
![]() |
8342135b36 | ||
![]() |
e74915d94c | ||
![]() |
df62f710e3 | ||
![]() |
118241ba6d | ||
![]() |
de47e8516a | ||
![]() |
d438e4b19d | ||
![]() |
0003041dab | ||
![]() |
ec3d9094bf | ||
![]() |
629509a588 | ||
![]() |
5e17563d11 | ||
![]() |
5acafe7948 | ||
![]() |
c25f80c175 | ||
![]() |
ab88b33c74 | ||
![]() |
7442e43be3 | ||
![]() |
a0518412b0 | ||
![]() |
9ccb56aeae | ||
![]() |
3601ddb14d | ||
![]() |
728a5cb3ea | ||
![]() |
46909e4866 | ||
![]() |
072b6ad21e | ||
![]() |
bf320abf1a | ||
![]() |
2d7c3c1323 | ||
![]() |
c185e914e7 | ||
![]() |
7c0ab0e519 | ||
![]() |
f50b08e9c4 | ||
![]() |
ff7d82f4d0 | ||
![]() |
944d968679 | ||
![]() |
9f1f152680 | ||
![]() |
18321e4fee | ||
![]() |
b79bda9001 | ||
![]() |
80bc210ff1 | ||
![]() |
dbf7681518 | ||
![]() |
f18616e3ff | ||
![]() |
911ea02f3f | ||
![]() |
22f86cf0f2 | ||
![]() |
0285246337 | ||
![]() |
4fdce9a915 | ||
![]() |
15aab7c0c5 | ||
![]() |
3ec64d2623 | ||
![]() |
5582205204 | ||
![]() |
c272e5da86 | ||
![]() |
52702d4eb3 | ||
![]() |
a8e77b5e15 | ||
![]() |
606de75577 | ||
![]() |
76fd4bad55 | ||
![]() |
2860bc7b8c | ||
![]() |
ebe8a56a6f | ||
![]() |
e086de2852 | ||
![]() |
c2d16e13c2 | ||
![]() |
e43318f57a | ||
![]() |
7849707486 | ||
![]() |
b1632459ef | ||
![]() |
a2bd496e8e | ||
![]() |
9ed61c4f50 | ||
![]() |
c8b71835de | ||
![]() |
0afa159c98 | ||
![]() |
3ba748b101 | ||
![]() |
818d3e0ddc | ||
![]() |
ad568af5f3 | ||
![]() |
b2a6fd6b6b | ||
![]() |
960cf74de1 | ||
![]() |
fce97dec61 | ||
![]() |
3ffd34d0fd |
25
.github/ISSUE_TEMPLATE.md
vendored
Normal file
25
.github/ISSUE_TEMPLATE.md
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
## 版本信息 | Version Information
|
||||
**EasySpider版本 | EasySpider Version**:
|
||||
**系统版本(架构) | System Version (Architecture)**:
|
||||
**浏览器版本 | Browser Version**:
|
||||
**安装方式 | Installation method**:
|
||||
|
||||
## 问题描述 | Issue Description
|
||||
|
||||
|
||||
## 如何复现 | Steps to Reproduce
|
||||
|
||||
## 示例任务文件 | Example Task File
|
||||
|
||||
Windows和Linux版本的软件设计的任务文件在软件目录下的`tasks`文件夹中,文件名为任务列表中`任务的ID号.json`;MacOS系统的任务文件目录请运行下面的命令打开tasks文件夹:
|
||||
|
||||
The task file designed for the Windows and Linux versions of the software is in the `tasks` folder in the software directory, and the file name is `the ID number of the task.json` in the task list; the task file directory of the MacOS system is opened by running the following command:
|
||||
|
||||
```bash
|
||||
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
|
||||
open .
|
||||
```
|
||||
|
||||
请将任务文件直接以文件的方式粘贴到这里,不要截图和打开复制里面的内容。
|
||||
|
||||
Please paste the task file directly as a file here, do not take screenshots and open to copy the content.
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -14,3 +14,5 @@ old_code/
|
||||
*.tar.xz
|
||||
*.zip
|
||||
Data/
|
||||
**/__pycache__/
|
||||
**/.venv/
|
4
.temp_to_pub/.gitignore
vendored
4
.temp_to_pub/.gitignore
vendored
@ -1,10 +1,10 @@
|
||||
EasySpider_MacOS/easyspider_executestage
|
||||
EasySpider_MacOS/easyspider_executestage_full
|
||||
EasySpider_Linux64_x64/user_data
|
||||
EasySpider_windows_x32/user_data
|
||||
EasySpider_Windows_x32/user_data
|
||||
EasySpider
|
||||
EasySpider.app/
|
||||
EasySpider_windows_x64/user_data
|
||||
EasySpider_Windows_x64/user_data
|
||||
*.tmp
|
||||
*.tar.gz
|
||||
*.7z*
|
||||
|
@ -5,9 +5,11 @@ import copy
|
||||
import platform
|
||||
import shutil
|
||||
import string
|
||||
import threading
|
||||
# import undetected_chromedriver as uc
|
||||
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||
from constants import WriteMode, DataWriteMode, GraphOption
|
||||
from myChrome import MyChrome
|
||||
from threading import Thread, Event
|
||||
from PIL import Image
|
||||
@ -30,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from pynput.keyboard import Key, Listener
|
||||
from datetime import datetime
|
||||
import io # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
@ -75,10 +76,7 @@ class BrowserThread(Thread):
|
||||
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
|
||||
Thread.__init__(self)
|
||||
self.logs = io.StringIO()
|
||||
try:
|
||||
self.log = bool(service["recordLog"])
|
||||
except:
|
||||
self.log = True
|
||||
self.log = bool(service.get("recordLog", True))
|
||||
self.browser = browser_t
|
||||
self.option = option
|
||||
self.config = config
|
||||
@ -86,22 +84,13 @@ class BrowserThread(Thread):
|
||||
self.totalSteps = 0
|
||||
self.id = id
|
||||
self.event = event
|
||||
try:
|
||||
self.saveName = service["saveName"] # 保存文件的名字
|
||||
except:
|
||||
now = datetime.now()
|
||||
# 将时间格式化为精确到秒的字符串
|
||||
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
|
||||
now = datetime.now()
|
||||
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
|
||||
self.OUTPUT = ""
|
||||
self.SAVED = False
|
||||
self.BREAK = False
|
||||
self.CONTINUE = False
|
||||
try:
|
||||
maximizeWindow = service["maximizeWindow"]
|
||||
except:
|
||||
maximizeWindow = 0
|
||||
if maximizeWindow == 1:
|
||||
self.browser.maximize_window()
|
||||
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
|
||||
# 名称设定
|
||||
if saveName != "": # 命令行覆盖保存名称
|
||||
self.saveName = saveName # 保存文件的名字
|
||||
@ -112,19 +101,23 @@ class BrowserThread(Thread):
|
||||
self.print_and_log("Save Name for task ID", id, "is:", self.saveName)
|
||||
if not os.path.exists("Data/Task_" + str(id)):
|
||||
os.mkdir("Data/Task_" + str(id))
|
||||
if not os.path.exists("Data/Task_" + str(id) + "/" + self.saveName):
|
||||
os.mkdir("Data/Task_" + str(id) + "/" +
|
||||
self.saveName) # 创建保存文件夹用来保存截图
|
||||
self.downloadFolder = "Data/Task_" + str(id) + "/" + self.saveName
|
||||
if not os.path.exists(self.downloadFolder):
|
||||
os.mkdir(self.downloadFolder) # 创建保存文件夹用来保存截图和文件
|
||||
if not os.path.exists(self.downloadFolder + "/files"):
|
||||
os.mkdir(self.downloadFolder + "/files")
|
||||
if not os.path.exists(self.downloadFolder + "/images"):
|
||||
os.mkdir(self.downloadFolder + "/images")
|
||||
self.getDataStep = 0
|
||||
self.startSteps = 0
|
||||
try:
|
||||
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
|
||||
if startFromExit == 1:
|
||||
if service.get("startFromExit", 0) == 1:
|
||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
|
||||
encoding='utf-8-sig') as file_obj:
|
||||
self.startSteps = int(file_obj.read()) # 读取已执行步数
|
||||
except:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.print_and_log(f"读取steps.txt失败,原因:{str(e)}")
|
||||
|
||||
if self.startSteps != 0:
|
||||
self.print_and_log("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
|
||||
self.startSteps, "条。")
|
||||
@ -132,7 +125,7 @@ class BrowserThread(Thread):
|
||||
"will start from the last step, before we already collected", self.startSteps, " items.")
|
||||
else:
|
||||
self.print_and_log("此模式下,任务ID", self.id,
|
||||
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
self.print_and_log("In this mode, task ID", self.id,
|
||||
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||
stealth_path = driver_path[:driver_path.find(
|
||||
@ -140,78 +133,83 @@ class BrowserThread(Thread):
|
||||
with open(stealth_path, 'r') as f:
|
||||
js = f.read()
|
||||
self.print_and_log("Loading stealth.min.js")
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"""
|
||||
})
|
||||
WebDriverWait(self.browser, 10)
|
||||
self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
|
||||
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id))
|
||||
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id), self.saveName, "files")
|
||||
self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
|
||||
|
||||
self.browser.execute("send_command", self.paramss) # 下载地址改变
|
||||
self.browser.execute("send_command", self.paramss) # 下载目录改变
|
||||
self.monitor_event = threading.Event()
|
||||
self.monitor_thread = threading.Thread(target=rename_downloaded_file, args=(path, self.monitor_event)) #path后面的逗号不能省略,是元组固定写法
|
||||
self.monitor_thread.start()
|
||||
# self.browser.get('about:blank')
|
||||
self.procedure = service["graph"] # 程序执行流程
|
||||
try:
|
||||
self.maxViewLength = service["maxViewLength"] # 最大显示长度
|
||||
except:
|
||||
self.maxViewLength = 15
|
||||
try:
|
||||
self.outputFormat = service["outputFormat"] # 输出格式
|
||||
except:
|
||||
self.outputFormat = "csv"
|
||||
try:
|
||||
self.task_version = service["version"] # 任务版本
|
||||
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
if service["version"] != version:
|
||||
self.print_and_log("版本不一致,请使用" +
|
||||
service["version"] + "版本的EasySpider运行该任务!")
|
||||
self.print_and_log("Version not match, please use EasySpider " +
|
||||
service["version"] + " to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
except: # 0.2.0版本没有version字段,所以直接退出
|
||||
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
|
||||
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
|
||||
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
|
||||
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
|
||||
self.task_version = service.get("version", "") # 任务版本
|
||||
|
||||
if not self.task_version:
|
||||
self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
|
||||
self.print_and_log(
|
||||
"Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
try:
|
||||
self.save_threshold = service["saveThreshold"] # 保存最低阈值
|
||||
except:
|
||||
self.save_threshold = 10
|
||||
try:
|
||||
self.links = list(
|
||||
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||
except:
|
||||
|
||||
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务!")
|
||||
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
|
||||
service_links = service.get("links")
|
||||
if service_links:
|
||||
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
|
||||
else:
|
||||
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
|
||||
|
||||
self.OUTPUT = [] # 采集的数据
|
||||
try:
|
||||
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖
|
||||
except:
|
||||
self.dataWriteMode = 1
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
|
||||
if self.dataWriteMode == 2 and os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
|
||||
self.writeMode = 1 # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
|
||||
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
if self.dataWriteMode == DataWriteMode.Cover.value:
|
||||
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
|
||||
elif self.dataWriteMode == DataWriteMode.Rename.value:
|
||||
i = 2
|
||||
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
|
||||
i = i + 1
|
||||
self.saveName = self.saveName + '_' + str(i)
|
||||
self.print_and_log("文件已存在,已重命名为", self.saveName)
|
||||
self.writeMode = WriteMode.Create.value # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat in ['csv', 'txt', 'xlsx']:
|
||||
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
|
||||
self.OUTPUT.append([]) # 添加表头
|
||||
self.writeMode = 0
|
||||
self.writeMode = WriteMode.Create.value
|
||||
elif self.outputFormat == "json":
|
||||
self.writeMode = 3 # JSON模式无需判断是否存在文件
|
||||
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
|
||||
elif self.outputFormat == "mysql":
|
||||
self.mysql = myMySQL(config["mysql_config_path"])
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
|
||||
self.writeMode = 2
|
||||
if self.writeMode == 0:
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"],
|
||||
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
|
||||
self.writeMode = WriteMode.MySQL.value # MySQL模式
|
||||
|
||||
if self.writeMode == WriteMode.Create.value:
|
||||
self.print_and_log("新建模式|Create Mode")
|
||||
elif self.writeMode == 1:
|
||||
elif self.writeMode == WriteMode.Append.value:
|
||||
self.print_and_log("追加模式|Append Mode")
|
||||
elif self.writeMode == 2:
|
||||
elif self.writeMode == WriteMode.MySQL.value:
|
||||
self.print_and_log("MySQL模式|MySQL Mode")
|
||||
elif self.writeMode == 3:
|
||||
elif self.writeMode == WriteMode.Json.value:
|
||||
self.print_and_log("JSON模式|JSON Mode")
|
||||
|
||||
self.containJudge = service["containJudge"] # 是否含有判断语句
|
||||
self.outputParameters = {}
|
||||
self.service = service
|
||||
@ -224,191 +222,140 @@ class BrowserThread(Thread):
|
||||
if param["name"] not in self.outputParameters.keys():
|
||||
self.outputParameters[param["name"]] = ""
|
||||
self.dataNotFoundKeys[param["name"]] = False
|
||||
try:
|
||||
self.outputParametersTypes.append(param["type"])
|
||||
except:
|
||||
self.outputParametersTypes.append("text")
|
||||
try:
|
||||
self.outputParametersRecord.append(
|
||||
bool(param["recordASField"]))
|
||||
except:
|
||||
self.outputParametersRecord.append(True)
|
||||
self.outputParametersTypes.append(param.get("type", "text"))
|
||||
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
|
||||
# 文件叠加的时候不添加表头
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if self.writeMode == 0:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
self.urlId = 0 # 全局记录变量
|
||||
self.preprocess() # 预处理,优化提取数据流程
|
||||
try:
|
||||
self.inputExcel = service["inputExcel"] # 输入Excel
|
||||
except:
|
||||
self.inputExcel = ""
|
||||
self.inputExcel = service.get("inputExcel", "") # 输入Excel
|
||||
self.readFromExcel() # 读取Excel获得参数值
|
||||
|
||||
# 检测如果没有复杂的操作,优化提取数据流程
|
||||
def preprocess(self):
|
||||
for node in self.procedure:
|
||||
try:
|
||||
iframe = node["parameters"]["iframe"]
|
||||
except:
|
||||
node["parameters"]["iframe"] = False
|
||||
for index_node, node in enumerate(self.procedure):
|
||||
parameters: dict = node["parameters"]
|
||||
iframe = parameters.get('iframe')
|
||||
option = node["option"]
|
||||
|
||||
try:
|
||||
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
|
||||
node["parameters"]["xpath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["waitElementIframeIndex"] = int(
|
||||
node["parameters"]["waitElementIframeIndex"])
|
||||
except:
|
||||
node["parameters"]["waitElement"] = ""
|
||||
node["parameters"]["waitElementTime"] = 10
|
||||
node["parameters"]["waitElementIframeIndex"] = 0
|
||||
if node["option"] == 1: # 打开网页操作
|
||||
try:
|
||||
cookies = node["parameters"]["cookies"]
|
||||
except:
|
||||
node["parameters"]["cookies"] = ""
|
||||
elif node["option"] == 2: # 点击操作
|
||||
try:
|
||||
alertHandleType = node["parameters"]["alertHandleType"]
|
||||
except:
|
||||
node["parameters"]["alertHandleType"] = 0
|
||||
if node["parameters"]["useLoop"]:
|
||||
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
|
||||
if parameters.get("xpath"):
|
||||
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
|
||||
|
||||
if parameters.get("waitElementIframeIndex"):
|
||||
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
|
||||
else:
|
||||
parameters["waitElement"] = ""
|
||||
parameters["waitElementTime"] = 10
|
||||
parameters["waitElementIframeIndex"] = 0
|
||||
|
||||
if option == GraphOption.Get.value: # 打开网页操作
|
||||
parameters["cookies"] = parameters.get("cookies", "")
|
||||
elif option == GraphOption.Click.value: # 点击操作
|
||||
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
|
||||
if parameters.get("useLoop"):
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 3: # 提取数据操作
|
||||
node["parameters"]["recordASField"] = 0
|
||||
try:
|
||||
params = node["parameters"]["params"]
|
||||
except:
|
||||
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
params = node["parameters"]["params"]
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Extract.value: # 提取数据操作
|
||||
parameters["recordASField"] = 0
|
||||
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
|
||||
parameters["clear"] = parameters.get("clear", 0)
|
||||
parameters["newLine"] = parameters.get("newLine", 1)
|
||||
|
||||
params = parameters["params"]
|
||||
for param in params:
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
try:
|
||||
param["iframe"] = param.get("iframe", False)
|
||||
|
||||
if param.get("relativeXPath"):
|
||||
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["recordASField"] = param["recordASField"]
|
||||
except:
|
||||
node["parameters"]["recordASField"] = 1
|
||||
try:
|
||||
splitLine = int(param["splitLine"])
|
||||
except:
|
||||
param["splitLine"] = 0
|
||||
if param["contentType"] == 8:
|
||||
self.print_and_log(
|
||||
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log(
|
||||
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
|
||||
|
||||
parameters["recordASField"] = param.get("recordASField", 1)
|
||||
|
||||
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
|
||||
|
||||
if param.get("contentType") == 8:
|
||||
self.print_and_log("默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType =="
|
||||
"8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片"
|
||||
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
|
||||
"的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
|
||||
"modify the source code get_content function -> contentType == 8 position "
|
||||
"to your own OCR model and then compile and run it; or you can first set "
|
||||
"the content type of the crawler to \"Element Screenshot\" to save the "
|
||||
"picture, and then call your own program with custom operations. The "
|
||||
"function of the program is to read the latest generated picture, then use "
|
||||
"a good model, such as PaddleOCR to recognize the picture, and then return "
|
||||
"the return value as a parameter output to the program.")
|
||||
param["optimizable"] = detect_optimizable(param)
|
||||
elif node["option"] == 4: # 输入文字
|
||||
try:
|
||||
index = node["parameters"]["index"] # 索引值
|
||||
except:
|
||||
node["parameters"]["index"] = 0
|
||||
elif node["option"] == 5: # 自定义操作
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
elif node["option"] == 7: # 移动到元素
|
||||
if node["parameters"]["useLoop"]:
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 8: # 循环操作
|
||||
try:
|
||||
exitElement = node["parameters"]["exitElement"]
|
||||
if exitElement == "":
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
except:
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
|
||||
try:
|
||||
skipCount = node["parameters"]["skipCount"]
|
||||
except:
|
||||
node["parameters"]["skipCount"] = 0
|
||||
elif option == GraphOption.Input.value: # 输入文字
|
||||
parameters['index'] = parameters.get('index', 0)
|
||||
elif option == GraphOption.Custom.value: # 自定义操作
|
||||
parameters['clear'] = parameters.get('clear', 0)
|
||||
parameters['newLine'] = parameters.get('newLine', 1)
|
||||
elif option == GraphOption.Move.value: # 移动到元素
|
||||
if parameters.get('useLoop'):
|
||||
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Loop.value: # 循环操作
|
||||
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
|
||||
parameters["quickExtractable"] = False # 是否可以快速提取
|
||||
parameters['skipCount'] = parameters.get('skipCount', 0)
|
||||
|
||||
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
try:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
|
||||
except:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
try:
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
|
||||
except:
|
||||
waitElement = ""
|
||||
if node["parameters"]["iframe"]:
|
||||
node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
|
||||
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
|
||||
if not params:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
|
||||
|
||||
if parameters["iframe"]:
|
||||
parameters["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
else:
|
||||
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
|
||||
if node["parameters"]["skipCount"] > 0:
|
||||
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
parameters["quickExtractable"] = True # 先假设可以快速提取
|
||||
|
||||
if parameters["skipCount"] > 0:
|
||||
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
|
||||
for param in params:
|
||||
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
param['iframe'] = param.get('iframe', False)
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
optimizable = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
parameters["quickExtractable"] = False
|
||||
break
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
|
||||
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
|
||||
try:
|
||||
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
|
||||
if parameters["quickExtractable"]:
|
||||
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
|
||||
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
|
||||
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
|
||||
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
|
||||
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
|
||||
node["parameters"]["quickParams"] = []
|
||||
for param in params:
|
||||
content_type = ""
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
|
||||
"::text()") >= 0:
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
|
||||
or param["relativeXPath"].find("::text()") >= 0:
|
||||
content_type = ""
|
||||
elif param["nodeType"] == 2:
|
||||
content_type = "//@href"
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
content_type = "//@src"
|
||||
elif param["contentType"] == 1:
|
||||
content_type = "/text()"
|
||||
elif param["contentType"] == 0:
|
||||
content_type = "//text()"
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
xpath = "." + param["relativeXPath"] + content_type
|
||||
else:
|
||||
xpath = param["relativeXPath"] + content_type
|
||||
@ -422,6 +369,7 @@ class BrowserThread(Thread):
|
||||
"nodeType": param["nodeType"],
|
||||
"default": param["default"],
|
||||
})
|
||||
self.procedure[index_node]["parameters"] = parameters
|
||||
self.print_and_log("预处理完成|Preprocess completed")
|
||||
|
||||
def readFromExcel(self):
|
||||
@ -521,7 +469,7 @@ class BrowserThread(Thread):
|
||||
"/", len(self.links))
|
||||
self.executeNode(0)
|
||||
self.urlId = self.urlId + 1
|
||||
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
||||
# files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
||||
# 如果目录为空,则删除该目录
|
||||
# if not files:
|
||||
# os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
|
||||
@ -538,12 +486,16 @@ class BrowserThread(Thread):
|
||||
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
|
||||
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
|
||||
time.sleep(quitWaitTime)
|
||||
self.browser.quit()
|
||||
try:
|
||||
self.browser.quit()
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
|
||||
try:
|
||||
shutil.rmtree(self.option["tmp_user_data_folder"])
|
||||
except:
|
||||
pass
|
||||
self.monitor_event.set()
|
||||
self.print_and_log("清理完成!|Clean up completed!")
|
||||
self.print_and_log("您现在可以安全的关闭此窗口了。|You can safely close this window now.")
|
||||
|
||||
@ -753,28 +705,32 @@ class BrowserThread(Thread):
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 2:
|
||||
self.recordLog("Execute JavaScript for element:" + code)
|
||||
self.recordLog("对元素执行JavaScript:" + code)
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code, element)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 5:
|
||||
try:
|
||||
code = readCode(code)
|
||||
# global_namespace = globals().copy()
|
||||
# global_namespace["self"] = self
|
||||
output = exec(code)
|
||||
self.recordLog("执行下面的代码:" + code)
|
||||
self.recordLog("Execute the following code:" + code)
|
||||
except Exception as e:
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" +
|
||||
code, ", error is:", e)
|
||||
code, ", error is:", str(e))
|
||||
elif int(codeMode) == 6:
|
||||
try:
|
||||
code = readCode(code)
|
||||
@ -847,6 +803,23 @@ class BrowserThread(Thread):
|
||||
self.print_and_log("根据设置的自定义操作,任务已刷新页面|Task refreshed page according to custom operation")
|
||||
elif codeMode == 9: # 发送邮件
|
||||
send_email(node["parameters"]["emailConfig"])
|
||||
elif codeMode == 10: # 清空所有字段值
|
||||
self.clearOutputParameters()
|
||||
elif codeMode == 11: # 生成新的数据行
|
||||
line = new_line(self.outputParameters,
|
||||
self.maxViewLength, self.outputParametersRecord)
|
||||
self.OUTPUT.append(line)
|
||||
elif codeMode == 12: # 退出程序
|
||||
self.print_and_log("根据设置的自定义操作,任务已退出|Task exited according to custom operation")
|
||||
self.saveData(exit=True)
|
||||
self.browser.quit()
|
||||
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
|
||||
try:
|
||||
shutil.rmtree(self.option["tmp_user_data_folder"])
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("清理完成!|Clean up completed!")
|
||||
os._exit(0)
|
||||
else: # 0 1 5 6
|
||||
output = self.execute_code(
|
||||
codeMode, code, max_wait_time, iframe=params["iframe"])
|
||||
@ -1106,7 +1079,25 @@ class BrowserThread(Thread):
|
||||
self.recordLog(
|
||||
"判断条件内所有条件分支的条件都不满足|None of the conditions in the judgment condition are met")
|
||||
|
||||
def handleHistory(self, node, xpath, thisHistoryURL, thisHistoryLength, index, element=None, elements=None):
|
||||
def handleHistory(self, node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, element=None, elements=None):
|
||||
try:
|
||||
changed_handle = self.browser.current_window_handle != thisHandle
|
||||
except: # 如果网页被意外关闭了的情况下
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
changed_handle = self.browser.window_handles[-1] != thisHandle
|
||||
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
|
||||
try:
|
||||
while True: # 一直关闭窗口直到当前标签页
|
||||
self.browser.close() # 关闭使用完的标签页
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
if self.browser.current_window_handle == thisHandle:
|
||||
break
|
||||
except Exception as e:
|
||||
self.print_and_log("关闭标签页发生错误:", e)
|
||||
self.print_and_log(
|
||||
"Error occurred while closing tab: ", e)
|
||||
if self.history["index"] != thisHistoryLength and self.history["handle"] == self.browser.current_window_handle: # 如果执行完一次循环之后历史记录发生了变化,注意当前页面的判断
|
||||
difference = thisHistoryLength - self.history["index"] # 计算历史记录变化差值
|
||||
self.browser.execute_script('history.go(' + str(difference) + ')') # 回退历史记录
|
||||
@ -1132,12 +1123,13 @@ class BrowserThread(Thread):
|
||||
if self.browser.current_url == thisHistoryURL or ti > thisHistoryLength: # 如果执行完一次循环之后网址发生了变化
|
||||
break
|
||||
time.sleep(2)
|
||||
if element == None: # 不固定元素列表
|
||||
element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
|
||||
else: # 固定元素列表
|
||||
element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
|
||||
# if index > 0:
|
||||
# index -= 1 # 如果是data:开头的网址,就要重试一次
|
||||
if xpath != "":
|
||||
if element == None: # 不固定元素列表
|
||||
element = self.browser.find_elements(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
|
||||
else: # 固定元素列表
|
||||
element = self.browser.find_element(By.XPATH, xpath, iframe=node["parameters"]["iframe"])
|
||||
# if index > 0:
|
||||
# index -= 1 # 如果是data:开头的网址,就要重试一次
|
||||
else:
|
||||
if element == None:
|
||||
element = elements
|
||||
@ -1156,6 +1148,14 @@ class BrowserThread(Thread):
|
||||
self.history["handle"] = thisHandle
|
||||
thisHistoryURL = self.browser.current_url
|
||||
# 快速提取处理
|
||||
# start = time.time()
|
||||
try:
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
except Exception as e:
|
||||
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
# end = time.time()
|
||||
# print("解析页面秒数:", end - start)
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.browser.switch_to.default_content() # 切换到主页面
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
@ -1321,25 +1321,7 @@ class BrowserThread(Thread):
|
||||
if self.BREAK:
|
||||
self.BREAK = False
|
||||
break
|
||||
try:
|
||||
changed_handle = self.browser.current_window_handle != thisHandle
|
||||
except: # 如果网页被意外关闭了的情况下
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
changed_handle = self.browser.window_handles[-1] != thisHandle
|
||||
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
|
||||
try:
|
||||
while True: # 一直关闭窗口直到当前标签页
|
||||
self.browser.close() # 关闭使用完的标签页
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
if self.browser.current_window_handle == thisHandle:
|
||||
break
|
||||
except Exception as e:
|
||||
self.print_and_log("关闭标签页发生错误:", e)
|
||||
self.print_and_log(
|
||||
"Error occurred while closing tab: ", e)
|
||||
index, elements = self.handleHistory(node, xpath, thisHistoryURL, thisHistoryLength, index, elements=elements)
|
||||
index, elements = self.handleHistory(node, xpath, thisHandle, thisHistoryURL, thisHistoryLength, index, elements=elements)
|
||||
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
|
||||
output = self.execute_code(int(
|
||||
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"],
|
||||
@ -1381,25 +1363,7 @@ class BrowserThread(Thread):
|
||||
if self.BREAK:
|
||||
self.BREAK = False
|
||||
break
|
||||
try:
|
||||
changed_handle = self.browser.current_window_handle != thisHandle
|
||||
except: # 如果网页被意外关闭了的情况下
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
changed_handle = self.browser.window_handles[-1] != thisHandle
|
||||
if changed_handle: # 如果执行完一次循环之后标签页的位置发生了变化
|
||||
try:
|
||||
while True: # 一直关闭窗口直到当前标签页
|
||||
self.browser.close() # 关闭使用完的标签页
|
||||
self.browser.switch_to.window(
|
||||
self.browser.window_handles[-1])
|
||||
if self.browser.current_window_handle == thisHandle:
|
||||
break
|
||||
except Exception as e:
|
||||
self.print_and_log("关闭标签页发生错误:", e)
|
||||
self.print_and_log(
|
||||
"Error occurred while closing tab: ", e)
|
||||
index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
|
||||
index, element = self.handleHistory(node, path, thisHandle, thisHistoryURL, thisHistoryLength, index, element=element)
|
||||
except NoSuchElementException:
|
||||
self.print_and_log("Loop element not found: ", path)
|
||||
self.print_and_log("找不到循环元素:", path)
|
||||
@ -1447,6 +1411,7 @@ class BrowserThread(Thread):
|
||||
code = get_output_code(output)
|
||||
if code <= 0:
|
||||
break
|
||||
index, _ = self.handleHistory(node, "", thisHandle, thisHistoryURL, thisHistoryLength, index)
|
||||
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
|
||||
# tempList = node["parameters"]["textList"].split("\r\n")
|
||||
urlList = list(
|
||||
@ -1696,8 +1661,11 @@ class BrowserThread(Thread):
|
||||
try:
|
||||
actions = ActionChains(self.browser) # 实例化一个action对象
|
||||
if newTab == 1: # 在新标签页打开
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
if sys.platform == "darwin": # Mac
|
||||
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
|
||||
else:
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
else:
|
||||
actions.click(element).perform()
|
||||
except Exception as e:
|
||||
@ -1715,6 +1683,21 @@ class BrowserThread(Thread):
|
||||
script = 'var result = document.evaluate(`' + path + \
|
||||
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
|
||||
self.browser.execute_script(script, str(index)) # 用js的点击方法
|
||||
elif click_way == 2: # 双击
|
||||
try:
|
||||
actions = ActionChains(self.browser) # 实例化一个action对象
|
||||
actions.double_click(element).perform()
|
||||
except Exception as e:
|
||||
self.browser.execute_script("arguments[0].scrollIntoView();", element)
|
||||
try:
|
||||
actions = ActionChains(self.browser) # 实例化一个action对象
|
||||
actions.double_click(element).perform()
|
||||
except Exception as e:
|
||||
self.print_and_log(f"Selenium双击元素{path}失败,将尝试使用JavaScript双击")
|
||||
self.print_and_log(f"Failed to double click element {path} with Selenium, will try to double click with JavaScript")
|
||||
script = 'var result = document.evaluate(`' + path + \
|
||||
'`, document, null, XPathResult.ANY_TYPE, null);for(let i=0;i<arguments[0];i++){result.iterateNext();} result.iterateNext().click();'
|
||||
self.browser.execute_script(script, str(index)) # 用js的点击方法
|
||||
self.recordLog("点击元素|Click element: " + path)
|
||||
except TimeoutException:
|
||||
self.print_and_log(
|
||||
@ -1797,7 +1780,6 @@ class BrowserThread(Thread):
|
||||
self.print_and_log("History Length Error")
|
||||
self.history["index"] = 0
|
||||
self.scrollDown(param) # 根据参数配置向下滚动
|
||||
# rt.end()
|
||||
|
||||
def get_content(self, p, element):
|
||||
content = ""
|
||||
@ -1824,7 +1806,7 @@ class BrowserThread(Thread):
|
||||
downloadPic = 0
|
||||
if downloadPic == 1:
|
||||
download_image(self, content, "Data/Task_" +
|
||||
str(self.id) + "/" + self.saveName + "/", element)
|
||||
str(self.id) + "/" + self.saveName + "/images", element)
|
||||
else: # 普通节点
|
||||
if p["splitLine"] == 1:
|
||||
text = extract_text_from_html(element.get_attribute('outerHTML'))
|
||||
@ -1853,7 +1835,7 @@ class BrowserThread(Thread):
|
||||
downloadPic = 0
|
||||
if downloadPic == 1:
|
||||
download_image(self, content, "Data/Task_" +
|
||||
str(self.id) + "/" + self.saveName + "/", element)
|
||||
str(self.id) + "/" + self.saveName + "/images", element)
|
||||
else:
|
||||
command = 'var arr = [];\
|
||||
var content = arguments[0];\
|
||||
@ -1965,6 +1947,8 @@ class BrowserThread(Thread):
|
||||
content = element.get_attribute(attribute_name)
|
||||
except:
|
||||
content = ""
|
||||
elif p["contentType"] == 15: # 常量值
|
||||
content = p["JS"]
|
||||
if content == None:
|
||||
content = ""
|
||||
return content
|
||||
@ -2208,7 +2192,9 @@ if __name__ == '__main__':
|
||||
"server_address": "http://localhost:8074",
|
||||
"keyboard": True, # 是否监听键盘输入
|
||||
"pause_key": "p", # 暂停键
|
||||
"version": "0.6.0",
|
||||
"version": "0.6.3",
|
||||
"docker_driver": "",
|
||||
"user_folder": "",
|
||||
}
|
||||
c = Config(config)
|
||||
print(c)
|
||||
@ -2283,7 +2269,9 @@ if __name__ == '__main__':
|
||||
|
||||
options.add_argument(
|
||||
"--disable-blink-features=AutomationControlled") # TMALL 反扒
|
||||
|
||||
# 阻止http -> https的重定向
|
||||
options.add_argument("--disable-features=CrossSiteDocumentBlockingIfIsolating,CrossSiteDocumentBlockingAlways,IsolateOrigins,site-per-process")
|
||||
options.add_argument("--disable-web-security") # 禁用同源策略
|
||||
options.add_argument('-ignore-certificate-errors')
|
||||
options.add_argument('-ignore -ssl-errors')
|
||||
|
||||
@ -2302,35 +2290,43 @@ if __name__ == '__main__':
|
||||
os.mkdir(tmp_user_folder_parent)
|
||||
characters = string.ascii_letters + string.digits
|
||||
for i in range(len(c.ids)):
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options = tmp_options[i]["options"]
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
options.add_argument("--profile-directory=Default")
|
||||
if c.user_folder == "":
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
print(f"Use local user data folder: {tmp_user_data_folder}")
|
||||
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
|
||||
else:
|
||||
options.add_argument(
|
||||
f'--user-data-dir={c.user_folder}')
|
||||
print(f"Use specifed user data folder: {c.user_folder}", ", please note if you are using docker, this user folder path should be the path inside the docker container.")
|
||||
print(f"使用指定的用户信息目录: {c.user_folder}", ",请注意如果您正在使用docker,此用户文件夹路径应是容器内的路径。")
|
||||
print(
|
||||
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally,说明有之前运行的Chrome实例没有正常关闭,请关闭之前打开的所有Chrome实例后再运行程序即可。")
|
||||
print(
|
||||
@ -2343,9 +2339,13 @@ if __name__ == '__main__':
|
||||
print("id: ", id)
|
||||
if c.read_type == "remote":
|
||||
print("remote")
|
||||
content = requests.get(
|
||||
try:
|
||||
content = requests.get(
|
||||
c.server_address + "/queryExecutionInstance?id=" + str(id))
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
except:
|
||||
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
|
||||
print("无法连接到服务器,请确保EasySpider主程序正在运行,或者您可以将--read_type参数更改为'local',以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
|
||||
else:
|
||||
print("local")
|
||||
local_folder = os.path.join(os.getcwd(), "execution_instances")
|
||||
@ -2370,8 +2370,8 @@ if __name__ == '__main__':
|
||||
cloudflare = 0
|
||||
if cloudflare == 0:
|
||||
options.add_argument('log-level=3') # 隐藏日志
|
||||
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
|
||||
print("Data path:", path)
|
||||
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id), "files")
|
||||
print("文件下载路径|File Download path:", path)
|
||||
options.add_experimental_option("prefs", {
|
||||
# 设置文件下载路径
|
||||
"download.default_directory": path,
|
||||
@ -2396,8 +2396,17 @@ if __name__ == '__main__':
|
||||
except:
|
||||
browser = "chrome"
|
||||
if browser == "chrome":
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options)
|
||||
if c.docker_driver == "":
|
||||
print("Using local driver")
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
|
||||
else:
|
||||
print("Using remote driver")
|
||||
# Use docker driver, default address is http://localhost:4444/wd/hub
|
||||
# Headless mode
|
||||
# options.add_argument("--headless")
|
||||
# print("Headless mode")
|
||||
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
|
||||
elif browser == "edge":
|
||||
from selenium.webdriver.edge.service import Service as EdgeService
|
||||
from selenium.webdriver.edge.options import Options as EdgeOptions
|
||||
@ -2458,6 +2467,7 @@ if __name__ == '__main__':
|
||||
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
|
||||
# 使用监听器监听键盘输入
|
||||
try:
|
||||
from pynput.keyboard import Key, Listener
|
||||
if c.keyboard:
|
||||
with Listener(on_press=on_press_creator(press_time, event),
|
||||
on_release=on_release_creator(event, press_time)) as listener:
|
||||
|
@ -1 +1,50 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 使用 lsb_release 获取系统信息
|
||||
os_name=$(lsb_release -si)
|
||||
os_version=$(lsb_release -sr)
|
||||
|
||||
# 提取主版本号副版本号
|
||||
major_version=$(echo $os_version | cut -d'.' -f1)
|
||||
minor_version=$(echo $os_version | cut -d'.' -f2)
|
||||
|
||||
# 检查是否为Ubuntu且版本大于等于24.04
|
||||
if [ "$os_name" == "Ubuntu" ] && [ "$major_version" -gt 24 ] || { [ "$major_version" -eq 24 ]; }; then
|
||||
# 要检查的文件路径
|
||||
file_path="./EasySpider/chrome-sandbox"
|
||||
|
||||
# 检查文件是否存在
|
||||
if [ ! -e "$file_path" ]; then
|
||||
echo "File Not Exist!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 获取文件的拥有者
|
||||
owner=$(stat -c %U "$file_path")
|
||||
|
||||
# 获取文件的权限
|
||||
permissions=$(stat -c %a "$file_path")
|
||||
|
||||
# 检查拥有者是否为root且权限是否为4755
|
||||
if [ "$owner" != "root" ] || [ "$permissions" != "4755" ]; then
|
||||
echo "这是你第一次在该Ubuntu系统上使用EasySpider,请在下方输入密码来调整文件权限以使用EasySpider:"
|
||||
echo "This is the first time you use EasySpider in this Ubuntu system, please change your permission of the software by input your password below (should have root/sudo permission):"
|
||||
sudo chown root:root "$file_path"
|
||||
sudo chmod 4755 "$file_path"
|
||||
sudo chown root:root "./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
|
||||
sudo chmod 4755 "./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
|
||||
fi
|
||||
else
|
||||
echo "如果报错“The SUID sandbox helper binary was found, but is not configured correctly”,请尝试执行以下命令后再次运行EasySpider:"
|
||||
echo "If you encounter the error message “The SUID sandbox helper binary was found, but is not configured correctly”, please try run the following commands and run EasySpider again:"
|
||||
echo ""
|
||||
echo "sudo chown root:root ./EasySpider/chrome-sandbox"
|
||||
echo "sudo chmod 4755 ./EasySpider/chrome-sandbox"
|
||||
echo "sudo chown root:root ./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
|
||||
echo "sudo chmod 4755 ./EasySpider/resources/app/chrome_linux64/chrome-sandbox"
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
|
||||
|
||||
./EasySpider/EasySpider
|
||||
|
@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
|
||||
"""
|
||||
|
||||
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
|
||||
|
||||
print(globals())
|
||||
# 导包 | Import packages
|
||||
from selenium.common.exceptions import ElementClickInterceptedException
|
||||
|
||||
@ -56,3 +56,20 @@ finally:
|
||||
print("All parameters:", self.outputParameters)
|
||||
print(test(3))
|
||||
print("执行完毕|Execution completed")
|
||||
|
||||
import time
|
||||
time.sleep(3)
|
||||
|
||||
def new_line(outputParameters, maxViewLength, record):
|
||||
line = []
|
||||
print("Use this function to print a new line in the console")
|
||||
i = 0
|
||||
for value in outputParameters.values():
|
||||
line.append(value)
|
||||
if record[i]:
|
||||
print(value[:maxViewLength], " ", end="")
|
||||
i += 1
|
||||
print("")
|
||||
return line
|
||||
|
||||
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
@ -1 +1 @@
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
@ -1 +1 @@
|
||||
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
||||
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
File diff suppressed because one or more lines are too long
@ -1,8 +1,29 @@
|
||||
Due to the complex security settings of MacOS, the issue of being unable to open software due to the "unverified developer" message may occur upon the first attempt to open the software. Please refer to the following GitHub document to see how to open software and perform tasks on your MacOS version:
|
||||
Due to MacOS's complex security settings, software downloaded for the first time will warn that the developer is unverified and will not allow the application to run. Please follow these steps to unlock:
|
||||
|
||||
https://github.com/NaiboWang/EasySpider/wiki/MacOS-Guide
|
||||
1. Open the system Terminal.
|
||||
|
||||
The main steps are as follows:
|
||||
2. Navigate to the EasySpider software directory, such as:
|
||||
|
||||
cd ~/Downloads/EasySpider_MacOS
|
||||
|
||||
3. In the EasySpider directory, run the `first_time_run.sh` script to modify the package properties by using the following command:
|
||||
|
||||
bash first_time_run.sh
|
||||
|
||||
|
||||
|
||||
This will unlock EasySpider for both design and execution stages.
|
||||
|
||||
If you encounter errors such as the one below during the command execution, they can be ignored, and you may proceed to open the software after the command completes:
|
||||
|
||||
xattr: [Errno 13] Permission denied: 'EasySpider.app/Contents/Resources/app/node_modules/node-window-manager/build/node_gyp_bins/python3'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
For another solution, refer to this video on how to open software and execute tasks in MacOS version: https://www.bilibili.com/video/BV1E34y137fT/
|
||||
|
||||
- Design phase - Apple Arm chip version of MacOS
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
@ -1 +1 @@
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/309.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/309.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":309,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-24 00:34:50","update_time":"2023-12-24 00:36:58","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":2,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"text","exampleValue":"JS(\"return new Date().getYear()\")1","value":"JS(\"return new Date().getYear()\")1"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"value":"JS(\"return new Date().getYear()\")1","index":0,"allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}},{"id":3,"index":3,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[4],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":4,"index":4,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"ughtq41gxwnlqia7awp","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"ughtq41gxwnlqia7awp","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/310.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/310.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/311.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/311.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":311,"name":"重命名测试","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-28 14:05:20","update_time":"2023-12-28 14:05:43","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/313.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/313.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/314.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/314.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/315.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/315.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":315,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-29 22:34:23","update_time":"2023-12-29 22:38:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"Text","desc":"自定义操作返回的数据","type":"text","recordASField":1,"exampleValue":""},{"id":1,"name":"Link","desc":"自定义操作返回的数据","type":"text","recordASField":1,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[4,5,3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":5,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":""}},{"id":3,"index":4,"parentId":2,"type":0,"option":5,"title":"Text","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":0,"codeMode":2,"code":"return arguments[0].innerText","waitTime":0,"recordASField":1,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}},{"id":4,"index":5,"parentId":2,"type":0,"option":5,"title":"Link","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":2,"code":"return arguments[0].href","waitTime":0,"recordASField":1,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/316.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/316.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":316,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-30 22:35:04","update_time":"2023-12-30 22:35:12","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"自定义操作","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":5,"title":"自定义操作","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":12,"code":"","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/317.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/317.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":317,"name":"图片下载","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-01-05 22:14:43","update_time":"2024-01-05 22:15:19","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数2_图片地址","desc":"","type":"text","recordASField":1,"exampleValue":"//m.360buyimg.com/babel/jfs/t1/232616/15/5744/219106/656d810aF16705ea9/41c4997dc1b81f17.png"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":4,"contentType":0,"relative":false,"name":"参数1_图片地址","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-3]/div/div/a/img"],"exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/s1420x740_jfs/t1/194401/20/32669/76553/64142a96F7733e6ad/cf2727848c86cf45.jpg!q70.dpg"}],"unique_index":"i9in42ta6klr0pwp4k","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}},{"id":2,"index":3,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[4],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div[1]/a[1]/img[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-4]/div/div/a/img"]}},{"id":3,"index":4,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":4,"contentType":0,"relative":true,"name":"参数2_图片地址","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/jfs/t1/232616/15/5744/219106/656d810aF16705ea9/41c4997dc1b81f17.png"}],"unique_index":"i81avec75qflr0pwym8","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":1,"splitLine":0}]}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/318.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/318.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":318,"name":"京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 05:08:03","update_time":"2024-04-22 05:19:48","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"电脑数码"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://prodev.jd.com/mall/active/31XPWPTonxJ9e5YoQ85HS7z8XNYQ/index.html?babelChannel=ttt40"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[4]/div[1]/div[4]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[4]/div[1]/div[4]/ul[1]/li[1]/a[1]","//a[contains(., '电脑数码')]","//A[@class='navitems-lk']","/html/body/div[last()-5]/div[last()-2]/div/div[last()-1]/ul/li[last()-8]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":15,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"电脑数码"}],"unique_index":"auwkv5g1krqlva0tsc4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"123","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://prodev.jd.com/mall/active/31XPWPTonxJ9e5YoQ85HS7z8XNYQ/index.html?babelChannel=ttt40"}],"unique_index":"auwkv5g1krqlva0tsc4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/319.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/319.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":-2,"name":"百度一下,你就知道","url":"https://www.baidu.com?id=1","links":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","create_time":"2024-04-22 05:45:12","update_time":"2024-04-22 05:45:20","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com?id=1","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com?id=1","links":"https://www.baidu.com?id=11\nhttps://www.baidu.com?id=12","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/320.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/320.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":320,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 05:53:18","update_time":"2024-04-22 05:53:28","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/321.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/321.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":321,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 07:02:02","update_time":"2024-04-22 07:02:16","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/322.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/322.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":322,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 08:13:15","update_time":"2024-04-22 08:13:33","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/323.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/323.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":323,"name":"新web采集任务","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"","update_time":"2024-08-10 17:29:04","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}
|
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/324.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/324.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/325.json
Normal file
1
.temp_to_pub/EasySpider_MacOS/Sample Tasks/325.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":325,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-12-30 22:37:29","update_time":"2024-12-30 22:37:43","version":"0.6.3","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"0暖心2024 总书记的贴心话"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li[1]/a[1]","//a[contains(., '0暖心2024 总')]","//a[@class='title-content c-link c-font-medium c-line-clamp1']","/html/body/div[last()-4]/div[last()-3]/div[last()-3]/div/div/div/ul/li[last()-9]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"0暖心2024 总书记的贴心话"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
@ -1 +1 @@
|
||||
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
||||
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
File diff suppressed because one or more lines are too long
5
.temp_to_pub/EasySpider_MacOS/first_time_run.sh
Normal file
5
.temp_to_pub/EasySpider_MacOS/first_time_run.sh
Normal file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
xattr -cr EasySpider.app
|
||||
xattr -cr easyspider_executestage
|
||||
xattr -cr easyspider_executestage_full
|
@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
|
||||
"""
|
||||
|
||||
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
|
||||
|
||||
print(globals())
|
||||
# 导包 | Import packages
|
||||
from selenium.common.exceptions import ElementClickInterceptedException
|
||||
|
||||
@ -56,3 +56,20 @@ finally:
|
||||
print("All parameters:", self.outputParameters)
|
||||
print(test(3))
|
||||
print("执行完毕|Execution completed")
|
||||
|
||||
import time
|
||||
time.sleep(3)
|
||||
|
||||
def new_line(outputParameters, maxViewLength, record):
|
||||
line = []
|
||||
print("Use this function to print a new line in the console")
|
||||
i = 0
|
||||
for value in outputParameters.values():
|
||||
line.append(value)
|
||||
if record[i]:
|
||||
print(value[:maxViewLength], " ", end="")
|
||||
i += 1
|
||||
print("")
|
||||
return line
|
||||
|
||||
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
|
@ -12,6 +12,11 @@ Official documentation can be found at: https://github.com/NaiboWang/EasySpider/
|
||||
|
||||
Video Tutorial: https://youtube.com/playlist?list=PL0kEFEkWrT7mt9MUlEBV2DTo1QsaanUTp
|
||||
|
||||
You can import tasks from other machines by simply opening the EasySpider software in this directory, right-clicking "Show Package Contents", and then placing the .json files from the tasks folder in the /Users/your user name/Library/Application Support/EasySpider/tasks folder of the other machine. Similarly, execution ID files can be imported by copying the .json files from the execution_instances folder. Please note that the .json files in both folders only support names greater than 0.
|
||||
You can import tasks from other machines by simply opening the EasySpider software in this directory, right-clicking "Show Package Contents", and then placing the .json files from the tasks folder in the /Users/Your User Name/Library/Application Support/EasySpider/tasks folder of the other machine. Similarly, execution ID files can be imported by copying the .json files from the execution_instances folder. Please note that the .json files in both folders only support names greater than 0.
|
||||
|
||||
You can quickly navigate to the tasks folder using the following commands:
|
||||
|
||||
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
|
||||
open .
|
||||
|
||||
If you need to press p one the keyboard to pause and continue the execution of the task, you need to grant the program keyboard monitoring permission.
|
||||
|
@ -1,6 +1,26 @@
|
||||
由于MacOS复杂的安全性设置,初次打开软件会显示未验证开发者从而不允许打开的问题,请参考以下视频来查看MacOS版本如何打开软件和执行任务:https://www.bilibili.com/video/BV1E34y137fT/
|
||||
由于MacOS复杂的安全性设置,初次打开软件会显示未验证开发者从而不允许打开的问题,请通过以下方式来解锁:
|
||||
|
||||
主要步骤如下:
|
||||
1. 打开系统terminal命令行窗口。
|
||||
|
||||
2. 切换到EasySpider软件目录,如:
|
||||
|
||||
cd ~/Downloads/EasySpider_MacOS
|
||||
|
||||
3. 在EasySpider目录下,使用以下命令运行目录下的`first_time_run.sh`脚本修改软件包属性:
|
||||
|
||||
bash first_time_run.sh
|
||||
|
||||
即可一键解锁并正常使用EasySpider,包括设计阶段程序和执行阶段程序。
|
||||
|
||||
|
||||
执行命令时如果出现类似下面的错误可以忽略,执行完成之后即可打开软件:
|
||||
|
||||
xattr: [Errno 13] Permission denied: 'EasySpider.app/Contents/Resources/app/node_modules/node-window-manager/build/node_gyp_bins/python3'
|
||||
|
||||
|
||||
|
||||
|
||||
以下是另一种方案,请参考以下视频来查看MacOS版本如何打开软件和执行任务:https://www.bilibili.com/video/BV1E34y137fT/
|
||||
|
||||
- 设计阶段 - Apple Arm芯片版MacOS
|
||||
|
||||
|
@ -14,4 +14,9 @@
|
||||
|
||||
可以从其他机器导入任务,只需要把其他机器的tasks文件夹里的.json文件放入/Users/你的用户名/Library/Application Support/EasySpider/tasks文件夹里即可。同理执行号文件可以通过复制execution_instances文件夹中的.json文件来导入。注意,两个文件夹里的.json文件只支持命名为大于0的数字。
|
||||
|
||||
可通过以下命令快速进入tasks文件夹:
|
||||
|
||||
cd /Users/$(whoami)/Library/Application\ Support/EasySpider/tasks
|
||||
open .
|
||||
|
||||
如果需要按p键暂停和继续任务的执行,需要赋予程序键盘监控权限。
|
||||
|
@ -23,7 +23,7 @@ For more complex operations, please download the source code and compile it for
|
||||
"""
|
||||
|
||||
# 请在下面编写你的代码,不要有代码缩进!!! | Please write your code below, do not indent the code!!!
|
||||
|
||||
print(globals())
|
||||
# 导包 | Import packages
|
||||
from selenium.common.exceptions import ElementClickInterceptedException
|
||||
|
||||
@ -56,3 +56,20 @@ finally:
|
||||
print("All parameters:", self.outputParameters)
|
||||
print(test(3))
|
||||
print("执行完毕|Execution completed")
|
||||
|
||||
import time
|
||||
time.sleep(3)
|
||||
|
||||
def new_line(outputParameters, maxViewLength, record):
|
||||
line = []
|
||||
print("Use this function to print a new line in the console")
|
||||
i = 0
|
||||
for value in outputParameters.values():
|
||||
line.append(value)
|
||||
if record[i]:
|
||||
print(value[:maxViewLength], " ", end="")
|
||||
i += 1
|
||||
print("")
|
||||
return line
|
||||
|
||||
new_line(self.outputParameters, 10, [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
|
||||
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 22:08:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":3,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
@ -1 +1 @@
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"07/12/2023, 03:43:34","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"desc":"https://www.zhihu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
||||
{"id":229,"name":"知乎 - 有问题,就会有答案","url":"https://www.zhihu.com","links":"https://www.zhihu.com","create_time":"07/12/2023, 03:26:24","update_time":"2023-12-27 20:05:50","version":"0.6.0","saveThreshold":10,"quitWaitTime":6,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"t","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"知了个乎","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zhihu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zhihu.com"},{"id":1,"name":"loopTimes_1","nodeId":4,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":0,"value":0}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,4,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.zhihu.com","links":"https://www.zhihu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":2,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/main[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/h2[1]/div[1]","//div[contains(., '死刑执行前可以谎称肚')]","/html/body/div[last()-7]/div/main/div/div/div[last()-1]/div/div/div/div/div/div[last()-12]/div/div/div/div/h2/div"]}},{"id":4,"index":3,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":5,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"死刑执行前可以谎称肚子痛,想排泄粪便,籍此拖延时间吗?"}],"unique_index":"onlvi030w9jlpu5tjzb","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":2,"index":4,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]}
|
@ -1 +1 @@
|
||||
{"id":70,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
||||
{"id":-2,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/24/2023, 8:21:45 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":7,"title":"移动到元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"allXPaths":"","loopType":1}}]}
|
File diff suppressed because one or more lines are too long
@ -9,6 +9,7 @@ import threading
|
||||
# import undetected_chromedriver as uc
|
||||
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||
from constants import WriteMode, DataWriteMode, GraphOption
|
||||
from myChrome import MyChrome
|
||||
from threading import Thread, Event
|
||||
from PIL import Image
|
||||
@ -31,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from pynput.keyboard import Key, Listener
|
||||
from datetime import datetime
|
||||
import io # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
@ -76,10 +76,7 @@ class BrowserThread(Thread):
|
||||
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
|
||||
Thread.__init__(self)
|
||||
self.logs = io.StringIO()
|
||||
try:
|
||||
self.log = bool(service["recordLog"])
|
||||
except:
|
||||
self.log = True
|
||||
self.log = bool(service.get("recordLog", True))
|
||||
self.browser = browser_t
|
||||
self.option = option
|
||||
self.config = config
|
||||
@ -87,22 +84,13 @@ class BrowserThread(Thread):
|
||||
self.totalSteps = 0
|
||||
self.id = id
|
||||
self.event = event
|
||||
try:
|
||||
self.saveName = service["saveName"] # 保存文件的名字
|
||||
except:
|
||||
now = datetime.now()
|
||||
# 将时间格式化为精确到秒的字符串
|
||||
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
|
||||
now = datetime.now()
|
||||
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
|
||||
self.OUTPUT = ""
|
||||
self.SAVED = False
|
||||
self.BREAK = False
|
||||
self.CONTINUE = False
|
||||
try:
|
||||
maximizeWindow = service["maximizeWindow"]
|
||||
except:
|
||||
maximizeWindow = 0
|
||||
if maximizeWindow == 1:
|
||||
self.browser.maximize_window()
|
||||
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
|
||||
# 名称设定
|
||||
if saveName != "": # 命令行覆盖保存名称
|
||||
self.saveName = saveName # 保存文件的名字
|
||||
@ -123,13 +111,13 @@ class BrowserThread(Thread):
|
||||
self.getDataStep = 0
|
||||
self.startSteps = 0
|
||||
try:
|
||||
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
|
||||
if startFromExit == 1:
|
||||
if service.get("startFromExit", 0) == 1:
|
||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
|
||||
encoding='utf-8-sig') as file_obj:
|
||||
self.startSteps = int(file_obj.read()) # 读取已执行步数
|
||||
except:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.print_and_log(f"读取steps.txt失败,原因:{str(e)}")
|
||||
|
||||
if self.startSteps != 0:
|
||||
self.print_and_log("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
|
||||
self.startSteps, "条。")
|
||||
@ -137,7 +125,7 @@ class BrowserThread(Thread):
|
||||
"will start from the last step, before we already collected", self.startSteps, " items.")
|
||||
else:
|
||||
self.print_and_log("此模式下,任务ID", self.id,
|
||||
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
self.print_and_log("In this mode, task ID", self.id,
|
||||
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||
stealth_path = driver_path[:driver_path.find(
|
||||
@ -145,13 +133,12 @@ class BrowserThread(Thread):
|
||||
with open(stealth_path, 'r') as f:
|
||||
js = f.read()
|
||||
self.print_and_log("Loading stealth.min.js")
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"""
|
||||
})
|
||||
WebDriverWait(self.browser, 10)
|
||||
@ -164,75 +151,65 @@ class BrowserThread(Thread):
|
||||
self.monitor_thread.start()
|
||||
# self.browser.get('about:blank')
|
||||
self.procedure = service["graph"] # 程序执行流程
|
||||
try:
|
||||
self.maxViewLength = service["maxViewLength"] # 最大显示长度
|
||||
except:
|
||||
self.maxViewLength = 15
|
||||
try:
|
||||
self.outputFormat = service["outputFormat"] # 输出格式
|
||||
except:
|
||||
self.outputFormat = "csv"
|
||||
try:
|
||||
self.task_version = service["version"] # 任务版本
|
||||
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
if service["version"] != version:
|
||||
self.print_and_log("版本不一致,请使用" +
|
||||
service["version"] + "版本的EasySpider运行该任务!")
|
||||
self.print_and_log("Version not match, please use EasySpider " +
|
||||
service["version"] + " to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
except: # 0.2.0版本没有version字段,所以直接退出
|
||||
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
|
||||
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
|
||||
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
|
||||
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
|
||||
self.task_version = service.get("version", "") # 任务版本
|
||||
|
||||
if not self.task_version:
|
||||
self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
|
||||
self.print_and_log(
|
||||
"Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
try:
|
||||
self.save_threshold = service["saveThreshold"] # 保存最低阈值
|
||||
except:
|
||||
self.save_threshold = 10
|
||||
try:
|
||||
self.links = list(
|
||||
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||
except:
|
||||
|
||||
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务!")
|
||||
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
|
||||
service_links = service.get("links")
|
||||
if service_links:
|
||||
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
|
||||
else:
|
||||
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
|
||||
|
||||
self.OUTPUT = [] # 采集的数据
|
||||
try:
|
||||
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖,3为重命名文件
|
||||
except:
|
||||
self.dataWriteMode = 1
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
|
||||
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
|
||||
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
if self.dataWriteMode == 2:
|
||||
if self.dataWriteMode == DataWriteMode.Cover.value:
|
||||
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
|
||||
elif self.dataWriteMode == 3:
|
||||
elif self.dataWriteMode == DataWriteMode.Rename.value:
|
||||
i = 2
|
||||
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
|
||||
i = i + 1
|
||||
self.saveName = self.saveName + '_' + str(i)
|
||||
self.print_and_log("文件已存在,已重命名为", self.saveName)
|
||||
self.writeMode = 1 # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
self.writeMode = WriteMode.Create.value # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat in ['csv', 'txt', 'xlsx']:
|
||||
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
|
||||
self.OUTPUT.append([]) # 添加表头
|
||||
self.writeMode = 0
|
||||
self.writeMode = WriteMode.Create.value
|
||||
elif self.outputFormat == "json":
|
||||
self.writeMode = 3 # JSON模式无需判断是否存在文件
|
||||
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
|
||||
elif self.outputFormat == "mysql":
|
||||
self.mysql = myMySQL(config["mysql_config_path"])
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
|
||||
self.writeMode = 2
|
||||
if self.writeMode == 0:
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"],
|
||||
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
|
||||
self.writeMode = WriteMode.MySQL.value # MySQL模式
|
||||
|
||||
if self.writeMode == WriteMode.Create.value:
|
||||
self.print_and_log("新建模式|Create Mode")
|
||||
elif self.writeMode == 1:
|
||||
elif self.writeMode == WriteMode.Append.value:
|
||||
self.print_and_log("追加模式|Append Mode")
|
||||
elif self.writeMode == 2:
|
||||
elif self.writeMode == WriteMode.MySQL.value:
|
||||
self.print_and_log("MySQL模式|MySQL Mode")
|
||||
elif self.writeMode == 3:
|
||||
elif self.writeMode == WriteMode.Json.value:
|
||||
self.print_and_log("JSON模式|JSON Mode")
|
||||
|
||||
self.containJudge = service["containJudge"] # 是否含有判断语句
|
||||
self.outputParameters = {}
|
||||
self.service = service
|
||||
@ -245,191 +222,140 @@ class BrowserThread(Thread):
|
||||
if param["name"] not in self.outputParameters.keys():
|
||||
self.outputParameters[param["name"]] = ""
|
||||
self.dataNotFoundKeys[param["name"]] = False
|
||||
try:
|
||||
self.outputParametersTypes.append(param["type"])
|
||||
except:
|
||||
self.outputParametersTypes.append("text")
|
||||
try:
|
||||
self.outputParametersRecord.append(
|
||||
bool(param["recordASField"]))
|
||||
except:
|
||||
self.outputParametersRecord.append(True)
|
||||
self.outputParametersTypes.append(param.get("type", "text"))
|
||||
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
|
||||
# 文件叠加的时候不添加表头
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if self.writeMode == 0:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
self.urlId = 0 # 全局记录变量
|
||||
self.preprocess() # 预处理,优化提取数据流程
|
||||
try:
|
||||
self.inputExcel = service["inputExcel"] # 输入Excel
|
||||
except:
|
||||
self.inputExcel = ""
|
||||
self.inputExcel = service.get("inputExcel", "") # 输入Excel
|
||||
self.readFromExcel() # 读取Excel获得参数值
|
||||
|
||||
# 检测如果没有复杂的操作,优化提取数据流程
|
||||
def preprocess(self):
|
||||
for node in self.procedure:
|
||||
try:
|
||||
iframe = node["parameters"]["iframe"]
|
||||
except:
|
||||
node["parameters"]["iframe"] = False
|
||||
for index_node, node in enumerate(self.procedure):
|
||||
parameters: dict = node["parameters"]
|
||||
iframe = parameters.get('iframe')
|
||||
option = node["option"]
|
||||
|
||||
try:
|
||||
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
|
||||
node["parameters"]["xpath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["waitElementIframeIndex"] = int(
|
||||
node["parameters"]["waitElementIframeIndex"])
|
||||
except:
|
||||
node["parameters"]["waitElement"] = ""
|
||||
node["parameters"]["waitElementTime"] = 10
|
||||
node["parameters"]["waitElementIframeIndex"] = 0
|
||||
if node["option"] == 1: # 打开网页操作
|
||||
try:
|
||||
cookies = node["parameters"]["cookies"]
|
||||
except:
|
||||
node["parameters"]["cookies"] = ""
|
||||
elif node["option"] == 2: # 点击操作
|
||||
try:
|
||||
alertHandleType = node["parameters"]["alertHandleType"]
|
||||
except:
|
||||
node["parameters"]["alertHandleType"] = 0
|
||||
if node["parameters"]["useLoop"]:
|
||||
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
|
||||
if parameters.get("xpath"):
|
||||
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
|
||||
|
||||
if parameters.get("waitElementIframeIndex"):
|
||||
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
|
||||
else:
|
||||
parameters["waitElement"] = ""
|
||||
parameters["waitElementTime"] = 10
|
||||
parameters["waitElementIframeIndex"] = 0
|
||||
|
||||
if option == GraphOption.Get.value: # 打开网页操作
|
||||
parameters["cookies"] = parameters.get("cookies", "")
|
||||
elif option == GraphOption.Click.value: # 点击操作
|
||||
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
|
||||
if parameters.get("useLoop"):
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 3: # 提取数据操作
|
||||
node["parameters"]["recordASField"] = 0
|
||||
try:
|
||||
params = node["parameters"]["params"]
|
||||
except:
|
||||
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
params = node["parameters"]["params"]
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Extract.value: # 提取数据操作
|
||||
parameters["recordASField"] = 0
|
||||
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
|
||||
parameters["clear"] = parameters.get("clear", 0)
|
||||
parameters["newLine"] = parameters.get("newLine", 1)
|
||||
|
||||
params = parameters["params"]
|
||||
for param in params:
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
try:
|
||||
param["iframe"] = param.get("iframe", False)
|
||||
|
||||
if param.get("relativeXPath"):
|
||||
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["recordASField"] = param["recordASField"]
|
||||
except:
|
||||
node["parameters"]["recordASField"] = 1
|
||||
try:
|
||||
splitLine = int(param["splitLine"])
|
||||
except:
|
||||
param["splitLine"] = 0
|
||||
if param["contentType"] == 8:
|
||||
self.print_and_log(
|
||||
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log(
|
||||
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
|
||||
|
||||
parameters["recordASField"] = param.get("recordASField", 1)
|
||||
|
||||
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
|
||||
|
||||
if param.get("contentType") == 8:
|
||||
self.print_and_log("默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType =="
|
||||
"8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片"
|
||||
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
|
||||
"的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
|
||||
"modify the source code get_content function -> contentType == 8 position "
|
||||
"to your own OCR model and then compile and run it; or you can first set "
|
||||
"the content type of the crawler to \"Element Screenshot\" to save the "
|
||||
"picture, and then call your own program with custom operations. The "
|
||||
"function of the program is to read the latest generated picture, then use "
|
||||
"a good model, such as PaddleOCR to recognize the picture, and then return "
|
||||
"the return value as a parameter output to the program.")
|
||||
param["optimizable"] = detect_optimizable(param)
|
||||
elif node["option"] == 4: # 输入文字
|
||||
try:
|
||||
index = node["parameters"]["index"] # 索引值
|
||||
except:
|
||||
node["parameters"]["index"] = 0
|
||||
elif node["option"] == 5: # 自定义操作
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
elif node["option"] == 7: # 移动到元素
|
||||
if node["parameters"]["useLoop"]:
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 8: # 循环操作
|
||||
try:
|
||||
exitElement = node["parameters"]["exitElement"]
|
||||
if exitElement == "":
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
except:
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
|
||||
try:
|
||||
skipCount = node["parameters"]["skipCount"]
|
||||
except:
|
||||
node["parameters"]["skipCount"] = 0
|
||||
elif option == GraphOption.Input.value: # 输入文字
|
||||
parameters['index'] = parameters.get('index', 0)
|
||||
elif option == GraphOption.Custom.value: # 自定义操作
|
||||
parameters['clear'] = parameters.get('clear', 0)
|
||||
parameters['newLine'] = parameters.get('newLine', 1)
|
||||
elif option == GraphOption.Move.value: # 移动到元素
|
||||
if parameters.get('useLoop'):
|
||||
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Loop.value: # 循环操作
|
||||
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
|
||||
parameters["quickExtractable"] = False # 是否可以快速提取
|
||||
parameters['skipCount'] = parameters.get('skipCount', 0)
|
||||
|
||||
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
try:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
|
||||
except:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
try:
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
|
||||
except:
|
||||
waitElement = ""
|
||||
if node["parameters"]["iframe"]:
|
||||
node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
|
||||
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
|
||||
if not params:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
|
||||
|
||||
if parameters["iframe"]:
|
||||
parameters["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
else:
|
||||
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
|
||||
if node["parameters"]["skipCount"] > 0:
|
||||
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
parameters["quickExtractable"] = True # 先假设可以快速提取
|
||||
|
||||
if parameters["skipCount"] > 0:
|
||||
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
|
||||
for param in params:
|
||||
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
param['iframe'] = param.get('iframe', False)
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
optimizable = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
parameters["quickExtractable"] = False
|
||||
break
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
|
||||
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
|
||||
try:
|
||||
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
|
||||
if parameters["quickExtractable"]:
|
||||
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
|
||||
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
|
||||
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
|
||||
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
|
||||
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
|
||||
node["parameters"]["quickParams"] = []
|
||||
for param in params:
|
||||
content_type = ""
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
|
||||
"::text()") >= 0:
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
|
||||
or param["relativeXPath"].find("::text()") >= 0:
|
||||
content_type = ""
|
||||
elif param["nodeType"] == 2:
|
||||
content_type = "//@href"
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
content_type = "//@src"
|
||||
elif param["contentType"] == 1:
|
||||
content_type = "/text()"
|
||||
elif param["contentType"] == 0:
|
||||
content_type = "//text()"
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
xpath = "." + param["relativeXPath"] + content_type
|
||||
else:
|
||||
xpath = param["relativeXPath"] + content_type
|
||||
@ -443,6 +369,7 @@ class BrowserThread(Thread):
|
||||
"nodeType": param["nodeType"],
|
||||
"default": param["default"],
|
||||
})
|
||||
self.procedure[index_node]["parameters"] = parameters
|
||||
self.print_and_log("预处理完成|Preprocess completed")
|
||||
|
||||
def readFromExcel(self):
|
||||
@ -559,7 +486,10 @@ class BrowserThread(Thread):
|
||||
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
|
||||
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
|
||||
time.sleep(quitWaitTime)
|
||||
self.browser.quit()
|
||||
try:
|
||||
self.browser.quit()
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
|
||||
try:
|
||||
shutil.rmtree(self.option["tmp_user_data_folder"])
|
||||
@ -775,18 +705,20 @@ class BrowserThread(Thread):
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 2:
|
||||
self.recordLog("Execute JavaScript for element:" + code)
|
||||
self.recordLog("对元素执行JavaScript:" + code)
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code, element)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 5:
|
||||
try:
|
||||
code = readCode(code)
|
||||
@ -796,9 +728,9 @@ class BrowserThread(Thread):
|
||||
self.recordLog("执行下面的代码:" + code)
|
||||
self.recordLog("Execute the following code:" + code)
|
||||
except Exception as e:
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" +
|
||||
code, ", error is:", e)
|
||||
code, ", error is:", str(e))
|
||||
elif int(codeMode) == 6:
|
||||
try:
|
||||
code = readCode(code)
|
||||
@ -1216,6 +1148,14 @@ class BrowserThread(Thread):
|
||||
self.history["handle"] = thisHandle
|
||||
thisHistoryURL = self.browser.current_url
|
||||
# 快速提取处理
|
||||
# start = time.time()
|
||||
try:
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
except Exception as e:
|
||||
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
# end = time.time()
|
||||
# print("解析页面秒数:", end - start)
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.browser.switch_to.default_content() # 切换到主页面
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
@ -1721,8 +1661,11 @@ class BrowserThread(Thread):
|
||||
try:
|
||||
actions = ActionChains(self.browser) # 实例化一个action对象
|
||||
if newTab == 1: # 在新标签页打开
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
if sys.platform == "darwin": # Mac
|
||||
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
|
||||
else:
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
else:
|
||||
actions.click(element).perform()
|
||||
except Exception as e:
|
||||
@ -2249,7 +2192,9 @@ if __name__ == '__main__':
|
||||
"server_address": "http://localhost:8074",
|
||||
"keyboard": True, # 是否监听键盘输入
|
||||
"pause_key": "p", # 暂停键
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"docker_driver": "",
|
||||
"user_folder": "",
|
||||
}
|
||||
c = Config(config)
|
||||
print(c)
|
||||
@ -2345,35 +2290,43 @@ if __name__ == '__main__':
|
||||
os.mkdir(tmp_user_folder_parent)
|
||||
characters = string.ascii_letters + string.digits
|
||||
for i in range(len(c.ids)):
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options = tmp_options[i]["options"]
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
options.add_argument("--profile-directory=Default")
|
||||
if c.user_folder == "":
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
print(f"Use local user data folder: {tmp_user_data_folder}")
|
||||
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
|
||||
else:
|
||||
options.add_argument(
|
||||
f'--user-data-dir={c.user_folder}')
|
||||
print(f"Use specifed user data folder: {c.user_folder}, please note if you are using docker, this user folder path should be the path inside the docker container.")
|
||||
print(f"使用指定的用户信息目录: {c.user_folder},请注意如果您正在使用docker,此用户文件夹路径应是容器内的路径。")
|
||||
print(
|
||||
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally,说明有之前运行的Chrome实例没有正常关闭,请关闭之前打开的所有Chrome实例后再运行程序即可。")
|
||||
print(
|
||||
@ -2386,9 +2339,13 @@ if __name__ == '__main__':
|
||||
print("id: ", id)
|
||||
if c.read_type == "remote":
|
||||
print("remote")
|
||||
content = requests.get(
|
||||
try:
|
||||
content = requests.get(
|
||||
c.server_address + "/queryExecutionInstance?id=" + str(id))
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
except:
|
||||
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
|
||||
print("无法连接到服务器,请确保EasySpider主程序正在运行,或者您可以将--read_type参数更改为'local',以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
|
||||
else:
|
||||
print("local")
|
||||
local_folder = os.path.join(os.getcwd(), "execution_instances")
|
||||
@ -2439,8 +2396,17 @@ if __name__ == '__main__':
|
||||
except:
|
||||
browser = "chrome"
|
||||
if browser == "chrome":
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options)
|
||||
if c.docker_driver == "":
|
||||
print("Using local driver")
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
|
||||
else:
|
||||
print("Using remote driver")
|
||||
# Use docker driver, default address is http://localhost:4444/wd/hub
|
||||
# Headless mode
|
||||
# options.add_argument("--headless")
|
||||
# print("Headless mode")
|
||||
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
|
||||
elif browser == "edge":
|
||||
from selenium.webdriver.edge.service import Service as EdgeService
|
||||
from selenium.webdriver.edge.options import Options as EdgeOptions
|
||||
@ -2501,6 +2467,7 @@ if __name__ == '__main__':
|
||||
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
|
||||
# 使用监听器监听键盘输入
|
||||
try:
|
||||
from pynput.keyboard import Key, Listener
|
||||
if c.keyboard:
|
||||
with Listener(on_press=on_press_creator(press_time, event),
|
||||
on_release=on_release_creator(event, press_time)) as listener:
|
||||
|
@ -19,11 +19,16 @@ desired_capabilities["pageLoadStrategy"] = "none"
|
||||
|
||||
|
||||
|
||||
class MyChrome(webdriver.Chrome):
|
||||
class MyChrome(webdriver.Chrome, webdriver.Remote):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
def __init__(self, mode='local_driver', *args, **kwargs):
|
||||
self.iframe_env = False # 现在的环境是root还是iframe
|
||||
super().__init__(*args, **kwargs) # 调用父类的 __init__
|
||||
self.mode = mode
|
||||
if mode == "local_driver":
|
||||
webdriver.Chrome.__init__(self, *args, **kwargs)
|
||||
elif mode == "remote_driver":
|
||||
webdriver.Remote.__init__(self, *args, **kwargs)
|
||||
# super().__init__(*args, **kwargs) # 调用父类的 __init__
|
||||
|
||||
# def find_element(self, by=By.ID, value=None, iframe=False):
|
||||
# # 在这里改变查找元素的行为
|
||||
|
@ -64,49 +64,49 @@ def compress_folder_to_7z_split(folder_path, output_file):
|
||||
except:
|
||||
subprocess.call(["7zz", "a", "-v95m", output_file, folder_path])
|
||||
|
||||
easyspider_version = "0.6.2"
|
||||
easyspider_version = "0.6.3"
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if sys.platform == "win32" and platform.architecture()[0] == "64bit":
|
||||
file_name = f"EasySpider_{easyspider_version}_windows_x64.7z"
|
||||
if os.path.exists("./EasySpider_windows_x64/user_data"):
|
||||
shutil.rmtree("./EasySpider_windows_x64/user_data")
|
||||
if os.path.exists("./EasySpider_windows_x64/Data"):
|
||||
shutil.rmtree("./EasySpider_windows_x64/Data")
|
||||
if os.path.exists("./EasySpider_windows_x64/execution_instances"):
|
||||
shutil.rmtree("./EasySpider_windows_x64/execution_instances")
|
||||
if os.path.exists("./EasySpider_windows_x64/config.json"):
|
||||
os.remove("./EasySpider_windows_x64/config.json")
|
||||
if os.path.exists("./EasySpider_windows_x64/mysql_config.json"):
|
||||
os.remove("./EasySpider_windows_x64/mysql_config.json")
|
||||
if os.path.exists("./EasySpider_windows_x64/TempUserDataFolder"):
|
||||
shutil.rmtree("./EasySpider_windows_x64/TempUserDataFolder")
|
||||
os.mkdir("./EasySpider_windows_x64/Data")
|
||||
os.mkdir("./EasySpider_windows_x64/execution_instances")
|
||||
# compress_folder_to_7z_split("./EasySpider_windows_x64", file_name)
|
||||
file_name = f"EasySpider_{easyspider_version}_Windows_x64.7z"
|
||||
if os.path.exists("./EasySpider_Windows_x64/user_data"):
|
||||
shutil.rmtree("./EasySpider_Windows_x64/user_data")
|
||||
if os.path.exists("./EasySpider_Windows_x64/Data"):
|
||||
shutil.rmtree("./EasySpider_Windows_x64/Data")
|
||||
if os.path.exists("./EasySpider_Windows_x64/execution_instances"):
|
||||
shutil.rmtree("./EasySpider_Windows_x64/execution_instances")
|
||||
if os.path.exists("./EasySpider_Windows_x64/config.json"):
|
||||
os.remove("./EasySpider_Windows_x64/config.json")
|
||||
if os.path.exists("./EasySpider_Windows_x64/mysql_config.json"):
|
||||
os.remove("./EasySpider_Windows_x64/mysql_config.json")
|
||||
if os.path.exists("./EasySpider_Windows_x64/TempUserDataFolder"):
|
||||
shutil.rmtree("./EasySpider_Windows_x64/TempUserDataFolder")
|
||||
os.mkdir("./EasySpider_Windows_x64/Data")
|
||||
os.mkdir("./EasySpider_Windows_x64/execution_instances")
|
||||
# compress_folder_to_7z_split("./EasySpider_Windows_x64", file_name)
|
||||
# print(f"Compress {file_name} Split successfully!")
|
||||
compress_folder_to_7z("./EasySpider_windows_x64", file_name)
|
||||
compress_folder_to_7z("./EasySpider_Windows_x64", file_name)
|
||||
print(f"Compress {file_name} successfully!")
|
||||
elif sys.platform == "win32" and platform.architecture()[0] == "32bit":
|
||||
file_name = f"EasySpider_{easyspider_version}_windows_x32.7z"
|
||||
if os.path.exists("./EasySpider_windows_x32/user_data"):
|
||||
shutil.rmtree("./EasySpider_windows_x32/user_data")
|
||||
if os.path.exists("./EasySpider_windows_x32/Data"):
|
||||
shutil.rmtree("./EasySpider_windows_x32/Data")
|
||||
if os.path.exists("./EasySpider_windows_x32/execution_instances"):
|
||||
shutil.rmtree("./EasySpider_windows_x32/execution_instances")
|
||||
if os.path.exists("./EasySpider_windows_x32/config.json"):
|
||||
os.remove("./EasySpider_windows_x32/config.json")
|
||||
if os.path.exists("./EasySpider_windows_x32/mysql_config.json"):
|
||||
os.remove("./EasySpider_windows_x32/mysql_config.json")
|
||||
if os.path.exists("./EasySpider_windows_x32/TempUserDataFolder"):
|
||||
shutil.rmtree("./EasySpider_windows_x32/TempUserDataFolder")
|
||||
os.mkdir("./EasySpider_windows_x32/Data")
|
||||
os.mkdir("./EasySpider_windows_x32/execution_instances")
|
||||
# compress_folder_to_7z_split("./EasySpider_windows_x32", file_name)
|
||||
file_name = f"EasySpider_{easyspider_version}_Windows_x32.7z"
|
||||
if os.path.exists("./EasySpider_Windows_x32/user_data"):
|
||||
shutil.rmtree("./EasySpider_Windows_x32/user_data")
|
||||
if os.path.exists("./EasySpider_Windows_x32/Data"):
|
||||
shutil.rmtree("./EasySpider_Windows_x32/Data")
|
||||
if os.path.exists("./EasySpider_Windows_x32/execution_instances"):
|
||||
shutil.rmtree("./EasySpider_Windows_x32/execution_instances")
|
||||
if os.path.exists("./EasySpider_Windows_x32/config.json"):
|
||||
os.remove("./EasySpider_Windows_x32/config.json")
|
||||
if os.path.exists("./EasySpider_Windows_x32/mysql_config.json"):
|
||||
os.remove("./EasySpider_Windows_x32/mysql_config.json")
|
||||
if os.path.exists("./EasySpider_Windows_x32/TempUserDataFolder"):
|
||||
shutil.rmtree("./EasySpider_Windows_x32/TempUserDataFolder")
|
||||
os.mkdir("./EasySpider_Windows_x32/Data")
|
||||
os.mkdir("./EasySpider_Windows_x32/execution_instances")
|
||||
# compress_folder_to_7z_split("./EasySpider_Windows_x32", file_name)
|
||||
# print(f"Compress {file_name} Split successfully!")
|
||||
compress_folder_to_7z("./EasySpider_windows_x32", file_name)
|
||||
compress_folder_to_7z("./EasySpider_Windows_x32", file_name)
|
||||
print(f"Compress {file_name} successfully!")
|
||||
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
|
||||
file_name = f"EasySpider_{easyspider_version}_Linux_x64.tar.xz"
|
||||
|
Binary file not shown.
Binary file not shown.
@ -1,4 +1,8 @@
|
||||
# 环境编译说明|Environment Compilation Instruction
|
||||
## 视频教程
|
||||
|
||||
[从源代码编译程序并设计运行和调试任务指南(基于Ubuntu24.04)](https://www.bilibili.com/video/BV1VE421P7yj/)
|
||||
|
||||
# 环境编译说明 | Environment Compilation Instruction
|
||||
|
||||
EasySpider分三部分:
|
||||
|
||||
@ -19,35 +23,35 @@ EasySpider is divided into three parts:
|
||||
This section covers the compilation instructions for the `main program`.
|
||||
|
||||
|
||||
## 建议编译顺序|Suggested Compilation Order
|
||||
## 建议编译顺序 | Suggested Compilation Order
|
||||
|
||||
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
|
||||
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
|
||||
3. 编译执行阶段程序,否则无法执行程序,只能设计程序。
|
||||
3. 编译执行阶段程序,否则无法执行任务,只能设计任务。
|
||||
|
||||
-----
|
||||
|
||||
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
|
||||
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
|
||||
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
|
||||
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
|
||||
|
||||
## 注意事项|Note
|
||||
## 注意事项 | Note
|
||||
|
||||
请记住,每当EasySpider扩展程序和执行程序更新时,都要更新`EasySpider.crx`和`easyspider_executestage`文件。
|
||||
|
||||
Remember to update the `EasySpider.crx` and `easyspider_executestage` files whenever the EasySpider extension and execution program are updated.
|
||||
|
||||
## 环境构建|Environment Setup
|
||||
## 环境构建 | Environment Setup
|
||||
|
||||
以下以Windows x64版本为例。
|
||||
|
||||
Taking the example of Windows x64 version.
|
||||
|
||||
### 浏览器和驱动|Browser and Driver
|
||||
### 浏览器和驱动 | Browser and Driver
|
||||
|
||||
实在搞不定本节的情况下,下载一个直接能用的EasySpider,并把文件夹内的`EasySpider\resources\app\chrome_win64`文件夹拷贝到此`ElectronJS`文件夹下即可。
|
||||
实在搞不定本节的情况下,下载一个直接能用的EasySpider,并把文件夹内的`EasySpider\resources\app\chrome_win64`文件夹拷贝到此`ElectronJS`文件夹下,并把`chrome_win64`文件夹下的`execute.sh`在原文件夹下复制一份并命名为`execute_win64.sh`即可。
|
||||
|
||||
If you're unable to handle the tasks in this section, you can download a ready-to-use EasySpider. Simply copy the `EasySpider\resources\app\chrome_win64` folder from the downloaded files and paste it into the ElectronJS folder.
|
||||
If you're unable to handle the tasks in this section, you can download a ready-to-use EasySpider, and copy the `EasySpider\resources\app\chrome_win64` folder to this `ElectronJS` folder, then copy the `execute.sh` script found in the `chrome_win64` folder and rename it as `execute_win64.sh` in the same location.
|
||||
|
||||
------
|
||||
|
||||
@ -66,7 +70,7 @@ chrome_linux64/ # for linux x64
|
||||
chrome_mac64/ # for mac x64
|
||||
```
|
||||
|
||||
然后,从下面的页面下载和**自己安装的Chrome版本一致**的Chromedriver:[https://chromedriver.chromium.org/downloads](https://chromedriver.chromium.org/downloads),把chromedriver放入刚刚的`chrome`文件夹内,并更名为下面的格式:
|
||||
然后,从下面的页面下载和**自己安装的Chrome版本一致**的Chromedriver:[https://googlechromelabs.github.io/chrome-for-testing/](https://googlechromelabs.github.io/chrome-for-testing/),把chromedriver放入刚刚的`chrome`文件夹内,并更名为下面的格式:
|
||||
|
||||
```
|
||||
chromedriver_win32.exe # for windows x32
|
||||
@ -77,7 +81,7 @@ chromedriver_mac64 # for mac x64
|
||||
|
||||
例如,如果您想在Windows x64平台上构建此软件,那么您首先需要下载适用于Windows x64的Chrome浏览器,并将整个`chrome`文件夹复制到`ElectronJS`文件夹中,然后将文件夹重命名为`chrome_win64`。假设您下载的Chrome版本是110。接下来,下载一个适用于Windows x64的110版本的ChromeDriver,并将其放入`chrome_win64`文件夹中,然后将其重命名为`chromedriver_win64.exe`。
|
||||
|
||||
最后,把此文件夹内的`stealth.min.js`和`execute.bat`文件拷贝入`chrome`文件夹内。
|
||||
最后,把此`ElectronJS`文件夹内的`stealth.min.js`和`execute_win64.bat`文件拷贝入`chrome_win64`文件夹内,**这一步不要忘**。
|
||||
|
||||
|
||||
Download a Chrome from the Internet: https://www.google.com/chrome/, and then put them into this folder, with name format of the following:
|
||||
@ -100,33 +104,31 @@ chromedriver_mac64 # for mac x64
|
||||
|
||||
For example, if you want to build this software on Windows x64 platform, then you should first download a Chrome for Windows x64, then copy the whole `chrome` folder to this `ElectronJS` folder and rename the folder to `chrome_win64`, assume the Chrome version you downloaded is 110; then, download a `chromedriver.exe` with version 110 for Windows x64, and put it into the `chrome_win64` folder, then rename it to `chromedriver_win64.exe`.
|
||||
|
||||
Finally, copy the `stealth.min.js` and `execute.bat` (for Windows x64) file in this folder to these `chrome` folders.
|
||||
Finally, copy the `stealth.min.js` and `execute_win64.bat` file in this `ElectronJS` folder to the `chrome_win64` folder **(do not forget this step)**.
|
||||
|
||||
### NodeJS环境|NodeJS Environment
|
||||
### NodeJS环境 | NodeJS Environment
|
||||
|
||||
1. Windows环境下需要先安装`VS Build Tools 2017` ([https://aka.ms/vs/15/release/vs_buildtools.exe](https://aka.ms/vs/15/release/vs_buildtools.exe))的`Visual C++ Build Tools`组件,不然下面的命令无法执行,其他系统不需要。
|
||||
1. Windows环境下需要先下载`VS Build Tools 2017` ([https://aka.ms/vs/15/release/vs_buildtools.exe](https://aka.ms/vs/15/release/vs_buildtools.exe))并勾选安装其中的`Visual C++ Build Tools(Visual C++生成工具)`组件以便`node-gyp`模块来安装`node-windows-manager`,不然下面的命令无法执行,其他系统不需要。同时,`Python3`也需要安装在系统中并配置好环境变量。
|
||||
2. 安装`NodeJS`:[https://nodejs.org/zh-cn/download/](https://nodejs.org/zh-cn/download/)。
|
||||
3. 运行下面的命令来安装依赖:
|
||||
|
||||
```
|
||||
npm install
|
||||
npm install @electron-forge/cli -g
|
||||
```
|
||||
|
||||
如果上面的命令运行速度很慢可以参考NodeJS换源说明:[https://blog.csdn.net/qq_23211463/article/details/123769061](https://blog.csdn.net/qq_23211463/article/details/123769061)。
|
||||
如果上面的命令运行速度很慢可以参考使用NodeJS和Electron包的换源说明来加速安装:[https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803)。
|
||||
|
||||
-----
|
||||
|
||||
1. On Windows, you need to install `VS Build Tools 2017` (https://aka.ms/vs/15/release/vs_buildtools.exe, select and install the `Visual C++ Build Tools` component) first for node-gyp to install `node-windows-manager` (No need for other OS).
|
||||
1. On Windows, you need to download `VS Build Tools 2017` (https://aka.ms/vs/15/release/vs_buildtools.exe, select and install the `Visual C++ Build Tools` component) first for the module `node-gyp` to install `node-windows-manager` (No need for other OS). Meanwhile, `Python3` needs to be installed and the environment variables need to be configured.
|
||||
2. Install `NodeJS`: [https://nodejs.org/en/download/](https://nodejs.org/en/download/).
|
||||
3. Run the following commands to install NodeJS packages:
|
||||
|
||||
```
|
||||
npm install
|
||||
npm install @electron-forge/cli -g
|
||||
```
|
||||
|
||||
## 运行说明|Run Instruction
|
||||
## 运行说明 | Run Instruction
|
||||
|
||||
在当前文件夹执行以下命令即可在开发模式下运行程序:
|
||||
|
||||
@ -146,14 +148,13 @@ npm run start_direct
|
||||
|
||||
But so far can only design the task, can not execute the task, want to execute the task also need to complete the 'ExecuteStage' folder of the execution of the task program compilation instructions can be executed.
|
||||
|
||||
## 打包发布说明|Package Instruction
|
||||
## 打包发布说明 | Package Instruction
|
||||
|
||||
打包发布前,确保执行阶段程序`easyspider_executestage(.exe)`已放入`chrome(_win64)`文件夹内,且浏览器插件`EasySpider_zh.crx`已经是最新版本。
|
||||
|
||||
执行下面的命令即可打包:
|
||||
执行下面的命令即可打包(需要安装`Git`):
|
||||
|
||||
```
|
||||
npx electron-forge import
|
||||
npm run package
|
||||
```
|
||||
|
||||
@ -161,10 +162,9 @@ npm run package
|
||||
|
||||
Before packaging and releasing, make sure that the task execution program `easyspider_executestage(.exe)` is placed inside the `chrome(_win64)` folder and that the browser extension `EasySpider_en.crx` is the latest version.
|
||||
|
||||
After finishing developing, package software by the following command:
|
||||
After finishing developing, package software by the following command (`Git` is required):
|
||||
|
||||
```
|
||||
npx electron-forge import
|
||||
npm run package
|
||||
```
|
||||
|
||||
@ -186,8 +186,43 @@ package_win64.cmd
|
||||
clean_and_release_win64.cmd
|
||||
```
|
||||
|
||||
### (可选)编译成安装包|(Optional) Compile to an installation package
|
||||
## 可能出现的问题 | Troubleshooting
|
||||
|
||||
以下命令一般不需要执行,但打包时可能会用到:
|
||||
|
||||
```sh
|
||||
npm install @electron-forge/cli -g
|
||||
npx electron-forge import
|
||||
```
|
||||
npm run make
|
||||
|
||||
如果任务执行到`npm install electron-squirrel-startup`的步骤时卡死,请参考下面的换源教程:[https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803)。
|
||||
|
||||
Windows端如果在运行`npm run package`的时候提示`node-gyp`相关的错误,可以安装`electron-rebuild`并重新编译相关模块:
|
||||
|
||||
```sh
|
||||
npm install --save-dev electron-rebuild
|
||||
npx electron-rebuild
|
||||
```
|
||||
|
||||
然后再次运行`npm run package`。
|
||||
|
||||
-----
|
||||
|
||||
The following commands are generally not required, but may be used during packaging:
|
||||
|
||||
```sh
|
||||
npm install @electron-forge/cli -g
|
||||
npx electron-forge import
|
||||
```
|
||||
|
||||
If the task is stuck at the `npm install electron-squirrel-startup` step, please refer to the following tutorial on changing the source: [https://blog.csdn.net/qq_38463737/article/details/140277803](https://blog.csdn.net/qq_38463737/article/details/140277803).
|
||||
|
||||
If you encounter `node-gyp` related errors when running `npm run package` on Windows, you can install `electron-rebuild` and recompile the relevant modules:
|
||||
|
||||
```sh
|
||||
npm install --save-dev electron-rebuild
|
||||
npx electron-rebuild
|
||||
```
|
||||
|
||||
Then run `npm run package` again.
|
||||
|
||||
|
@ -30,7 +30,7 @@ def update_file_version(file_path, new_version, key="当前版本/Current Versio
|
||||
file.write(line)
|
||||
|
||||
|
||||
version = "0.6.2"
|
||||
version = "0.6.3"
|
||||
|
||||
# py html js
|
||||
|
||||
@ -47,7 +47,8 @@ if __name__ == "__main__":
|
||||
|
||||
# index.html
|
||||
file_path = "./src/index.html"
|
||||
update_file_version(file_path, version, key="当前版本/Current Version: <b>v")
|
||||
update_file_version(file_path, version, key="软件当前版本:<b>v")
|
||||
update_file_version(file_path, version, key="Current Version: <b>v")
|
||||
|
||||
# package.json
|
||||
file_path = "./package.json"
|
||||
|
@ -11,9 +11,10 @@ del out\EasySpider\resources\app\vs_BuildTools.exe
|
||||
move out\EasySpider ..\.temp_to_pub\EasySpider_windows_x32\EasySpider
|
||||
rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
mkdir ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
@REM copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
@REM copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
@REM copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\*.py ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\requirements.txt ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\Readme.md ..\.temp_to_pub\EasySpider_windows_x32\Code
|
||||
copy ..\ExecuteStage\myCode.py ..\.temp_to_pub\EasySpider_windows_x32
|
||||
|
@ -11,9 +11,10 @@ del out\EasySpider\resources\app\vs_BuildTools.exe
|
||||
move out\EasySpider ..\.temp_to_pub\EasySpider_windows_x64\EasySpider
|
||||
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
mkdir ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
@REM copy ..\ExecuteStage\easyspider_executestage.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
@REM copy ..\ExecuteStage\myChrome.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
@REM copy ..\ExecuteStage\utils.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\*.py ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\requirements.txt ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\Readme.md ..\.temp_to_pub\EasySpider_windows_x64\Code
|
||||
copy ..\ExecuteStage\myCode.py ..\.temp_to_pub\EasySpider_windows_x64
|
||||
|
@ -1 +1 @@
|
||||
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
|
||||
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data","lang":"zh"}
|
@ -50,7 +50,9 @@ if (config.debug) {
|
||||
}
|
||||
let allWindowSockets = [];
|
||||
let allWindowScoketNames = [];
|
||||
task_server.start(config.webserver_port); //start local server
|
||||
if(config.webserver_address.includes("localhost") || config.webserver_address.includes("127.0.0.1")) {
|
||||
task_server.start(config.webserver_port); //start local server
|
||||
}
|
||||
let server_address = `${config.webserver_address}:${config.webserver_port}`;
|
||||
const websocket_port = 8084; //目前只支持8084端口,写死,因为扩展里面写死了
|
||||
console.log("server_address: " + server_address);
|
||||
@ -84,11 +86,11 @@ console.log(process.arch);
|
||||
if (process.platform === "win32" && process.arch === "ia32") {
|
||||
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");
|
||||
chromeBinaryPath = path.join(__dirname, "chrome_win32/chrome.exe");
|
||||
execute_path = path.join(__dirname, "chrome_win32/execute.bat");
|
||||
execute_path = path.join(__dirname, "chrome_win32/execute_win32.bat");
|
||||
} else if (process.platform === "win32" && process.arch === "x64") {
|
||||
driverPath = path.join(__dirname, "chrome_win64/chromedriver_win64.exe");
|
||||
chromeBinaryPath = path.join(__dirname, "chrome_win64/chrome.exe");
|
||||
execute_path = path.join(__dirname, "chrome_win64/execute.bat");
|
||||
execute_path = path.join(__dirname, "chrome_win64/execute_win64.bat");
|
||||
} else if (process.platform === "darwin") {
|
||||
driverPath = path.join(__dirname, "chromedriver_mac64");
|
||||
chromeBinaryPath = path.join(
|
||||
@ -99,7 +101,7 @@ if (process.platform === "win32" && process.arch === "ia32") {
|
||||
} else if (process.platform === "linux") {
|
||||
driverPath = path.join(__dirname, "chrome_linux64/chromedriver_linux64");
|
||||
chromeBinaryPath = path.join(__dirname, "chrome_linux64/chrome");
|
||||
execute_path = path.join(__dirname, "chrome_linux64/execute.sh");
|
||||
execute_path = path.join(__dirname, "chrome_linux64/execute_linux64.sh");
|
||||
}
|
||||
console.log(driverPath, chromeBinaryPath, execute_path);
|
||||
let language = "en";
|
||||
@ -112,6 +114,7 @@ let handle_pairs = {};
|
||||
let socket_window = null;
|
||||
let socket_start = null;
|
||||
let socket_flowchart = null;
|
||||
let socket_popup = null;
|
||||
let invoke_window = null;
|
||||
|
||||
// var ffi = require('ffi-napi');
|
||||
@ -148,8 +151,8 @@ function createWindow() {
|
||||
server_address +
|
||||
"/index.html?user_data_folder=" +
|
||||
config.user_data_folder +
|
||||
"©right=" +
|
||||
config.copyright,
|
||||
"©right=" + config.copyright +
|
||||
"&lang=" + config.lang,
|
||||
{extraHeaders: "pragma: no-cache\n"}
|
||||
);
|
||||
// 隐藏菜单栏
|
||||
@ -160,9 +163,8 @@ function createWindow() {
|
||||
app.quit();
|
||||
}
|
||||
});
|
||||
//调试模式
|
||||
// mainWindow.webContents.openDevTools();
|
||||
// Open the DevTools.
|
||||
// mainWindow.webContents.openDevTools()
|
||||
}
|
||||
|
||||
async function findElementRecursive(driver, by, value, frames) {
|
||||
@ -243,6 +245,7 @@ async function findElementAcrossAllWindows(
|
||||
let handles = await driver.getAllWindowHandles();
|
||||
// console.log("handles", handles);
|
||||
let content_handle = current_handle;
|
||||
let old_handle = current_handle;
|
||||
let id = -1;
|
||||
try {
|
||||
id = msg.message.id;
|
||||
@ -289,12 +292,12 @@ async function findElementAcrossAllWindows(
|
||||
xpath = msg.xpath;
|
||||
}
|
||||
}
|
||||
if (xpath.indexOf("Field(") >= 0 || xpath.indexOf("eval(") >= 0) {
|
||||
if (xpath.indexOf("Field[") >= 0 || xpath.indexOf("eval(") >= 0) {
|
||||
//两秒后通知浏览器
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
notify_browser(
|
||||
'检测到XPath中包含Field("")或eval(""),试运行时无法正常定位到包含此两项表达式的元素,请在任务正式运行阶段测试是否有效。',
|
||||
'Field("") or eval("") is detected in xpath, and the element containing these two expressions cannot be located normally during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
'检测到XPath中包含Field[""]或eval(""),试运行时无法正常定位到包含此两项表达式的元素,请在任务正式运行阶段测试是否有效。',
|
||||
'Field[""] or eval("") is detected in xpath, and the element containing these two expressions cannot be located normally during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
"warning"
|
||||
);
|
||||
return null;
|
||||
@ -308,7 +311,7 @@ async function findElementAcrossAllWindows(
|
||||
if (h != null && handles.includes(h)) {
|
||||
await driver.switchTo().window(h);
|
||||
current_handle = h;
|
||||
console.log("switch to handle: ", h);
|
||||
console.log("Switch to handle: ", h);
|
||||
}
|
||||
element = await findElement(driver, By.xpath, xpath, iframe);
|
||||
break;
|
||||
@ -325,6 +328,12 @@ async function findElementAcrossAllWindows(
|
||||
}
|
||||
}
|
||||
if (element == null && notifyBrowser) {
|
||||
// 如果找不到元素,切换回原来的窗口
|
||||
if (old_handle != null && handles.includes(old_handle)) {
|
||||
await driver.switchTo().window(old_handle);
|
||||
current_handle = old_handle;
|
||||
console.log("Switch to handle: ", old_handle);
|
||||
}
|
||||
notify_browser(
|
||||
"无法找到元素,请检查XPath是否正确:" + xpath,
|
||||
"Cannot find the element, please check if the XPath is correct: " + xpath,
|
||||
@ -654,7 +663,11 @@ async function beginInvoke(msg, ws) {
|
||||
if (parameters.clickWay == 2){ //双击
|
||||
await click_element(element, "double");
|
||||
} else {
|
||||
await click_element(element); //单击
|
||||
if (parameters.newTab == 1){
|
||||
await click_element(element, "loopClickEvery"); //新标签页打开
|
||||
} else {
|
||||
await click_element(element); //单击
|
||||
}
|
||||
}
|
||||
}
|
||||
let alertHandleType = parameters.alertHandleType;
|
||||
@ -761,12 +774,12 @@ async function beginInvoke(msg, ws) {
|
||||
keyInfo = keyInfo.replace(match[0], jsReplacedText.toString());
|
||||
}
|
||||
}
|
||||
if (keyInfo.indexOf("Field(") >= 0 || keyInfo.indexOf("eval(") >= 0) {
|
||||
if (keyInfo.indexOf("Field[") >= 0 || keyInfo.indexOf("eval(") >= 0) {
|
||||
//两秒后通知浏览器
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
notify_browser(
|
||||
'检测到文字中包含Field("")或eval(""),试运行时无法输入两项表达式的替换值,请在任务正式运行阶段测试是否有效。',
|
||||
'Field("") or eval("") is detected in the text, and the replacement value of the two expressions cannot be entered during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
'检测到文字中包含Field[""]或eval(""),试运行时无法输入两项表达式的替换值,请在任务正式运行阶段测试是否有效。',
|
||||
'Field[""] or eval("") is detected in the text, and the replacement value of the two expressions cannot be entered during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
"warning"
|
||||
);
|
||||
}
|
||||
@ -1119,18 +1132,41 @@ async function beginInvoke(msg, ws) {
|
||||
} catch {
|
||||
console.log("Cannot get Cookies");
|
||||
}
|
||||
} else if (msg.type == 30) {
|
||||
send_message_to_browser(
|
||||
JSON.stringify({
|
||||
type: "showAllToolboxes"
|
||||
})
|
||||
);
|
||||
console.log("Show all toolboxes");
|
||||
} else if (msg.type == 31) {
|
||||
send_message_to_browser(
|
||||
JSON.stringify({
|
||||
type: "hideAllToolboxes"
|
||||
})
|
||||
);
|
||||
console.log("Hide all toolboxes");
|
||||
}
|
||||
}
|
||||
|
||||
async function click_element(element, type = "click") {
|
||||
try {
|
||||
if (type == "loopClickEvery") {
|
||||
await driver
|
||||
if (process.platform === "darwin") {
|
||||
await driver
|
||||
.actions()
|
||||
.keyDown(Key.COMMAND)
|
||||
.click(element)
|
||||
.keyUp(Key.COMMAND)
|
||||
.perform();
|
||||
} else {
|
||||
await driver
|
||||
.actions()
|
||||
.keyDown(Key.CONTROL)
|
||||
.click(element)
|
||||
.keyUp(Key.CONTROL)
|
||||
.perform();
|
||||
}
|
||||
} else if (type.includes("point(")) {
|
||||
//point(10, 20)表示点击坐标为(10, 20)的位置
|
||||
let point = type.substring(6, type.length - 1).split(",");
|
||||
@ -1177,12 +1213,12 @@ async function execute_js(js, element, wait_time = 3) {
|
||||
);
|
||||
outcome = -1;
|
||||
}
|
||||
if (js.indexOf("Field(") >= 0 || js.indexOf("eval(") >= 0) {
|
||||
if (js.indexOf("Field[") >= 0 || js.indexOf("eval(") >= 0) {
|
||||
//两秒后通知浏览器
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
notify_browser(
|
||||
'检测到JavaScript中包含Field("")或eval(""),试运行时无法执行两项表达式,请在任务正式运行阶段测试是否有效。',
|
||||
'Field("") or eval("") is detected in JavaScript, and the two expressions cannot be executed during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
'检测到JavaScript中包含Field[""]或eval(""),试运行时无法执行两项表达式,请在任务正式运行阶段测试是否有效。',
|
||||
'Field[""] or eval("") is detected in JavaScript, and the two expressions cannot be executed during trial operation. Please test whether it is valid in the formal call stage.',
|
||||
"warning"
|
||||
);
|
||||
}
|
||||
@ -1253,6 +1289,9 @@ wss.on("connection", function (ws) {
|
||||
// console.log("socket_flowchart closed");
|
||||
// });
|
||||
console.log("set socket_flowchart at time: ", new Date());
|
||||
} else if (msg.message.id == 3) {
|
||||
socket_popup = ws;
|
||||
console.log("set socket_popup at time: ", new Date());
|
||||
} else {
|
||||
//其他的ID是用来标识不同的浏览器标签页的
|
||||
// await new Promise(resolve => setTimeout(resolve, 200));
|
||||
@ -1543,6 +1582,17 @@ app.whenReady().then(() => {
|
||||
path.join(task_server.getDir(), "config.json"),
|
||||
JSON.stringify(config)
|
||||
);
|
||||
//重新读取配置文件
|
||||
config = JSON.parse(fs.readFileSync(path.join(task_server.getDir(), "config.json")));
|
||||
});
|
||||
ipcMain.on("change-lang", function (event, arg) {
|
||||
config.lang = arg;
|
||||
fs.writeFileSync(
|
||||
path.join(task_server.getDir(), "config.json"),
|
||||
JSON.stringify(config)
|
||||
);
|
||||
//重新读取配置文件
|
||||
config = JSON.parse(fs.readFileSync(path.join(task_server.getDir(), "config.json")));
|
||||
});
|
||||
createWindow();
|
||||
|
||||
|
193
ElectronJS/package-lock.json
generated
193
ElectronJS/package-lock.json
generated
@ -1,24 +1,24 @@
|
||||
{
|
||||
"name": "easy-spider",
|
||||
"version": "0.6.0",
|
||||
"version": "0.6.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "easy-spider",
|
||||
"version": "0.6.0",
|
||||
"version": "0.6.3",
|
||||
"license": "AGPL-3.0",
|
||||
"dependencies": {
|
||||
"cors": "^2.8.5",
|
||||
"electron-squirrel-startup": "^1.0.0",
|
||||
"express": "^4.19.2",
|
||||
"express": "^4.21.2",
|
||||
"formidable": "^3.5.0",
|
||||
"http": "^0.0.1-security",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"node-abi": "^3.52.0",
|
||||
"node-window-manager": "^2.2.4",
|
||||
"selenium-webdriver": "^4.16.0",
|
||||
"ws": "^8.12.0",
|
||||
"selenium-webdriver": "^4.27.0",
|
||||
"ws": "^8.18.0",
|
||||
"xlsx": "^0.18.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
@ -30,6 +30,11 @@
|
||||
"electron": "^27.1.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@bazel/runfiles": {
|
||||
"version": "6.3.1",
|
||||
"resolved": "https://registry.npmjs.org/@bazel/runfiles/-/runfiles-6.3.1.tgz",
|
||||
"integrity": "sha512-1uLNT5NZsUVIGS4syuHwTzZ8HycMPyr6POA3FCE4GbMtc4rhoJk8aZKtNIRthJYfL+iioppi+rTfH3olMPr9nA=="
|
||||
},
|
||||
"node_modules/@electron-forge/cli": {
|
||||
"version": "6.2.1",
|
||||
"dev": true,
|
||||
@ -1203,6 +1208,7 @@
|
||||
},
|
||||
"node_modules/balanced-match": {
|
||||
"version": "1.0.2",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
@ -1253,9 +1259,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/body-parser": {
|
||||
"version": "1.20.2",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
|
||||
"integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
|
||||
"version": "1.20.3",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz",
|
||||
"integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==",
|
||||
"dependencies": {
|
||||
"bytes": "3.1.2",
|
||||
"content-type": "~1.0.5",
|
||||
@ -1265,7 +1271,7 @@
|
||||
"http-errors": "2.0.0",
|
||||
"iconv-lite": "0.4.24",
|
||||
"on-finished": "2.4.1",
|
||||
"qs": "6.11.0",
|
||||
"qs": "6.13.0",
|
||||
"raw-body": "2.5.2",
|
||||
"type-is": "~1.6.18",
|
||||
"unpipe": "1.0.0"
|
||||
@ -1307,6 +1313,7 @@
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"balanced-match": "^1.0.0",
|
||||
@ -1314,11 +1321,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/braces": {
|
||||
"version": "3.0.2",
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
|
||||
"integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"fill-range": "^7.0.1"
|
||||
"fill-range": "^7.1.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
@ -1667,6 +1675,7 @@
|
||||
},
|
||||
"node_modules/concat-map": {
|
||||
"version": "0.0.1",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/concat-stream": {
|
||||
@ -1727,9 +1736,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/cookie": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
|
||||
"integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
|
||||
"version": "0.7.1",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.1.tgz",
|
||||
"integrity": "sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==",
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@ -2188,9 +2197,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
"integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
|
||||
"integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
|
||||
"engines": {
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
@ -2397,36 +2406,36 @@
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/express": {
|
||||
"version": "4.19.2",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
|
||||
"integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
|
||||
"version": "4.21.2",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
|
||||
"integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
|
||||
"dependencies": {
|
||||
"accepts": "~1.3.8",
|
||||
"array-flatten": "1.1.1",
|
||||
"body-parser": "1.20.2",
|
||||
"body-parser": "1.20.3",
|
||||
"content-disposition": "0.5.4",
|
||||
"content-type": "~1.0.4",
|
||||
"cookie": "0.6.0",
|
||||
"cookie": "0.7.1",
|
||||
"cookie-signature": "1.0.6",
|
||||
"debug": "2.6.9",
|
||||
"depd": "2.0.0",
|
||||
"encodeurl": "~1.0.2",
|
||||
"encodeurl": "~2.0.0",
|
||||
"escape-html": "~1.0.3",
|
||||
"etag": "~1.8.1",
|
||||
"finalhandler": "1.2.0",
|
||||
"finalhandler": "1.3.1",
|
||||
"fresh": "0.5.2",
|
||||
"http-errors": "2.0.0",
|
||||
"merge-descriptors": "1.0.1",
|
||||
"merge-descriptors": "1.0.3",
|
||||
"methods": "~1.1.2",
|
||||
"on-finished": "2.4.1",
|
||||
"parseurl": "~1.3.3",
|
||||
"path-to-regexp": "0.1.7",
|
||||
"path-to-regexp": "0.1.12",
|
||||
"proxy-addr": "~2.0.7",
|
||||
"qs": "6.11.0",
|
||||
"qs": "6.13.0",
|
||||
"range-parser": "~1.2.1",
|
||||
"safe-buffer": "5.2.1",
|
||||
"send": "0.18.0",
|
||||
"serve-static": "1.15.0",
|
||||
"send": "0.19.0",
|
||||
"serve-static": "1.16.2",
|
||||
"setprototypeof": "1.2.0",
|
||||
"statuses": "2.0.1",
|
||||
"type-is": "~1.6.18",
|
||||
@ -2435,6 +2444,10 @@
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.10.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/express"
|
||||
}
|
||||
},
|
||||
"node_modules/express/node_modules/debug": {
|
||||
@ -2556,9 +2569,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/fill-range": {
|
||||
"version": "7.0.1",
|
||||
"version": "7.1.1",
|
||||
"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
|
||||
"integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"to-regex-range": "^5.0.1"
|
||||
},
|
||||
@ -2567,12 +2581,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/finalhandler": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
|
||||
"integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==",
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz",
|
||||
"integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==",
|
||||
"dependencies": {
|
||||
"debug": "2.6.9",
|
||||
"encodeurl": "~1.0.2",
|
||||
"encodeurl": "~2.0.0",
|
||||
"escape-html": "~1.0.3",
|
||||
"on-finished": "2.4.1",
|
||||
"parseurl": "~1.3.3",
|
||||
@ -2736,6 +2750,7 @@
|
||||
},
|
||||
"node_modules/fs.realpath": {
|
||||
"version": "1.0.0",
|
||||
"dev": true,
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/function-bind": {
|
||||
@ -2885,6 +2900,7 @@
|
||||
},
|
||||
"node_modules/glob": {
|
||||
"version": "7.2.3",
|
||||
"dev": true,
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"fs.realpath": "^1.0.0",
|
||||
@ -3234,6 +3250,7 @@
|
||||
},
|
||||
"node_modules/inflight": {
|
||||
"version": "1.0.6",
|
||||
"dev": true,
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"once": "^1.3.0",
|
||||
@ -3343,8 +3360,9 @@
|
||||
},
|
||||
"node_modules/is-number": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
|
||||
"integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.12.0"
|
||||
}
|
||||
@ -3713,9 +3731,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/merge-descriptors": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
|
||||
"integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w=="
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz",
|
||||
"integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/merge2": {
|
||||
"version": "1.4.1",
|
||||
@ -3793,6 +3814,7 @@
|
||||
},
|
||||
"node_modules/minimatch": {
|
||||
"version": "3.1.2",
|
||||
"dev": true,
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"brace-expansion": "^1.1.7"
|
||||
@ -4159,9 +4181,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/object-inspect": {
|
||||
"version": "1.13.1",
|
||||
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
|
||||
"integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
|
||||
"version": "1.13.2",
|
||||
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
|
||||
"integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
@ -4357,6 +4382,7 @@
|
||||
},
|
||||
"node_modules/path-is-absolute": {
|
||||
"version": "1.0.1",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
@ -4399,9 +4425,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/path-to-regexp": {
|
||||
"version": "0.1.7",
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
|
||||
"integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ=="
|
||||
"version": "0.1.12",
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
|
||||
"integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ=="
|
||||
},
|
||||
"node_modules/path-type": {
|
||||
"version": "2.0.0",
|
||||
@ -4571,11 +4597,11 @@
|
||||
}
|
||||
},
|
||||
"node_modules/qs": {
|
||||
"version": "6.11.0",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
|
||||
"integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
|
||||
"version": "6.13.0",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
|
||||
"integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==",
|
||||
"dependencies": {
|
||||
"side-channel": "^1.0.4"
|
||||
"side-channel": "^1.0.6"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.6"
|
||||
@ -4807,6 +4833,7 @@
|
||||
},
|
||||
"node_modules/rimraf": {
|
||||
"version": "3.0.2",
|
||||
"dev": true,
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"glob": "^7.1.3"
|
||||
@ -4874,16 +4901,27 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/selenium-webdriver": {
|
||||
"version": "4.16.0",
|
||||
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.16.0.tgz",
|
||||
"integrity": "sha512-IbqpRpfGE7JDGgXHJeWuCqT/tUqnLvZ14csSwt+S8o4nJo3RtQoE9VR4jB47tP/A8ArkYsh/THuMY6kyRP6kuA==",
|
||||
"version": "4.27.0",
|
||||
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.27.0.tgz",
|
||||
"integrity": "sha512-LkTJrNz5socxpPnWPODQ2bQ65eYx9JK+DQMYNihpTjMCqHwgWGYQnQTCAAche2W3ZP87alA+1zYPvgS8tHNzMQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/SeleniumHQ"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/selenium"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"@bazel/runfiles": "^6.3.1",
|
||||
"jszip": "^3.10.1",
|
||||
"tmp": "^0.2.1",
|
||||
"ws": ">=8.14.2"
|
||||
"tmp": "^0.2.3",
|
||||
"ws": "^8.18.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14.20.0"
|
||||
"node": ">= 14.21.0"
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
@ -4916,9 +4954,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/send": {
|
||||
"version": "0.18.0",
|
||||
"resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
|
||||
"integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==",
|
||||
"version": "0.19.0",
|
||||
"resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz",
|
||||
"integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==",
|
||||
"dependencies": {
|
||||
"debug": "2.6.9",
|
||||
"depd": "2.0.0",
|
||||
@ -4951,6 +4989,14 @@
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
|
||||
},
|
||||
"node_modules/send/node_modules/encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
"integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
|
||||
"engines": {
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/send/node_modules/ms": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||
@ -4984,14 +5030,14 @@
|
||||
}
|
||||
},
|
||||
"node_modules/serve-static": {
|
||||
"version": "1.15.0",
|
||||
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
|
||||
"integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==",
|
||||
"version": "1.16.2",
|
||||
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz",
|
||||
"integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==",
|
||||
"dependencies": {
|
||||
"encodeurl": "~1.0.2",
|
||||
"encodeurl": "~2.0.0",
|
||||
"escape-html": "~1.0.3",
|
||||
"parseurl": "~1.3.3",
|
||||
"send": "0.18.0"
|
||||
"send": "0.19.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 0.8.0"
|
||||
@ -5414,13 +5460,11 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tmp": {
|
||||
"version": "0.2.1",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"rimraf": "^3.0.0"
|
||||
},
|
||||
"version": "0.2.3",
|
||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz",
|
||||
"integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==",
|
||||
"engines": {
|
||||
"node": ">=8.17.0"
|
||||
"node": ">=14.14"
|
||||
}
|
||||
},
|
||||
"node_modules/tmp-promise": {
|
||||
@ -5434,8 +5478,9 @@
|
||||
},
|
||||
"node_modules/to-regex-range": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
||||
"integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"is-number": "^7.0.0"
|
||||
},
|
||||
@ -5693,9 +5738,9 @@
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.14.2",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.14.2.tgz",
|
||||
"integrity": "sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==",
|
||||
"version": "8.18.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
|
||||
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "easy-spider",
|
||||
"productName": "EasySpider",
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"icon": "./favicon",
|
||||
"description": "NoCode Visual Web Crawler",
|
||||
"main": "main.js",
|
||||
@ -33,14 +33,14 @@
|
||||
"dependencies": {
|
||||
"cors": "^2.8.5",
|
||||
"electron-squirrel-startup": "^1.0.0",
|
||||
"express": "^4.19.2",
|
||||
"express": "^4.21.2",
|
||||
"formidable": "^3.5.0",
|
||||
"http": "^0.0.1-security",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"node-abi": "^3.52.0",
|
||||
"node-window-manager": "^2.2.4",
|
||||
"selenium-webdriver": "^4.16.0",
|
||||
"ws": "^8.12.0",
|
||||
"selenium-webdriver": "^4.27.0",
|
||||
"ws": "^8.18.0",
|
||||
"xlsx": "^0.18.5"
|
||||
},
|
||||
"config": {
|
||||
@ -67,7 +67,7 @@
|
||||
],
|
||||
"packagerConfig": {
|
||||
"icon": "./favicon",
|
||||
"appVersion": "0.6.2",
|
||||
"appVersion": "0.6.3",
|
||||
"name": "EasySpider",
|
||||
"executableName": "EasySpider",
|
||||
"appCopyright": "Naibo Wang (naibowang@foxmail.com)",
|
||||
|
@ -20,9 +20,10 @@ rm out/EasySpider/resources/app/vs_BuildTools.exe
|
||||
mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
|
||||
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
mkdir ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
# cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
# cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
# cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/*.py ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/Readme.md ../.temp_to_pub/EasySpider_Linux_x64/Code
|
||||
cp ../ExecuteStage/myCode.py ../.temp_to_pub/EasySpider_Linux_x64
|
||||
|
@ -20,9 +20,10 @@ rm -r ../.temp_to_pub/EasySpider_MacOS/EasySpider.app/Contents/Resources/app/use
|
||||
rm -r ../.temp_to_pub/EasySpider_MacOS/EasySpider.app/Contents/Resources/app/TempUserDataFolder
|
||||
rm -rf ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
mkdir ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
# cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
# cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
# cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/*.py ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/Readme.md ../.temp_to_pub/EasySpider_MacOS/Code
|
||||
cp ../ExecuteStage/myCode.py ../.temp_to_pub/EasySpider_MacOS
|
||||
|
@ -66,6 +66,7 @@ if (!fs.existsSync(path.join(getDir(), "config.json"))) {
|
||||
webserver_port: 8074,
|
||||
user_data_folder: "./user_data",
|
||||
debug: false,
|
||||
lang: "-",
|
||||
copyright: 0,
|
||||
sys_arch: require("os").arch(),
|
||||
mysql_config_path: "./mysql_config.json",
|
||||
@ -121,6 +122,12 @@ exports.start = function (port = 8074) {
|
||||
res.setHeader("Access-Control-Allow-Origin", "*"); // 设置可访问的源
|
||||
// 解析参数
|
||||
const pathName = url.parse(req.url).pathname;
|
||||
const safeBase = path.join(__dirname, "src");
|
||||
|
||||
const safeJoin = (base, target) => {
|
||||
const targetPath = "." + path.posix.normalize("/" + target);
|
||||
return path.join(base, targetPath);
|
||||
};
|
||||
if (pathName == "/excelUpload" && req.method.toLowerCase() === "post") {
|
||||
// // parse a file upload
|
||||
// let form = new formidable.IncomingForm();
|
||||
@ -160,8 +167,16 @@ exports.start = function (port = 8074) {
|
||||
else {
|
||||
//如果有后缀名, 则为前端请求
|
||||
// console.log(path.join(__dirname,"src/taskGrid", pathName));
|
||||
const filePath = safeJoin(safeBase, pathName);
|
||||
|
||||
if (!filePath.startsWith(safeBase)) {
|
||||
res.writeHead(400, { "Content-Type": 'text/html;charset="utf-8"' });
|
||||
res.end("Invalid path");
|
||||
return;
|
||||
}
|
||||
|
||||
fs.readFile(
|
||||
path.join(__dirname, "src", pathName),
|
||||
filePath,
|
||||
async (err, data) => {
|
||||
if (err) {
|
||||
res.writeHead(404, {
|
||||
@ -200,7 +215,7 @@ exports.start = function (port = 8074) {
|
||||
let item = {
|
||||
id: task.id,
|
||||
name: task.name,
|
||||
url: task.url,
|
||||
url: task.links.split("\n")[0],
|
||||
mtime: stat.mtime,
|
||||
links: task.links,
|
||||
desc: task.desc,
|
||||
@ -445,6 +460,10 @@ exports.start = function (port = 8074) {
|
||||
"utf8"
|
||||
);
|
||||
config_file = JSON.parse(config_file);
|
||||
let lang = config_file["lang"];
|
||||
if(lang == undefined){
|
||||
lang = "-";
|
||||
}
|
||||
res.write(JSON.stringify(config_file));
|
||||
res.end();
|
||||
} else if (pathName == "/setUserDataFolder") {
|
||||
|
@ -32,7 +32,7 @@
|
||||
<body>
|
||||
<div id="app">
|
||||
|
||||
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-if="init">
|
||||
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-if="lang=='-'">
|
||||
<h5 style="margin-top: 20px">选择语言/Select Language</h5>
|
||||
|
||||
<p><a @click="changeLang('zh')" class="btn btn-outline-primary btn-lg"
|
||||
@ -40,9 +40,6 @@
|
||||
|
||||
<p><a @click="changeLang('en')" class="btn btn-outline-primary btn-lg"
|
||||
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;">English</a></p>
|
||||
<p style="font-size: 17px">当前版本/Current Version: <b>v0.6.2</b></p>
|
||||
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
|
||||
target="_blank">Github</a>最新版本/Newest Version:<b>{{newest_version}}</b></p>
|
||||
<!-- <p>如发现新版本更新,可从以下Github仓库下载最新版本使用/If a new version is found, you can download the latest version from the following Github repository:</p>-->
|
||||
<!-- <p></p>-->
|
||||
<div class="img-container">
|
||||
@ -64,8 +61,8 @@
|
||||
<textarea class="form-control"
|
||||
style="margin:0 auto;width:90%; color:black; height: 450px; min-height: 200px; background: white"
|
||||
readonly>
|
||||
This software is intended for educational and communication purposes only. It is strictly prohibited to use the software for any illegal activities or operations, such as crawling government/military websites that are not allowed to be crawled. The user bears all consequences resulting from the use of this software and the author shall not be held responsible or liable in any way. Furthermore, the software is protected by patent rights. If you intend to use it for commercial purposes or profit-making activities, such as using the software for client orders, selling the collected data, please contact author: naibowang@foxmail.com for patent authorization and payment operations: https://www.patentguru.com/cn/search?q=一种自定义提取流程的服务封装系统
|
||||
For individual users, EasySpider is a completely free and ad-free open-source software. The development and maintenance of the software rely solely on the author's voluntary efforts. Therefore, you can choose to support the author, allowing them to have more enthusiasm and energy to maintain this software. Alternatively, if you have profited from using this software, you are welcome to support the author through the following methods:
|
||||
This software is intended for educational and communication purposes only. It is strictly prohibited to use the software for any illegal activities or operations, such as crawling government/military websites that are not allowed to be crawled. The user bears all consequences resulting from the use of this software and the author shall not be held responsible or liable in any way.
|
||||
EasySpider is a completely free and ad-free open-source software. The development and maintenance of the software rely solely on the author's voluntary efforts. Therefore, you can choose to support the author, allowing them to have more enthusiasm and energy to maintain this software. Alternatively, if you have profited from using this software, you are welcome to support the author through the following methods:
|
||||
|
||||
1. PayPal account: naibowang, or scan the QR code provided in the software package.
|
||||
2. Alipay account: naibowang@foxmail.com, or scan the QR code provided in the software package.
|
||||
@ -92,6 +89,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
|
||||
<a href="https://www.easyspider.cn/index_english.html" target="_blank"
|
||||
style="text-align: center; font-size: 18px">Browse official website to watch tutorials</a>
|
||||
</p>
|
||||
<p style="font-size: 17px">Current Version: <b>v0.6.3</b></p>
|
||||
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
|
||||
target="_blank">Newest</a> Version: <b>{{newest_version}}</b></p>
|
||||
<div class="img-container">
|
||||
<!-- <h5>Producer</h5>-->
|
||||
<a href="https://www.zju.edu.cn" alt="Zhejiang University" target="_blank"><img
|
||||
@ -164,9 +164,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
|
||||
<textarea class="form-control"
|
||||
style="margin:0 auto;width:90%; color:black; height: 480px; min-height: 200px; background: white"
|
||||
readonly>
|
||||
本软件仅供学习交流使用,严禁使用软件进行任何违法违规的操作,如爬取不允许爬取的政府/军事机关网站等。使用本软件所造成的一切后果由使用者自负,与作者本人无关,作者不会承担任何责任。同时,软件受到专利权保护,如要用于商业用途,如使用软件进行盈利接单,用于公司业务,或出售采集到的数据等,请邮件联系作者:naibowang@foxmail.com进行专利授权等付费操作:https://www.patentguru.com/cn/search?q=一种自定义提取流程的服务封装系统
|
||||
本软件仅供学习交流使用,严禁使用软件进行任何违法违规的操作,如爬取不允许爬取的政府/军事机关网站等。使用本软件所造成的一切后果由使用者自负,与作者本人无关,作者不会承担任何责任。
|
||||
|
||||
对于个人使用者来说,易采集EasySpider是一款完全免费无广告的开源软件,软件开发和维护全靠作者用爱发电,因此您可以选择支持作者让作者有更多的热情和精力维护此软件,或者您使用了此软件进行了盈利,欢迎您通过下面的方式支持作者:
|
||||
易采集EasySpider是一款完全免费无广告的开源软件,软件开发和维护全靠作者用爱发电,因此您可以选择支持作者让作者有更多的热情和精力维护此软件,或者您使用了此软件进行了盈利,欢迎您通过下面的方式支持作者:
|
||||
|
||||
1、支付宝账号:naibowang@foxmail.com,也可以扫描软件包中带的二维码。
|
||||
2、微信收款:扫描软件包中带的二维码。
|
||||
@ -191,6 +191,9 @@ For individual users, EasySpider is a completely free and ad-free open-source so
|
||||
<a href="https://www.easyspider.cn?lang=zh" target="_blank"
|
||||
style="text-align: center; font-size: 18px">点此访问官网查看文档/视频教程</a>
|
||||
</p>
|
||||
<p style="font-size: 17px">软件当前版本:<b>v0.6.3</b></p>
|
||||
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases"
|
||||
target="_blank">官网</a>最新版本:<b>{{newest_version}}</b></p>
|
||||
<div class="img-container">
|
||||
<!-- <h5>出品方</h5>-->
|
||||
<a href="https://www.zju.edu.cn" alt="浙江大学" target="_blank"><img src="img/zju.png"></a>
|
||||
|
@ -22,7 +22,7 @@ let app = Vue.createApp({
|
||||
data() {
|
||||
return {
|
||||
init: true,
|
||||
lang: 'zh',
|
||||
lang: '-',
|
||||
user_data_folder: getUrlParam("user_data_folder"),
|
||||
copyright: 0,
|
||||
step: 0,
|
||||
@ -34,6 +34,10 @@ let app = Vue.createApp({
|
||||
if(this.copyright == 0){
|
||||
this.step = -1;
|
||||
}
|
||||
this.lang = getUrlParam("lang");
|
||||
if (this.lang == 'undefined' || this.lang == '') {
|
||||
this.lang = '-';
|
||||
}
|
||||
// 发送GET请求获取GitHub的Release API响应
|
||||
const request = new XMLHttpRequest();
|
||||
request.open('GET', `https://api.github.com/repos/NaiboWang/EasySpider/releases/latest`);
|
||||
@ -52,8 +56,9 @@ let app = Vue.createApp({
|
||||
},
|
||||
methods: {
|
||||
changeLang(lang = 'zh') {
|
||||
this.init = false;
|
||||
// this.init = false;
|
||||
this.lang = lang;
|
||||
window.electronAPI.changeLang(lang);
|
||||
},
|
||||
acceptAgreement() {
|
||||
this.step = 0;
|
||||
|
@ -11,4 +11,5 @@ contextBridge.exposeInMainWorld('electronAPI', {
|
||||
startDesign: (lang="en", user_data_folder = '', mobile=false) => ipcRenderer.send('start-design', lang, user_data_folder, mobile),
|
||||
startInvoke: (lang="en") => ipcRenderer.send('start-invoke', lang),
|
||||
acceptAgreement: () => ipcRenderer.send('accept-agreement'),
|
||||
changeLang: (lang="en") => ipcRenderer.send('change-lang', lang)
|
||||
})
|
@ -580,7 +580,7 @@ Please note that this feature does not support assigning values to variables. In
|
||||
Loop based on the expression value of Python code. Here are some examples:
|
||||
1. Return relevant values of the current browser object. Use `self.browser` to refer to the current browser being operated. You can directly use Selenium's API to perform operations, such as `self.browser.find_element(By.CSS_SELECTOR, "body").text=="123"`, which checks whether the current page contains the text "123".
|
||||
2. Return the value of a custom global variable: `self.myVar`
|
||||
3. Return the result of a conditional statement: `self.myVar == 1`
|
||||
3. Return the result of a conditional statement: `self.myVar > 1`
|
||||
4. Determining whether the value extracted from a certain field is equal to the value of a certain variable: self.outputParameters["field name"] == self.myVar
|
||||
If the expression returns a value greater than 0 or evaluates to True, the loop continues; otherwise, it stops.
|
||||
</pre>
|
||||
|
@ -579,8 +579,8 @@ print(emotlib.emoji()) # 使用其中的函数。
|
||||
<pre class="form-control" style="background: white; margin-top: 20px; min-height: 220px; font-size: 15px!important; word-wrap: break-word; white-space: pre-wrap; border-radius: 0; border: 1px solid" disabled v-if='parseInt(loopType) == 7'>请先阅读此说明,再在上方输入框(不是本框)写具体代码,如果要执行大量代码,可以直接写outside:myCode.py,这样程序就会读取并执行EasySpider目录下的myCode.py中的代码。
|
||||
根据Python代码的表达式值来决定是否循环,示例:
|
||||
1. 返回当前浏览器对象的相关值,用self.browser表示当前操作的浏览器,可直接用selenium的API进行操作,如self.browser.find_element(By.CSS_SELECTOR, "body").text=="123",表示判断当前页面是否为123这个文本。
|
||||
2. 返回自定义全局变量的值:self.myVar,如果
|
||||
3. 返回条件判断的值:self.myVar == 1
|
||||
2. 返回自定义全局变量的值:self.myVar
|
||||
3. 返回条件判断的值:self.myVar > 1
|
||||
4. 判断某个字段提取的值是否等于某个变量的值:self.outputParameters["字段名"] == self.myVar
|
||||
以上表达式返回值大于0或为真则继续循环,否则停止循环。
|
||||
</pre>
|
||||
|
@ -91,7 +91,7 @@
|
||||
value="about:blank"></input>
|
||||
<label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction"
|
||||
target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
|
||||
<label v-if="OS=='darwin'">{{`对于MacOS系统,EasySpider提供了两个不同的执行程序,分别为easyspider_executestage和easyspider_executestage_full,前者执行时加载速度较快,并提供了除OCR识别和数据去重以外的全部功能;后者则提供了包括OCR识别和数据去重在内的全部功能,但运行时加载速度较慢,需要等待2-10分钟才能执行程序,请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
|
||||
<label v-if="OS=='MacOS'">{{`对于MacOS系统,EasySpider提供了两个不同的执行程序,分别为easyspider_executestage和easyspider_executestage_full,前者执行时加载速度较快,并提供了除OCR识别和数据去重以外的全部功能;后者则提供了包括OCR识别和数据去重在内的全部功能,但运行时加载速度较慢,需要等待2-10分钟才能执行程序,请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
|
||||
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider can quit when executing command for ease of timed execution, and you can set --read_type to "remote" for remote execution):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时可以退出EasySpider以方便定时执行,如需要远程调用则需要将--read_type设置为remote并设置远程地址):` | lang }}</label>
|
||||
<textarea class="form-control" style="height:150px">cd {{easyspider_location}}
|
||||
{{command}} --config_folder "{{config_folder}}" --headless 0 --read_type local --config_file_name config.json --saved_file_name </textarea>
|
||||
@ -348,7 +348,7 @@
|
||||
config_folder: "",
|
||||
easyspider_location: "",
|
||||
mysql_config_path: "",
|
||||
OS: "win32",
|
||||
OS: "Windows",
|
||||
}, mounted() {
|
||||
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
|
||||
app.$data.user_data_folder = result.user_data_folder;
|
||||
@ -412,7 +412,7 @@
|
||||
form_data.append('file', $('#excelFile').prop('files')[0]);
|
||||
// console.log(app.$data.backEndAddressServiceWrapper + "/excelUpload",)
|
||||
$.ajax({
|
||||
url: app.$data.backEndAddressServiceWrapper.replace("8074", "8075") + "/excelUpload",
|
||||
url: "http://localhost:8075/excelUpload",
|
||||
type: 'POST',
|
||||
data: form_data,
|
||||
processData: false,
|
||||
@ -559,12 +559,14 @@
|
||||
};
|
||||
app.$data.ID = result;
|
||||
ws.send(JSON.stringify(message));
|
||||
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
|
||||
if (OSInfo.version == 'darwin') {
|
||||
changeCommand();
|
||||
$('#myModal').modal('show');
|
||||
}
|
||||
});
|
||||
// 使用函数并打印结果
|
||||
const systemInfo = detectOperatingSystemAndArch();
|
||||
// $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
|
||||
if (systemInfo.OS == 'MacOS') {
|
||||
changeCommand();
|
||||
$('#myModal').modal('show');
|
||||
}
|
||||
// });
|
||||
});
|
||||
// }
|
||||
},
|
||||
@ -574,15 +576,17 @@
|
||||
});
|
||||
|
||||
function changeCommand() {
|
||||
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
|
||||
app.$data.OS = OSInfo.version;
|
||||
if (OSInfo.version == 'win32' && OSInfo.bit == 'x64') {
|
||||
// $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
|
||||
// app.$data.OS = systemInfo.OS;
|
||||
const systemInfo = detectOperatingSystemAndArch();
|
||||
app.$data.OS = systemInfo.OS;
|
||||
if (systemInfo.OS == 'Windows' && systemInfo.architecture == 'x64') {
|
||||
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
|
||||
} else if (OSInfo.version == 'win32' && OSInfo.bit == 'ia32') {
|
||||
} else if (systemInfo.OS == 'Windows' && systemInfo.architecture == 'ia32') {
|
||||
app.$data.command = "./EasySpider/resources/app/chrome_win32/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
|
||||
} else if (OSInfo.version == 'linux') {
|
||||
} else if (systemInfo.OS == 'Linux') {
|
||||
app.$data.command = "./EasySpider/resources/app/chrome_linux64/easyspider_executestage --ids '[" + app.$data.ID.toString() + "]' --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
|
||||
} else if (OSInfo.version == 'darwin') {
|
||||
} else if (systemInfo.OS == 'MacOS') {
|
||||
if (getUrlParam("lang") == "zh") {
|
||||
app.$data.easyspider_location = "你的EasySpider文件夹,如:cd /Users/" + app.$data.config_folder.split("/")[2] + "/Downloads/EasySpider_MacOS";
|
||||
} else {
|
||||
@ -590,7 +594,7 @@
|
||||
}
|
||||
app.$data.command = "./easyspider_executestage --ids '[" + app.$data.ID.toString() + "]' --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
|
||||
}
|
||||
});
|
||||
// });
|
||||
}
|
||||
|
||||
$.get(app.$data.backEndAddressServiceWrapper + "/queryTask?id=" + sId, function (result) {
|
||||
|
@ -127,3 +127,27 @@ document.onkeydown = function (e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
function detectOperatingSystemAndArch() {
|
||||
const platform = navigator.platform.toLowerCase();
|
||||
const userAgent = navigator.userAgent.toLowerCase();
|
||||
let OS = 'Unknown';
|
||||
let architecture = 'Unknown';
|
||||
|
||||
// 判断操作系统类型
|
||||
if (platform.includes('win')) {
|
||||
OS = 'Windows';
|
||||
} else if (platform.includes('mac')) {
|
||||
OS = 'MacOS';
|
||||
} else if (platform.includes('linux')) {
|
||||
OS = 'Linux';
|
||||
}
|
||||
|
||||
// 判断操作系统位数
|
||||
if (userAgent.includes('wow64') || userAgent.includes('win64') || platform.includes('x86_64') || platform.includes('amd64')) {
|
||||
architecture = 'x64';
|
||||
} else {
|
||||
architecture = 'ia32';
|
||||
}
|
||||
|
||||
return { OS, architecture };
|
||||
}
|
||||
|
@ -491,7 +491,7 @@ if (mobile == "true") {
|
||||
}
|
||||
|
||||
let serviceInfo = {
|
||||
"version": "0.6.2"
|
||||
"version": "0.6.3"
|
||||
};
|
||||
|
||||
function saveService(type) {
|
||||
@ -625,7 +625,7 @@ function saveService(type) {
|
||||
"links": links,
|
||||
"create_time": $("#create_time").val(),
|
||||
"update_time": formatDateTime(new Date()),
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"saveThreshold": saveThreshold,
|
||||
// "cloudflare": cloudflare,
|
||||
"quitWaitTime": parseInt($("#quitWaitTime").val()),
|
||||
|
3
ElectronJS/start_server.js
Normal file
3
ElectronJS/start_server.js
Normal file
@ -0,0 +1,3 @@
|
||||
const path = require("path");
|
||||
const task_server = require(path.join(__dirname, "server.js"));
|
||||
task_server.start(8074); //start local server
|
2
ElectronJS/stealth.min.js
vendored
2
ElectronJS/stealth.min.js
vendored
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/321.json
Normal file
1
ElectronJS/tasks/321.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":321,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-04-22 07:02:02","update_time":"2024-04-22 07:02:16","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[2]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}
|
1
ElectronJS/tasks/322.json
Normal file
1
ElectronJS/tasks/322.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":322,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2024-04-22 08:13:15","update_time":"2024-04-22 08:13:33","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环点击每个元素","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":""}},{"id":3,"index":3,"parentId":2,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":true,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":1,"maxWaitTime":10,"params":[],"alertHandleType":0,"downloadWaitTime":3600,"allXPaths":""}}]}
|
1
ElectronJS/tasks/323.json
Normal file
1
ElectronJS/tasks/323.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":323,"name":"新web采集任务","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"","update_time":"2024-08-10 17:29:04","version":"0.6.2","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}
|
1
ElectronJS/tasks/324.json
Normal file
1
ElectronJS/tasks/324.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/325.json
Normal file
1
ElectronJS/tasks/325.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":325,"name":"百度一下,你就知道","url":"https://www.baidu.com","links":"https://www.baidu.com","create_time":"2024-12-30 22:37:29","update_time":"2024-12-30 22:37:43","version":"0.6.3","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"csv","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.baidu.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.baidu.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.baidu.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"0暖心2024 总书记的贴心话"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.baidu.com","links":"https://www.baidu.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li/a[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[1]/div[1]/div[5]/div[1]/div[1]/div[3]/ul[1]/li[1]/a[1]","//a[contains(., '0暖心2024 总')]","//a[@class='title-content c-link c-font-medium c-line-clamp1']","/html/body/div[last()-4]/div[last()-3]/div[last()-3]/div/div/div/ul/li[last()-9]/a"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"0暖心2024 总书记的贴心话"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://www.baidu.com/s?wd=%E6%9A%96%E5%BF%832024+%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E8%B4%B4%E5%BF%83%E8%AF%9D&sa=fyb_n_homepage&rsv_dl=fyb_n_homepage&from=super&cl=3&tn=baidutop10&fr=top1000&rsv_idx=2&hisfilter=1"}],"unique_index":"8rtq2is658sm5b58osr","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
1
ElectronJS/tasks/326.json
Normal file
1
ElectronJS/tasks/326.json
Normal file
File diff suppressed because one or more lines are too long
@ -48,7 +48,7 @@ def copy_folder(source_folder, destination_folder):
|
||||
|
||||
|
||||
def get_chrome_version():
|
||||
version = "120"
|
||||
version = "131"
|
||||
if sys.platform == "win32":
|
||||
version_re = re.compile(r"^[1-9]\d*\.\d*.\d*")
|
||||
try:
|
||||
@ -90,6 +90,8 @@ old_driver_version = {
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.system("npm install -g extract-stealth-evasions") # 安装stealth.min.js
|
||||
os.system("npx extract-stealth-evasions") # 提取stealth.min.js
|
||||
driver_downloads = []
|
||||
response = requests.get(chrome_driver_url)
|
||||
if response.status_code == 200:
|
||||
@ -150,7 +152,7 @@ if __name__ == "__main__":
|
||||
for folder in os.listdir("./chrome_win64"):
|
||||
if folder[0].isdigit() and os.path.isdir("./chrome_win64/"+folder):
|
||||
shutil.rmtree("./chrome_win64/"+folder+"/Installer") # 删除Installer文件夹
|
||||
copy_file("./execute_win64.bat", "./chrome_win64/execute.bat")
|
||||
copy_file("./execute_win64.bat", "./chrome_win64/execute_win64.bat")
|
||||
copy_file("./stealth.min.js", "./chrome_win64/stealth.min.js")
|
||||
try:
|
||||
copy_file(
|
||||
@ -177,7 +179,7 @@ if __name__ == "__main__":
|
||||
for folder in os.listdir("./chrome_win32"):
|
||||
if folder[0].isdigit() and os.path.isdir("./chrome_win32/"+folder):
|
||||
shutil.rmtree("./chrome_win32/"+folder+"/Installer") # 删除Installer文件夹
|
||||
copy_file("./execute_win32.bat", "./chrome_win32/execute.bat")
|
||||
copy_file("./execute_win32.bat", "./chrome_win32/execute_win32.bat")
|
||||
copy_file("./stealth.min.js", "./chrome_win32/stealth.min.js")
|
||||
try:
|
||||
copy_file(
|
||||
@ -201,7 +203,7 @@ if __name__ == "__main__":
|
||||
if os.path.exists("./chrome_linux64"):
|
||||
shutil.rmtree("./chrome_linux64")
|
||||
copy_folder(linux_chrome_path, "./chrome_linux64")
|
||||
copy_file("./execute_linux64.sh", "./chrome_linux64/execute.sh")
|
||||
copy_file("./execute_linux64.sh", "./chrome_linux64/execute_linux64.sh")
|
||||
copy_file("./stealth.min.js", "./chrome_linux64/stealth.min.js")
|
||||
try:
|
||||
copy_file(
|
||||
@ -216,7 +218,7 @@ if __name__ == "__main__":
|
||||
finally:
|
||||
# Change Linux file permissions
|
||||
os.chmod("./chrome_linux64/chromedriver_linux64", 0o755)
|
||||
os.chmod("./chrome_linux64/execute.sh", 0o755)
|
||||
os.chmod("./chrome_linux64/execute_linux64.sh", 0o755)
|
||||
shutil.rmtree("./chromedrivers")
|
||||
elif sys.platform == "darwin" and platform.architecture()[0] == "64bit":
|
||||
processor = get_processor_info()
|
||||
|
5
ExecuteStage/.vscode/launch.json
vendored
5
ExecuteStage/.vscode/launch.json
vendored
@ -12,8 +12,9 @@
|
||||
"justMyCode": false,
|
||||
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||
"args": ["--ids", "[83]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
||||
"--read_type", "remote"]
|
||||
"args": ["--ids", "[0]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
||||
"--read_type", "remote",
|
||||
]
|
||||
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
||||
}
|
||||
]
|
||||
|
@ -1,4 +1,8 @@
|
||||
# 环境编译说明|Environment Compilation Instruction
|
||||
## 视频教程
|
||||
|
||||
[从源代码编译程序并设计运行和调试任务指南(基于Ubuntu24.04)](https://www.bilibili.com/video/BV1VE421P7yj/)
|
||||
|
||||
# 环境编译说明 | Environment Compilation Instruction
|
||||
|
||||
EasySpider分三部分:
|
||||
|
||||
@ -18,20 +22,20 @@ EasySpider is divided into three parts:
|
||||
|
||||
This section covers the compilation instructions for the `Execution stage program`.
|
||||
|
||||
## 建议编译顺序|Suggested Compilation Order
|
||||
## 建议编译顺序 | Suggested Compilation Order
|
||||
|
||||
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
|
||||
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
|
||||
3. 编译执行阶段程序,否则无法执行程序,只能设计程序。
|
||||
3. 编译执行阶段程序,否则无法执行任务,只能设计任务。
|
||||
|
||||
-----
|
||||
|
||||
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
|
||||
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
|
||||
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
|
||||
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
|
||||
|
||||
|
||||
## 环境构建|Environment Setup
|
||||
## 环境构建 | Environment Setup
|
||||
|
||||
1. 安装Python 3.7及以上版本并添加至系统环境变量:[https://www.python.org/downloads/](https://www.python.org/downloads/)。
|
||||
2. 安装`pip3`并添加至系统环境变量(Windows安装python后会自带pip,Linux和MacOS安装方式请自行搜索)。
|
||||
@ -51,7 +55,7 @@ This section covers the compilation instructions for the `Execution stage progra
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
## 运行说明|Run Instruction
|
||||
## 运行说明 | Run Instruction
|
||||
|
||||
运行程序前,确保已经完成了`ElectronJS`文件夹下`主程序`的编译,保证`chrome`文件夹和`chromedriver`环境已经就绪,同时**EasySpider主程序已在运行中**。
|
||||
|
||||
@ -75,13 +79,13 @@ python3 easyspider_executestage.py --ids [1]
|
||||
|
||||
The above is an example command to run a task with the ID of `1`. For more information on command-line parameters, please refer to: [Argument Instruction](https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction) on the project's GitHub Wiki.
|
||||
|
||||
### VS Code调试|VS Code Debug
|
||||
### VS Code调试 | VS Code Debug
|
||||
|
||||
可以用VS Code打开此文件夹即可调试程序,可修改`.vscode`下的`launch.json`文件中的调试参数,调试说明参考:[https://zhuanlan.zhihu.com/p/41189402](https://zhuanlan.zhihu.com/p/41189402)。
|
||||
|
||||
You can use VS Code to open this folder and debug the program. You can modify the debugging parameters in the launch.json file located under the .vscode folder. For instructions on debugging with VSCode, you can refer to this guide: [Debugging Python with Visual Studio Code](https://code.visualstudio.com/docs/python/debugging).
|
||||
|
||||
## 打包说明|Package Instruction
|
||||
## 打包说明 | Package Instruction
|
||||
|
||||
如果想要在主程序直接点击`本地直接运行`按钮即可执行程序,则需要打包程序为可执行程序。
|
||||
|
||||
|
27
ExecuteStage/constants.py
Normal file
27
ExecuteStage/constants.py
Normal file
@ -0,0 +1,27 @@
|
||||
from enum import unique, IntEnum
|
||||
|
||||
|
||||
@unique
|
||||
class WriteMode(IntEnum):
|
||||
Create = 0 # 新建模式|Create Mode
|
||||
Append = 1 # 追加模式|Append Mode
|
||||
MySQL = 2 # MySQL模式|MySQL Mode
|
||||
Json = 3 # Json模式|Json Mode
|
||||
|
||||
|
||||
@unique
|
||||
class DataWriteMode(IntEnum):
|
||||
Append = 1 # 追加模式|Append Mode
|
||||
Cover = 2 # 覆盖模式|Cover Mode
|
||||
Rename = 3 # 重命名模式|Rename Mode
|
||||
|
||||
|
||||
@unique
|
||||
class GraphOption(IntEnum):
|
||||
Get = 1 # 打开网页操作|Open Web
|
||||
Click = 2 # 点击操作|Click
|
||||
Extract = 3 # 提取数据操作|Extract Data
|
||||
Input = 4 # 输入操作|Input
|
||||
Custom = 5 # 自定义操作|Custom
|
||||
Move = 7 # 移动操作|Move
|
||||
Loop = 8 # 循环操作|Loop
|
@ -9,6 +9,7 @@ import threading
|
||||
# import undetected_chromedriver as uc
|
||||
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||
from constants import WriteMode, DataWriteMode, GraphOption
|
||||
from myChrome import MyChrome
|
||||
from threading import Thread, Event
|
||||
from PIL import Image
|
||||
@ -31,7 +32,6 @@ from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from pynput.keyboard import Key, Listener
|
||||
from datetime import datetime
|
||||
import io # 遇到错误退出时应执行的代码
|
||||
import json
|
||||
@ -73,36 +73,24 @@ desired_capabilities["pageLoadStrategy"] = "none"
|
||||
|
||||
|
||||
class BrowserThread(Thread):
|
||||
def __init__(self, browser_t, id, service, version, event, saveName, config, option):
|
||||
def __init__(self, browser_t, id, service, version, event, saveName, config, option, commandline_config=""):
|
||||
Thread.__init__(self)
|
||||
self.logs = io.StringIO()
|
||||
try:
|
||||
self.log = bool(service["recordLog"])
|
||||
except:
|
||||
self.log = True
|
||||
self.log = bool(service.get("recordLog", True))
|
||||
self.browser = browser_t
|
||||
self.option = option
|
||||
self.config = config
|
||||
self.commandline_config = commandline_config
|
||||
self.version = version
|
||||
self.totalSteps = 0
|
||||
self.id = id
|
||||
self.event = event
|
||||
try:
|
||||
self.saveName = service["saveName"] # 保存文件的名字
|
||||
except:
|
||||
now = datetime.now()
|
||||
# 将时间格式化为精确到秒的字符串
|
||||
self.saveName = now.strftime("%Y_%m_%d_%H_%M_%S")
|
||||
now = datetime.now()
|
||||
self.saveName = service.get("saveName", now.strftime("%Y_%m_%d_%H_%M_%S")) # 保存文件的名字
|
||||
self.OUTPUT = ""
|
||||
self.SAVED = False
|
||||
self.BREAK = False
|
||||
self.CONTINUE = False
|
||||
try:
|
||||
maximizeWindow = service["maximizeWindow"]
|
||||
except:
|
||||
maximizeWindow = 0
|
||||
if maximizeWindow == 1:
|
||||
self.browser.maximize_window()
|
||||
self.browser.maximize_window() if service.get("maximizeWindow") == 1 else ...
|
||||
# 名称设定
|
||||
if saveName != "": # 命令行覆盖保存名称
|
||||
self.saveName = saveName # 保存文件的名字
|
||||
@ -120,16 +108,18 @@ class BrowserThread(Thread):
|
||||
os.mkdir(self.downloadFolder + "/files")
|
||||
if not os.path.exists(self.downloadFolder + "/images"):
|
||||
os.mkdir(self.downloadFolder + "/images")
|
||||
if not os.path.exists(self.downloadFolder + "/screenshots"):
|
||||
os.mkdir(self.downloadFolder + "/screenshots")
|
||||
self.getDataStep = 0
|
||||
self.startSteps = 0
|
||||
try:
|
||||
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
|
||||
if startFromExit == 1:
|
||||
if service.get("startFromExit", 0) == 1:
|
||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r',
|
||||
encoding='utf-8-sig') as file_obj:
|
||||
self.startSteps = int(file_obj.read()) # 读取已执行步数
|
||||
except:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.print_and_log(f"读取steps.txt失败,原因:{str(e)}")
|
||||
|
||||
if self.startSteps != 0:
|
||||
self.print_and_log("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为",
|
||||
self.startSteps, "条。")
|
||||
@ -137,7 +127,7 @@ class BrowserThread(Thread):
|
||||
"will start from the last step, before we already collected", self.startSteps, " items.")
|
||||
else:
|
||||
self.print_and_log("此模式下,任务ID", self.id,
|
||||
"将从头F开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
"将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||
self.print_and_log("In this mode, task ID", self.id,
|
||||
"will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||
stealth_path = driver_path[:driver_path.find(
|
||||
@ -145,13 +135,12 @@ class BrowserThread(Thread):
|
||||
with open(stealth_path, 'r') as f:
|
||||
js = f.read()
|
||||
self.print_and_log("Loading stealth.min.js")
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': js}) # TMALL 反扒
|
||||
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"""
|
||||
})
|
||||
WebDriverWait(self.browser, 10)
|
||||
@ -164,75 +153,65 @@ class BrowserThread(Thread):
|
||||
self.monitor_thread.start()
|
||||
# self.browser.get('about:blank')
|
||||
self.procedure = service["graph"] # 程序执行流程
|
||||
try:
|
||||
self.maxViewLength = service["maxViewLength"] # 最大显示长度
|
||||
except:
|
||||
self.maxViewLength = 15
|
||||
try:
|
||||
self.outputFormat = service["outputFormat"] # 输出格式
|
||||
except:
|
||||
self.outputFormat = "csv"
|
||||
try:
|
||||
self.task_version = service["version"] # 任务版本
|
||||
if service["version"] >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
else: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
if service["version"] != version:
|
||||
self.print_and_log("版本不一致,请使用" +
|
||||
service["version"] + "版本的EasySpider运行该任务!")
|
||||
self.print_and_log("Version not match, please use EasySpider " +
|
||||
service["version"] + " to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
except: # 0.2.0版本没有version字段,所以直接退出
|
||||
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度
|
||||
self.outputFormat = service.get("outputFormat", "csv") # 输出格式
|
||||
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值
|
||||
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式,1为追加,2为覆盖,3为重命名文件
|
||||
self.task_version = service.get("version", "") # 任务版本
|
||||
|
||||
if not self.task_version:
|
||||
self.print_and_log("版本不一致,请使用v0.2.0版本的EasySpider运行该任务!")
|
||||
self.print_and_log(
|
||||
"Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.print_and_log("Version not match, please use EasySpider v0.2.0 to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
try:
|
||||
self.save_threshold = service["saveThreshold"] # 保存最低阈值
|
||||
except:
|
||||
self.save_threshold = 10
|
||||
try:
|
||||
self.links = list(
|
||||
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
|
||||
except:
|
||||
|
||||
if self.task_version >= "0.3.1": # 0.3.1及以上版本以上的EasySpider兼容从0.3.1版本开始的所有版本
|
||||
pass
|
||||
elif self.task_version != version: # 0.3.1以下版本的EasySpider不兼容0.3.1及以上版本的EasySpider
|
||||
self.print_and_log(f"版本不一致,请使用{self.task_version}版本的EasySpider运行该任务!")
|
||||
self.print_and_log(f"Version not match, please use EasySpider {self.task_version} to run this task!")
|
||||
self.browser.quit()
|
||||
sys.exit()
|
||||
|
||||
service_links = service.get("links")
|
||||
if service_links:
|
||||
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表
|
||||
else:
|
||||
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
|
||||
|
||||
self.OUTPUT = [] # 采集的数据
|
||||
try:
|
||||
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖,3为重命名文件
|
||||
except:
|
||||
self.dataWriteMode = 1
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
|
||||
if self.outputFormat in ["csv", "txt", "xlsx", "json"]:
|
||||
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
if self.dataWriteMode == 2:
|
||||
if self.dataWriteMode == DataWriteMode.Cover.value:
|
||||
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
|
||||
elif self.dataWriteMode == 3:
|
||||
elif self.dataWriteMode == DataWriteMode.Rename.value:
|
||||
i = 2
|
||||
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
|
||||
i = i + 1
|
||||
self.saveName = self.saveName + '_' + str(i)
|
||||
self.print_and_log("文件已存在,已重命名为", self.saveName)
|
||||
self.writeMode = 1 # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
|
||||
self.writeMode = WriteMode.Create.value # 写入模式,0为新建,1为追加
|
||||
if self.outputFormat in ['csv', 'txt', 'xlsx']:
|
||||
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"):
|
||||
self.OUTPUT.append([]) # 添加表头
|
||||
self.writeMode = 0
|
||||
self.writeMode = WriteMode.Create.value
|
||||
elif self.outputFormat == "json":
|
||||
self.writeMode = 3 # JSON模式无需判断是否存在文件
|
||||
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件
|
||||
elif self.outputFormat == "mysql":
|
||||
self.mysql = myMySQL(config["mysql_config_path"])
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2)
|
||||
self.writeMode = 2
|
||||
if self.writeMode == 0:
|
||||
self.mysql.create_table(self.saveName, service["outputParameters"],
|
||||
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value)
|
||||
self.writeMode = WriteMode.MySQL.value # MySQL模式
|
||||
|
||||
if self.writeMode == WriteMode.Create.value:
|
||||
self.print_and_log("新建模式|Create Mode")
|
||||
elif self.writeMode == 1:
|
||||
elif self.writeMode == WriteMode.Append.value:
|
||||
self.print_and_log("追加模式|Append Mode")
|
||||
elif self.writeMode == 2:
|
||||
elif self.writeMode == WriteMode.MySQL.value:
|
||||
self.print_and_log("MySQL模式|MySQL Mode")
|
||||
elif self.writeMode == 3:
|
||||
elif self.writeMode == WriteMode.Json.value:
|
||||
self.print_and_log("JSON模式|JSON Mode")
|
||||
|
||||
self.containJudge = service["containJudge"] # 是否含有判断语句
|
||||
self.outputParameters = {}
|
||||
self.service = service
|
||||
@ -245,191 +224,140 @@ class BrowserThread(Thread):
|
||||
if param["name"] not in self.outputParameters.keys():
|
||||
self.outputParameters[param["name"]] = ""
|
||||
self.dataNotFoundKeys[param["name"]] = False
|
||||
try:
|
||||
self.outputParametersTypes.append(param["type"])
|
||||
except:
|
||||
self.outputParametersTypes.append("text")
|
||||
try:
|
||||
self.outputParametersRecord.append(
|
||||
bool(param["recordASField"]))
|
||||
except:
|
||||
self.outputParametersRecord.append(True)
|
||||
self.outputParametersTypes.append(param.get("type", "text"))
|
||||
self.outputParametersRecord.append(bool(param.get("recordASField", True)))
|
||||
# 文件叠加的时候不添加表头
|
||||
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
|
||||
if self.writeMode == 0:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value:
|
||||
self.OUTPUT[0].append(param["name"])
|
||||
self.urlId = 0 # 全局记录变量
|
||||
self.preprocess() # 预处理,优化提取数据流程
|
||||
try:
|
||||
self.inputExcel = service["inputExcel"] # 输入Excel
|
||||
except:
|
||||
self.inputExcel = ""
|
||||
self.inputExcel = service.get("inputExcel", "") # 输入Excel
|
||||
self.readFromExcel() # 读取Excel获得参数值
|
||||
|
||||
# 检测如果没有复杂的操作,优化提取数据流程
|
||||
def preprocess(self):
|
||||
for node in self.procedure:
|
||||
try:
|
||||
iframe = node["parameters"]["iframe"]
|
||||
except:
|
||||
node["parameters"]["iframe"] = False
|
||||
for index_node, node in enumerate(self.procedure):
|
||||
parameters: dict = node["parameters"]
|
||||
iframe = parameters.get('iframe')
|
||||
option = node["option"]
|
||||
|
||||
try:
|
||||
node["parameters"]["xpath"] = lowercase_tags_in_xpath(
|
||||
node["parameters"]["xpath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["waitElementIframeIndex"] = int(
|
||||
node["parameters"]["waitElementIframeIndex"])
|
||||
except:
|
||||
node["parameters"]["waitElement"] = ""
|
||||
node["parameters"]["waitElementTime"] = 10
|
||||
node["parameters"]["waitElementIframeIndex"] = 0
|
||||
if node["option"] == 1: # 打开网页操作
|
||||
try:
|
||||
cookies = node["parameters"]["cookies"]
|
||||
except:
|
||||
node["parameters"]["cookies"] = ""
|
||||
elif node["option"] == 2: # 点击操作
|
||||
try:
|
||||
alertHandleType = node["parameters"]["alertHandleType"]
|
||||
except:
|
||||
node["parameters"]["alertHandleType"] = 0
|
||||
if node["parameters"]["useLoop"]:
|
||||
parameters["iframe"] = False if not iframe else parameters.get('iframe', False)
|
||||
if parameters.get("xpath"):
|
||||
parameters["xpath"] = lowercase_tags_in_xpath(parameters["xpath"])
|
||||
|
||||
if parameters.get("waitElementIframeIndex"):
|
||||
parameters["waitElementIframeIndex"] = int(parameters["waitElementIframeIndex"])
|
||||
else:
|
||||
parameters["waitElement"] = ""
|
||||
parameters["waitElementTime"] = 10
|
||||
parameters["waitElementIframeIndex"] = 0
|
||||
|
||||
if option == GraphOption.Get.value: # 打开网页操作
|
||||
parameters["cookies"] = parameters.get("cookies", "")
|
||||
elif option == GraphOption.Click.value: # 点击操作
|
||||
parameters["alertHandleType"] = parameters.get("alertHandleType", 0)
|
||||
if parameters.get("useLoop"):
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 3: # 提取数据操作
|
||||
node["parameters"]["recordASField"] = 0
|
||||
try:
|
||||
params = node["parameters"]["params"]
|
||||
except:
|
||||
node["parameters"]["params"] = node["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
params = node["parameters"]["params"]
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Extract.value: # 提取数据操作
|
||||
parameters["recordASField"] = 0
|
||||
parameters["params"] = parameters.get("params", parameters.get("paras")) # 兼容0.5.0及以下版本的EasySpider
|
||||
parameters["clear"] = parameters.get("clear", 0)
|
||||
parameters["newLine"] = parameters.get("newLine", 1)
|
||||
|
||||
params = parameters["params"]
|
||||
for param in params:
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
try:
|
||||
param["iframe"] = param.get("iframe", False)
|
||||
|
||||
if param.get("relativeXPath"):
|
||||
param["relativeXPath"] = lowercase_tags_in_xpath(param["relativeXPath"])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
node["parameters"]["recordASField"] = param["recordASField"]
|
||||
except:
|
||||
node["parameters"]["recordASField"] = 1
|
||||
try:
|
||||
splitLine = int(param["splitLine"])
|
||||
except:
|
||||
param["splitLine"] = 0
|
||||
if param["contentType"] == 8:
|
||||
self.print_and_log(
|
||||
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log(
|
||||
"If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
|
||||
|
||||
parameters["recordASField"] = param.get("recordASField", 1)
|
||||
|
||||
param["splitLine"] = 0 if not param.get("splitLine") else param.get("splitLine")
|
||||
|
||||
if param.get("contentType") == 8:
|
||||
self.print_and_log("默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType =="
|
||||
"8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片"
|
||||
"保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用"
|
||||
"的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
|
||||
self.print_and_log("If you think the default ddddocr function is not good enough, you can "
|
||||
"modify the source code get_content function -> contentType == 8 position "
|
||||
"to your own OCR model and then compile and run it; or you can first set "
|
||||
"the content type of the crawler to \"Element Screenshot\" to save the "
|
||||
"picture, and then call your own program with custom operations. The "
|
||||
"function of the program is to read the latest generated picture, then use "
|
||||
"a good model, such as PaddleOCR to recognize the picture, and then return "
|
||||
"the return value as a parameter output to the program.")
|
||||
param["optimizable"] = detect_optimizable(param)
|
||||
elif node["option"] == 4: # 输入文字
|
||||
try:
|
||||
index = node["parameters"]["index"] # 索引值
|
||||
except:
|
||||
node["parameters"]["index"] = 0
|
||||
elif node["option"] == 5: # 自定义操作
|
||||
try:
|
||||
clear = node["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
newLine = node["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
elif node["option"] == 7: # 移动到元素
|
||||
if node["parameters"]["useLoop"]:
|
||||
if self.task_version <= "0.3.5":
|
||||
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
node["parameters"]["xpath"] = ""
|
||||
self.print_and_log("您的任务版本号为" + self.task_version +
|
||||
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif node["option"] == 8: # 循环操作
|
||||
try:
|
||||
exitElement = node["parameters"]["exitElement"]
|
||||
if exitElement == "":
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
except:
|
||||
node["parameters"]["exitElement"] = "//body"
|
||||
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
|
||||
try:
|
||||
skipCount = node["parameters"]["skipCount"]
|
||||
except:
|
||||
node["parameters"]["skipCount"] = 0
|
||||
elif option == GraphOption.Input.value: # 输入文字
|
||||
parameters['index'] = parameters.get('index', 0)
|
||||
elif option == GraphOption.Custom.value: # 自定义操作
|
||||
parameters['clear'] = parameters.get('clear', 0)
|
||||
parameters['newLine'] = parameters.get('newLine', 1)
|
||||
elif option == GraphOption.Move.value: # 移动到元素
|
||||
if parameters.get('useLoop'):
|
||||
if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath
|
||||
parameters["xpath"] = ""
|
||||
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath")
|
||||
elif option == GraphOption.Loop.value: # 循环操作
|
||||
parameters['exitElement'] = "//body" if not parameters.get('exitElement') or parameters.get('exitElement') == "" else parameters.get('exitElement')
|
||||
parameters["quickExtractable"] = False # 是否可以快速提取
|
||||
parameters['skipCount'] = parameters.get('skipCount', 0)
|
||||
|
||||
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
try:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["params"]
|
||||
except:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
try:
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"]
|
||||
except:
|
||||
waitElement = ""
|
||||
if node["parameters"]["iframe"]:
|
||||
node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \
|
||||
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
|
||||
params = self.procedure[node["sequence"][0]].get("parameters").get("params")
|
||||
if not params:
|
||||
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider
|
||||
|
||||
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "")
|
||||
|
||||
if parameters["iframe"]:
|
||||
parameters["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
|
||||
else:
|
||||
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
|
||||
if node["parameters"]["skipCount"] > 0:
|
||||
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
parameters["quickExtractable"] = True # 先假设可以快速提取
|
||||
|
||||
if parameters["skipCount"] > 0:
|
||||
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
|
||||
|
||||
for param in params:
|
||||
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
|
||||
try:
|
||||
iframe = param["iframe"]
|
||||
except:
|
||||
param["iframe"] = False
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
param['iframe'] = param.get('iframe', False)
|
||||
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取
|
||||
optimizable = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取
|
||||
parameters["quickExtractable"] = False
|
||||
break
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据")
|
||||
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly")
|
||||
try:
|
||||
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"]
|
||||
except:
|
||||
node["parameters"]["clear"] = 0
|
||||
try:
|
||||
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"]
|
||||
except:
|
||||
node["parameters"]["newLine"] = 1
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
|
||||
if parameters["quickExtractable"]:
|
||||
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据")
|
||||
self.print_and_log(f"Loop operation <{node['title']}> can extract data quickly")
|
||||
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0)
|
||||
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1)
|
||||
|
||||
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["xpath"]
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
|
||||
node["parameters"]["baseXPath"] = node["parameters"]["pathList"]
|
||||
node["parameters"]["quickParams"] = []
|
||||
for param in params:
|
||||
content_type = ""
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find(
|
||||
"::text()") >= 0:
|
||||
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \
|
||||
or param["relativeXPath"].find("::text()") >= 0:
|
||||
content_type = ""
|
||||
elif param["nodeType"] == 2:
|
||||
content_type = "//@href"
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
elif param["nodeType"] == 4: # 图片链接
|
||||
content_type = "//@src"
|
||||
elif param["contentType"] == 1:
|
||||
content_type = "/text()"
|
||||
elif param["contentType"] == 0:
|
||||
content_type = "//text()"
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
if param["relative"]: # 如果是相对XPath
|
||||
xpath = "." + param["relativeXPath"] + content_type
|
||||
else:
|
||||
xpath = param["relativeXPath"] + content_type
|
||||
@ -443,6 +371,7 @@ class BrowserThread(Thread):
|
||||
"nodeType": param["nodeType"],
|
||||
"default": param["default"],
|
||||
})
|
||||
self.procedure[index_node]["parameters"] = parameters
|
||||
self.print_and_log("预处理完成|Preprocess completed")
|
||||
|
||||
def readFromExcel(self):
|
||||
@ -559,7 +488,10 @@ class BrowserThread(Thread):
|
||||
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
|
||||
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
|
||||
time.sleep(quitWaitTime)
|
||||
self.browser.quit()
|
||||
try:
|
||||
self.browser.quit()
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("正在清理临时用户目录……|Cleaning up temporary user directory...")
|
||||
try:
|
||||
shutil.rmtree(self.option["tmp_user_data_folder"])
|
||||
@ -775,18 +707,20 @@ class BrowserThread(Thread):
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 2:
|
||||
self.recordLog("Execute JavaScript for element:" + code)
|
||||
self.recordLog("对元素执行JavaScript:" + code)
|
||||
self.browser.set_script_timeout(max_wait_time)
|
||||
try:
|
||||
output = self.browser.execute_script(code, element)
|
||||
except:
|
||||
except Exception as e:
|
||||
output = ""
|
||||
self.recordLog("JavaScript execution failed")
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" + code, ", error is:", str(e))
|
||||
elif int(codeMode) == 5:
|
||||
try:
|
||||
code = readCode(code)
|
||||
@ -796,9 +730,9 @@ class BrowserThread(Thread):
|
||||
self.recordLog("执行下面的代码:" + code)
|
||||
self.recordLog("Execute the following code:" + code)
|
||||
except Exception as e:
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", e)
|
||||
self.print_and_log("执行下面的代码时出错:" + code, ",错误为:", str(e))
|
||||
self.print_and_log("Error executing the following code:" +
|
||||
code, ", error is:", e)
|
||||
code, ", error is:", str(e))
|
||||
elif int(codeMode) == 6:
|
||||
try:
|
||||
code = readCode(code)
|
||||
@ -1204,7 +1138,7 @@ class BrowserThread(Thread):
|
||||
return index, element
|
||||
|
||||
# 对循环的处理
|
||||
def loopExecute(self, node, loopValue, clickPath="", index=0):
|
||||
def loopExecute(self, node, loopValue, loopPath="", index=0):
|
||||
time.sleep(0.1) # 第一次执行循环的时候强制等待1秒
|
||||
thisHandle = self.browser.current_window_handle # 记录本次循环内的标签页的ID
|
||||
try:
|
||||
@ -1216,6 +1150,14 @@ class BrowserThread(Thread):
|
||||
self.history["handle"] = thisHandle
|
||||
thisHistoryURL = self.browser.current_url
|
||||
# 快速提取处理
|
||||
# start = time.time()
|
||||
try:
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
except Exception as e:
|
||||
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
|
||||
node["parameters"]["quickExtractable"] = False
|
||||
# end = time.time()
|
||||
# print("解析页面秒数:", end - start)
|
||||
if node["parameters"]["quickExtractable"]:
|
||||
self.browser.switch_to.default_content() # 切换到主页面
|
||||
tree = html.fromstring(self.browser.page_source)
|
||||
@ -1721,8 +1663,11 @@ class BrowserThread(Thread):
|
||||
try:
|
||||
actions = ActionChains(self.browser) # 实例化一个action对象
|
||||
if newTab == 1: # 在新标签页打开
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
if sys.platform == "darwin": # Mac
|
||||
actions.key_down(Keys.COMMAND).click(element).key_up(Keys.COMMAND).perform()
|
||||
else:
|
||||
# Ctrl + Click
|
||||
actions.key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()
|
||||
else:
|
||||
actions.click(element).perform()
|
||||
except Exception as e:
|
||||
@ -1925,9 +1870,17 @@ class BrowserThread(Thread):
|
||||
width = size["width"]
|
||||
height = size["height"]
|
||||
# 调整浏览器窗口的大小
|
||||
self.browser.set_window_size(width, height)
|
||||
if self.commandline_config["headless"] == 1: # 无头模式下,截取整个网页的高度
|
||||
page_width = self.browser.execute_script(
|
||||
"return document.body.scrollWidth")
|
||||
page_height = self.browser.execute_script(
|
||||
"return document.body.scrollHeight")
|
||||
self.browser.set_window_size(page_width, page_height)
|
||||
time.sleep(1)
|
||||
else:
|
||||
self.browser.set_window_size(width, height)
|
||||
element.screenshot("Data/Task_" + str(self.id) + "/" + self.saveName +
|
||||
"/" + str(time.time()) + ".png")
|
||||
"/screenshots/" + str(time.time()) + ".png")
|
||||
# 截图完成后,将浏览器的窗口大小设置为原来的大小
|
||||
self.browser.set_window_size(width, height)
|
||||
elif p["contentType"] == 8:
|
||||
@ -2238,7 +2191,7 @@ class BrowserThread(Thread):
|
||||
if __name__ == '__main__':
|
||||
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
|
||||
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
|
||||
config = {
|
||||
commandline_config = {
|
||||
"ids": [0],
|
||||
"saved_file_name": "",
|
||||
"user_data": False,
|
||||
@ -2249,9 +2202,11 @@ if __name__ == '__main__':
|
||||
"server_address": "http://localhost:8074",
|
||||
"keyboard": True, # 是否监听键盘输入
|
||||
"pause_key": "p", # 暂停键
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"docker_driver": "",
|
||||
"user_folder": "",
|
||||
}
|
||||
c = Config(config)
|
||||
c = Config(commandline_config)
|
||||
print(c)
|
||||
options = webdriver.ChromeOptions()
|
||||
driver_path = "chromedriver.exe"
|
||||
@ -2345,35 +2300,43 @@ if __name__ == '__main__':
|
||||
os.mkdir(tmp_user_folder_parent)
|
||||
characters = string.ascii_letters + string.digits
|
||||
for i in range(len(c.ids)):
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options = tmp_options[i]["options"]
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
options.add_argument("--profile-directory=Default")
|
||||
if c.user_folder == "":
|
||||
id = c.ids[i]
|
||||
# 从字符集中随机选择字符构成字符串
|
||||
random_string = ''.join(random.choice(characters) for i in range(10))
|
||||
tmp_user_data_folder = os.path.join(tmp_user_folder_parent, "user_data_" + str(id) + "_" + str(time.time()).replace(".","") + "_" + random_string)
|
||||
tmp_options[i]["tmp_user_data_folder"] = tmp_user_data_folder
|
||||
if os.path.exists(tmp_user_data_folder):
|
||||
try:
|
||||
shutil.rmtree(tmp_user_data_folder)
|
||||
except:
|
||||
pass
|
||||
print(f"Copying user data folder to: {tmp_user_data_folder}, please wait...")
|
||||
print(f"正在复制用户信息目录到: {tmp_user_data_folder},请稍等...")
|
||||
if os.path.exists(absolute_user_data_folder):
|
||||
try:
|
||||
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
|
||||
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
|
||||
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
|
||||
except:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Copy user data folder failed, use the original folder.")
|
||||
print("复制用户信息目录失败,使用原始目录。")
|
||||
else:
|
||||
tmp_user_data_folder = absolute_user_data_folder
|
||||
print("Cannot find user data folder, create a new folder.")
|
||||
print("未找到用户信息目录,创建新目录。")
|
||||
options.add_argument(
|
||||
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
|
||||
print(f"Use local user data folder: {tmp_user_data_folder}")
|
||||
print(f"使用本地用户信息目录: {tmp_user_data_folder}")
|
||||
else:
|
||||
options.add_argument(
|
||||
f'--user-data-dir={c.user_folder}')
|
||||
print(f"Use specifed user data folder: {c.user_folder}, please note if you are using docker, this user folder path should be the path inside the docker container.")
|
||||
print(f"使用指定的用户信息目录: {c.user_folder},请注意如果您正在使用docker,此用户文件夹路径应是容器内的路径。")
|
||||
print(
|
||||
"如果报错Selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally,说明有之前运行的Chrome实例没有正常关闭,请关闭之前打开的所有Chrome实例后再运行程序即可。")
|
||||
print(
|
||||
@ -2386,9 +2349,13 @@ if __name__ == '__main__':
|
||||
print("id: ", id)
|
||||
if c.read_type == "remote":
|
||||
print("remote")
|
||||
content = requests.get(
|
||||
try:
|
||||
content = requests.get(
|
||||
c.server_address + "/queryExecutionInstance?id=" + str(id))
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
except:
|
||||
print("Cannot connect to the server, please make sure that the EasySpider Main Program is running, or you can change the --read_type parameter to 'local' to read the task information from the local task file without keeping the EasySpider Main Program running.")
|
||||
print("无法连接到服务器,请确保EasySpider主程序正在运行,或者您可以将--read_type参数更改为'local',以实现从本地任务文件中读取任务信息而无需保持EasySpider主程序运行。")
|
||||
else:
|
||||
print("local")
|
||||
local_folder = os.path.join(os.getcwd(), "execution_instances")
|
||||
@ -2439,8 +2406,17 @@ if __name__ == '__main__':
|
||||
except:
|
||||
browser = "chrome"
|
||||
if browser == "chrome":
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options)
|
||||
if c.docker_driver == "":
|
||||
print("Using local driver")
|
||||
selenium_service = Service(executable_path=driver_path)
|
||||
browser_t = MyChrome(service=selenium_service, options=options, mode='local_driver')
|
||||
else:
|
||||
print("Using remote driver")
|
||||
# Use docker driver, default address is http://localhost:4444/wd/hub
|
||||
# Headless mode
|
||||
# options.add_argument("--headless")
|
||||
# print("Headless mode")
|
||||
browser_t = MyChrome(command_executor=c.docker_driver, options=options, mode='remote_driver')
|
||||
elif browser == "edge":
|
||||
from selenium.webdriver.edge.service import Service as EdgeService
|
||||
from selenium.webdriver.edge.options import Options as EdgeOptions
|
||||
@ -2472,7 +2448,7 @@ if __name__ == '__main__':
|
||||
event = Event()
|
||||
event.set()
|
||||
thread = BrowserThread(browser_t, id, service,
|
||||
c.version, event, c.saved_file_name, config=config, option=tmp_options[i])
|
||||
c.version, event, c.saved_file_name, config=config, option=tmp_options[i], commandline_config=c)
|
||||
print("Thread with task id: ", id, " is created")
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
@ -2501,6 +2477,7 @@ if __name__ == '__main__':
|
||||
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
|
||||
# 使用监听器监听键盘输入
|
||||
try:
|
||||
from pynput.keyboard import Key, Listener
|
||||
if c.keyboard:
|
||||
with Listener(on_press=on_press_creator(press_time, event),
|
||||
on_release=on_release_creator(event, press_time)) as listener:
|
||||
|
108
ExecuteStage/fl_beta.py
Normal file
108
ExecuteStage/fl_beta.py
Normal file
@ -0,0 +1,108 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torchvision import models, transforms
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
# 定义 ResNet 模型(以 ResNet18 为例)
|
||||
class ResNetModel(nn.Module):
|
||||
def __init__(self, num_classes):
|
||||
super(ResNetModel, self).__init__()
|
||||
self.resnet = models.resnet18(pretrained=True)
|
||||
# 修改最后的全连接层以适应特定的分类任务
|
||||
self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
return self.resnet(x)
|
||||
|
||||
# 自定义数据集类
|
||||
class WebpageDataset(Dataset):
|
||||
def __init__(self, image_dir, transform=None):
|
||||
self.image_dir = image_dir
|
||||
self.transform = transform
|
||||
self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.image_files)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img_name = os.path.join(self.image_dir, self.image_files[idx])
|
||||
image = Image.open(img_name).convert('RGB')
|
||||
label = self.get_label_from_filename(self.image_files[idx])
|
||||
if self.transform:
|
||||
image = self.transform(image)
|
||||
return image, label
|
||||
|
||||
def get_label_from_filename(self, filename):
|
||||
# 假设文件名格式为 'class_label.png'
|
||||
return int(filename.split('_')[0])
|
||||
|
||||
# 图像预处理
|
||||
transform = transforms.Compose([
|
||||
transforms.Resize((224, 224)),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
])
|
||||
|
||||
# 定义客户端训练函数
|
||||
def train_local_model(model, dataloader, criterion, optimizer, epochs=5):
|
||||
model.train()
|
||||
for epoch in range(epochs):
|
||||
for images, labels in dataloader:
|
||||
outputs = model(images)
|
||||
loss = criterion(outputs, labels)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
return model.state_dict()
|
||||
|
||||
# 联邦平均算法
|
||||
def federated_average(models_state_dicts):
|
||||
avg_state_dict = models_state_dicts[0]
|
||||
for key in avg_state_dict.keys():
|
||||
for i in range(1, len(models_state_dicts)):
|
||||
avg_state_dict[key] += models_state_dicts[i][key]
|
||||
avg_state_dict[key] = torch.div(avg_state_dict[key], len(models_state_dicts))
|
||||
return avg_state_dict
|
||||
|
||||
# 模拟多个客户端的数据
|
||||
client_data_dirs = ['client1_data', 'client2_data', 'client3_data'] # 每个客户端的数据目录
|
||||
num_classes = 10 # 根据实际情况设置
|
||||
|
||||
# 初始化全局模型
|
||||
global_model = ResNetModel(num_classes=num_classes)
|
||||
|
||||
# 定义损失函数
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# 联邦学习过程
|
||||
num_rounds = 10
|
||||
for round in range(num_rounds):
|
||||
local_models = []
|
||||
for client_dir in client_data_dirs:
|
||||
# 加载客户端数据
|
||||
dataset = WebpageDataset(image_dir=client_dir, transform=transform)
|
||||
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
|
||||
|
||||
# 初始化客户端模型
|
||||
local_model = ResNetModel(num_classes=num_classes)
|
||||
local_model.load_state_dict(global_model.state_dict())
|
||||
|
||||
# 定义优化器
|
||||
optimizer = optim.SGD(local_model.parameters(), lr=0.01, momentum=0.9)
|
||||
|
||||
# 训练本地模型
|
||||
local_state_dict = train_local_model(local_model, dataloader, criterion, optimizer)
|
||||
local_models.append(local_state_dict)
|
||||
|
||||
# 聚合模型参数
|
||||
global_state_dict = federated_average(local_models)
|
||||
global_model.load_state_dict(global_state_dict)
|
||||
|
||||
print(f'Round {round+1}/{num_rounds} completed.')
|
||||
|
||||
# 保存全局模型
|
||||
torch.save(global_model.state_dict(), 'federated_resnet_model.pth')
|
@ -1,5 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm -r build
|
||||
rm -r dist
|
||||
# 一定要先source一下,不然会出现找不到conda命令的错误!!!
|
||||
source ~/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate easyspider
|
||||
# Python一定要是3.11版本,不然会出现浏览器弹出崩溃的错误!!!原来使用的3.8,崩溃原因未知。
|
||||
pyinstaller -F --add-data "/home/naibo/miniconda3/envs/easyspider/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.cpython-311-x86_64-linux-gnu.so:onnxruntime/capi" --add-data "/home/naibo/miniconda3/envs/easyspider/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
|
||||
rm ../ElectronJS/chrome_linux64/easyspider_executestage
|
||||
|
36
ExecuteStage/llm_beta.py
Normal file
36
ExecuteStage/llm_beta.py
Normal file
@ -0,0 +1,36 @@
|
||||
from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
from PIL import Image
|
||||
import torch
|
||||
|
||||
# 加载 Llama 3.2 视觉模型和处理器
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision" # 请根据实际模型路径替换
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
model = AutoModelForVision2Seq.from_pretrained(model_name)
|
||||
|
||||
# 处理网页截图并提取结构
|
||||
def predict_structure_from_image(image_path):
|
||||
# 加载图像
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
# 预处理图像
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
|
||||
# 生成描述(结构描述)
|
||||
outputs = model.generate(
|
||||
inputs["pixel_values"],
|
||||
max_length=512,
|
||||
num_beams=5,
|
||||
early_stopping=True
|
||||
)
|
||||
description = processor.decode(outputs[0], skip_special_tokens=True)
|
||||
return description
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
# 提供网页截图的路径
|
||||
image_path = "webpage_screenshot.png" # 请替换为实际的图像文件路径
|
||||
|
||||
# 预测结构
|
||||
predicted_structure = predict_structure_from_image(image_path)
|
||||
|
||||
print("预测的结构:", predicted_structure)
|
@ -19,11 +19,16 @@ desired_capabilities["pageLoadStrategy"] = "none"
|
||||
|
||||
|
||||
|
||||
class MyChrome(webdriver.Chrome):
|
||||
class MyChrome(webdriver.Chrome, webdriver.Remote):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
def __init__(self, mode='local_driver', *args, **kwargs):
|
||||
self.iframe_env = False # 现在的环境是root还是iframe
|
||||
super().__init__(*args, **kwargs) # 调用父类的 __init__
|
||||
self.mode = mode
|
||||
if mode == "local_driver":
|
||||
webdriver.Chrome.__init__(self, *args, **kwargs)
|
||||
elif mode == "remote_driver":
|
||||
webdriver.Remote.__init__(self, *args, **kwargs)
|
||||
# super().__init__(*args, **kwargs) # 调用父类的 __init__
|
||||
|
||||
# def find_element(self, by=By.ID, value=None, iframe=False):
|
||||
# # 在这里改变查找元素的行为
|
||||
|
@ -1,14 +1,14 @@
|
||||
commandline_config==2.2.3
|
||||
requests==2.31.0
|
||||
selenium==4.16.0
|
||||
requests==2.32.3
|
||||
selenium==4.27.1
|
||||
pyinstaller==5.13.2
|
||||
Pillow==10.2.0
|
||||
xlsxwriter==3.1.9
|
||||
xlsxwriter==3.2.0
|
||||
openpyxl==3.1.2
|
||||
pymysql==1.1.0
|
||||
lxml==4.9.2
|
||||
ddddocr==1.4.10
|
||||
pymysql==1.1.1
|
||||
lxml==5.3.0
|
||||
ddddocr==1.5.6
|
||||
pynput==1.7.6
|
||||
beautifulsoup4==4.12.2
|
||||
undetected-chromedriver==3.4.7
|
||||
pandas==2.1.4
|
||||
pandas==2.2.3
|
||||
|
@ -1,11 +1,11 @@
|
||||
commandline_config==2.2.3
|
||||
requests==2.31.0
|
||||
requests==2.32.0
|
||||
selenium==4.16.0
|
||||
pyinstaller==5.13.2
|
||||
Pillow==9.5.0
|
||||
xlsxwriter==3.1.9
|
||||
openpyxl==3.1.2
|
||||
pymysql==1.1.0
|
||||
pymysql==1.1.1
|
||||
lxml==4.9.2
|
||||
ddddocr==1.4.10
|
||||
pynput==1.7.6
|
||||
|
@ -1,4 +1,8 @@
|
||||
# 环境编译说明|Environment Compilation Instruction
|
||||
## 视频教程
|
||||
|
||||
[从源代码编译程序并设计运行和调试任务指南(基于Ubuntu24.04)](https://www.bilibili.com/video/BV1VE421P7yj/)
|
||||
|
||||
# 环境编译说明 | Environment Compilation Instruction
|
||||
|
||||
EasySpider分三部分:
|
||||
|
||||
@ -18,20 +22,20 @@ EasySpider is divided into three parts:
|
||||
|
||||
This section covers the compilation instructions for the `Browser extension`, **all commands in this section are executed in the `manifest_v3` folder**, i.e., you need to `cd manifest_v3` first.
|
||||
|
||||
## 建议编译顺序|Suggested Compilation Order
|
||||
## 建议编译顺序 | Suggested Compilation Order
|
||||
|
||||
1. 编译浏览器扩展,否则在主程序执行时会提示找不到`EasySpider_zh.crx`的错误。
|
||||
2. 编译主程序,此时主程序可以正常运行,但无法执行任务,只能设计任务。
|
||||
3. 编译执行阶段程序,否则无法执行程序,只能设计程序。
|
||||
3. 编译执行阶段程序,否则无法执行任务,只能设计任务。
|
||||
|
||||
-----
|
||||
|
||||
1. Compile the browser extension, otherwise an error will be prompted when the main program is executed that `EasySpider_en.crx` cannot be found.
|
||||
2. Compile the main program, at this time the main program can run normally, but can not execute the task, can only design the task.
|
||||
3. Compile the execution stage program, otherwise the program cannot be executed, can only design the program.
|
||||
3. Compile the execution stage program, otherwise the task cannot be executed, can only design the task.
|
||||
|
||||
|
||||
## 环境构建|Environment Setup
|
||||
## 环境构建 | Environment Setup
|
||||
|
||||
1. 安装`NodeJS`:[https://nodejs.org/zh-cn/download/](https://nodejs.org/zh-cn/download/)。
|
||||
2. 运行下面的命令来安装依赖:
|
||||
@ -49,7 +53,7 @@ npm install
|
||||
npm install
|
||||
```
|
||||
|
||||
## 热加载扩展|Hot reload the extension
|
||||
## 热加载扩展 | Hot reload the extension
|
||||
|
||||
执行下面的命令来热加载扩展:
|
||||
|
||||
@ -69,7 +73,7 @@ npm run dev
|
||||
|
||||
Open a Chrome browser window, then enter `chrome://extensions/` in the browser address bar. On the opened page, open the `Developer mode` in the upper right corner, click `Load unpacked` and select the `manifest_v3/dist` folder to load the extension.
|
||||
|
||||
## 打包扩展|Package the extension
|
||||
## 打包扩展 | Package the extension
|
||||
|
||||
执行下面的命令来打包扩展:
|
||||
|
||||
|
847
Extension/manifest_v3/package-lock.json
generated
847
Extension/manifest_v3/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "EasySpider",
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "rollup -c",
|
||||
@ -34,7 +34,7 @@
|
||||
"@types/node": "^16.11.10",
|
||||
"@vitejs/plugin-vue": "^1.9.3",
|
||||
"esno": "^0.12.1",
|
||||
"firebase": "^9.18.0",
|
||||
"firebase": "^10.12.2",
|
||||
"fs-extra": "^10.0.0",
|
||||
"npm-run-all": "^4.1.5",
|
||||
"rimraf": "^3.0.2",
|
||||
|
@ -1,5 +1,6 @@
|
||||
import config from './config.json';
|
||||
|
||||
|
||||
export var global = {
|
||||
nodeList: [], //已被选中的节点列表
|
||||
readyList: [], //预备选中的list
|
||||
@ -57,13 +58,13 @@ export function getElementXPaths(element, parentElement = document.body) {
|
||||
paths.push(pre_xpath + `id("${element.id}")`);
|
||||
}
|
||||
if (element.className) {
|
||||
paths.push(pre_xpath + "//" + element.tagName + "[@class='" + element.className + "']");
|
||||
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@class='" + element.className + "']");
|
||||
}
|
||||
if (element.name) {
|
||||
paths.push(pre_xpath + "//" + element.tagName + "[@name='" + element.name + "']");
|
||||
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@name='" + element.name + "']");
|
||||
}
|
||||
if (element.alt) {
|
||||
paths.push(pre_xpath + "//" + element.tagName + "[@alt='" + element.alt + "']");
|
||||
paths.push(pre_xpath + "//" + element.tagName.toLowerCase() + "[@alt='" + element.alt + "']");
|
||||
}
|
||||
paths.push(getAbsoluteXPathWithReverseIndex(element));
|
||||
console.log("ALL PATHS: " + paths);
|
||||
|
@ -1,9 +1,28 @@
|
||||
import $ from "jquery";
|
||||
import Vue from "vue";
|
||||
import {global, getOS, readXPath, addEl, clearEl, clearReady, handleElement, clearParameters, generateParameters, generateMultiParameters, handleDescendents, generateValTable, findRelated, pushToReadyList, readyToList, combineXpath, relatedTest} from "./global.js";
|
||||
import {
|
||||
global,
|
||||
getOS,
|
||||
readXPath,
|
||||
addEl,
|
||||
clearEl,
|
||||
clearReady,
|
||||
handleElement,
|
||||
clearParameters,
|
||||
generateParameters,
|
||||
generateMultiParameters,
|
||||
handleDescendents,
|
||||
generateValTable,
|
||||
findRelated,
|
||||
pushToReadyList,
|
||||
readyToList,
|
||||
combineXpath,
|
||||
relatedTest,
|
||||
LANG
|
||||
} from "./global.js";
|
||||
import ToolKit from "./toolkit.vue";
|
||||
import iframe from "./iframe.vue";
|
||||
|
||||
import {createNotification} from './trail.js';
|
||||
|
||||
//表现逻辑层的处理
|
||||
|
||||
@ -170,7 +189,7 @@ window.addEventListener('DOMContentLoaded', () => {
|
||||
document.onkeydown = function(event) {
|
||||
// console.log("keydown");
|
||||
var e = event || window.event || arguments.callee.caller.arguments[0];
|
||||
if (e && e.keyCode == 118) { // 按 F7
|
||||
if (e && e.keyCode == 113) { // 按 F2
|
||||
addEl();
|
||||
} else if (e && e.keyCode == 119) { //按F8
|
||||
clearEl();
|
||||
@ -316,11 +335,16 @@ function generateToolkit() {
|
||||
//Vue元素
|
||||
generateToolkit();
|
||||
|
||||
function closeToolkit() {
|
||||
toolkit.style.display = "none"; // 隐藏元素
|
||||
createNotification(LANG("EasySpider操作控制台已隐藏,可点击浏览器右上角扩展程序区域的EasySpider图标重新显示。", "EasySpider Toolkit is hidden. Click the EasySpider icon in the extension list (upper right corner) of the browser to reopen."));
|
||||
}
|
||||
|
||||
let closeButton = document.getElementById("closeButton");
|
||||
closeButton.addEventListener("click", function() {
|
||||
toolkit.style.display = "none"; // 隐藏元素
|
||||
closeToolkit();
|
||||
});
|
||||
let closeButtonLeft = document.getElementById("closeButtonLeft");
|
||||
closeButtonLeft.addEventListener("click", function() {
|
||||
toolkit.style.display = "none"; // 隐藏元素
|
||||
closeToolkit();
|
||||
});
|
||||
|
@ -27,6 +27,10 @@ global.ws.onmessage = function (evt) {
|
||||
clearEl();
|
||||
} else if (evt["type"] == "trial") {
|
||||
trial(evt);
|
||||
} else if (evt["type"] == "showAllToolboxes") {
|
||||
document.getElementById("wrapperToolkit").style.display = "block";
|
||||
} else if (evt["type"] == "hideAllToolboxes") {
|
||||
document.getElementById("wrapperToolkit").style.display = "none";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
</div>
|
||||
<p style="color:black; margin-top: 10px">● 鼠标移动到笑脸<span style="font-size: 20px">☺</span>查看操作提示。</p>
|
||||
<p style="color:black; margin-top: 10px">●
|
||||
鼠标移动到元素上后,请<strong>右键</strong>点击或者按<strong>F7</strong>键选中页面元素。
|
||||
鼠标移动到元素上后,请<strong>右键</strong>点击或者按<strong>F2</strong>键选中页面元素。
|
||||
</p>
|
||||
<p style="color:black; margin-top: 10px">●
|
||||
通过鼠标左键进行点击时,页面也会有反应,但左键点击发生的操作不会被记录在任务流程中;同理,如果想输入文本框但并不想将动作记录,可以鼠标移动到文本框,并按键盘的<strong>F9</strong>进行输入。
|
||||
@ -231,7 +231,7 @@
|
||||
<p style="color:black; margin-top: 10px">● Mouse move to smiling face <span style="font-size: 20px">☺</span> to see operation help.</p>
|
||||
<p style="color:black; margin-top: 10px">● When your mouse moves to the element, please
|
||||
<strong>right-click</strong> your
|
||||
mouse button or press <strong>F7</strong> on the keyboard to select it.</p>
|
||||
mouse button or press <strong>F2</strong> on the keyboard to select it.</p>
|
||||
<p style="color:black; margin-top: 10px">● When clicked with the left mouse button, the page will also
|
||||
respond, but this click operation will not be recorded in the task flow. Similarly, if you want to input
|
||||
in a text box but do not want the action to be recorded , you can move the mouse to the text box and
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "EasySpider",
|
||||
"version": "0.6.2",
|
||||
"version": "0.6.3",
|
||||
"description": "EasySpider's chrome extension",
|
||||
"author": "Naibo Wang",
|
||||
"manifest_version": 3,
|
||||
@ -11,6 +11,7 @@
|
||||
"38": "assets/icon-38.png",
|
||||
"128": "assets/icon-128.png"
|
||||
},
|
||||
"default_popup": "popup.html",
|
||||
"default_title": "EasySpider"
|
||||
},
|
||||
"icons": {
|
||||
@ -53,6 +54,7 @@
|
||||
"storage",
|
||||
"tabs",
|
||||
"scripting",
|
||||
"activeTab",
|
||||
"notifications"
|
||||
]
|
||||
}
|
@ -1,11 +1,19 @@
|
||||
<!doctype html>
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Popup 示例</title>
|
||||
<link rel="stylesheet" type="text/css" href="popup.css">
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>EasySpider Control Panel</title>
|
||||
<link rel="stylesheet" href="style/bootstrap.min.css">
|
||||
</head>
|
||||
<body>
|
||||
<!-- <h2>EasySpider Extension</h2>-->
|
||||
EasySpider Extension, please do not disable me.
|
||||
<body class="p-4">
|
||||
<div class="text-center">
|
||||
<!-- <h3>操作</h3>-->
|
||||
<p id="title">可执行操作</p>
|
||||
<button id="show-toolkit" class="btn btn-primary" style="width: 200px">显示EasySpider操作台</button>
|
||||
<p></p>
|
||||
<button id="close-toolkit" class="btn btn-danger" style="width: 200px">隐藏EasySpider操作台</button>
|
||||
</div>
|
||||
<script src="popup.js"></script>
|
||||
</body>
|
||||
</html>
|
@ -1,3 +1,141 @@
|
||||
document.getElementById('clickme').addEventListener('click', () => {
|
||||
alert('Hello, World!');
|
||||
import config from './content-scripts/config.json';
|
||||
import {global} from "./content-scripts/global.js";
|
||||
|
||||
if (config.language == 'zh') {
|
||||
document.getElementById('title').innerText = '可执行操作';
|
||||
document.getElementById('show-toolkit').innerText = '显示EasySpider操作台';
|
||||
document.getElementById('close-toolkit').innerText = '隐藏EasySpider操作台';
|
||||
} else {
|
||||
document.getElementById('title').innerText = 'Executable Operations';
|
||||
document.getElementById('show-toolkit').innerText = 'Show EasySpider Toolkit';
|
||||
document.getElementById('close-toolkit').innerText = 'Hide EasySpider Toolkit';
|
||||
}
|
||||
|
||||
var ws = new WebSocket("ws://localhost:8084");
|
||||
ws.onopen = function () {
|
||||
// Web Socket 已连接上,使用 send() 方法发送数据
|
||||
console.log("已连接");
|
||||
let message = {
|
||||
type: 0, //消息类型,0代表连接操作
|
||||
message: {
|
||||
id: 3, //socket id
|
||||
title: document.title, //网页标题
|
||||
}
|
||||
};
|
||||
this.send(JSON.stringify(message));
|
||||
};
|
||||
|
||||
document.getElementById('show-toolkit').addEventListener('click', async () => {
|
||||
try {
|
||||
// 发送消息给 content script
|
||||
const [tab] = await chrome.tabs.query({active: true, currentWindow: true});
|
||||
chrome.scripting.executeScript({
|
||||
target: {tabId: tab.id},
|
||||
func: showToolkit
|
||||
});
|
||||
let message_action = {
|
||||
type: 30, //消息类型,30代表显示所有操作台
|
||||
from: 3, //3代表popup
|
||||
message: {}
|
||||
};
|
||||
window.close();
|
||||
try {
|
||||
ws.send(JSON.stringify(message_action));
|
||||
} catch (e) {
|
||||
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error showing toolkit:', error);
|
||||
}
|
||||
});
|
||||
|
||||
document.getElementById('close-toolkit').addEventListener('click', async () => {
|
||||
try {
|
||||
// 发送消息给 content script
|
||||
const [tab] = await chrome.tabs.query({active: true, currentWindow: true});
|
||||
chrome.scripting.executeScript({
|
||||
target: {tabId: tab.id},
|
||||
func: closeToolkit
|
||||
});
|
||||
let message_action = {
|
||||
type: 31, //消息类型,30代表隐藏所有操作台
|
||||
from: 3, //3代表popup
|
||||
message: {}
|
||||
};
|
||||
window.close();
|
||||
try {
|
||||
ws.send(JSON.stringify(message_action));
|
||||
} catch (e) {
|
||||
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error closing toolkit:', error);
|
||||
}
|
||||
});
|
||||
|
||||
// 显示操作台函数
|
||||
function showToolkit() {
|
||||
const showContainers = (documentRoot) => {
|
||||
const containers = documentRoot.querySelectorAll('#wrapperToolkit');
|
||||
containers.forEach(container => {
|
||||
if (getComputedStyle(container).display === 'none') {
|
||||
container.style.display = 'block';
|
||||
console.log('显示EasySpider操作台');
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
const processIframes = (documentRoot) => {
|
||||
const iframes = documentRoot.querySelectorAll('iframe');
|
||||
iframes.forEach(iframe => {
|
||||
try {
|
||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
||||
if (iframeDoc) {
|
||||
// 显示 iframe 内的 #wrapperToolkit
|
||||
showContainers(iframeDoc);
|
||||
processIframes(iframeDoc);
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn('无法访问 iframe:', err);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// 处理主文档和嵌套 iframe
|
||||
showContainers(document);
|
||||
processIframes(document);
|
||||
}
|
||||
|
||||
// 关闭操作台函数
|
||||
function closeToolkit() {
|
||||
const hideContainers = (documentRoot) => {
|
||||
const containers = documentRoot.querySelectorAll('#wrapperToolkit');
|
||||
containers.forEach(container => {
|
||||
if (getComputedStyle(container).display === 'block') {
|
||||
container.style.display = 'none';
|
||||
console.log('关闭EasySpider操作台');
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
const processIframes = (documentRoot) => {
|
||||
const iframes = documentRoot.querySelectorAll('iframe');
|
||||
console.log("iframes", iframes);
|
||||
iframes.forEach(iframe => {
|
||||
try {
|
||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
||||
if (iframeDoc) {
|
||||
// 隐藏 iframe 内的 #wrapperToolkit
|
||||
hideContainers(iframeDoc);
|
||||
processIframes(iframeDoc);
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn('无法访问 iframe:', err);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// 处理主文档和嵌套 iframe
|
||||
hideContainers(document);
|
||||
processIframes(document);
|
||||
}
|
||||
|
6
Extension/manifest_v3/src/style/bootstrap.min.css
vendored
Normal file
6
Extension/manifest_v3/src/style/bootstrap.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user