diff --git a/.temp_to_pub/.gitignore b/.temp_to_pub/.gitignore
index 9a8f15e..0167622 100644
--- a/.temp_to_pub/.gitignore
+++ b/.temp_to_pub/.gitignore
@@ -1,4 +1,5 @@
EasySpider_MacOS/easyspider_executestage
+EasySpider_MacOS/easyspider_executestage_full
EasySpider_Linux64_x64/user_data
EasySpider_windows_x32/user_data
EasySpider
diff --git a/ElectronJS/config.json b/ElectronJS/config.json
index 252c802..0d7be07 100644
--- a/ElectronJS/config.json
+++ b/ElectronJS/config.json
@@ -1 +1 @@
-{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
\ No newline at end of file
+{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}
\ No newline at end of file
diff --git a/ElectronJS/main.js b/ElectronJS/main.js
index a3aaceb..2f8bd41 100644
--- a/ElectronJS/main.js
+++ b/ElectronJS/main.js
@@ -59,27 +59,27 @@ let chromeBinaryPath = "";
let execute_path = "";
console.log(process.arch);
-exec(`wmic os get Caption`, function (error, stdout, stderr) {
- if (error) {
- console.error(`执行的错误: ${error}`);
- return;
- }
+// exec(`wmic os get Caption`, function (error, stdout, stderr) {
+// if (error) {
+// console.error(`执行的错误: ${error}`);
+// return;
+// }
- if (stdout.includes("Windows 7")) {
- console.log("Windows 7");
- let sys_arch = config.sys_arch;
- if (sys_arch === "x64") {
- dialog.showMessageBoxSync({
- type: "error",
- title: "Error",
- message:
- "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
- });
- }
- } else {
- console.log("Not Windows 7");
- }
-});
+// if (stdout.includes("Windows 7")) {
+// console.log("Windows 7");
+// let sys_arch = config.sys_arch;
+// if (sys_arch === "x64") {
+// dialog.showMessageBoxSync({
+// type: "error",
+// title: "Error",
+// message:
+// "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
+// });
+// }
+// } else {
+// console.log("Not Windows 7");
+// }
+// });
if (process.platform === "win32" && process.arch === "ia32") {
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");
diff --git a/ElectronJS/src/taskGrid/executeTask.html b/ElectronJS/src/taskGrid/executeTask.html
index 5b1dfd3..8ece920 100644
--- a/ElectronJS/src/taskGrid/executeTask.html
+++ b/ElectronJS/src/taskGrid/executeTask.html
@@ -79,8 +79,9 @@
-
+
+
@@ -314,6 +315,7 @@
config_folder: "",
easyspider_location: "",
mysql_config_path: "",
+ OS: "win32",
}, mounted() {
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
app.$data.user_data_folder = result.user_data_folder;
@@ -540,6 +542,7 @@
function changeCommand() {
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
+ app.$data.OS = OSInfo.version;
if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){
diff --git a/ElectronJS/tasks/308.json b/ElectronJS/tasks/308.json
new file mode 100644
index 0000000..d972f14
--- /dev/null
+++ b/ElectronJS/tasks/308.json
@@ -0,0 +1 @@
+{"id":308,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-23 14:21:24","update_time":"2023-12-23 14:23:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
\ No newline at end of file
diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py
index 1db3930..26c3b58 100644
--- a/ExecuteStage/easyspider_executestage.py
+++ b/ExecuteStage/easyspider_executestage.py
@@ -44,14 +44,25 @@ import sys
# import hashlib
import time
import requests
-from ddddocr import DdddOcr
+from multiprocessing import freeze_support
+freeze_support() # 防止无限死循环多开
+try:
+ from ddddocr import DdddOcr
+ import onnxruntime
+ onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
+except:
+ print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
+ print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
+ time.sleep(2)
from urllib.parse import urljoin
from lxml import etree, html
+try:
+ import pandas as pd
+except:
+ print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
+ print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
+ time.sleep(2)
-import onnxruntime
-
-onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
-import pandas as pd
# import numpy
# import pytesseract
# import uuid
@@ -2185,8 +2196,6 @@ class BrowserThread(Thread):
self.OUTPUT.append(line)
if __name__ == '__main__':
- from multiprocessing import freeze_support
- freeze_support() # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {
diff --git a/ExecuteStage/generateExecutable_Macos.sh b/ExecuteStage/generateExecutable_Macos.sh
index dbdd180..ce0c754 100755
--- a/ExecuteStage/generateExecutable_Macos.sh
+++ b/ExecuteStage/generateExecutable_Macos.sh
@@ -1,7 +1,17 @@
+# 先打包一个不带ddddocr和pandas的版本,然后再打包一个带的版本,不带ddddocr和pandas的版本运行速度会快很多
+rm -r build
+rm -r dist
+pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn
+
+rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
+cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
+# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
+
+echo "With ddddocr and pandas"
+
+# # 打包带ddddocr和pandas的版本
rm -r build
rm -r dist
pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
-rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
-rm ../ElectronJS/easyspider_executestage
-cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
-# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
+rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
+cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
\ No newline at end of file
diff --git a/ExecuteStage/package_size.py b/ExecuteStage/package_size.py
new file mode 100644
index 0000000..2bb4bac
--- /dev/null
+++ b/ExecuteStage/package_size.py
@@ -0,0 +1,34 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+# 获取当前Python环境的lib路径
+lib_path = Path(sys.prefix) / "lib"
+
+# 使用pip列出所有已安装的包及其版本
+installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:]
+
+# 初始化一个字典来保存数据
+package_sizes = {}
+
+# 对于每个已安装的包,找到对应的路径并计算大小
+for package in installed_packages:
+ name, version = package.split()[:2]
+ package_size = 0
+
+ # 寻找与包名相关的顶层目录
+ # 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。
+ # 例如,Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf'
+ # 这需要特别处理或者使用包的元数据来找到正确的顶层目录。
+ package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name
+
+ # 计算文件夹大小
+ if package_dir.exists():
+ package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file())
+
+ package_sizes[name] = package_size
+
+# 将包按大小排序并输出
+for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True):
+ print(f"{name}: {size/1024/1024:.2f} MB")
\ No newline at end of file
diff --git a/ExecuteStage/test.py b/ExecuteStage/test.py
deleted file mode 100644
index 4f4e1c8..0000000
--- a/ExecuteStage/test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# from lxml import etree
-
-# # 解析HTML
-# html = """
-#
-# 123
-#
-# 456
-#
-# 789
-#
-# """
-# html = etree.HTML(html)
-# element = html.xpath("*")
-# direct_text = "/html/body/" + html[0][0].tag + "/text()"
-# all_text = "/html/body/" + html[0][0].tag + "//text()"
-# # 使用XPath选择元素
-# results = html.xpath(direct_text)
-# # print(results)
-# # 拼接所有文本内容并去掉两边的空白
-# text = ' '.join(result.strip() for result in results if result.strip())
-
-# # 输出结果
-# print(text)
-
-# results = html.xpath(all_text)
-# # print(results)
-# # 拼接所有文本内容并去掉两边的空白
-# text = ' '.join(result.strip() for result in results if result.strip())
-
-# # 输出结果
-# print(text)
-
-import re
-
-def lowercase_xpath_tags(xpath):
- return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
-
-print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
-print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
-print("")
\ No newline at end of file