diff --git a/.temp_to_pub/.gitignore b/.temp_to_pub/.gitignore index 9a8f15e..0167622 100644 --- a/.temp_to_pub/.gitignore +++ b/.temp_to_pub/.gitignore @@ -1,4 +1,5 @@ EasySpider_MacOS/easyspider_executestage +EasySpider_MacOS/easyspider_executestage_full EasySpider_Linux64_x64/user_data EasySpider_windows_x32/user_data EasySpider diff --git a/ElectronJS/config.json b/ElectronJS/config.json index 252c802..0d7be07 100644 --- a/ElectronJS/config.json +++ b/ElectronJS/config.json @@ -1 +1 @@ -{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"} \ No newline at end of file +{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"} \ No newline at end of file diff --git a/ElectronJS/main.js b/ElectronJS/main.js index a3aaceb..2f8bd41 100644 --- a/ElectronJS/main.js +++ b/ElectronJS/main.js @@ -59,27 +59,27 @@ let chromeBinaryPath = ""; let execute_path = ""; console.log(process.arch); -exec(`wmic os get Caption`, function (error, stdout, stderr) { - if (error) { - console.error(`执行的错误: ${error}`); - return; - } +// exec(`wmic os get Caption`, function (error, stdout, stderr) { +// if (error) { +// console.error(`执行的错误: ${error}`); +// return; +// } - if (stdout.includes("Windows 7")) { - console.log("Windows 7"); - let sys_arch = config.sys_arch; - if (sys_arch === "x64") { - dialog.showMessageBoxSync({ - type: "error", - title: "Error", - message: - "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.", - }); - } - } else { - console.log("Not Windows 7"); - } -}); +// if (stdout.includes("Windows 7")) { +// console.log("Windows 7"); +// let sys_arch = config.sys_arch; +// if (sys_arch === "x64") { +// dialog.showMessageBoxSync({ +// type: "error", +// title: "Error", +// message: +// "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.", +// }); +// } +// } else { +// console.log("Not Windows 7"); +// } +// }); if (process.platform === "win32" && process.arch === "ia32") { driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe"); diff --git a/ElectronJS/src/taskGrid/executeTask.html b/ElectronJS/src/taskGrid/executeTask.html index 5b1dfd3..8ece920 100644 --- a/ElectronJS/src/taskGrid/executeTask.html +++ b/ElectronJS/src/taskGrid/executeTask.html @@ -79,8 +79,9 @@ @@ -314,6 +315,7 @@ config_folder: "", easyspider_location: "", mysql_config_path: "", + OS: "win32", }, mounted() { $.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) { app.$data.user_data_folder = result.user_data_folder; @@ -540,6 +542,7 @@ function changeCommand() { $.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) { + app.$data.OS = OSInfo.version; if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){ app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper; } else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){ diff --git a/ElectronJS/tasks/308.json b/ElectronJS/tasks/308.json new file mode 100644 index 0000000..d972f14 --- /dev/null +++ b/ElectronJS/tasks/308.json @@ -0,0 +1 @@ +{"id":308,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-23 14:21:24","update_time":"2023-12-23 14:23:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]} \ No newline at end of file diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index 1db3930..26c3b58 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -44,14 +44,25 @@ import sys # import hashlib import time import requests -from ddddocr import DdddOcr +from multiprocessing import freeze_support +freeze_support() # 防止无限死循环多开 +try: + from ddddocr import DdddOcr + import onnxruntime + onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志 +except: + print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。") + print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.") + time.sleep(2) from urllib.parse import urljoin from lxml import etree, html +try: + import pandas as pd +except: + print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。") + print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.") + time.sleep(2) -import onnxruntime - -onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志 -import pandas as pd # import numpy # import pytesseract # import uuid @@ -2185,8 +2196,6 @@ class BrowserThread(Thread): self.OUTPUT.append(line) if __name__ == '__main__': - from multiprocessing import freeze_support - freeze_support() # 防止无限死循环多开 # 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度 # If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed config = { diff --git a/ExecuteStage/generateExecutable_Macos.sh b/ExecuteStage/generateExecutable_Macos.sh index dbdd180..ce0c754 100755 --- a/ExecuteStage/generateExecutable_Macos.sh +++ b/ExecuteStage/generateExecutable_Macos.sh @@ -1,7 +1,17 @@ +# 先打包一个不带ddddocr和pandas的版本,然后再打包一个带的版本,不带ddddocr和pandas的版本运行速度会快很多 +rm -r build +rm -r dist +pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn + +rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage +cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage +# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage + +echo "With ddddocr and pandas" + +# # 打包带ddddocr和pandas的版本 rm -r build rm -r dist pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py -rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage -rm ../ElectronJS/easyspider_executestage -cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage -# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage +rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full +cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full \ No newline at end of file diff --git a/ExecuteStage/package_size.py b/ExecuteStage/package_size.py new file mode 100644 index 0000000..2bb4bac --- /dev/null +++ b/ExecuteStage/package_size.py @@ -0,0 +1,34 @@ +import os +import subprocess +import sys +from pathlib import Path + +# 获取当前Python环境的lib路径 +lib_path = Path(sys.prefix) / "lib" + +# 使用pip列出所有已安装的包及其版本 +installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:] + +# 初始化一个字典来保存数据 +package_sizes = {} + +# 对于每个已安装的包,找到对应的路径并计算大小 +for package in installed_packages: + name, version = package.split()[:2] + package_size = 0 + + # 寻找与包名相关的顶层目录 + # 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。 + # 例如,Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf' + # 这需要特别处理或者使用包的元数据来找到正确的顶层目录。 + package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name + + # 计算文件夹大小 + if package_dir.exists(): + package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file()) + + package_sizes[name] = package_size + +# 将包按大小排序并输出 +for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True): + print(f"{name}: {size/1024/1024:.2f} MB") \ No newline at end of file diff --git a/ExecuteStage/test.py b/ExecuteStage/test.py deleted file mode 100644 index 4f4e1c8..0000000 --- a/ExecuteStage/test.py +++ /dev/null @@ -1,44 +0,0 @@ -# from lxml import etree - -# # 解析HTML -# html = """ -#
-# 123 -# -# 456 -#
-# 789 -#
-# """ -# html = etree.HTML(html) -# element = html.xpath("*") -# direct_text = "/html/body/" + html[0][0].tag + "/text()" -# all_text = "/html/body/" + html[0][0].tag + "//text()" -# # 使用XPath选择元素 -# results = html.xpath(direct_text) -# # print(results) -# # 拼接所有文本内容并去掉两边的空白 -# text = ' '.join(result.strip() for result in results if result.strip()) - -# # 输出结果 -# print(text) - -# results = html.xpath(all_text) -# # print(results) -# # 拼接所有文本内容并去掉两边的空白 -# text = ' '.join(result.strip() for result in results if result.strip()) - -# # 输出结果 -# print(text) - -import re - -def lowercase_xpath_tags(xpath): - return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath) - -print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL')) -print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]") -print("") \ No newline at end of file