MacOS with two execute stage version

This commit is contained in:
Naibo_Mac_M2 2023-12-23 14:55:21 +08:00
parent e79eecc669
commit 476cec0537
9 changed files with 91 additions and 77 deletions

View File

@ -1,4 +1,5 @@
EasySpider_MacOS/easyspider_executestage
EasySpider_MacOS/easyspider_executestage_full
EasySpider_Linux64_x64/user_data
EasySpider_windows_x32/user_data
EasySpider

View File

@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}

View File

@ -59,27 +59,27 @@ let chromeBinaryPath = "";
let execute_path = "";
console.log(process.arch);
exec(`wmic os get Caption`, function (error, stdout, stderr) {
if (error) {
console.error(`执行的错误: ${error}`);
return;
}
// exec(`wmic os get Caption`, function (error, stdout, stderr) {
// if (error) {
// console.error(`执行的错误: ${error}`);
// return;
// }
if (stdout.includes("Windows 7")) {
console.log("Windows 7");
let sys_arch = config.sys_arch;
if (sys_arch === "x64") {
dialog.showMessageBoxSync({
type: "error",
title: "Error",
message:
"Windows 7系统请下载使用x32版本的软件不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
});
}
} else {
console.log("Not Windows 7");
}
});
// if (stdout.includes("Windows 7")) {
// console.log("Windows 7");
// let sys_arch = config.sys_arch;
// if (sys_arch === "x64") {
// dialog.showMessageBoxSync({
// type: "error",
// title: "Error",
// message:
// "Windows 7系统请下载使用x32版本的软件不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
// });
// }
// } else {
// console.log("Not Windows 7");
// }
// });
if (process.platform === "win32" && process.arch === "ia32") {
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");

View File

@ -79,8 +79,9 @@
<div class="modal-body">
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal Windows请使用PowerShell而不是CMD然后复制Command/Ctrl + c和运行以下命令以执行任务执行命令时不能退出EasySpider除非将--read_type设置为local` | lang }}</label>
<label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction" target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
<label v-if="OS=='darwin'">{{`对于MacOS系统EasySpider提供了两个不同的执行程序分别为easyspider_executestage和easyspider_executestage_full前者执行时加载速度较快并提供了除OCR识别和数据去重以外的全部功能后者则提供了包括OCR识别和数据去重在内的全部功能但运行时加载速度较慢需要等待2-10分钟才能执行程序请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal Windows请使用PowerShell而不是CMD然后复制Command/Ctrl + c和运行以下命令以执行任务执行命令时不能退出EasySpider除非将--read_type设置为local` | lang }}</label>
<textarea class="form-control" style="height:150px">cd {{easyspider_location}}
{{command}} --config_folder "{{config_folder}}" --headless 0 --read_type remote --config_file_name config.json --saved_file_name </textarea>
</div>
@ -314,6 +315,7 @@
config_folder: "",
easyspider_location: "",
mysql_config_path: "",
OS: "win32",
}, mounted() {
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
app.$data.user_data_folder = result.user_data_folder;
@ -540,6 +542,7 @@
function changeCommand() {
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
app.$data.OS = OSInfo.version;
if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){

View File

@ -0,0 +1 @@
{"id":308,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-23 14:21:24","update_time":"2023-12-23 14:23:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

View File

@ -44,14 +44,25 @@ import sys
# import hashlib
import time
import requests
from ddddocr import DdddOcr
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
try:
from ddddocr import DdddOcr
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
except:
print("OCR识别无法在当前环境下使用ddddocr库缺失请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
time.sleep(2)
from urllib.parse import urljoin
from lxml import etree, html
try:
import pandas as pd
except:
print("数据去重无法在当前环境下使用pandas库缺失请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
time.sleep(2)
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
import pandas as pd
# import numpy
# import pytesseract
# import uuid
@ -2185,8 +2196,6 @@ class BrowserThread(Thread):
self.OUTPUT.append(line)
if __name__ == '__main__':
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {

View File

@ -1,7 +1,17 @@
# 先打包一个不带ddddocr和pandas的版本然后再打包一个带的版本不带ddddocr和pandas的版本运行速度会快很多
rm -r build
rm -r dist
pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
echo "With ddddocr and pandas"
# # 打包带ddddocr和pandas的版本
rm -r build
rm -r dist
pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
rm ../ElectronJS/easyspider_executestage
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full

View File

@ -0,0 +1,34 @@
import os
import subprocess
import sys
from pathlib import Path
# 获取当前Python环境的lib路径
lib_path = Path(sys.prefix) / "lib"
# 使用pip列出所有已安装的包及其版本
installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:]
# 初始化一个字典来保存数据
package_sizes = {}
# 对于每个已安装的包,找到对应的路径并计算大小
for package in installed_packages:
name, version = package.split()[:2]
package_size = 0
# 寻找与包名相关的顶层目录
# 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。
# 例如Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf'
# 这需要特别处理或者使用包的元数据来找到正确的顶层目录。
package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name
# 计算文件夹大小
if package_dir.exists():
package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file())
package_sizes[name] = package_size
# 将包按大小排序并输出
for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True):
print(f"{name}: {size/1024/1024:.2f} MB")

View File

@ -1,44 +0,0 @@
# from lxml import etree
# # 解析HTML
# html = """
# <div>
# 123
# <ul class="list">
# <li class="item-0">first item</li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# </ul>
# 456
# <div></div>
# 789
# </div>
# """
# html = etree.HTML(html)
# element = html.xpath("*")
# direct_text = "/html/body/" + html[0][0].tag + "/text()"
# all_text = "/html/body/" + html[0][0].tag + "//text()"
# # 使用XPath选择元素
# results = html.xpath(direct_text)
# # print(results)
# # 拼接所有文本内容并去掉两边的空白
# text = ' '.join(result.strip() for result in results if result.strip())
# # 输出结果
# print(text)
# results = html.xpath(all_text)
# # print(results)
# # 拼接所有文本内容并去掉两边的空白
# text = ' '.join(result.strip() for result in results if result.strip())
# # 输出结果
# print(text)
import re
def lowercase_xpath_tags(xpath):
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
print("")