mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-12 11:37:11 +08:00
MacOS with two execute stage version
This commit is contained in:
parent
e79eecc669
commit
476cec0537
1
.temp_to_pub/.gitignore
vendored
1
.temp_to_pub/.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
EasySpider_MacOS/easyspider_executestage
|
||||
EasySpider_MacOS/easyspider_executestage_full
|
||||
EasySpider_Linux64_x64/user_data
|
||||
EasySpider_windows_x32/user_data
|
||||
EasySpider
|
||||
|
@ -1 +1 @@
|
||||
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
|
||||
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}
|
@ -59,27 +59,27 @@ let chromeBinaryPath = "";
|
||||
let execute_path = "";
|
||||
console.log(process.arch);
|
||||
|
||||
exec(`wmic os get Caption`, function (error, stdout, stderr) {
|
||||
if (error) {
|
||||
console.error(`执行的错误: ${error}`);
|
||||
return;
|
||||
}
|
||||
// exec(`wmic os get Caption`, function (error, stdout, stderr) {
|
||||
// if (error) {
|
||||
// console.error(`执行的错误: ${error}`);
|
||||
// return;
|
||||
// }
|
||||
|
||||
if (stdout.includes("Windows 7")) {
|
||||
console.log("Windows 7");
|
||||
let sys_arch = config.sys_arch;
|
||||
if (sys_arch === "x64") {
|
||||
dialog.showMessageBoxSync({
|
||||
type: "error",
|
||||
title: "Error",
|
||||
message:
|
||||
"Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
|
||||
});
|
||||
}
|
||||
} else {
|
||||
console.log("Not Windows 7");
|
||||
}
|
||||
});
|
||||
// if (stdout.includes("Windows 7")) {
|
||||
// console.log("Windows 7");
|
||||
// let sys_arch = config.sys_arch;
|
||||
// if (sys_arch === "x64") {
|
||||
// dialog.showMessageBoxSync({
|
||||
// type: "error",
|
||||
// title: "Error",
|
||||
// message:
|
||||
// "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
|
||||
// });
|
||||
// }
|
||||
// } else {
|
||||
// console.log("Not Windows 7");
|
||||
// }
|
||||
// });
|
||||
|
||||
if (process.platform === "win32" && process.arch === "ia32") {
|
||||
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");
|
||||
|
@ -79,8 +79,9 @@
|
||||
<div class="modal-body">
|
||||
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
|
||||
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
|
||||
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
|
||||
<label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction" target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
|
||||
<label v-if="OS=='darwin'">{{`对于MacOS系统,EasySpider提供了两个不同的执行程序,分别为easyspider_executestage和easyspider_executestage_full,前者执行时加载速度较快,并提供了除OCR识别和数据去重以外的全部功能;后者则提供了包括OCR识别和数据去重在内的全部功能,但运行时加载速度较慢,需要等待2-10分钟才能执行程序,请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
|
||||
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
|
||||
<textarea class="form-control" style="height:150px">cd {{easyspider_location}}
|
||||
{{command}} --config_folder "{{config_folder}}" --headless 0 --read_type remote --config_file_name config.json --saved_file_name </textarea>
|
||||
</div>
|
||||
@ -314,6 +315,7 @@
|
||||
config_folder: "",
|
||||
easyspider_location: "",
|
||||
mysql_config_path: "",
|
||||
OS: "win32",
|
||||
}, mounted() {
|
||||
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
|
||||
app.$data.user_data_folder = result.user_data_folder;
|
||||
@ -540,6 +542,7 @@
|
||||
|
||||
function changeCommand() {
|
||||
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
|
||||
app.$data.OS = OSInfo.version;
|
||||
if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){
|
||||
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
|
||||
} else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){
|
||||
|
1
ElectronJS/tasks/308.json
Normal file
1
ElectronJS/tasks/308.json
Normal file
@ -0,0 +1 @@
|
||||
{"id":308,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-23 14:21:24","update_time":"2023-12-23 14:23:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
|
@ -44,14 +44,25 @@ import sys
|
||||
# import hashlib
|
||||
import time
|
||||
import requests
|
||||
from ddddocr import DdddOcr
|
||||
from multiprocessing import freeze_support
|
||||
freeze_support() # 防止无限死循环多开
|
||||
try:
|
||||
from ddddocr import DdddOcr
|
||||
import onnxruntime
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
except:
|
||||
print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
|
||||
print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
|
||||
time.sleep(2)
|
||||
from urllib.parse import urljoin
|
||||
from lxml import etree, html
|
||||
try:
|
||||
import pandas as pd
|
||||
except:
|
||||
print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
|
||||
print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
|
||||
time.sleep(2)
|
||||
|
||||
import onnxruntime
|
||||
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
import pandas as pd
|
||||
# import numpy
|
||||
# import pytesseract
|
||||
# import uuid
|
||||
@ -2185,8 +2196,6 @@ class BrowserThread(Thread):
|
||||
self.OUTPUT.append(line)
|
||||
|
||||
if __name__ == '__main__':
|
||||
from multiprocessing import freeze_support
|
||||
freeze_support() # 防止无限死循环多开
|
||||
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
|
||||
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
|
||||
config = {
|
||||
|
@ -1,7 +1,17 @@
|
||||
# 先打包一个不带ddddocr和pandas的版本,然后再打包一个带的版本,不带ddddocr和pandas的版本运行速度会快很多
|
||||
rm -r build
|
||||
rm -r dist
|
||||
pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn
|
||||
|
||||
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
|
||||
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
|
||||
# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
|
||||
|
||||
echo "With ddddocr and pandas"
|
||||
|
||||
# # 打包带ddddocr和pandas的版本
|
||||
rm -r build
|
||||
rm -r dist
|
||||
pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
|
||||
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
|
||||
rm ../ElectronJS/easyspider_executestage
|
||||
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
|
||||
# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage
|
||||
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
|
||||
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
|
34
ExecuteStage/package_size.py
Normal file
34
ExecuteStage/package_size.py
Normal file
@ -0,0 +1,34 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 获取当前Python环境的lib路径
|
||||
lib_path = Path(sys.prefix) / "lib"
|
||||
|
||||
# 使用pip列出所有已安装的包及其版本
|
||||
installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:]
|
||||
|
||||
# 初始化一个字典来保存数据
|
||||
package_sizes = {}
|
||||
|
||||
# 对于每个已安装的包,找到对应的路径并计算大小
|
||||
for package in installed_packages:
|
||||
name, version = package.split()[:2]
|
||||
package_size = 0
|
||||
|
||||
# 寻找与包名相关的顶层目录
|
||||
# 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。
|
||||
# 例如,Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf'
|
||||
# 这需要特别处理或者使用包的元数据来找到正确的顶层目录。
|
||||
package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name
|
||||
|
||||
# 计算文件夹大小
|
||||
if package_dir.exists():
|
||||
package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file())
|
||||
|
||||
package_sizes[name] = package_size
|
||||
|
||||
# 将包按大小排序并输出
|
||||
for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True):
|
||||
print(f"{name}: {size/1024/1024:.2f} MB")
|
@ -1,44 +0,0 @@
|
||||
# from lxml import etree
|
||||
|
||||
# # 解析HTML
|
||||
# html = """
|
||||
# <div>
|
||||
# 123
|
||||
# <ul class="list">
|
||||
# <li class="item-0">first item</li>
|
||||
# <li class="item-1"><a href="link2.html">second item</a></li>
|
||||
# </ul>
|
||||
# 456
|
||||
# <div></div>
|
||||
# 789
|
||||
# </div>
|
||||
# """
|
||||
# html = etree.HTML(html)
|
||||
# element = html.xpath("*")
|
||||
# direct_text = "/html/body/" + html[0][0].tag + "/text()"
|
||||
# all_text = "/html/body/" + html[0][0].tag + "//text()"
|
||||
# # 使用XPath选择元素
|
||||
# results = html.xpath(direct_text)
|
||||
# # print(results)
|
||||
# # 拼接所有文本内容并去掉两边的空白
|
||||
# text = ' '.join(result.strip() for result in results if result.strip())
|
||||
|
||||
# # 输出结果
|
||||
# print(text)
|
||||
|
||||
# results = html.xpath(all_text)
|
||||
# # print(results)
|
||||
# # 拼接所有文本内容并去掉两边的空白
|
||||
# text = ' '.join(result.strip() for result in results if result.strip())
|
||||
|
||||
# # 输出结果
|
||||
# print(text)
|
||||
|
||||
import re
|
||||
|
||||
def lowercase_xpath_tags(xpath):
|
||||
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
||||
|
||||
print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
|
||||
print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
|
||||
print("")
|
Loading…
x
Reference in New Issue
Block a user