mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-20 10:05:00 +08:00
MacOS version v0.3.5
This commit is contained in:
parent
35b9494d42
commit
8c5267d66c
1
.temp_to_pub/.gitignore
vendored
1
.temp_to_pub/.gitignore
vendored
@ -5,6 +5,7 @@ EasySpider
|
|||||||
EasySpider.app/
|
EasySpider.app/
|
||||||
EasySpider_windows_x64/user_data
|
EasySpider_windows_x64/user_data
|
||||||
*.tmp
|
*.tmp
|
||||||
|
*.tar.gz
|
||||||
*.7z*
|
*.7z*
|
||||||
config.json
|
config.json
|
||||||
mysql_config.json
|
mysql_config.json
|
||||||
|
@ -15,6 +15,8 @@ import time
|
|||||||
import requests
|
import requests
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
# import undetected_chromedriver as uc
|
||||||
|
from pynput.keyboard import Key, Listener
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
@ -29,7 +31,6 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
from selenium.webdriver.support.ui import Select
|
from selenium.webdriver.support.ui import Select
|
||||||
from selenium.webdriver import ActionChains
|
from selenium.webdriver import ActionChains
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
import undetected_chromedriver as uc
|
|
||||||
import random
|
import random
|
||||||
# import pandas as pd
|
# import pandas as pd
|
||||||
from openpyxl import load_workbook, Workbook
|
from openpyxl import load_workbook, Workbook
|
||||||
@ -41,8 +42,10 @@ import pytesseract
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
# import uuid
|
# import uuid
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from myChrome import MyChrome, MyUCChrome
|
from myChrome import MyChrome
|
||||||
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press, on_release_creator, write_to_csv, write_to_excel
|
if sys.platform != "darwin":
|
||||||
|
from myChrome import MyUCChrome
|
||||||
|
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
@ -1326,6 +1329,8 @@ class BrowserThread(Thread):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# from multiprocessing import freeze_support
|
||||||
|
# freeze_support() # 防止无限死循环多开
|
||||||
config = {
|
config = {
|
||||||
"id": [0],
|
"id": [0],
|
||||||
"saved_file_name": "",
|
"saved_file_name": "",
|
||||||
@ -1358,6 +1363,9 @@ if __name__ == '__main__':
|
|||||||
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||||
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
||||||
print(driver_path)
|
print(driver_path)
|
||||||
|
if c.config_folder == "":
|
||||||
|
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
|
||||||
|
# print("Config folder for MacOS:", c.config_folder)
|
||||||
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
||||||
print("Finding chromedriver in EasySpider",
|
print("Finding chromedriver in EasySpider",
|
||||||
os.getcwd()+"/EasySpider")
|
os.getcwd()+"/EasySpider")
|
||||||
@ -1367,16 +1375,19 @@ if __name__ == '__main__':
|
|||||||
driver_path = os.path.join(
|
driver_path = os.path.join(
|
||||||
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
|
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
|
||||||
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
|
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
|
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
|
||||||
options.binary_location = os.path.join(
|
options.binary_location = os.path.join(
|
||||||
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
|
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
|
||||||
driver_path = os.path.join(
|
driver_path = os.path.join(
|
||||||
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
|
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
|
||||||
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
|
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
|
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
|
||||||
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
|
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
|
||||||
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
|
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
|
||||||
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
|
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
|
||||||
else:
|
else:
|
||||||
print("Unsupported platform")
|
print("Unsupported platform")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
@ -1419,6 +1430,7 @@ if __name__ == '__main__':
|
|||||||
try:
|
try:
|
||||||
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
print("Config file path: " + c.config_folder + c.config_file_name)
|
||||||
absolute_user_data_folder = config["absolute_user_data_folder"]
|
absolute_user_data_folder = config["absolute_user_data_folder"]
|
||||||
print("\nAbsolute_user_data_folder:",
|
print("\nAbsolute_user_data_folder:",
|
||||||
absolute_user_data_folder, "\n")
|
absolute_user_data_folder, "\n")
|
||||||
@ -1428,6 +1440,9 @@ if __name__ == '__main__':
|
|||||||
option.add_argument(
|
option.add_argument(
|
||||||
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
||||||
option.add_argument("--profile-directory=Default")
|
option.add_argument("--profile-directory=Default")
|
||||||
|
options.add_argument(
|
||||||
|
f'--user-data-dir={absolute_user_data_folder}') # TMALL 反扒
|
||||||
|
options.add_argument("--profile-directory=Default")
|
||||||
|
|
||||||
if c.headless:
|
if c.headless:
|
||||||
print("Headless mode")
|
print("Headless mode")
|
||||||
@ -1444,7 +1459,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
threads = []
|
threads = []
|
||||||
for i in c.id:
|
for i in c.id:
|
||||||
print(options)
|
# print(options)
|
||||||
print("id: ", i)
|
print("id: ", i)
|
||||||
if c.read_type == "remote":
|
if c.read_type == "remote":
|
||||||
print("remote")
|
print("remote")
|
||||||
@ -1492,10 +1507,15 @@ if __name__ == '__main__':
|
|||||||
browser_t = MyChrome(
|
browser_t = MyChrome(
|
||||||
options=options, chrome_options=option, executable_path=driver_path)
|
options=options, chrome_options=option, executable_path=driver_path)
|
||||||
elif cloudflare == 1:
|
elif cloudflare == 1:
|
||||||
|
if sys.platform != "darwin":
|
||||||
browser_t = MyUCChrome(
|
browser_t = MyUCChrome(
|
||||||
options=options, chrome_options=option, executable_path=driver_path)
|
options=options, chrome_options=option, driver_executable_path=driver_path)
|
||||||
print("Pass Cloudflare Mode")
|
print("Pass Cloudflare Mode")
|
||||||
print("过Cloudflare验证模式")
|
print("过Cloudflare验证模式")
|
||||||
|
else:
|
||||||
|
print("Not support Cloudflare Mode on MacOS")
|
||||||
|
print("MacOS不支持Cloudflare验证模式")
|
||||||
|
sys.exit()
|
||||||
event = Event()
|
event = Event()
|
||||||
event.set()
|
event.set()
|
||||||
thread = BrowserThread(browser_t, i, service,
|
thread = BrowserThread(browser_t, i, service,
|
||||||
@ -1505,26 +1525,33 @@ if __name__ == '__main__':
|
|||||||
thread.start()
|
thread.start()
|
||||||
# Set the pause operation
|
# Set the pause operation
|
||||||
# if sys.platform != "linux":
|
# if sys.platform != "linux":
|
||||||
|
# time.sleep(3)
|
||||||
|
# print("\n\n----------------------------------")
|
||||||
|
# print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
|
||||||
|
# print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
|
||||||
|
# print("----------------------------------\n\n")
|
||||||
# Thread(target=check_pause, args=("p", event)).start()
|
# Thread(target=check_pause, args=("p", event)).start()
|
||||||
# else:
|
# else:
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
press_time = {"duration": 0, "is_pressed": False}
|
||||||
print("\n\n----------------------------------")
|
print("\n\n----------------------------------")
|
||||||
print("正在运行任务,按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次按p键。")
|
print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
|
||||||
print("Running task, press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please press 'p' again.")
|
print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
|
||||||
print("----------------------------------\n\n")
|
print("----------------------------------\n\n")
|
||||||
# 使用监听器监听键盘输入
|
# 使用监听器监听键盘输入
|
||||||
try:
|
try:
|
||||||
from pynput.keyboard import Key, Listener
|
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
|
||||||
with Listener(on_press=on_press, on_release=on_release_creator(event)) as listener:
|
|
||||||
listener.join()
|
listener.join()
|
||||||
except:
|
except:
|
||||||
print("您的操作系统不支持暂停功能。")
|
pass
|
||||||
print("Your operating system does not support the pause function.")
|
# print("您的操作系统不支持暂停功能。")
|
||||||
|
# print("Your operating system does not support the pause function.")
|
||||||
|
|
||||||
|
|
||||||
|
# print("线程长度:", len(threads) )
|
||||||
|
|
||||||
for thread in threads:
|
for thread in threads:
|
||||||
|
print()
|
||||||
thread.join()
|
thread.join()
|
||||||
|
|
||||||
for thread in threads:
|
for thread in threads:
|
||||||
|
@ -12,7 +12,6 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
from selenium.webdriver.support.ui import Select
|
from selenium.webdriver.support.ui import Select
|
||||||
from selenium.webdriver import ActionChains
|
from selenium.webdriver import ActionChains
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
import undetected_chromedriver as uc
|
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
@ -90,8 +89,11 @@ class MyChrome(webdriver.Chrome):
|
|||||||
else:
|
else:
|
||||||
return super().find_elements(by=by, value=value)
|
return super().find_elements(by=by, value=value)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
if sys.platform != "darwin": # MacOS不支持Cloudflare
|
||||||
|
import undetected_chromedriver_ES as uc
|
||||||
|
|
||||||
class MyUCChrome(uc.Chrome):
|
class MyUCChrome(uc.Chrome):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.iframe_env = False # 现在的环境是root还是iframe
|
self.iframe_env = False # 现在的环境是root还是iframe
|
||||||
|
@ -4,6 +4,7 @@ import csv
|
|||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@ -24,26 +25,56 @@ def is_valid_url(url):
|
|||||||
def lowercase_tags_in_xpath(xpath):
|
def lowercase_tags_in_xpath(xpath):
|
||||||
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
||||||
|
|
||||||
def on_release_creator(event):
|
|
||||||
def on_release(key):
|
def on_press_creator(press_time, event):
|
||||||
|
def on_press(key):
|
||||||
try:
|
try:
|
||||||
if key.char == 'p': # 当按下esc键时,退出监听
|
if key.char == 'p':
|
||||||
|
if press_time["is_pressed"] == False: # 没按下p键时,记录按下p键的时间
|
||||||
|
press_time["duration"] = time.time()
|
||||||
|
press_time["is_pressed"] = True
|
||||||
|
else: # 按下p键时,判断按下p键的时间是否超过2.5秒
|
||||||
|
duration = time.time() - press_time["duration"]
|
||||||
|
if duration > 2:
|
||||||
if event._flag == False:
|
if event._flag == False:
|
||||||
print("任务执行中,按p键暂停执行。")
|
print("任务执行中,长按p键暂停执行。")
|
||||||
print("Task is running, press 'p' to pause.")
|
print("Task is running, long press 'p' to pause.")
|
||||||
# 设置Event的值为True,使得线程b可以继续执行
|
# 设置Event的值为True,使得线程b可以继续执行
|
||||||
event.set()
|
event.set()
|
||||||
else:
|
else:
|
||||||
# 设置Event的值为False,使得线程b暂停执行
|
# 设置Event的值为False,使得线程b暂停执行
|
||||||
print("任务已暂停,按p键继续执行...")
|
print("任务已暂停,长按p键继续执行...")
|
||||||
print("Task paused, press 'p' to continue...")
|
print("Task paused, long press 'p' to continue...")
|
||||||
event.clear()
|
event.clear()
|
||||||
|
press_time["duration"] = time.time()
|
||||||
|
press_time["is_pressed"] = False
|
||||||
|
# print("按下p键时间:", press_time["duration"])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return on_press
|
||||||
|
|
||||||
|
def on_release_creator(event, press_time):
|
||||||
|
def on_release(key):
|
||||||
|
try:
|
||||||
|
# duration = time.time() - press_time["duration"]
|
||||||
|
# # print("松开p键时间:", time.time(), "Duration: ", duration)
|
||||||
|
# if duration > 2.5 and key.char == 'p':
|
||||||
|
# if event._flag == False:
|
||||||
|
# print("任务执行中,按p键暂停执行。")
|
||||||
|
# print("Task is running, press 'p' to pause.")
|
||||||
|
# # 设置Event的值为True,使得线程b可以继续执行
|
||||||
|
# event.set()
|
||||||
|
# else:
|
||||||
|
# # 设置Event的值为False,使得线程b暂停执行
|
||||||
|
# print("任务已暂停,按p键继续执行...")
|
||||||
|
# print("Task paused, press 'p' to continue...")
|
||||||
|
# event.clear()
|
||||||
|
# press_time["duration"] = time.time()
|
||||||
|
press_time["is_pressed"] = False
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return on_release
|
return on_release
|
||||||
|
|
||||||
def on_press(key):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def check_pause(key, event):
|
# def check_pause(key, event):
|
||||||
# while True:
|
# while True:
|
||||||
@ -189,16 +220,22 @@ class myMySQL:
|
|||||||
def __init__(self, config_file="mysql_config.json"):
|
def __init__(self, config_file="mysql_config.json"):
|
||||||
# 读取配置文件
|
# 读取配置文件
|
||||||
try:
|
try:
|
||||||
|
if sys.platform == "darwin":
|
||||||
|
if config_file.find("./") >= 0:
|
||||||
|
config_file = config_file.replace("./", "")
|
||||||
|
config_file = os.path.expanduser("~/Library/Application Support/EasySpider/" + config_file)
|
||||||
|
print("MySQL config file path: ", config_file)
|
||||||
with open(config_file, 'r') as f:
|
with open(config_file, 'r') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
host = config["host"]
|
host = config["host"]
|
||||||
port = config["port"]
|
port = config["port"]
|
||||||
user = config["user"]
|
user = config["username"]
|
||||||
passwd = config["password"]
|
passwd = config["password"]
|
||||||
db = config["database"]
|
db = config["database"]
|
||||||
except:
|
except Exception as e:
|
||||||
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
|
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
|
||||||
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
|
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
|
||||||
|
print(e)
|
||||||
try:
|
try:
|
||||||
self.conn = pymysql.connect(
|
self.conn = pymysql.connect(
|
||||||
host=host, port=port, user=user, passwd=passwd, db=db)
|
host=host, port=port, user=user, passwd=passwd, db=db)
|
||||||
|
2
.temp_to_pub/compress.cmd
Normal file → Executable file
2
.temp_to_pub/compress.cmd
Normal file → Executable file
@ -1 +1 @@
|
|||||||
python compress.py
|
python3 compress.py
|
||||||
|
@ -45,7 +45,10 @@ def compress_folder_to_7z_split(folder_path, output_file):
|
|||||||
try:
|
try:
|
||||||
subprocess.call(["7z", "a", "-v95m", output_file, folder_path])
|
subprocess.call(["7z", "a", "-v95m", output_file, folder_path])
|
||||||
except:
|
except:
|
||||||
|
try:
|
||||||
subprocess.call(["7za", "a", "-v95m", output_file, folder_path])
|
subprocess.call(["7za", "a", "-v95m", output_file, folder_path])
|
||||||
|
except:
|
||||||
|
subprocess.call(["7zz", "a", "-v95m", output_file, folder_path])
|
||||||
|
|
||||||
easyspider_version = "0.3.5"
|
easyspider_version = "0.3.5"
|
||||||
|
|
||||||
@ -104,5 +107,11 @@ if __name__ == "__main__":
|
|||||||
subprocess.call(["tar", "-Jcvf", file_name, "./EasySpider_Linux_x64"])
|
subprocess.call(["tar", "-Jcvf", file_name, "./EasySpider_Linux_x64"])
|
||||||
print(f"Compress {file_name} successfully!")
|
print(f"Compress {file_name} successfully!")
|
||||||
elif sys.platform == "darwin" and platform.architecture()[0] == "64bit":
|
elif sys.platform == "darwin" and platform.architecture()[0] == "64bit":
|
||||||
pass
|
file_name = f"EasySpider_{easyspider_version}_MacOS_all_arch.tar.gz"
|
||||||
|
if os.path.exists("./EasySpider_MacOS_all_arch/Data"):
|
||||||
|
shutil.rmtree("./EasySpider_MacOS_all_arch/Data")
|
||||||
|
os.mkdir("./EasySpider_MacOS_all_arch/Data")
|
||||||
|
subprocess.call(["tar", "-zcvf", file_name, "./EasySpider_MacOS_all_arch"])
|
||||||
|
subprocess.call(["7zz", "a", "-v95m", file_name.replace(".tar.gz", ".7z"), file_name, "请继续解压EasySpider_MacOS_all_arch.tar.gz使用.txt"])
|
||||||
|
print(f"Compress {file_name} successfully!")
|
||||||
|
|
||||||
|
1
.temp_to_pub/请继续解压EasySpider_MacOS_all_arch.tar.gz使用.txt
Normal file
1
.temp_to_pub/请继续解压EasySpider_MacOS_all_arch.tar.gz使用.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
请继续解压.tar.gz文件以使用易采集。
|
BIN
ElectronJS/EasySpider_en.crx
Normal file
BIN
ElectronJS/EasySpider_en.crx
Normal file
Binary file not shown.
BIN
ElectronJS/EasySpider_zh.crx
Normal file
BIN
ElectronJS/EasySpider_zh.crx
Normal file
Binary file not shown.
@ -1 +1 @@
|
|||||||
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
|
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data1","debug":false,"mysql_config_path":"/Users/naibowang/Documents/EasySpider/ElectronJS/mysql_config.json","absolute_user_data_folder":"/Users/naibowang/Documents/EasySpider/ElectronJS/user_data1"}
|
@ -324,7 +324,9 @@ async function beginInvoke(msg, ws) {
|
|||||||
config.absolute_user_data_folder = user_data_folder_path;
|
config.absolute_user_data_folder = user_data_folder_path;
|
||||||
fs.writeFileSync(path.join(task_server.getDir(), "config.json"), JSON.stringify(config));
|
fs.writeFileSync(path.join(task_server.getDir(), "config.json"), JSON.stringify(config));
|
||||||
}
|
}
|
||||||
|
if(msg.message.mysql_config_path != "-1"){
|
||||||
config.mysql_config_path = msg.message.mysql_config_path;
|
config.mysql_config_path = msg.message.mysql_config_path;
|
||||||
|
}
|
||||||
fs.writeFileSync(path.join(task_server.getDir(), "config.json"), JSON.stringify(config));
|
fs.writeFileSync(path.join(task_server.getDir(), "config.json"), JSON.stringify(config));
|
||||||
// child('Chrome/easyspider_executestage.exe', parameters, function(err,stdout, stderr) {
|
// child('Chrome/easyspider_executestage.exe', parameters, function(err,stdout, stderr) {
|
||||||
// console.log(stdout);
|
// console.log(stdout);
|
||||||
|
@ -23,4 +23,4 @@ cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS_a
|
|||||||
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
||||||
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
||||||
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
cp ../ExecuteStage/requirements.txt ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
||||||
cp -Rf ../undetected_chromedriver_ES ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
cp -Rf ../ExecuteStage/undetected_chromedriver_ES ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
|
||||||
|
@ -563,7 +563,7 @@
|
|||||||
<label>Is it an extreme anti-scraping website like Cloudflare?</label>
|
<label>Is it an extreme anti-scraping website like Cloudflare?</label>
|
||||||
<select id="cloudflare" name="cloudflare" class="form-control">
|
<select id="cloudflare" name="cloudflare" class="form-control">
|
||||||
<option value=0>No</option>
|
<option value=0>No</option>
|
||||||
<option value=1>Yes</option>
|
<option value=1>Yes (Not support on MacOS, unless compile by yourself)</option>
|
||||||
</select>
|
</select>
|
||||||
<label>Browser Emulation Type:</label>
|
<label>Browser Emulation Type:</label>
|
||||||
<select id="environment" name="environment" class="form-control">
|
<select id="environment" name="environment" class="form-control">
|
||||||
|
@ -563,7 +563,7 @@
|
|||||||
<label>是否为Cloudflare等极端反爬网站(<a href="https://www.bilibili.com/video/BV1Ph4y1E7R9/" target="_blank">查看Cloudflare设计和执行教程</a>):</label>
|
<label>是否为Cloudflare等极端反爬网站(<a href="https://www.bilibili.com/video/BV1Ph4y1E7R9/" target="_blank">查看Cloudflare设计和执行教程</a>):</label>
|
||||||
<select id="cloudflare" name="cloudflare" class="form-control">
|
<select id="cloudflare" name="cloudflare" class="form-control">
|
||||||
<option value = 0>否</option>
|
<option value = 0>否</option>
|
||||||
<option value = 1>是</option>
|
<option value = 1>是(MacOS不支持直接运行,但可以自行编译)</option>
|
||||||
</select>
|
</select>
|
||||||
<label>浏览器模拟类型:</label>
|
<label>浏览器模拟类型:</label>
|
||||||
<select id="environment" name="environment" class="form-control">
|
<select id="environment" name="environment" class="form-control">
|
||||||
|
@ -209,7 +209,7 @@
|
|||||||
<input type="text" class="form-control" v-model="user_data_folder"></input>
|
<input type="text" class="form-control" v-model="user_data_folder"></input>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group" style="margin-top: 10px" v-if="task.outputFormat=='mysql'">
|
<div class="form-group" style="margin-top: 10px" v-if="task.outputFormat=='mysql'">
|
||||||
<label>{{"MySQL configuration file Path:~MySQL配置文件路径:" | lang}}</label>
|
<label>{{"MySQL configuration file Path, relative to this folder:~MySQL配置文件路径,路径相对此文件夹:" | lang}} {{config_folder}}</label>
|
||||||
<input type="text" class="form-control" v-model="mysql_config_path"></input>
|
<input type="text" class="form-control" v-model="mysql_config_path"></input>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
@ -485,13 +485,23 @@
|
|||||||
ws.onopen = function () {
|
ws.onopen = function () {
|
||||||
// Web Socket 已连接上,使用 send() 方法发送数据
|
// Web Socket 已连接上,使用 send() 方法发送数据
|
||||||
console.log("Connected");
|
console.log("Connected");
|
||||||
message = {
|
let message = {
|
||||||
type: 0, //消息类型,0代表链接操作
|
type: 0, //消息类型,0代表链接操作
|
||||||
message: {
|
message: {
|
||||||
id: 1, //socket id
|
id: 1, //socket id
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
this.send(JSON.stringify(message));
|
this.send(JSON.stringify(message));
|
||||||
|
message = { //显示flowchart
|
||||||
|
type: 5, //消息类型,调用执行程序
|
||||||
|
message: {
|
||||||
|
"id": -1,
|
||||||
|
"user_data_folder": "",
|
||||||
|
"mysql_config_path": "-1",
|
||||||
|
"execute_type": 1,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
this.send(JSON.stringify(message));
|
||||||
};
|
};
|
||||||
ws.onmessage = function(message){
|
ws.onmessage = function(message){
|
||||||
message = JSON.parse(message.data);
|
message = JSON.parse(message.data);
|
||||||
|
File diff suppressed because one or more lines are too long
@ -15,7 +15,7 @@ import time
|
|||||||
import requests
|
import requests
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import undetected_chromedriver as uc
|
# import undetected_chromedriver as uc
|
||||||
from pynput.keyboard import Key, Listener
|
from pynput.keyboard import Key, Listener
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
@ -42,7 +42,9 @@ import pytesseract
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
# import uuid
|
# import uuid
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from myChrome import MyChrome, MyUCChrome
|
from myChrome import MyChrome
|
||||||
|
if sys.platform != "darwin":
|
||||||
|
from myChrome import MyUCChrome
|
||||||
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
|
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
@ -1327,8 +1329,8 @@ class BrowserThread(Thread):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from multiprocessing import freeze_support
|
# from multiprocessing import freeze_support
|
||||||
freeze_support() # 防止无限死循环多开
|
# freeze_support() # 防止无限死循环多开
|
||||||
config = {
|
config = {
|
||||||
"id": [0],
|
"id": [0],
|
||||||
"saved_file_name": "",
|
"saved_file_name": "",
|
||||||
@ -1361,6 +1363,9 @@ if __name__ == '__main__':
|
|||||||
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||||
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
||||||
print(driver_path)
|
print(driver_path)
|
||||||
|
if c.config_folder == "":
|
||||||
|
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
|
||||||
|
# print("Config folder for MacOS:", c.config_folder)
|
||||||
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
||||||
print("Finding chromedriver in EasySpider",
|
print("Finding chromedriver in EasySpider",
|
||||||
os.getcwd()+"/EasySpider")
|
os.getcwd()+"/EasySpider")
|
||||||
@ -1425,6 +1430,7 @@ if __name__ == '__main__':
|
|||||||
try:
|
try:
|
||||||
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
print("Config file path: " + c.config_folder + c.config_file_name)
|
||||||
absolute_user_data_folder = config["absolute_user_data_folder"]
|
absolute_user_data_folder = config["absolute_user_data_folder"]
|
||||||
print("\nAbsolute_user_data_folder:",
|
print("\nAbsolute_user_data_folder:",
|
||||||
absolute_user_data_folder, "\n")
|
absolute_user_data_folder, "\n")
|
||||||
@ -1501,13 +1507,15 @@ if __name__ == '__main__':
|
|||||||
browser_t = MyChrome(
|
browser_t = MyChrome(
|
||||||
options=options, chrome_options=option, executable_path=driver_path)
|
options=options, chrome_options=option, executable_path=driver_path)
|
||||||
elif cloudflare == 1:
|
elif cloudflare == 1:
|
||||||
if sys.platform == "linux":
|
if sys.platform != "darwin":
|
||||||
import ssl
|
|
||||||
ssl._create_default_https_context = ssl._create_unverified_context # 忽略证书验证
|
|
||||||
browser_t = MyUCChrome(
|
browser_t = MyUCChrome(
|
||||||
options=options, chrome_options=option, driver_executable_path=driver_path)
|
options=options, chrome_options=option, driver_executable_path=driver_path)
|
||||||
print("Pass Cloudflare Mode")
|
print("Pass Cloudflare Mode")
|
||||||
print("过Cloudflare验证模式")
|
print("过Cloudflare验证模式")
|
||||||
|
else:
|
||||||
|
print("Not support Cloudflare Mode on MacOS")
|
||||||
|
print("MacOS不支持Cloudflare验证模式")
|
||||||
|
sys.exit()
|
||||||
event = Event()
|
event = Event()
|
||||||
event.set()
|
event.set()
|
||||||
thread = BrowserThread(browser_t, i, service,
|
thread = BrowserThread(browser_t, i, service,
|
||||||
|
@ -12,7 +12,6 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
from selenium.webdriver.support.ui import Select
|
from selenium.webdriver.support.ui import Select
|
||||||
from selenium.webdriver import ActionChains
|
from selenium.webdriver import ActionChains
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
import undetected_chromedriver_ES as uc
|
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
@ -90,8 +89,11 @@ class MyChrome(webdriver.Chrome):
|
|||||||
else:
|
else:
|
||||||
return super().find_elements(by=by, value=value)
|
return super().find_elements(by=by, value=value)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
if sys.platform != "darwin": # MacOS不支持Cloudflare
|
||||||
|
import undetected_chromedriver_ES as uc
|
||||||
|
|
||||||
class MyUCChrome(uc.Chrome):
|
class MyUCChrome(uc.Chrome):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.iframe_env = False # 现在的环境是root还是iframe
|
self.iframe_env = False # 现在的环境是root还是iframe
|
||||||
|
@ -4,6 +4,7 @@ import csv
|
|||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@ -219,6 +220,11 @@ class myMySQL:
|
|||||||
def __init__(self, config_file="mysql_config.json"):
|
def __init__(self, config_file="mysql_config.json"):
|
||||||
# 读取配置文件
|
# 读取配置文件
|
||||||
try:
|
try:
|
||||||
|
if sys.platform == "darwin":
|
||||||
|
if config_file.find("./") >= 0:
|
||||||
|
config_file = config_file.replace("./", "")
|
||||||
|
config_file = os.path.expanduser("~/Library/Application Support/EasySpider/" + config_file)
|
||||||
|
print("MySQL config file path: ", config_file)
|
||||||
with open(config_file, 'r') as f:
|
with open(config_file, 'r') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
host = config["host"]
|
host = config["host"]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user