Compatible for Windows

This commit is contained in:
naibo 2023-07-10 11:31:11 +08:00
parent 23dd4e4dd7
commit c31bc94dd0
6 changed files with 132 additions and 94 deletions

View File

@ -15,7 +15,7 @@ import time
import requests
from urllib.parse import urljoin
from lxml import etree
import undetected_chromedriver as uc
# import undetected_chromedriver as uc
from pynput.keyboard import Key, Listener
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
@ -42,8 +42,10 @@ import pytesseract
from PIL import Image
# import uuid
from threading import Thread, Event
from myChrome import MyChrome, MyUCChrome
from utils import check_pause, download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
from myChrome import MyChrome
if sys.platform != "darwin":
from myChrome import MyUCChrome
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
@ -279,7 +281,10 @@ class BrowserThread(Thread):
except:
self.Log('Time out after set seconds when scrolling. ')
self.recordLog('Time out after set seconds when scrolling')
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
self.Log("Wait for set second after screen scrolling")
@ -677,7 +682,10 @@ class BrowserThread(Thread):
# 切换历史记录等待:
self.Log("Change history back time or:",
node["parameters"]["historyWait"])
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
@ -722,7 +730,10 @@ class BrowserThread(Thread):
# time.sleep(2)
self.Log("Change history back time or:",
node["parameters"]["historyWait"])
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
except NoSuchElementException:
print("Loop element not found: ", path)
print("找不到循环元素: ", path)
@ -995,7 +1006,10 @@ class BrowserThread(Thread):
self.history["index"] = self.browser.execute_script(
"return history.length")
except TimeoutException:
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
self.history["index"] = self.browser.execute_script(
"return history.length")
else:
@ -1003,7 +1017,10 @@ class BrowserThread(Thread):
self.history["index"] = self.browser.execute_script(
"return history.length")
except TimeoutException:
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
self.history["index"] = self.browser.execute_script(
"return history.length")
# 如果打开了新窗口,切换到新窗口
@ -1275,7 +1292,10 @@ class BrowserThread(Thread):
self.Log('Time out after set seconds when getting data')
self.recordLog(
'Time out after set seconds when getting data')
self.browser.execute_script('window.stop()')
try:
self.browser.execute_script('window.stop()')
except:
pass
if p["relative"]: # 是否相对xpath
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身不需要二次查找
element = loopElement
@ -1327,8 +1347,8 @@ class BrowserThread(Thread):
if __name__ == '__main__':
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
# from multiprocessing import freeze_support
# freeze_support() # 防止无限死循环多开
config = {
"id": [0],
"saved_file_name": "",
@ -1361,6 +1381,9 @@ if __name__ == '__main__':
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# driver_path = os.getcwd()+ "/chromedriver_mac64"
print(driver_path)
if c.config_folder == "":
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider",
os.getcwd()+"/EasySpider")
@ -1425,6 +1448,7 @@ if __name__ == '__main__':
try:
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
config = json.load(f)
print("Config file path: " + c.config_folder + c.config_file_name)
absolute_user_data_folder = config["absolute_user_data_folder"]
print("\nAbsolute_user_data_folder:",
absolute_user_data_folder, "\n")
@ -1501,10 +1525,15 @@ if __name__ == '__main__':
browser_t = MyChrome(
options=options, chrome_options=option, executable_path=driver_path)
elif cloudflare == 1:
browser_t = MyUCChrome(
if sys.platform != "darwin":
browser_t = MyUCChrome(
options=options, chrome_options=option, driver_executable_path=driver_path)
print("Pass Cloudflare Mode")
print("过Cloudflare验证模式")
print("Pass Cloudflare Mode")
print("过Cloudflare验证模式")
else:
print("Not support Cloudflare Mode on MacOS")
print("MacOS不支持Cloudflare验证模式")
sys.exit()
event = Event()
event.set()
thread = BrowserThread(browser_t, i, service,

View File

@ -12,7 +12,6 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import undetected_chromedriver_ES as uc
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
@ -89,77 +88,80 @@ class MyChrome(webdriver.Chrome):
raise NoSuchElementException
else:
return super().find_elements(by=by, value=value)
class MyUCChrome(uc.Chrome):
import sys
if sys.platform != "darwin": # MacOS不支持Cloudflare
import undetected_chromedriver_ES as uc
def __init__(self, *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
class MyUCChrome(uc.Chrome):
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
def __init__(self, *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
try:
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except:
print("No such element found in the iframe")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return element
if not find_element:
raise NoSuchElementException
else:
return super().find_element(by=by, value=value)
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
self.iframe_env = False
if iframe:
# 获取所有的 iframe
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
try:
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except:
print("No such element found in the iframe")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return elements
except:
print("No such element found in the iframe")
if not find_element:
raise NoSuchElementException
else:
return super().find_elements(by=by, value=value)
return element
if not find_element:
raise NoSuchElementException
else:
return super().find_element(by=by, value=value)
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
try:
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
find_element = True
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return elements
except:
print("No such element found in the iframe")
if not find_element:
raise NoSuchElementException
else:
return super().find_elements(by=by, value=value)

View File

@ -4,10 +4,11 @@ import csv
import datetime
import json
import os
import sys
import re
import time
import uuid
import keyboard
# import keyboard
from openpyxl import Workbook, load_workbook
import requests
from urllib.parse import urlparse
@ -75,20 +76,20 @@ def on_release_creator(event, press_time):
return on_release
def check_pause(key, event):
while True:
if keyboard.is_pressed(key): # 按下p键暂停程序
if event._flag == False:
print("任务执行中长按p键暂停执行。")
print("Task is running, long press 'p' to pause.")
# 设置Event的值为True使得线程b可以继续执行
event.set()
else:
# 设置Event的值为False使得线程b暂停执行
print("任务已暂停长按p键继续执行...")
print("Task paused, press 'p' to continue...")
event.clear()
time.sleep(1) # 每秒检查一次
# def check_pause(key, event):
# while True:
# if keyboard.is_pressed(key): # 按下p键暂停程序
# if event._flag == False:
# print("任务执行中长按p键暂停执行。")
# print("Task is running, long press 'p' to pause.")
# # 设置Event的值为True使得线程b可以继续执行
# event.set()
# else:
# # 设置Event的值为False使得线程b暂停执行
# print("任务已暂停长按p键继续执行...")
# print("Task paused, press 'p' to continue...")
# event.clear()
# time.sleep(1) # 每秒检查一次
def download_image(url, save_directory):
@ -219,16 +220,22 @@ class myMySQL:
def __init__(self, config_file="mysql_config.json"):
# 读取配置文件
try:
if sys.platform == "darwin":
if config_file.find("./") >= 0:
config_file = config_file.replace("./", "")
config_file = os.path.expanduser("~/Library/Application Support/EasySpider/" + config_file)
print("MySQL config file path: ", config_file)
with open(config_file, 'r') as f:
config = json.load(f)
host = config["host"]
port = config["port"]
user = config["user"]
user = config["username"]
passwd = config["password"]
db = config["database"]
except:
except Exception as e:
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
print(e)
try:
self.conn = pymysql.connect(
host=host, port=port, user=user, passwd=passwd, db=db)

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.