mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-19 18:59:52 +08:00
Compatible for Windows
This commit is contained in:
parent
23dd4e4dd7
commit
c31bc94dd0
@ -15,7 +15,7 @@ import time
|
|||||||
import requests
|
import requests
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import undetected_chromedriver as uc
|
# import undetected_chromedriver as uc
|
||||||
from pynput.keyboard import Key, Listener
|
from pynput.keyboard import Key, Listener
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
@ -42,8 +42,10 @@ import pytesseract
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
# import uuid
|
# import uuid
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from myChrome import MyChrome, MyUCChrome
|
from myChrome import MyChrome
|
||||||
from utils import check_pause, download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
|
if sys.platform != "darwin":
|
||||||
|
from myChrome import MyUCChrome
|
||||||
|
from utils import download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, write_to_csv, write_to_excel
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
@ -279,7 +281,10 @@ class BrowserThread(Thread):
|
|||||||
except:
|
except:
|
||||||
self.Log('Time out after set seconds when scrolling. ')
|
self.Log('Time out after set seconds when scrolling. ')
|
||||||
self.recordLog('Time out after set seconds when scrolling')
|
self.recordLog('Time out after set seconds when scrolling')
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
|
if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
|
||||||
for i in range(para["scrollCount"]):
|
for i in range(para["scrollCount"]):
|
||||||
self.Log("Wait for set second after screen scrolling")
|
self.Log("Wait for set second after screen scrolling")
|
||||||
@ -677,7 +682,10 @@ class BrowserThread(Thread):
|
|||||||
# 切换历史记录等待:
|
# 切换历史记录等待:
|
||||||
self.Log("Change history back time or:",
|
self.Log("Change history back time or:",
|
||||||
node["parameters"]["historyWait"])
|
node["parameters"]["historyWait"])
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
|
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
|
||||||
output = self.execute_code(int(
|
output = self.execute_code(int(
|
||||||
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
|
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
|
||||||
@ -722,7 +730,10 @@ class BrowserThread(Thread):
|
|||||||
# time.sleep(2)
|
# time.sleep(2)
|
||||||
self.Log("Change history back time or:",
|
self.Log("Change history back time or:",
|
||||||
node["parameters"]["historyWait"])
|
node["parameters"]["historyWait"])
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
print("Loop element not found: ", path)
|
print("Loop element not found: ", path)
|
||||||
print("找不到循环元素: ", path)
|
print("找不到循环元素: ", path)
|
||||||
@ -995,7 +1006,10 @@ class BrowserThread(Thread):
|
|||||||
self.history["index"] = self.browser.execute_script(
|
self.history["index"] = self.browser.execute_script(
|
||||||
"return history.length")
|
"return history.length")
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
self.history["index"] = self.browser.execute_script(
|
self.history["index"] = self.browser.execute_script(
|
||||||
"return history.length")
|
"return history.length")
|
||||||
else:
|
else:
|
||||||
@ -1003,7 +1017,10 @@ class BrowserThread(Thread):
|
|||||||
self.history["index"] = self.browser.execute_script(
|
self.history["index"] = self.browser.execute_script(
|
||||||
"return history.length")
|
"return history.length")
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
self.history["index"] = self.browser.execute_script(
|
self.history["index"] = self.browser.execute_script(
|
||||||
"return history.length")
|
"return history.length")
|
||||||
# 如果打开了新窗口,切换到新窗口
|
# 如果打开了新窗口,切换到新窗口
|
||||||
@ -1275,7 +1292,10 @@ class BrowserThread(Thread):
|
|||||||
self.Log('Time out after set seconds when getting data')
|
self.Log('Time out after set seconds when getting data')
|
||||||
self.recordLog(
|
self.recordLog(
|
||||||
'Time out after set seconds when getting data')
|
'Time out after set seconds when getting data')
|
||||||
|
try:
|
||||||
self.browser.execute_script('window.stop()')
|
self.browser.execute_script('window.stop()')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if p["relative"]: # 是否相对xpath
|
if p["relative"]: # 是否相对xpath
|
||||||
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
if p["relativeXPath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
|
||||||
element = loopElement
|
element = loopElement
|
||||||
@ -1327,8 +1347,8 @@ class BrowserThread(Thread):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from multiprocessing import freeze_support
|
# from multiprocessing import freeze_support
|
||||||
freeze_support() # 防止无限死循环多开
|
# freeze_support() # 防止无限死循环多开
|
||||||
config = {
|
config = {
|
||||||
"id": [0],
|
"id": [0],
|
||||||
"saved_file_name": "",
|
"saved_file_name": "",
|
||||||
@ -1361,6 +1381,9 @@ if __name__ == '__main__':
|
|||||||
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
|
||||||
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
# driver_path = os.getcwd()+ "/chromedriver_mac64"
|
||||||
print(driver_path)
|
print(driver_path)
|
||||||
|
if c.config_folder == "":
|
||||||
|
c.config_folder = os.path.expanduser("~/Library/Application Support/EasySpider/")
|
||||||
|
# print("Config folder for MacOS:", c.config_folder)
|
||||||
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
elif os.path.exists(os.getcwd()+"/EasySpider/resources"): # 打包后的路径
|
||||||
print("Finding chromedriver in EasySpider",
|
print("Finding chromedriver in EasySpider",
|
||||||
os.getcwd()+"/EasySpider")
|
os.getcwd()+"/EasySpider")
|
||||||
@ -1425,6 +1448,7 @@ if __name__ == '__main__':
|
|||||||
try:
|
try:
|
||||||
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
with open(c.config_folder + c.config_file_name, "r", encoding='utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
print("Config file path: " + c.config_folder + c.config_file_name)
|
||||||
absolute_user_data_folder = config["absolute_user_data_folder"]
|
absolute_user_data_folder = config["absolute_user_data_folder"]
|
||||||
print("\nAbsolute_user_data_folder:",
|
print("\nAbsolute_user_data_folder:",
|
||||||
absolute_user_data_folder, "\n")
|
absolute_user_data_folder, "\n")
|
||||||
@ -1501,10 +1525,15 @@ if __name__ == '__main__':
|
|||||||
browser_t = MyChrome(
|
browser_t = MyChrome(
|
||||||
options=options, chrome_options=option, executable_path=driver_path)
|
options=options, chrome_options=option, executable_path=driver_path)
|
||||||
elif cloudflare == 1:
|
elif cloudflare == 1:
|
||||||
|
if sys.platform != "darwin":
|
||||||
browser_t = MyUCChrome(
|
browser_t = MyUCChrome(
|
||||||
options=options, chrome_options=option, driver_executable_path=driver_path)
|
options=options, chrome_options=option, driver_executable_path=driver_path)
|
||||||
print("Pass Cloudflare Mode")
|
print("Pass Cloudflare Mode")
|
||||||
print("过Cloudflare验证模式")
|
print("过Cloudflare验证模式")
|
||||||
|
else:
|
||||||
|
print("Not support Cloudflare Mode on MacOS")
|
||||||
|
print("MacOS不支持Cloudflare验证模式")
|
||||||
|
sys.exit()
|
||||||
event = Event()
|
event = Event()
|
||||||
event.set()
|
event.set()
|
||||||
thread = BrowserThread(browser_t, i, service,
|
thread = BrowserThread(browser_t, i, service,
|
||||||
|
@ -12,7 +12,6 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
from selenium.webdriver.support.ui import Select
|
from selenium.webdriver.support.ui import Select
|
||||||
from selenium.webdriver import ActionChains
|
from selenium.webdriver import ActionChains
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
import undetected_chromedriver_ES as uc
|
|
||||||
desired_capabilities = DesiredCapabilities.CHROME
|
desired_capabilities = DesiredCapabilities.CHROME
|
||||||
desired_capabilities["pageLoadStrategy"] = "none"
|
desired_capabilities["pageLoadStrategy"] = "none"
|
||||||
|
|
||||||
@ -90,6 +89,9 @@ class MyChrome(webdriver.Chrome):
|
|||||||
else:
|
else:
|
||||||
return super().find_elements(by=by, value=value)
|
return super().find_elements(by=by, value=value)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
if sys.platform != "darwin": # MacOS不支持Cloudflare
|
||||||
|
import undetected_chromedriver_ES as uc
|
||||||
|
|
||||||
class MyUCChrome(uc.Chrome):
|
class MyUCChrome(uc.Chrome):
|
||||||
|
|
||||||
|
@ -4,10 +4,11 @@ import csv
|
|||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import keyboard
|
# import keyboard
|
||||||
from openpyxl import Workbook, load_workbook
|
from openpyxl import Workbook, load_workbook
|
||||||
import requests
|
import requests
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -75,20 +76,20 @@ def on_release_creator(event, press_time):
|
|||||||
return on_release
|
return on_release
|
||||||
|
|
||||||
|
|
||||||
def check_pause(key, event):
|
# def check_pause(key, event):
|
||||||
while True:
|
# while True:
|
||||||
if keyboard.is_pressed(key): # 按下p键,暂停程序
|
# if keyboard.is_pressed(key): # 按下p键,暂停程序
|
||||||
if event._flag == False:
|
# if event._flag == False:
|
||||||
print("任务执行中,长按p键暂停执行。")
|
# print("任务执行中,长按p键暂停执行。")
|
||||||
print("Task is running, long press 'p' to pause.")
|
# print("Task is running, long press 'p' to pause.")
|
||||||
# 设置Event的值为True,使得线程b可以继续执行
|
# # 设置Event的值为True,使得线程b可以继续执行
|
||||||
event.set()
|
# event.set()
|
||||||
else:
|
# else:
|
||||||
# 设置Event的值为False,使得线程b暂停执行
|
# # 设置Event的值为False,使得线程b暂停执行
|
||||||
print("任务已暂停,长按p键继续执行...")
|
# print("任务已暂停,长按p键继续执行...")
|
||||||
print("Task paused, press 'p' to continue...")
|
# print("Task paused, press 'p' to continue...")
|
||||||
event.clear()
|
# event.clear()
|
||||||
time.sleep(1) # 每秒检查一次
|
# time.sleep(1) # 每秒检查一次
|
||||||
|
|
||||||
|
|
||||||
def download_image(url, save_directory):
|
def download_image(url, save_directory):
|
||||||
@ -219,16 +220,22 @@ class myMySQL:
|
|||||||
def __init__(self, config_file="mysql_config.json"):
|
def __init__(self, config_file="mysql_config.json"):
|
||||||
# 读取配置文件
|
# 读取配置文件
|
||||||
try:
|
try:
|
||||||
|
if sys.platform == "darwin":
|
||||||
|
if config_file.find("./") >= 0:
|
||||||
|
config_file = config_file.replace("./", "")
|
||||||
|
config_file = os.path.expanduser("~/Library/Application Support/EasySpider/" + config_file)
|
||||||
|
print("MySQL config file path: ", config_file)
|
||||||
with open(config_file, 'r') as f:
|
with open(config_file, 'r') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
host = config["host"]
|
host = config["host"]
|
||||||
port = config["port"]
|
port = config["port"]
|
||||||
user = config["user"]
|
user = config["username"]
|
||||||
passwd = config["password"]
|
passwd = config["password"]
|
||||||
db = config["database"]
|
db = config["database"]
|
||||||
except:
|
except Exception as e:
|
||||||
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
|
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
|
||||||
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
|
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
|
||||||
|
print(e)
|
||||||
try:
|
try:
|
||||||
self.conn = pymysql.connect(
|
self.conn = pymysql.connect(
|
||||||
host=host, port=port, user=user, passwd=passwd, db=db)
|
host=host, port=port, user=user, passwd=passwd, db=db)
|
||||||
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user