Linux V0.3.5

This commit is contained in:
Naibo Wang 2023-07-09 08:39:00 +08:00
parent 541db1fa3f
commit 261fb13006
14 changed files with 2070 additions and 23 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,165 @@
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException, InvalidSelectorException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
class MyChrome(webdriver.Chrome):
def __init__(self, *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
try:
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except:
print("No such element found in the iframe")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return element
if not find_element:
raise NoSuchElementException
else:
return super().find_element(by=by, value=value)
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
try:
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
find_element = True
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return elements
except:
print("No such element found in the iframe")
if not find_element:
raise NoSuchElementException
else:
return super().find_elements(by=by, value=value)
class MyUCChrome(uc.Chrome):
def __init__(self, *args, **kwargs):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
try:
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except:
print("No such element found in the iframe")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return element
if not find_element:
raise NoSuchElementException
else:
return super().find_element(by=by, value=value)
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并点击里面的元素
for iframe in iframes:
# 切换到 iframe
try:
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
# 在 iframe 中查找并点击元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
find_element = True
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return elements
except:
print("No such element found in the iframe")
if not find_element:
raise NoSuchElementException
else:
return super().find_elements(by=by, value=value)

View File

@ -0,0 +1,318 @@
# 控制流程的暂停和继续
import csv
import datetime
import json
import os
import re
import time
import uuid
import keyboard
from openpyxl import Workbook, load_workbook
import requests
from urllib.parse import urlparse
import pymysql
from lxml import etree
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def lowercase_tags_in_xpath(xpath):
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
def on_press(key):
pass
def on_release_creator(event):
def on_release(key):
try:
if key.char == 'p': # 当按下esc键时退出监听
if event._flag == False:
print("任务执行中长按p键暂停执行。")
print("Task is running, long press 'p' to pause.")
# 设置Event的值为True使得线程b可以继续执行
event.set()
else:
# 设置Event的值为False使得线程b暂停执行
print("任务已暂停长按p键继续执行...")
print("Task paused, press 'p' to continue...")
event.clear()
except:
pass
return on_release
def check_pause(key, event):
while True:
if keyboard.is_pressed(key): # 按下p键暂停程序
if event._flag == False:
print("任务执行中长按p键暂停执行。")
print("Task is running, long press 'p' to pause.")
# 设置Event的值为True使得线程b可以继续执行
event.set()
else:
# 设置Event的值为False使得线程b暂停执行
print("任务已暂停长按p键继续执行...")
print("Task paused, press 'p' to continue...")
event.clear()
time.sleep(1) # 每秒检查一次
def download_image(url, save_directory):
# 定义浏览器头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
if is_valid_url(url):
# 发送 GET 请求获取图片数据
response = requests.get(url, headers=headers)
# 检查响应状态码是否为成功状态
if response.status_code == requests.codes.ok:
# 提取文件名
file_name = url.split('/')[-1].split("?")[0]
# 生成唯一的新文件名
new_file_name = file_name + '_' + \
str(uuid.uuid4()) + '_' + file_name
# 构建保存路径
save_path = os.path.join(save_directory, new_file_name)
# 保存图片到本地
with open(save_path, 'wb') as file:
file.write(response.content)
print("图片已成功下载到:", save_path)
print("The image has been successfully downloaded to:", save_path)
else:
print("下载图片失败,请检查此图片链接是否有效:", url)
print(
"Failed to download image, please check if this image link is valid:", url)
else:
print("下载图片失败,请检查此图片链接是否有效:", url)
print("Failed to download image, please check if this image link is valid:", url)
def get_output_code(output):
try:
if output.find("rue") != -1: # 如果返回值中包含true
code = 1
else:
code = int(output)
except:
code = 0
return code
# 判断字段是否为空
def isnull(s):
return len(s) != 0
def new_line(outputParameters, maxViewLength, record):
line = []
i = 0
for value in outputParameters.values():
line.append(value)
if record[i]:
print(value[:maxViewLength], " ", end="")
i += 1
print("")
return line
def write_to_csv(file_name, data, record):
with open(file_name, 'a', encoding='utf-8-sig', newline="") as f:
f_csv = csv.writer(f)
for line in data:
to_write = []
for i in range(len(line)):
if record[i]:
to_write.append(line[i])
f_csv.writerow(to_write)
f.close()
def write_to_excel(file_name, data, types, record):
first = False
if os.path.exists(file_name):
# 加载现有的工作簿
wb = load_workbook(file_name)
ws = wb.active
else:
# 创建新的工作簿和工作表
wb = Workbook()
ws = wb.active
first = True
# 追加数据到工作表
for line in data:
if not first: # 如果不是第一行,需要转换数据类型
for i in range(len(line)):
if types[i] == "int" or types[i] == "bigInt":
try:
line[i] = int(line[i])
except:
line[i] = 0
elif types[i] == "double":
try:
line[i] = float(line[i])
except:
line[i] = 0.0
else:
first = False
to_write = []
for i in range(len(line)):
if record[i]:
to_write.append(line[i])
ws.append(to_write)
# 保存工作簿
wb.save(file_name)
class Time:
def __init__(self, type1=""):
self.t = int(round(time.time() * 1000))
self.type = type1
def end(self):
at = int(round(time.time() * 1000))
print("Time used for", self.type, ":", at - self.t, "ms")
class myMySQL:
def __init__(self, config_file="mysql_config.json"):
# 读取配置文件
try:
with open(config_file, 'r') as f:
config = json.load(f)
host = config["host"]
port = config["port"]
user = config["user"]
passwd = config["password"]
db = config["database"]
except:
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
try:
self.conn = pymysql.connect(
host=host, port=port, user=user, passwd=passwd, db=db)
print("成功连接到数据库。")
print("Successfully connected to the database.")
except:
print("连接数据库失败,请检查配置文件是否正确。")
print("Failed to connect to the database, please check if the configuration file is correct.")
def create_table(self, table_name, parameters):
self.table_name = table_name
self.field_sql = "("
cursor = self.conn.cursor()
# 检查表是否存在
cursor.execute("SHOW TABLES LIKE '%s'" % table_name)
result = cursor.fetchone()
sql = "CREATE TABLE " + table_name + " (_id INT AUTO_INCREMENT PRIMARY KEY, "
for item in parameters:
if item["recordASField"]:
name = item['name']
if item['type'] == 'int':
sql += f"{name} INT, "
elif item['type'] == 'double':
sql += f"{name} DOUBLE, "
elif item['type'] == 'text':
sql += f"{name} TEXT, "
elif item['type'] == 'mediumText':
sql += f"{name} MEDIUMTEXT, "
elif item['type'] == 'longText':
sql += f"{name} LONGTEXT, "
elif item['type'] == 'datetime':
sql += f"{name} DATETIME, "
elif item['type'] == 'date':
sql += f"{name} DATE, "
elif item['type'] == 'time':
sql += f"{name} TIME, "
elif item['type'] == 'varchar':
sql += f"{name} VARCHAR(255), "
elif item['type'] == 'bigInt':
sql += f"{name} BIGINT, "
self.field_sql += f"{name}, "
# 移除最后的逗号并添加闭合的括号
sql = sql.rstrip(', ') + ")"
self.field_sql = self.field_sql.rstrip(', ') + ")"
# 如果表不存在,创建它
if not result:
# 执行SQL命令
cursor.execute(sql)
else:
print("数据表" + table_name + "已存在。")
print("The data table " + table_name + " already exists.")
cursor.close()
def write_to_mysql(self, OUTPUT, record, types):
# 创建一个游标对象
cursor = self.conn.cursor()
for line in OUTPUT:
for i in range(len(line)):
if types[i] == "int" or types[i] == "bigInt":
try:
line[i] = int(line[i])
except:
line[i] = 0
elif types[i] == "double":
try:
line[i] = float(line[i])
except:
line[i] = 0.0
elif types[i] == "datetime":
try:
line[i] = datetime.datetime.strptime(line[i], '%Y-%m-%d %H:%M:%S')
except:
line[i] = datetime.datetime.strptime("1970-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
elif types[i] == "date":
try:
line[i] = datetime.datetime.strptime(line[i], '%Y-%m-%d')
except:
line[i] = datetime.datetime.strptime("1970-01-01", '%Y-%m-%d')
elif types[i] == "time":
try:
line[i] = datetime.datetime.strptime(line[i], '%H:%M:%S')
except:
line[i] = datetime.datetime.strptime("00:00:00", '%H:%M:%S')
to_write = []
for i in range(len(line)):
if record[i]:
to_write.append(line[i])
# 构造插入数据的 SQL 语句
sql = f"INSERT INTO "+ self.table_name +" "+self.field_sql+" VALUES ("
for item in to_write:
sql += "%s, "
# 移除最后的逗号并添加闭合的括号
sql = sql.rstrip(', ') + ")"
# 执行 SQL 语句
try:
cursor.execute(sql, to_write)
except Exception as e:
print("Error:", e)
print("Error SQL:", sql, to_write)
print("插入数据库错误,请查看以上的错误提示,然后检查数据的类型是否正确,是否文本过长(超过一万的文本类型要设置为大文本)。")
print("Inserting database error, please check the above error, and then check whether the data type is correct, whether the text is too long (text type over 10,000 should be set to large text).")
print("重新执行任务时,请删除数据库中的数据表" + self.table_name + ",然后再次运行程序。")
print("When re-executing the task, please delete the data table " + self.table_name + " in the database, and then run the program again.")
# 提交到数据库执行
self.conn.commit()
# 关闭游标和连接
cursor.close()
def close(self):
self.conn.close()
print("成功关闭数据库。")
print("Successfully closed the database.")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"id":6,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/9/2023, 8:26:20 AM","update_time":"7/9/2023, 8:26:20 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":4,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"text","exampleValue":"452","value":"452"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"/手机/数码"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://channel.jd.com/mensshoes.html"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3,4,5],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"alr5ipqth9iljup1kld","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":4,"index":4,"parentId":2,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":true,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"452","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']","/html/body/div[last()-6]/div/div[last()-2]/div/input"]}},{"id":5,"index":5,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":2,"contentType":0,"relative":false,"name":"参数2_链接地址","desc":"","relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[8]/a[3]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[8]/a[3]","//a[contains(., '男鞋')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-5]/a"],"exampleValues":[{"num":0,"value":"https://channel.jd.com/mensshoes.html"}],"unique_index":"upvimqodqwljup1tlv","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

View File

@ -88,7 +88,7 @@ if __name__ == "__main__":
compress_folder_to_7z("./EasySpider_windows_x86", file_name)
print(f"Compress {file_name} successfully!")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
file_name = f"EasySpider_{easyspider_version}_Linux_x64.7z"
file_name = f"EasySpider_{easyspider_version}_Linux_x64.tar.xz"
if os.path.exists("./EasySpider_Linux_x64/user_data"):
shutil.rmtree("./EasySpider_Linux_x64/user_data")
if os.path.exists("./EasySpider_Linux_x64/Data"):
@ -101,7 +101,7 @@ if __name__ == "__main__":
os.remove("./EasySpider_Linux_x64/mysql_config.json")
os.mkdir("./EasySpider_Linux_x64/Data")
os.mkdir("./EasySpider_Linux_x64/execution_instances")
# compress_folder_to_7z("./EasySpider_Linux_x64", file_name)
subprocess.call(["tar", "-Jcvf", file_name, "./EasySpider_Linux_x64"])
print(f"Compress {file_name} successfully!")
elif sys.platform == "darwin" and platform.architecture()[0] == "64bit":
pass

Binary file not shown.

Binary file not shown.

View File

@ -2,16 +2,19 @@
# This script is used to build.md the package for Linux 64-bit.
rm -r out
cd ../Extension/manifest_v3
node package.js
cd ../../ElectronJS
npm run package
mv out/EasySpider-linux-x64 out/EasySpider
rm -r out/EasySpider/resources/app/chrome_win64
rm -r out/EasySpider/resources/app/chromedrivers
rm -r out/EasySpider/resources/app/Data
rm -r out/EasySpider/resources/app/.idea
rm -r out/EasySpider/resources/app/tasks
rm -r out/EasySpider/resources/app/execution_instances
rm -r out/EasySpider/resources/app/user_data
rm -r ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
rm -rf out/EasySpider/resources/app/chrome_win64
rm -rf out/EasySpider/resources/app/chromedrivers
rm -rf out/EasySpider/resources/app/Data
rm -rf out/EasySpider/resources/app/.idea
rm -rf out/EasySpider/resources/app/tasks
rm -rf out/EasySpider/resources/app/execution_instances
rm -rf out/EasySpider/resources/app/user_data
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
rm out/EasySpider/resources/app/vs_BuildTools.exe
mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_Linux_x64/Code
@ -19,8 +22,10 @@ cp ../ExecuteStage/myChrome.py ../.temp_to_pub/EasySpider_Linux_x64/Code
cp ../ExecuteStage/utils.py ../.temp_to_pub/EasySpider_Linux_x64/Code
chmod 777 ../.temp_to_pub/EasySpider_Linux_x64/easy-spider.sh
rm -r ..\.temp_to_pub\EasySpider_Linux_x64\user_data
rm -r ..\.temp_to_pub\EasySpider_Linux_x64\execution_instances
mkdir ..\.temp_to_pub\EasySpider_Linux_x64\execution_instances
rm -r ..\.temp_to_pub\EasySpider_Linux_x64\Data
mkdir ..\.temp_to_pub\EasySpider_Linux_x64\Data
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/user_data
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/execution_instances
mkdir ../.temp_to_pub/EasySpider_Linux_x64/execution_instances
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/Data
mkdir ../.temp_to_pub/EasySpider_Linux_x64/Data
rm EasySpider_zh.crx
rm EasySpider_en.crx

View File

@ -12,7 +12,6 @@ import sys
# import base64
# import hashlib
import time
# import keyboard
import requests
from urllib.parse import urljoin
from lxml import etree
@ -43,11 +42,10 @@ from PIL import Image
# import uuid
from threading import Thread, Event
from myChrome import MyChrome, MyUCChrome
from utils import check_pause, download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, write_to_csv, write_to_excel
from utils import check_pause, download_image, get_output_code, isnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press, on_release_creator, write_to_csv, write_to_excel
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
class BrowserThread(Thread):
def __init__(self, browser_t, id, service, version, event, saveName, config):
Thread.__init__(self)
@ -1505,13 +1503,21 @@ if __name__ == '__main__':
print("Thread with task id: ", i, " is created")
threads.append(thread)
thread.start()
Thread(target=check_pause, args=("p", event)).start()
# Set the pause operation
if sys.platform != "linux":
Thread(target=check_pause, args=("p", event)).start()
else:
from pynput.keyboard import Key, Listener
# 使用监听器监听键盘输入
with Listener(on_press=on_press, on_release=on_release_creator(event)) as listener:
listener.join()
time.sleep(5)
print("\n\n----------------------------------")
print("正在运行任务长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码如果想恢复任务的执行请再次长按p键。")
print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
print("----------------------------------\n\n")
for thread in threads:
thread.join()

View File

@ -8,4 +8,5 @@ keyboard
undetected_chromedriver
openpyxl
pymysql
lxml
lxml
pynput

View File

@ -24,6 +24,27 @@ def is_valid_url(url):
def lowercase_tags_in_xpath(xpath):
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
def on_press(key):
pass
def on_release_creator(event):
def on_release(key):
try:
if key.char == 'p': # 当按下esc键时退出监听
if event._flag == False:
print("任务执行中长按p键暂停执行。")
print("Task is running, long press 'p' to pause.")
# 设置Event的值为True使得线程b可以继续执行
event.set()
else:
# 设置Event的值为False使得线程b暂停执行
print("任务已暂停长按p键继续执行...")
print("Task paused, press 'p' to continue...")
event.clear()
except:
pass
return on_release
def check_pause(key, event):
while True:
if keyboard.is_pressed(key): # 按下p键暂停程序
@ -294,4 +315,4 @@ class myMySQL:
def close(self):
self.conn.close()
print("成功关闭数据库。")
print("Successfully closed the database.")
print("Successfully closed the database.")

View File

@ -7,7 +7,7 @@
"": {
"name": "EasySpider",
"version": "0.3.5",
"license": "GPL-3.0",
"license": "AGPL-3.0",
"dependencies": {
"crx": "^5.0.1",
"crx3": "^1.1.3",