mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-16 16:26:56 +08:00
Bug fix
This commit is contained in:
parent
c8be528116
commit
a3ed52a54d
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,7 @@ from urllib.parse import urlparse
|
||||
import pymysql
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def is_valid_url(url):
|
||||
try:
|
||||
result = urlparse(url)
|
||||
@ -22,6 +23,7 @@ def is_valid_url(url):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def lowercase_tags_in_xpath(xpath):
|
||||
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
||||
|
||||
@ -30,10 +32,10 @@ def on_press_creator(press_time, event):
|
||||
def on_press(key):
|
||||
try:
|
||||
if key.char == 'p':
|
||||
if press_time["is_pressed"] == False: # 没按下p键时,记录按下p键的时间
|
||||
if press_time["is_pressed"] == False: # 没按下p键时,记录按下p键的时间
|
||||
press_time["duration"] = time.time()
|
||||
press_time["is_pressed"] = True
|
||||
else: # 按下p键时,判断按下p键的时间是否超过2.5秒
|
||||
else: # 按下p键时,判断按下p键的时间是否超过2.5秒
|
||||
duration = time.time() - press_time["duration"]
|
||||
if duration > 2:
|
||||
if event._flag == False:
|
||||
@ -53,6 +55,7 @@ def on_press_creator(press_time, event):
|
||||
pass
|
||||
return on_press
|
||||
|
||||
|
||||
def on_release_creator(event, press_time):
|
||||
def on_release(key):
|
||||
try:
|
||||
@ -92,40 +95,45 @@ def on_release_creator(event, press_time):
|
||||
# time.sleep(1) # 每秒检查一次
|
||||
|
||||
|
||||
def download_image(url, save_directory):
|
||||
def download_image(browser, url, save_directory):
|
||||
# 定义浏览器头信息
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
if is_valid_url(url):
|
||||
# 发送 GET 请求获取图片数据
|
||||
response = requests.get(url, headers=headers)
|
||||
try:
|
||||
# 发送 GET 请求获取图片数据
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
# 检查响应状态码是否为成功状态
|
||||
if response.status_code == requests.codes.ok:
|
||||
# 提取文件名
|
||||
file_name = url.split('/')[-1].split("?")[0]
|
||||
# 检查响应状态码是否为成功状态
|
||||
if response.status_code == requests.codes.ok:
|
||||
# 提取文件名
|
||||
file_name = url.split('/')[-1].split("?")[0]
|
||||
|
||||
# 生成唯一的新文件名
|
||||
new_file_name = file_name + '_' + \
|
||||
str(uuid.uuid4()) + '_' + file_name
|
||||
# 生成唯一的新文件名
|
||||
new_file_name = file_name + '_' + \
|
||||
str(uuid.uuid4()) + '_' + file_name
|
||||
|
||||
# 构建保存路径
|
||||
save_path = os.path.join(save_directory, new_file_name)
|
||||
# 构建保存路径
|
||||
save_path = os.path.join(save_directory, new_file_name)
|
||||
|
||||
# 保存图片到本地
|
||||
with open(save_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
# 保存图片到本地
|
||||
with open(save_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
||||
print("图片已成功下载到:", save_path)
|
||||
print("The image has been successfully downloaded to:", save_path)
|
||||
else:
|
||||
print("下载图片失败,请检查此图片链接是否有效:", url)
|
||||
print(
|
||||
"Failed to download image, please check if this image link is valid:", url)
|
||||
browser.print_and_log("图片已成功下载到:", save_path)
|
||||
browser.print_and_log(
|
||||
"The image has been successfully downloaded to:", save_path)
|
||||
else:
|
||||
browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
|
||||
browser.print_and_log(
|
||||
"Failed to download image, please check if this image link is valid:", url)
|
||||
except Exception as e:
|
||||
browser.print_and_log("下载图片失败|Error downloading image: ", e)
|
||||
else:
|
||||
print("下载图片失败,请检查此图片链接是否有效:", url)
|
||||
print("Failed to download image, please check if this image link is valid:", url)
|
||||
browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
|
||||
browser.print_and_log(
|
||||
"Failed to download image, please check if this image link is valid:", url)
|
||||
|
||||
|
||||
def get_output_code(output):
|
||||
@ -141,9 +149,10 @@ def get_output_code(output):
|
||||
# 判断字段是否为空
|
||||
|
||||
|
||||
def isnull(s):
|
||||
def isnotnull(s):
|
||||
return len(s) != 0
|
||||
|
||||
|
||||
def new_line(outputParameters, maxViewLength, record):
|
||||
line = []
|
||||
i = 0
|
||||
@ -155,6 +164,7 @@ def new_line(outputParameters, maxViewLength, record):
|
||||
print("")
|
||||
return line
|
||||
|
||||
|
||||
def write_to_csv(file_name, data, record):
|
||||
with open(file_name, 'a', encoding='utf-8-sig', newline="") as f:
|
||||
f_csv = csv.writer(f)
|
||||
@ -167,6 +177,52 @@ def write_to_csv(file_name, data, record):
|
||||
f.close()
|
||||
|
||||
|
||||
def replace_field_values(orginal_text, outputParameters):
|
||||
pattern = r'Field\["([^"]+)"\]'
|
||||
try:
|
||||
replaced_text = re.sub(
|
||||
pattern, lambda match: outputParameters.get(match.group(1), ''), orginal_text)
|
||||
except:
|
||||
replaced_text = orginal_text
|
||||
return replaced_text
|
||||
|
||||
|
||||
def write_to_json(file_name, data, types, record, keys):
|
||||
keys = list(keys)
|
||||
# Prepare empty list for data
|
||||
data_to_write = []
|
||||
# Tranform data and append to list
|
||||
for line in data:
|
||||
to_write = {}
|
||||
for i in range(len(line)):
|
||||
if types[i] == "int" or types[i] == "bigInt":
|
||||
try:
|
||||
line[i] = int(line[i])
|
||||
except:
|
||||
line[i] = 0
|
||||
elif types[i] == "double":
|
||||
try:
|
||||
line[i] = float(line[i])
|
||||
except:
|
||||
line[i] = 0.0
|
||||
if record[i]:
|
||||
to_write.update({keys[i]: line[i]})
|
||||
data_to_write.append(to_write)
|
||||
|
||||
try:
|
||||
# read data from JSON
|
||||
with open(file_name, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
except:
|
||||
json_data = []
|
||||
|
||||
json_data.extend(data_to_write)
|
||||
|
||||
# write data to JSON
|
||||
with open(file_name, 'w', encoding='utf-8') as f:
|
||||
json.dump(json_data, f, ensure_ascii=False)
|
||||
|
||||
|
||||
def write_to_excel(file_name, data, types, record):
|
||||
first = False
|
||||
if os.path.exists(file_name):
|
||||
@ -180,7 +236,7 @@ def write_to_excel(file_name, data, types, record):
|
||||
first = True
|
||||
# 追加数据到工作表
|
||||
for line in data:
|
||||
if not first: # 如果不是第一行,需要转换数据类型
|
||||
if not first: # 如果不是第一行,需要转换数据类型
|
||||
for i in range(len(line)):
|
||||
if types[i] == "int" or types[i] == "bigInt":
|
||||
try:
|
||||
@ -203,9 +259,6 @@ def write_to_excel(file_name, data, types, record):
|
||||
wb.save(file_name)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class Time:
|
||||
def __init__(self, type1=""):
|
||||
self.t = int(round(time.time() * 1000))
|
||||
@ -223,7 +276,8 @@ class myMySQL:
|
||||
if sys.platform == "darwin":
|
||||
if config_file.find("./") >= 0:
|
||||
config_file = config_file.replace("./", "")
|
||||
config_file = os.path.expanduser("~/Library/Application Support/EasySpider/" + config_file)
|
||||
config_file = os.path.expanduser(
|
||||
"~/Library/Application Support/EasySpider/" + config_file)
|
||||
print("MySQL config file path: ", config_file)
|
||||
with open(config_file, 'r') as f:
|
||||
config = json.load(f)
|
||||
@ -233,18 +287,20 @@ class myMySQL:
|
||||
passwd = config["password"]
|
||||
db = config["database"]
|
||||
except Exception as e:
|
||||
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在。")
|
||||
print("Failed to read configuration file, please check if the configuration file: "+config_file+" exists.")
|
||||
print("读取配置文件失败,请检查配置文件:"+config_file+"是否存在,或配置信息是否有误。")
|
||||
print("Failed to read configuration file, please check if the configuration file: " +
|
||||
config_file+" exists, or if the configuration information is incorrect.")
|
||||
print(e)
|
||||
try:
|
||||
self.conn = pymysql.connect(
|
||||
host=host, port=port, user=user, passwd=passwd, db=db)
|
||||
host=host, port=port, user=user, passwd=passwd, db=db)
|
||||
print("成功连接到数据库。")
|
||||
print("Successfully connected to the database.")
|
||||
except:
|
||||
print("连接数据库失败,请检查配置文件是否正确。")
|
||||
print("Failed to connect to the database, please check if the configuration file is correct.")
|
||||
|
||||
print(
|
||||
"Failed to connect to the database, please check if the configuration file is correct.")
|
||||
|
||||
def create_table(self, table_name, parameters):
|
||||
self.table_name = table_name
|
||||
self.field_sql = "("
|
||||
@ -253,7 +309,8 @@ class myMySQL:
|
||||
cursor.execute("SHOW TABLES LIKE '%s'" % table_name)
|
||||
result = cursor.fetchone()
|
||||
|
||||
sql = "CREATE TABLE " + table_name + " (_id INT AUTO_INCREMENT PRIMARY KEY, "
|
||||
sql = "CREATE TABLE " + table_name + \
|
||||
" (_id INT AUTO_INCREMENT PRIMARY KEY, "
|
||||
for item in parameters:
|
||||
if item["recordASField"]:
|
||||
name = item['name']
|
||||
@ -309,25 +366,32 @@ class myMySQL:
|
||||
line[i] = 0.0
|
||||
elif types[i] == "datetime":
|
||||
try:
|
||||
line[i] = datetime.datetime.strptime(line[i], '%Y-%m-%d %H:%M:%S')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
line[i], '%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
line[i] = datetime.datetime.strptime("1970-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
"1970-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
|
||||
elif types[i] == "date":
|
||||
try:
|
||||
line[i] = datetime.datetime.strptime(line[i], '%Y-%m-%d')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
line[i], '%Y-%m-%d')
|
||||
except:
|
||||
line[i] = datetime.datetime.strptime("1970-01-01", '%Y-%m-%d')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
"1970-01-01", '%Y-%m-%d')
|
||||
elif types[i] == "time":
|
||||
try:
|
||||
line[i] = datetime.datetime.strptime(line[i], '%H:%M:%S')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
line[i], '%H:%M:%S')
|
||||
except:
|
||||
line[i] = datetime.datetime.strptime("00:00:00", '%H:%M:%S')
|
||||
line[i] = datetime.datetime.strptime(
|
||||
"00:00:00", '%H:%M:%S')
|
||||
to_write = []
|
||||
for i in range(len(line)):
|
||||
if record[i]:
|
||||
to_write.append(line[i])
|
||||
# 构造插入数据的 SQL 语句
|
||||
sql = f"INSERT INTO "+ self.table_name +" "+self.field_sql+" VALUES ("
|
||||
sql = f"INSERT INTO " + self.table_name + \
|
||||
" "+self.field_sql+" VALUES ("
|
||||
for item in to_write:
|
||||
sql += "%s, "
|
||||
# 移除最后的逗号并添加闭合的括号
|
||||
@ -341,14 +405,15 @@ class myMySQL:
|
||||
print("插入数据库错误,请查看以上的错误提示,然后检查数据的类型是否正确,是否文本过长(超过一万的文本类型要设置为大文本)。")
|
||||
print("Inserting database error, please check the above error, and then check whether the data type is correct, whether the text is too long (text type over 10,000 should be set to large text).")
|
||||
print("重新执行任务时,请删除数据库中的数据表" + self.table_name + ",然后再次运行程序。")
|
||||
print("When re-executing the task, please delete the data table " + self.table_name + " in the database, and then run the program again.")
|
||||
print("When re-executing the task, please delete the data table " +
|
||||
self.table_name + " in the database, and then run the program again.")
|
||||
|
||||
# 提交到数据库执行
|
||||
self.conn.commit()
|
||||
|
||||
# 关闭游标和连接
|
||||
cursor.close()
|
||||
|
||||
|
||||
def close(self):
|
||||
self.conn.close()
|
||||
print("成功关闭数据库。")
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@
|
||||
{"id":3,"name":"Just a moment...","url":"https://portal.ustraveldocs.com/scheduleappointment","links":"https://portal.ustraveldocs.com/scheduleappointment","create_time":"7/12/2023, 11:21:54 AM","update_time":"7/12/2023, 11:23:01 AM","version":"0.3.5","saveThreshold":10,"cloudflare":1,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://portal.ustraveldocs.com/scheduleappointment","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://portal.ustraveldocs.com/scheduleappointment","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://portal.ustraveldocs.com/scheduleappointment"}],"outputParameters":[{"id":0,"name":"参数2_文本","desc":"","type":"text","recordASField":1,"exampleValue":"\n 使用条款及细则 (Terms & Conditions)\n \n 在本网站所支付的所有费用均不予退还。请确保您已付款,并获得了收据号码。\n 签证不能保证进入美国。\n 签证允许外国公民进入美国口岸并提出入境申请。\n 只有美国国土安全部和美国海关与边境保护局(CBP)官员可以决定签证持有人能否入境。\n 您不能使用过期签证进入美国。当您进入美国时签证必须是有效的。\n \n "}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":15,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://portal.ustraveldocs.com/scheduleappointment","links":"https://portal.ustraveldocs.com/scheduleappointment","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":3,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数2_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[1]/div[1]/div[1]/form[1]/span[1]/div[1]","allXPaths":["/html/body/div[1]/div[1]/div[1]/form[1]/span[1]/div[1]","//div[contains(., '使')]","//DIV[@class='span-6 last']","/html/body/div[last()-5]/div/div/form/span/div"],"exampleValues":[{"num":0,"value":"\n 使用条款及细则 (Terms & Conditions)\n \n 在本网站所支付的所有费用均不予退还。请确保您已付款,并获得了收据号码。\n 签证不能保证进入美国。\n 签证允许外国公民进入美国口岸并提出入境申请。\n 只有美国国土安全部和美国海关与边境保护局(CBP)官员可以决定签证持有人能否入境。\n 您不能使用过期签证进入美国。当您进入美国时签证必须是有效的。\n \n "}],"unique_index":"p3h5p8qfeyljz5n60b","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_windows_x64/tasks/205.json
Normal file
1
.temp_to_pub/EasySpider_windows_x64/tasks/205.json
Normal file
File diff suppressed because one or more lines are too long
1
.temp_to_pub/EasySpider_windows_x64/tasks/206.json
Normal file
1
.temp_to_pub/EasySpider_windows_x64/tasks/206.json
Normal file
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
1
ElectronJS/tasks/205.json
Normal file
1
ElectronJS/tasks/205.json
Normal file
File diff suppressed because one or more lines are too long
1
ElectronJS/tasks/206.json
Normal file
1
ElectronJS/tasks/206.json
Normal file
File diff suppressed because one or more lines are too long
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
||||
"justMyCode": false,
|
||||
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||
"args": ["--id", "[34]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
|
||||
"args": ["--id", "[28]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
|
||||
}
|
||||
]
|
||||
}
|
@ -577,7 +577,12 @@ class BrowserThread(Thread):
|
||||
self.print_and_log(e) # 打印异常信息
|
||||
self.recordLog("Command execution failed")
|
||||
self.recordLog("命令执行失败")
|
||||
return str(output)
|
||||
try:
|
||||
output = str(output)
|
||||
except:
|
||||
output = "无法转换为字符串|Unable to convert to string"
|
||||
self.print_and_log("无法转换为字符串|Unable to convert to string", output)
|
||||
return output
|
||||
|
||||
def customOperation(self, node, loopValue, loopPath, index):
|
||||
paras = node["parameters"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user