This commit is contained in:
naibo 2023-12-24 10:53:14 +08:00
parent 2eeeab89f6
commit 30dc790398
4 changed files with 497 additions and 688 deletions

View File

@ -5,7 +5,7 @@ import copy
import platform
import shutil
import string
import undetected_chromedriver as uc
# import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from myChrome import MyChrome
@ -44,14 +44,24 @@ import sys
# import hashlib
import time
import requests
from ddddocr import DdddOcr
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
try:
from ddddocr import DdddOcr
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
except:
print("OCR识别无法在当前环境下使用ddddocr库缺失请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
from urllib.parse import urljoin
from lxml import etree, html
try:
import pandas as pd
except:
print("数据去重无法在当前环境下使用pandas库缺失请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
time.sleep(1)
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
import pandas as pd
# import numpy
# import pytesseract
# import uuid
@ -651,26 +661,62 @@ class BrowserThread(Thread):
time.sleep(param["scrollWaitTime"]) # 下拉完等待
except:
pass
except:
self.print_and_log('Time out after set seconds when scrolling. ')
except Exception as e:
self.print_and_log("滚动屏幕时出错|Error scrolling screen:", e)
try:
self.browser.execute_script('window.stop()')
except:
pass
if scrollType != 0 and param["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(param["scrollCount"]):
self.print_and_log(
"Wait for set second after screen scrolling")
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=param["iframe"])
if scrollType == 1:
body.send_keys(Keys.PGDN)
elif scrollType == 2:
if scrollType == 1 or scrollType == 2:
for i in range(param["scrollCount"]):
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=param["iframe"])
if scrollType == 1:
body.send_keys(Keys.PAGE_DOWN)
elif scrollType == 2:
body.send_keys(Keys.END)
try:
time.sleep(param["scrollWaitTime"]) # 下拉完等待
except:
pass
self.print_and_log("向下滚动,第", i + 1, "次。")
self.print_and_log(
"Scroll down, the", i + 1, "time.")
elif scrollType == 3:
bodyText = ""
i = 0
while True:
newBodyText = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=False).text
if param["iframe"]: # 如果标记了iframe
iframes = self.browser.find_elements(
By.CSS_SELECTOR, "iframe", iframe=False)
for iframe in iframes:
self.browser.switch_to.default_content()
self.browser.switch_to.frame(iframe)
iframe_text = super(self.browser.__class__, self.browser).find_element(
By.CSS_SELECTOR, "body").text # 用super调用父类的方法
newBodyText += iframe_text
self.browser.switch_to.default_content()
if newBodyText == bodyText:
self.print_and_log("页面已检测不到新内容,停止滚动。")
self.print_and_log(
"No new content detected on the page, stop scrolling.")
break
else:
bodyText = newBodyText
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=param["iframe"])
body.send_keys(Keys.END)
try:
time.sleep(param["scrollWaitTime"]) # 下拉完等待
except:
pass
self.print_and_log("滚动到底部,第", i + 1, "次。")
self.print_and_log(
"Scroll to the bottom, the", i + 1, "time.")
i = i + 1
try:
time.sleep(param["scrollWaitTime"]) # 下拉完等待
except:
pass
if rt != "":
rt.end()
@ -1552,10 +1598,11 @@ class BrowserThread(Thread):
else:
value = param["value"]
# 将value中的Field[""]替换为outputParameters中的键值
pattern = r'Field\["([^"]+)"\]'
# pattern = r'Field\["([^"]+)"\]'
try:
replaced_text = re.sub(
pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
# replaced_text = re.sub(
# pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
replaced_text = replace_field_values(value, self.outputParameters, self)
replaced_text = re.sub(
'<enter>', '', replaced_text, flags=re.IGNORECASE)
except:
@ -2148,8 +2195,6 @@ class BrowserThread(Thread):
self.OUTPUT.append(line)
if __name__ == '__main__':
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {
@ -2303,7 +2348,13 @@ if __name__ == '__main__':
service = json.loads(content.text) # 加载服务信息
else:
print("local")
with open("execution_instances/" + str(id) + ".json", 'r', encoding='utf-8') as f:
local_folder = os.path.join(os.getcwd(), "execution_instances")
if sys.platform == "darwin":
user_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/")
local_folder = os.path.join(user_folder, "execution_instances")
file_path = os.path.join(local_folder, str(id) + ".json")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
service = json.loads(content) # 加载服务信息
try:

View File

@ -308,6 +308,18 @@ def replace_field_values(orginal_text, outputParameters, browser=None):
eval_replaced_text = str(eval(match.group(1)))
# 替换eval语句
replaced_text = replaced_text.replace(match.group(0), eval_replaced_text)
if re.search(r'JS\(', replaced_text, re.IGNORECASE): # 如果返回值中包含JS
replaced_text = replaced_text.replace("self.", "browser.")
pattern = re.compile(r'(?i)JS\("(.+?)"\)')
# 循环替换所有匹配到的JS语句
while True:
match = pattern.search(replaced_text)
if not match:
break
# 执行JS并将其结果转换为字符串形式
JS_replaced_text = str(browser.browser.execute_script(match.group(1)))
# 替换JS语句
replaced_text = replaced_text.replace(match.group(0), JS_replaced_text)
except Exception as e:
print("eval替换失败请检查eval语句是否正确。| Failed to replace eval, please check if the eval statement is correct.")
print(e)

File diff suppressed because one or more lines are too long

1068
LICENSE

File diff suppressed because it is too large Load Diff