mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-12 03:27:08 +08:00
CL
This commit is contained in:
parent
2eeeab89f6
commit
30dc790398
@ -5,7 +5,7 @@ import copy
|
||||
import platform
|
||||
import shutil
|
||||
import string
|
||||
import undetected_chromedriver as uc
|
||||
# import undetected_chromedriver as uc
|
||||
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
|
||||
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
|
||||
from myChrome import MyChrome
|
||||
@ -44,14 +44,24 @@ import sys
|
||||
# import hashlib
|
||||
import time
|
||||
import requests
|
||||
from ddddocr import DdddOcr
|
||||
from multiprocessing import freeze_support
|
||||
freeze_support() # 防止无限死循环多开
|
||||
try:
|
||||
from ddddocr import DdddOcr
|
||||
import onnxruntime
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
except:
|
||||
print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
|
||||
print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
|
||||
from urllib.parse import urljoin
|
||||
from lxml import etree, html
|
||||
try:
|
||||
import pandas as pd
|
||||
except:
|
||||
print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
|
||||
print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
|
||||
time.sleep(1)
|
||||
|
||||
import onnxruntime
|
||||
|
||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||
import pandas as pd
|
||||
# import numpy
|
||||
# import pytesseract
|
||||
# import uuid
|
||||
@ -651,26 +661,62 @@ class BrowserThread(Thread):
|
||||
time.sleep(param["scrollWaitTime"]) # 下拉完等待
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
self.print_and_log('Time out after set seconds when scrolling. ')
|
||||
except Exception as e:
|
||||
self.print_and_log("滚动屏幕时出错|Error scrolling screen:", e)
|
||||
try:
|
||||
self.browser.execute_script('window.stop()')
|
||||
except:
|
||||
pass
|
||||
if scrollType != 0 and param["scrollCount"] > 0: # 控制屏幕向下滚动
|
||||
for i in range(param["scrollCount"]):
|
||||
self.print_and_log(
|
||||
"Wait for set second after screen scrolling")
|
||||
body = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=param["iframe"])
|
||||
if scrollType == 1:
|
||||
body.send_keys(Keys.PGDN)
|
||||
elif scrollType == 2:
|
||||
if scrollType == 1 or scrollType == 2:
|
||||
for i in range(param["scrollCount"]):
|
||||
body = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=param["iframe"])
|
||||
if scrollType == 1:
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
elif scrollType == 2:
|
||||
body.send_keys(Keys.END)
|
||||
try:
|
||||
time.sleep(param["scrollWaitTime"]) # 下拉完等待
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("向下滚动,第", i + 1, "次。")
|
||||
self.print_and_log(
|
||||
"Scroll down, the", i + 1, "time.")
|
||||
elif scrollType == 3:
|
||||
bodyText = ""
|
||||
i = 0
|
||||
while True:
|
||||
newBodyText = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=False).text
|
||||
if param["iframe"]: # 如果标记了iframe
|
||||
iframes = self.browser.find_elements(
|
||||
By.CSS_SELECTOR, "iframe", iframe=False)
|
||||
for iframe in iframes:
|
||||
self.browser.switch_to.default_content()
|
||||
self.browser.switch_to.frame(iframe)
|
||||
iframe_text = super(self.browser.__class__, self.browser).find_element(
|
||||
By.CSS_SELECTOR, "body").text # 用super调用父类的方法
|
||||
newBodyText += iframe_text
|
||||
self.browser.switch_to.default_content()
|
||||
if newBodyText == bodyText:
|
||||
self.print_and_log("页面已检测不到新内容,停止滚动。")
|
||||
self.print_and_log(
|
||||
"No new content detected on the page, stop scrolling.")
|
||||
break
|
||||
else:
|
||||
bodyText = newBodyText
|
||||
body = self.browser.find_element(
|
||||
By.CSS_SELECTOR, "body", iframe=param["iframe"])
|
||||
body.send_keys(Keys.END)
|
||||
try:
|
||||
time.sleep(param["scrollWaitTime"]) # 下拉完等待
|
||||
except:
|
||||
pass
|
||||
self.print_and_log("滚动到底部,第", i + 1, "次。")
|
||||
self.print_and_log(
|
||||
"Scroll to the bottom, the", i + 1, "time.")
|
||||
i = i + 1
|
||||
try:
|
||||
time.sleep(param["scrollWaitTime"]) # 下拉完等待
|
||||
except:
|
||||
pass
|
||||
if rt != "":
|
||||
rt.end()
|
||||
|
||||
@ -1552,10 +1598,11 @@ class BrowserThread(Thread):
|
||||
else:
|
||||
value = param["value"]
|
||||
# 将value中的Field[""]替换为outputParameters中的键值
|
||||
pattern = r'Field\["([^"]+)"\]'
|
||||
# pattern = r'Field\["([^"]+)"\]'
|
||||
try:
|
||||
replaced_text = re.sub(
|
||||
pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
|
||||
# replaced_text = re.sub(
|
||||
# pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
|
||||
replaced_text = replace_field_values(value, self.outputParameters, self)
|
||||
replaced_text = re.sub(
|
||||
'<enter>', '', replaced_text, flags=re.IGNORECASE)
|
||||
except:
|
||||
@ -2148,8 +2195,6 @@ class BrowserThread(Thread):
|
||||
self.OUTPUT.append(line)
|
||||
|
||||
if __name__ == '__main__':
|
||||
from multiprocessing import freeze_support
|
||||
freeze_support() # 防止无限死循环多开
|
||||
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
|
||||
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
|
||||
config = {
|
||||
@ -2303,7 +2348,13 @@ if __name__ == '__main__':
|
||||
service = json.loads(content.text) # 加载服务信息
|
||||
else:
|
||||
print("local")
|
||||
with open("execution_instances/" + str(id) + ".json", 'r', encoding='utf-8') as f:
|
||||
local_folder = os.path.join(os.getcwd(), "execution_instances")
|
||||
if sys.platform == "darwin":
|
||||
user_folder = os.path.expanduser(
|
||||
"~/Library/Application Support/EasySpider/")
|
||||
local_folder = os.path.join(user_folder, "execution_instances")
|
||||
file_path = os.path.join(local_folder, str(id) + ".json")
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
service = json.loads(content) # 加载服务信息
|
||||
try:
|
||||
|
@ -308,6 +308,18 @@ def replace_field_values(orginal_text, outputParameters, browser=None):
|
||||
eval_replaced_text = str(eval(match.group(1)))
|
||||
# 替换eval语句
|
||||
replaced_text = replaced_text.replace(match.group(0), eval_replaced_text)
|
||||
if re.search(r'JS\(', replaced_text, re.IGNORECASE): # 如果返回值中包含JS
|
||||
replaced_text = replaced_text.replace("self.", "browser.")
|
||||
pattern = re.compile(r'(?i)JS\("(.+?)"\)')
|
||||
# 循环替换所有匹配到的JS语句
|
||||
while True:
|
||||
match = pattern.search(replaced_text)
|
||||
if not match:
|
||||
break
|
||||
# 执行JS并将其结果转换为字符串形式
|
||||
JS_replaced_text = str(browser.browser.execute_script(match.group(1)))
|
||||
# 替换JS语句
|
||||
replaced_text = replaced_text.replace(match.group(0), JS_replaced_text)
|
||||
except Exception as e:
|
||||
print("eval替换失败,请检查eval语句是否正确。| Failed to replace eval, please check if the eval statement is correct.")
|
||||
print(e)
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user