Iframe Nested

This commit is contained in:
naibo 2023-12-08 06:08:45 +08:00
parent ab0fad5b5a
commit c3773848c3
16 changed files with 156 additions and 138 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -12,8 +12,8 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
// "args": ["--ids", "[1]", "--headless", "0", "--user_data", "1", "--keyboard", "1"]
"args": "--ids '[3]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
"args": ["--ids", "[149]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}
]
}

View File

@ -2,6 +2,7 @@
# import atexit
import atexit
import copy
import platform
import shutil
import string
import undetected_chromedriver as uc
@ -1711,6 +1712,7 @@ class BrowserThread(Thread):
p["relativeXPath"], self.outputParameters, self)
# 只有当前环境不变变化才可以快速提取数据
if self.browser.iframe_env != p["iframe"]:
# if p["iframe"] or self.browser.iframe_env != p["iframe"]: # 如果是iframe则不能快速提取数据主要是各个上下文的iframe切换但一般不会有人这么做
p["optimizable"] = False
continue
# relativeXPath = relativeXPath.lower()
@ -1820,7 +1822,7 @@ class BrowserThread(Thread):
element = self.browser.find_element(
By.XPATH, relativeXPath, iframe=p["iframe"])
except (
NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
NoSuchElementException, InvalidSelectorException, StaleElementReferenceException) as e: # 找不到元素的时候,使用默认值
# self.print_and_log(p)
try:
content = p["default"]
@ -1835,6 +1837,7 @@ class BrowserThread(Thread):
self.print_and_log(
"提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
p["name"], relativeXPath))
self.dataNotFoundKeys[p["name"]] = True
except:
pass
continue
@ -1916,92 +1919,57 @@ if __name__ == '__main__':
print(c)
options = webdriver.ChromeOptions()
driver_path = "chromedriver.exe"
import platform
print(sys.platform, platform.architecture())
# option = webdriver.ChromeOptions()
if not os.path.exists(os.getcwd() + "/Data"):
os.mkdir(os.getcwd() + "/Data")
if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# MacOS需要用option而不是options
# option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# option.add_extension(
# "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# # MacOS需要用option而不是options
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# driver_path = os.getcwd()+ "/chromedriver_mac64"
print(driver_path)
if c.config_folder == "":
c.config_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd() + "/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider",
os.getcwd() + "/EasySpider")
if sys.platform == "win32" and platform.architecture()[0] == "32bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
# option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
else:
print("Unsupported platform")
sys.exit()
print("Chrome location:", options.binary_location)
print("Chromedriver location:", driver_path)
# elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
# print("Finding chromedriver in ./Chrome",
# os.getcwd()+"/Chrome")
# options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
# # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
# driver_path = "./Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd() + "/../ElectronJS"):
# 软件dev用
print("Finding chromedriver in EasySpider",
os.getcwd() + "/ElectronJS")
# option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
# option.add_extension("../ElectronJS/XPathHelper.crx")
options.add_extension("../ElectronJS/XPathHelper.crx")
else:
options.binary_location = "./chrome.exe" # 指定chrome位置
# option.binary_location = "./chrome.exe" # 指定chrome位置
driver_path = "./chromedriver.exe"
# option.add_extension("XPathHelper.crx")
options.add_extension("XPathHelper.crx")
# option.add_experimental_option(
# 'excludeSwitches', ['enable-automation']) # 以开发者模式
options.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式
# user_data_dir = r'' # 注意没有Default
# options.add_argument('--user-data-dir='+p)
# 总结:
# 0. 带Cookie需要用userdatadir
@ -2018,22 +1986,15 @@ if __name__ == '__main__':
except:
pass
# options.add_argument(
# '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
# option.add_argument(
# "--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
# option.add_argument('-ignore-certificate-errors')
# option.add_argument('-ignore -ssl-errors')
if c.headless:
print("Headless mode")
print("无头模式")
# option.add_argument("--headless")
options.add_argument("--headless")
tmp_options = []
@ -2058,11 +2019,7 @@ if __name__ == '__main__':
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
# option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
# option.add_argument(
# f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
# option.add_argument("--profile-directory=Default")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
@ -2074,7 +2031,6 @@ if __name__ == '__main__':
threads = []
for i in range(len(c.ids)):
id = c.ids[i]
# option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
print("id: ", id)
if c.read_type == "remote":
@ -2100,7 +2056,6 @@ if __name__ == '__main__':
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
# option.add_argument('log-level=3') # 隐藏日志
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
print("Data path:", path)
options.add_experimental_option("prefs", {
@ -2116,37 +2071,17 @@ if __name__ == '__main__':
'safebrowsing.disable_download_protection': True,
'profile.default_content_settings.popups': 0,
})
# option.add_experimental_option("prefs", {
# # 设置文件下载路径
# "download.default_directory": path,
# "download.prompt_for_download": False, # 禁止下载提示框
# "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
# "download.directory_upgrade": True,
# "download.extensions_to_open": "applications/pdf",
# "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
# "safebrowsing_for_trusted_sources_enabled": False,
# "safebrowsing.enabled": False,
# 'safebrowsing.enabled': False,
# 'safebrowsing.disable_download_protection': True,
# 'profile.default_content_settings.popups': 0,
# })
try:
if service["environment"] == 1:
# option.add_experimental_option(
# 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
options.add_experimental_option(
'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
except:
pass
# browser_t = MyChrome(
# options=options, chrome_options=option, executable_path=driver_path)
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
elif cloudflare == 1:
if sys.platform == "win32":
options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
# options.add_argument("--auto-open-devtools-for-tabs")
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
browser_t = MyUCChrome(
options=options, driver_executable_path=driver_path)
links = list(filter(isnotnull, service["links"].split("\n")))
@ -2200,8 +2135,6 @@ if __name__ == '__main__':
# print("您的操作系统不支持暂停功能。")
# print("Your operating system does not support the pause function.")
# print("线程长度:", len(threads) )
for thread in threads:
print()
thread.join()

View File

@ -1,6 +1,6 @@
rmdir /s /q build
rmdir /s /q dist
@REM pyinstaller -F --icon=favicon.ico easyspider_executestage.py
pyinstaller -F --icon=favicon.ico --add-data "C:\Python311\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Python311\Lib\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
pyinstaller -F --icon=favicon.ico --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
del ..\ElectronJS\chrome_win64\easyspider_executestage.exe
copy dist\easyspider_executestage.exe ..\ElectronJS\chrome_win64\easyspider_executestage.exe

View File

@ -25,75 +25,150 @@ class MyChrome(webdriver.Chrome):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
# def find_element(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为
# if self.iframe_env:
# super().switch_to.default_content()
# self.iframe_env = False
# if iframe:
# # 获取所有的 iframe
# try:
# iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
# except Exception as e:
# print(e)
# find_element = False
# # 遍历所有的 iframe 并查找里面的元素
# for iframe in iframes:
# # 切换到 iframe
# super().switch_to.default_content()
# super().switch_to.frame(iframe)
# self.iframe_env = True
# try:
# # 在 iframe 中查找元素
# # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
# element = super().find_element(by=by, value=value)
# find_element = True
# except NoSuchElementException as e:
# print(f"No such element found in the iframe: {str(e)}")
# except Exception as e:
# print(f"Exception: {str(e)}")
# # 完成操作后切回主文档
# # super().switch_to.default_content()
# if find_element:
# return element
# if not find_element:
# raise NoSuchElementException
# else:
# return super().find_element(by=by, value=value)
def find_element_recursive(self, by, value, frames):
for frame in frames:
try:
try:
self.switch_to.frame(frame)
except StaleElementReferenceException:
# If the frame has been refreshed, we need to switch to the parent frame first,
self.switch_to.parent_frame()
self.switch_to.frame(frame)
try:
# !!! Attempt to find the element in the current frame, not the context (iframe environment will not change to default), therefore we use super().find_element instead of self.find_element
element = super(MyChrome, self).find_element(by=by, value=value)
return element
except NoSuchElementException:
# Recurse into nested iframes
nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
if nested_frames:
element = self.find_element_recursive(by, value, nested_frames)
if element:
return element
except Exception as e:
print(f"Exception while processing frame: {e}")
raise NoSuchElementException(f"Element {value} not found in any frame or iframe")
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.switch_to.default_content() # Switch back to the main document
self.iframe_env = False
if iframe:
# 获取所有的 iframe
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并查找里面的元素
for iframe in iframes:
# 切换到 iframe
super().switch_to.default_content()
super().switch_to.frame(iframe)
frames = self.find_elements(By.CSS_SELECTOR, "iframe")
if not frames:
raise NoSuchElementException(f"No iframes found in the current page while searching for {value}")
self.iframe_env = True
try:
# 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except NoSuchElementException as e:
print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return element
if not find_element:
raise NoSuchElementException
return self.find_element_recursive(by, value, frames)
else:
return super().find_element(by=by, value=value)
# Find element in the main document as normal
return super(MyChrome, self).find_element(by=by, value=value)
# def find_elements(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为
# if self.iframe_env:
# super().switch_to.default_content()
# self.iframe_env = False
# if iframe:
# # 获取所有的 iframe
# iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
# find_element = False
# # 遍历所有的 iframe 并找到里面的元素
# for iframe in iframes:
# # 切换到 iframe
# try:
# super().switch_to.default_content()
# super().switch_to.frame(iframe)
# self.iframe_env = True
# # 在 iframe 中查找元素
# # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
# elements = super().find_elements(by=by, value=value)
# if len(elements) > 0:
# find_element = True
# # 完成操作后切回主文档
# # super().switch_to.default_content()
# if find_element:
# return elements
# except NoSuchElementException as e:
# print(f"No such element found in the iframe: {str(e)}")
# except Exception as e:
# print(f"Exception: {str(e)}")
# if not find_element:
# raise NoSuchElementException
# else:
# return super().find_elements(by=by, value=value)
def find_elements_recursive(self, by, value, frames):
for frame in frames:
try:
try:
self.switch_to.frame(frame)
except StaleElementReferenceException:
# If the frame has been refreshed, we need to switch to the parent frame first,
self.switch_to.parent_frame()
self.switch_to.frame(frame)
# Directly find elements in the current frame
elements = super(MyChrome, self).find_elements(by=by, value=value)
if elements:
return elements
# Recursively search for elements in nested iframes
nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
if nested_frames:
elements = self.find_elements_recursive(by, value, nested_frames)
if elements:
return elements
except Exception as e:
print(f"Exception while processing frame: {e}")
raise NoSuchElementException(f"Elements with {value} not found in any frame or iframe")
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.switch_to.default_content() # Switch back to the main document
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并找到里面的元素
for iframe in iframes:
# 切换到 iframe
try:
super().switch_to.default_content()
super().switch_to.frame(iframe)
frames = self.find_elements(By.CSS_SELECTOR, "iframe")
if not frames:
return [] # Return an empty list if no iframes are found
self.iframe_env = True
# 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
find_element = True
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
return elements
except NoSuchElementException as e:
print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
if not find_element:
raise NoSuchElementException
return self.find_elements_recursive(by, value, frames)
else:
return super().find_elements(by=by, value=value)
# Find elements in the main document as normal
return super(MyChrome, self).find_elements(by=by, value=value)
# MacOS不支持直接打包带Cloudflare的功能如果要自己编译运行可以把这个if去掉然后配置好浏览器和driver路径
if sys.platform != "darwin":

View File

@ -1,12 +1,12 @@
{
"name": "EasySpider",
"version": "0.5.0",
"version": "0.6.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "EasySpider",
"version": "0.5.0",
"version": "0.6.0",
"license": "AGPL-3.0",
"dependencies": {
"crx": "^5.0.1",

View File

@ -1 +1 @@
{"language":"zh"}
{"language":"en"}