From a365783e417fbc9f94c76b923a9527dc930a3af9 Mon Sep 17 00:00:00 2001 From: naibo Date: Fri, 5 Jan 2024 17:52:50 +0800 Subject: [PATCH] Auto Rename Download Files --- ElectronJS/tasks/228.json | 2 +- ExecuteStage/.vscode/launch.json | 2 +- ExecuteStage/easyspider_executestage.py | 8 ++-- ExecuteStage/utils.py | 55 ++++++++++--------------- 4 files changed, 29 insertions(+), 38 deletions(-) diff --git a/ElectronJS/tasks/228.json b/ElectronJS/tasks/228.json index 2abf833..57b3718 100644 --- a/ElectronJS/tasks/228.json +++ b/ElectronJS/tasks/228.json @@ -1 +1 @@ -{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2023-12-28 14:20:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":3,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]} \ No newline at end of file +{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2024-01-05 17:40:31","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"},{"id":1,"name":"loopTimes_1","nodeId":5,"nodeName":"循环 - 单个元素","desc":"循环循环 - 单个元素执行的次数(0代表无限循环)","type":"int","exampleValue":10,"value":10}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,5],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":3,"index":2,"parentId":2,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":-1,"index":4,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":2,"index":5,"parentId":0,"type":1,"option":8,"title":"循环 - 单个元素","sequence":[2],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"//body","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":0,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":10,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0}}]} \ No newline at end of file diff --git a/ExecuteStage/.vscode/launch.json b/ExecuteStage/.vscode/launch.json index f0546e9..0363494 100644 --- a/ExecuteStage/.vscode/launch.json +++ b/ExecuteStage/.vscode/launch.json @@ -12,7 +12,7 @@ "justMyCode": false, // "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] - "args": ["--ids", "[75]", "--headless", "0", "--user_data", "0", "--keyboard", "0", + "args": ["--ids", "[77, 78]", "--headless", "0", "--user_data", "0", "--keyboard", "0", "--read_type", "remote"] // "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name" } diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index 4394cd3..3acce6f 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -5,9 +5,10 @@ import copy import platform import shutil import string +import threading # import undetected_chromedriver as uc from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \ - on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json + on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json from myChrome import MyChrome from threading import Thread, Event from PIL import Image @@ -147,8 +148,9 @@ class BrowserThread(Thread): self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id), self.saveName) self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}} - - self.browser.execute("send_command", self.paramss) # 下载地址改变 + self.browser.execute("send_command", self.paramss) # 下载目录改变 + monitor_thread = threading.Thread(target=rename_downloaded_file, args=(path, )) #path后面的逗号不能省略,是元组固定写法 + monitor_thread.start() # self.browser.get('about:blank') self.procedure = service["graph"] # 程序执行流程 try: diff --git a/ExecuteStage/utils.py b/ExecuteStage/utils.py index 690e5a7..f25fd02 100644 --- a/ExecuteStage/utils.py +++ b/ExecuteStage/utils.py @@ -59,41 +59,30 @@ def send_email(config): smtp_server.quit() except: pass - - -def wait_for_download_complete(download_dir, timeout=3600): - """等待下载完成,直到没有.crdownload文件为止,或者超时""" - while True: - time.sleep(1) # 每一秒检查一次 - timeout -= 1 - is_downloading = False - for fname in os.listdir(download_dir): - if fname.endswith('.crdownload'): - is_downloading = True - break - # 如果没有下载或超时,则退出 - if not is_downloading or timeout <= 0: - break - elif timeout % 10 == 0: - print(f"下载文件中,请等待...|Downloading in progress, please wait... {timeout} seconds left") - print("可以在点击元素选项中设置下载超时时间。|You can set the download timeout in the 'Click Element' option.") - if is_downloading: - print("下载可能未完成,但已经超时。|Download may not be completed, but it has timed out.") - else: - print("下载完成。|Download completed.") - + def rename_downloaded_file(download_dir): - """重命名下载文件,假设是最新下载的文件""" - files = os.listdir(download_dir) - paths = [os.path.join(download_dir, basename) for basename in files] - latest_file = max(paths, key=os.path.getmtime, default=None) + original_files = set(os.listdir(download_dir)) - if latest_file is not None and not latest_file.endswith('.crdownload'): - new_name = latest_file.split('/')[-1] + '_' + str(uuid.uuid4()) + '_' + latest_file.split('/')[-1] - new_path = os.path.join(download_dir, new_name) - os.rename(latest_file, new_path) - print(f"文件已重命名为: {new_path}") - print(f"File has been renamed to: {new_path}") + while True: + files = os.listdir(download_dir) + for file in files: + if file in original_files: + continue # 跳过原始文件和已重命名的文件 + + full_path = os.path.join(download_dir, file) + + if not full_path.endswith('.crdownload') and not full_path.endswith('.htm') and not full_path.endswith('.html'): + new_name = file.split('/')[-1] + '_' + str(uuid.uuid4()) + '_' + file.split('/')[-1] + new_path = os.path.join(download_dir, new_name) + try: + os.rename(full_path, new_path) + original_files.add(new_name) # 记录新文件名以避免再次重命名 + print(f"文件已重命名为|File has been renamed to: {new_path}") + except: + print("文件重命名失败|File rename failed") + + time.sleep(1) # 每一秒检查一次 + # print("下载文件重命名监控中,请等待...|Download file rename monitoring, please wait...") def is_valid_url(url): try: