mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-22 08:27:27 +08:00
Remove Duplicate Data!!!
This commit is contained in:
parent
fbdffb1eec
commit
0b0fca5fcf
@ -721,7 +721,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
|
|||||||
<label>Remove duplicates after execution (note that this function will be executed at the end of the task, and leaving the task midway will not perform deduplication):</label>
|
<label>Remove duplicates after execution (note that this function will be executed at the end of the task, and leaving the task midway will not perform deduplication):</label>
|
||||||
<select id="removeDuplicate" name="removeDuplicate" class="form-control">
|
<select id="removeDuplicate" name="removeDuplicate" class="form-control">
|
||||||
<option value="0">No</option>
|
<option value="0">No</option>
|
||||||
<option value="1">Yes</option>
|
<option value="1">Yes (Note that the file name above should be a fixed name rather than 'current_time', and the execution ID for each task execution should be the same)</option>
|
||||||
</select>
|
</select>
|
||||||
<label>To modify the input parameters of each operation during execution, read the following Excel (.xlsx) file. Please click the "Read Input Parameters from Excel File" button when calling the task to view the file format:</label>
|
<label>To modify the input parameters of each operation during execution, read the following Excel (.xlsx) file. Please click the "Read Input Parameters from Excel File" button when calling the task to view the file format:</label>
|
||||||
<input spellcheck=false onkeydown="inputDelete(event)" id="inputExcel" name="inputExcel" class="form-control" placeholder="If left empty, input parameters will not be read from Excel. The file path is relative to the EasySpider folder, e.g., inputs/task1.xlsx"></input>
|
<input spellcheck=false onkeydown="inputDelete(event)" id="inputExcel" name="inputExcel" class="form-control" placeholder="If left empty, input parameters will not be read from Excel. The file path is relative to the EasySpider folder, e.g., inputs/task1.xlsx"></input>
|
||||||
|
@ -721,7 +721,7 @@ print(emotlib.emoji()) # 使用其中的函数。
|
|||||||
<label>执行完成后是否去除重复数据(注意此功能需要等到任务结束时执行,因此执行任务中途退出将无法进行去重):</label>
|
<label>执行完成后是否去除重复数据(注意此功能需要等到任务结束时执行,因此执行任务中途退出将无法进行去重):</label>
|
||||||
<select id="removeDuplicate" name="removeDuplicate" class="form-control">
|
<select id="removeDuplicate" name="removeDuplicate" class="form-control">
|
||||||
<option value = 0>否</option>
|
<option value = 0>否</option>
|
||||||
<option value = 1>是</option>
|
<option value = 1>是(注意上方文件名应该为固定名称而不是current_time,同时每次执行任务时的执行ID要为同一个)</option>
|
||||||
</select>
|
</select>
|
||||||
<label>执行时通过读取以下Excel(.xlsx)文件来修改各个操作的输入参数,文件格式请在调用任务时点击“从Excel文件读取输入参数”按钮查看:</label>
|
<label>执行时通过读取以下Excel(.xlsx)文件来修改各个操作的输入参数,文件格式请在调用任务时点击“从Excel文件读取输入参数”按钮查看:</label>
|
||||||
<input spellcheck=false onkeydown="inputDelete(event)" id="inputExcel" name="inputExcel" class="form-control" placeholder="为空则不从Excel读取输入参数,文件路径相对于EasySpider文件夹,如inputs/task1.xlsx"></input>
|
<input spellcheck=false onkeydown="inputDelete(event)" id="inputExcel" name="inputExcel" class="form-control" placeholder="为空则不从Excel读取输入参数,文件路径相对于EasySpider文件夹,如inputs/task1.xlsx"></input>
|
||||||
|
@ -264,7 +264,7 @@
|
|||||||
<label style="margin-top: 15px;display: block">{{"如果想进行更复杂的操作,如设置无头模式,设置定时执行等,请使用下方的命令行执行任务选项并配置好命令行参数。~ If you want to perform more complex operations, such as setting headless mode, setting scheduled execution, etc., please use the command line to execute the task and configure the command line parameters below." | lang}}</label>
|
<label style="margin-top: 15px;display: block">{{"如果想进行更复杂的操作,如设置无头模式,设置定时执行等,请使用下方的命令行执行任务选项并配置好命令行参数。~ If you want to perform more complex operations, such as setting headless mode, setting scheduled execution, etc., please use the command line to execute the task and configure the command line parameters below." | lang}}</label>
|
||||||
</label>
|
</label>
|
||||||
<div style="margin-bottom: 10px;">
|
<div style="margin-bottom: 10px;">
|
||||||
<label style="margin-top: 10px;">{{"Execution ID (EID), execution files are stored in 'execution_instances' folder:~执行ID(执行文件存放在execution_instances文件夹内):" | lang}}</label>
|
<label style="margin-top: 10px;">{{"Execution ID (EID), execution files are stored in 'execution_instances' folder, you can write EID by yourself and the set the filename other than 'current_time to append content to the existing file from the EID to achieve incremental collection:~执行ID(执行文件存放在execution_instances文件夹内,提前在下方写好执行ID且文件名不为current_time时可以追加文件内容以实现增量采集):" | lang}}</label>
|
||||||
<input class="form-control" v-model="ID" :placeholder="LANG('如果已在此处写/生成了ID号,则点击执行或获得ID按钮后,任务ID将保持不变且原任务文件将会被新配置覆盖','If already have ID here, the task ID will remain unchanged and the original task file will be overwritten by the new configuration after click buttons')"></input>
|
<input class="form-control" v-model="ID" :placeholder="LANG('如果已在此处写/生成了ID号,则点击执行或获得ID按钮后,任务ID将保持不变且原任务文件将会被新配置覆盖','If already have ID here, the task ID will remain unchanged and the original task file will be overwritten by the new configuration after click buttons')"></input>
|
||||||
<p></p>
|
<p></p>
|
||||||
<!-- <p>提示:点击下方按钮获得任务ID,然后根据此ID进行服务执行;也可自己POST调用接口得到ID,具体参照POST调用文档。</p> -->
|
<!-- <p>提示:点击下方按钮获得任务ID,然后根据此ID进行服务执行;也可自己POST调用接口得到ID,具体参照POST调用文档。</p> -->
|
||||||
|
@ -1 +1 @@
|
|||||||
{"id":149,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/7/2023, 6:36:49 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"mysql","saveName":"京东","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"arguments[0].innerText = \"'\" + arguments[0].innerText + '\"'","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
{"id":149,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/7/2023, 6:36:49 AM","update_time":"12/20/2023, 4:03:13 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"mysql","saveName":"京东","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"List of URLs to be collected, separated by \\n for multiple lines","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"arguments[0].innerText = \"'\" + arguments[0].innerText + '\"'","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
|
File diff suppressed because one or more lines are too long
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
|||||||
"justMyCode": false,
|
"justMyCode": false,
|
||||||
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||||
"args": ["--ids", "[30]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
"args": ["--ids", "[40]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
|
||||||
"--read_type", "remote"]
|
"--read_type", "remote"]
|
||||||
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
|
||||||
}
|
}
|
||||||
|
@ -47,10 +47,11 @@ import requests
|
|||||||
from ddddocr import DdddOcr
|
from ddddocr import DdddOcr
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
|
|
||||||
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
|
||||||
# import pandas as pd
|
import pandas as pd
|
||||||
# import numpy
|
# import numpy
|
||||||
# import pytesseract
|
# import pytesseract
|
||||||
# import uuid
|
# import uuid
|
||||||
@ -481,6 +482,26 @@ class BrowserThread(Thread):
|
|||||||
if removeDuplicateData == 1:
|
if removeDuplicateData == 1:
|
||||||
self.print_and_log("正在去除重复数据,请稍后……")
|
self.print_and_log("正在去除重复数据,请稍后……")
|
||||||
self.print_and_log("Removing duplicate data, please wait...")
|
self.print_and_log("Removing duplicate data, please wait...")
|
||||||
|
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "json" or self.outputFormat == "xlsx":
|
||||||
|
file_name = "Data/Task_" + \
|
||||||
|
str(self.id) + "/" + self.saveName + \
|
||||||
|
'.' + self.outputFormat
|
||||||
|
if self.outputFormat == "csv" or self.outputFormat == "txt":
|
||||||
|
df = pd.read_csv(file_name)
|
||||||
|
df.drop_duplicates(inplace=True)
|
||||||
|
df.to_csv(file_name, index=False)
|
||||||
|
elif self.outputFormat == "xlsx":
|
||||||
|
df = pd.read_excel(file_name)
|
||||||
|
df.drop_duplicates(inplace=True)
|
||||||
|
df.to_excel(file_name, index=False)
|
||||||
|
elif self.outputFormat == "json":
|
||||||
|
df = pd.read_json(file_name)
|
||||||
|
df.drop_duplicates(inplace=True)
|
||||||
|
df.to_json(file_name, orient="records", force_ascii=False)
|
||||||
|
elif self.outputFormat == "mysql":
|
||||||
|
self.mysql.remove_duplicate_data()
|
||||||
|
self.print_and_log("去重完成。")
|
||||||
|
self.print_and_log("Duplicate data removed.")
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
# 挨个执行程序
|
# 挨个执行程序
|
||||||
@ -497,13 +518,13 @@ class BrowserThread(Thread):
|
|||||||
self.print_and_log("Done!")
|
self.print_and_log("Done!")
|
||||||
self.print_and_log("执行完成!")
|
self.print_and_log("执行完成!")
|
||||||
self.saveData(exit=True)
|
self.saveData(exit=True)
|
||||||
|
self.removeDuplicateData()
|
||||||
if self.outputFormat == "mysql":
|
if self.outputFormat == "mysql":
|
||||||
self.mysql.close()
|
self.mysql.close()
|
||||||
try:
|
try:
|
||||||
quitWaitTime = self.service["quitWaitTime"]
|
quitWaitTime = self.service["quitWaitTime"]
|
||||||
except:
|
except:
|
||||||
quitWaitTime = 60
|
quitWaitTime = 60
|
||||||
self.removeDuplicateData()
|
|
||||||
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
|
self.print_and_log(f"任务执行完毕,将在{quitWaitTime}秒后自动退出浏览器并清理临时用户目录,等待时间可在保存任务对话框中设置。")
|
||||||
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
|
self.print_and_log(f"The task is completed, the browser will exit automatically and the temporary user directory will be cleaned up after {quitWaitTime} seconds, the waiting time can be set in the save task dialog.")
|
||||||
time.sleep(quitWaitTime)
|
time.sleep(quitWaitTime)
|
||||||
|
@ -556,6 +556,10 @@ class myMySQL:
|
|||||||
sql = "CREATE TABLE " + table_name + \
|
sql = "CREATE TABLE " + table_name + \
|
||||||
" (_id INT AUTO_INCREMENT PRIMARY KEY, "
|
" (_id INT AUTO_INCREMENT PRIMARY KEY, "
|
||||||
for item in parameters:
|
for item in parameters:
|
||||||
|
try:
|
||||||
|
recordASField = item["recordASField"]
|
||||||
|
except:
|
||||||
|
item["recordASField"] = True
|
||||||
if item["recordASField"]:
|
if item["recordASField"]:
|
||||||
name = item['name']
|
name = item['name']
|
||||||
if item['type'] == 'int':
|
if item['type'] == 'int':
|
||||||
@ -669,6 +673,25 @@ class myMySQL:
|
|||||||
# 关闭游标和连接
|
# 关闭游标和连接
|
||||||
self.cursor.close()
|
self.cursor.close()
|
||||||
|
|
||||||
|
def remove_duplicate_data(self):
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
# 删除重复数据
|
||||||
|
fields = self.field_sql.replace("(", "").replace(")", "")
|
||||||
|
sql = f"CREATE TABLE {self.table_name}_temp AS " + \
|
||||||
|
f"SELECT MIN(_id) AS _id, " + fields + \
|
||||||
|
f" FROM {self.table_name} GROUP BY " + fields + ";"
|
||||||
|
self.cursor.execute(sql)
|
||||||
|
sql = f"DELETE FROM {self.table_name};"
|
||||||
|
self.cursor.execute(sql)
|
||||||
|
sql = f"INSERT INTO {self.table_name} SELECT * FROM {self.table_name}_temp;"
|
||||||
|
self.cursor.execute(sql)
|
||||||
|
sql = f"DROP TABLE {self.table_name}_temp;"
|
||||||
|
self.cursor.execute(sql)
|
||||||
|
# 提交到数据库执行
|
||||||
|
self.conn.commit()
|
||||||
|
# 关闭游标和连接
|
||||||
|
self.cursor.close()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
try:
|
try:
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user