mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-21 09:35:14 +08:00
继续上次的任务
This commit is contained in:
parent
f7496626a8
commit
f29d35dd95
@ -613,8 +613,13 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
|
|||||||
<option value=0>Desktop</option>
|
<option value=0>Desktop</option>
|
||||||
<option value=1>Mobile (Not supported under Cloudflare mode)</option>
|
<option value=1>Mobile (Not supported under Cloudflare mode)</option>
|
||||||
</select>
|
</select>
|
||||||
<label>Save Data Every N Rows (The larger the value, the faster the scraping speed, but there is a risk of data loss if unexpectedly exited):</label>
|
<label>Save Data Every N Rows (Specify N below, the larger the value, the faster the scraping speed, but there is a risk of data loss if unexpectedly exited):</label>
|
||||||
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
||||||
|
<label>Do you want to resume execution from the last saved position when unexpectedly exiting and restarting the task (The record interval of the number of collected items is the value set above)?</label>
|
||||||
|
<select id="startFromExit" name="startFromExit" class="form-control">
|
||||||
|
<option value="0">No</option>
|
||||||
|
<option value="1">Yes (Requires running the same task ID and the same file name, please execute from the command line and specify the ID)</option>
|
||||||
|
</select>
|
||||||
<label>Maximum Display Length of Data in Console Preview:</label>
|
<label>Maximum Display Length of Data in Console Preview:</label>
|
||||||
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>
|
||||||
|
|
||||||
|
@ -623,6 +623,11 @@
|
|||||||
</select>
|
</select>
|
||||||
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
|
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
|
||||||
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
|
||||||
|
<label>是否要意外退出并重新执行任务时从上次保存的位置继续执行(已采集条数记录间隔为上面设置的值):</label>
|
||||||
|
<select id="startFromExit" name="startFromExit" class="form-control">
|
||||||
|
<option value = 0>否</option>
|
||||||
|
<option value = 1>是(需要运行同一个任务ID和固定的文件名,请用命令行执行并指定ID)</option>
|
||||||
|
</select>
|
||||||
<label>控制台预览时数据最大显示长度:</label>
|
<label>控制台预览时数据最大显示长度:</label>
|
||||||
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>
|
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>
|
||||||
|
|
||||||
|
@ -437,6 +437,7 @@ function saveService(type) {
|
|||||||
"outputFormat": $("#outputFormat").val(),
|
"outputFormat": $("#outputFormat").val(),
|
||||||
"saveName": $("#saveName").val(),
|
"saveName": $("#saveName").val(),
|
||||||
"inputExcel": $("#inputExcel").val(),
|
"inputExcel": $("#inputExcel").val(),
|
||||||
|
"startFromExit": parseInt($("#startFromExit").val()),
|
||||||
"containJudge": containJudge,
|
"containJudge": containJudge,
|
||||||
"desc": serviceDescription,
|
"desc": serviceDescription,
|
||||||
"inputParameters": inputParameters,
|
"inputParameters": inputParameters,
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
2
ExecuteStage/.vscode/launch.json
vendored
2
ExecuteStage/.vscode/launch.json
vendored
@ -12,7 +12,7 @@
|
|||||||
"justMyCode": false,
|
"justMyCode": false,
|
||||||
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
|
||||||
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
|
||||||
"args": ["--id", "[49]", "--headless", "0", "--user_data", "1"]
|
"args": ["--id", "[58]", "--headless", "0", "--user_data", "1"]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
@ -54,6 +54,7 @@ class BrowserThread(Thread):
|
|||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.browser = browser_t
|
self.browser = browser_t
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.totalSteps = 0
|
||||||
self.id = id
|
self.id = id
|
||||||
self.event = event
|
self.event = event
|
||||||
try:
|
try:
|
||||||
@ -79,7 +80,21 @@ class BrowserThread(Thread):
|
|||||||
os.mkdir("Data/Task_" + str(i))
|
os.mkdir("Data/Task_" + str(i))
|
||||||
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
|
if not os.path.exists("Data/Task_" + str(i) + "/" + self.saveName):
|
||||||
os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
|
os.mkdir("Data/Task_" + str(i) + "/" + self.saveName) # 创建保存文件夹用来保存截图
|
||||||
|
self.getDataStep = 0
|
||||||
|
self.startSteps = 0
|
||||||
|
try:
|
||||||
|
startFromExit = service["startFromExit"] # 从上次退出的步骤开始
|
||||||
|
if startFromExit == 1:
|
||||||
|
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'r', encoding='utf-8-sig') as file_obj:
|
||||||
|
self.startSteps = int(file_obj.read()) # 读取已执行步数
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if self.startSteps != 0:
|
||||||
|
print("此模式下,任务ID", self.id, "将从上次退出的步骤开始执行,之前已采集条数为", self.startSteps, "条。")
|
||||||
|
print("In this mode, task ID", self.id, "will start from the last step, before we already collected", self.startSteps, " items.")
|
||||||
|
else:
|
||||||
|
print("此模式下,任务ID", self.id, "将从头开始执行,如果需要从上次退出的步骤开始执行,请在保存任务时设置是否从上次保存位置开始执行为“是”。")
|
||||||
|
print("In this mode, task ID", self.id, "will start from the beginning, if you want to start from the last step, please set the option 'start from the last step' to 'yes' when saving the task.")
|
||||||
stealth_path = driver_path[:driver_path.find(
|
stealth_path = driver_path[:driver_path.find(
|
||||||
"chromedriver")] + "stealth.min.js"
|
"chromedriver")] + "stealth.min.js"
|
||||||
with open(stealth_path, 'r') as f:
|
with open(stealth_path, 'r') as f:
|
||||||
@ -267,6 +282,10 @@ class BrowserThread(Thread):
|
|||||||
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
|
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_log.txt', 'a', encoding='utf-8-sig') as file_obj:
|
||||||
file_obj.write(self.log)
|
file_obj.write(self.log)
|
||||||
file_obj.close()
|
file_obj.close()
|
||||||
|
# 写入已执行步数
|
||||||
|
with open("Data/Task_" + str(self.id) + "/" + self.saveName + '_steps.txt', 'w', encoding='utf-8-sig') as file_obj:
|
||||||
|
file_obj.write(str(self.totalSteps + 1))
|
||||||
|
file_obj.close()
|
||||||
# 写入数据
|
# 写入数据
|
||||||
if self.outputFormat == "csv" or self.outputFormat == "txt":
|
if self.outputFormat == "csv" or self.outputFormat == "txt":
|
||||||
file_name = "Data/Task_" + \
|
file_name = "Data/Task_" + \
|
||||||
@ -536,10 +555,17 @@ class BrowserThread(Thread):
|
|||||||
self.recordLog("Click")
|
self.recordLog("Click")
|
||||||
self.clickElement(node["parameters"], loopValue, loopPath, index)
|
self.clickElement(node["parameters"], loopValue, loopPath, index)
|
||||||
elif node["option"] == 3: # 提取数据
|
elif node["option"] == 3: # 提取数据
|
||||||
|
# 针对提取数据操作,设置操作开始的步骤,用于不小心关闭后的恢复的增量采集
|
||||||
|
if self.totalSteps >= self.startSteps:
|
||||||
self.recordLog("getData")
|
self.recordLog("getData")
|
||||||
self.getData(node["parameters"], loopValue, node["isInLoop"],
|
self.getData(node["parameters"], loopValue, node["isInLoop"],
|
||||||
parentPath=loopPath, index=index)
|
parentPath=loopPath, index=index)
|
||||||
self.saveData()
|
self.saveData()
|
||||||
|
else:
|
||||||
|
# self.getDataStep += 1
|
||||||
|
print("跳过第" + str(self.totalSteps) + "次提取数据。")
|
||||||
|
print("Skip the " + str(self.totalSteps) + "th data extraction.")
|
||||||
|
self.totalSteps += 1 # 总步数加一
|
||||||
elif node["option"] == 4: # 输入文字
|
elif node["option"] == 4: # 输入文字
|
||||||
self.inputInfo(node["parameters"], loopValue)
|
self.inputInfo(node["parameters"], loopValue)
|
||||||
elif node["option"] == 5: # 自定义操作
|
elif node["option"] == 5: # 自定义操作
|
||||||
|
Loading…
x
Reference in New Issue
Block a user