Change OCR to ddddocr

This commit is contained in:
naibo 2023-07-15 07:42:22 +08:00
parent f847ac6efd
commit 1283206518
5 changed files with 53 additions and 36 deletions

View File

@ -0,0 +1 @@
{"id":181,"name":"OCR","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/15/2023, 6:00:09 AM","update_time":"7/15/2023, 6:01:09 AM","version":"0.3.6","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"/手机/数码"},{"id":1,"name":"参数3_图片地址","desc":"","recordASField":1,"exampleValue":"//m.360buyimg.com/babel/jfs/t1/160456/7/37206/196421/649c09faFeab01f59/cc5ea5b81653b3a5.png"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,4],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":4,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"clear":0,"paras":[{"nodeType":0,"contentType":8,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"nih1pc92uudlk34f3og","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}},{"id":3,"index":4,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"clear":0,"paras":[{"nodeType":4,"contentType":8,"relative":false,"name":"参数3_图片地址","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-7]/div/div/a/img"],"exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/jfs/t1/160456/7/37206/196421/649c09faFeab01f59/cc5ea5b81653b3a5.png"}],"unique_index":"hr2mndh1ty6lk34fgeu","iframe":false,"default":"","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

View File

@ -12,7 +12,7 @@
"justMyCode": false, "justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"] // "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"] // "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[79]", "--headless", "0", "--user_data", "1"] "args": ["--id", "[81]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
} }
] ]
} }

View File

@ -13,8 +13,11 @@ import sys
# import hashlib # import hashlib
import time import time
import requests import requests
from ddddocr import DdddOcr
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree from lxml import etree
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
# import undetected_chromedriver as uc # import undetected_chromedriver as uc
from pynput.keyboard import Key, Listener from pynput.keyboard import Key, Listener
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
@ -38,7 +41,7 @@ from openpyxl import load_workbook, Workbook
import csv import csv
import os import os
from commandline_config import Config from commandline_config import Config
import pytesseract # import pytesseract
from PIL import Image from PIL import Image
# import uuid # import uuid
from threading import Thread, Event from threading import Thread, Event
@ -236,8 +239,8 @@ class BrowserThread(Thread):
except: except:
node["parameters"]["recordASField"] += 1 node["parameters"]["recordASField"] += 1
if para["contentType"] == 8: if para["contentType"] == 8:
print("默认的OCR识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。") print("默认的ddddocr识别功能如果觉得不好用可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行或者可以先设置采集内容类型为“元素截图”把图片保存下来然后用自定义操作调用自己写的程序程序的功能是读取这个最新生成的图片然后用好用的模型如PaddleOCR把图片识别出来然后把返回值返回给程序作为参数输出。")
print("If you think the default OCR function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.") print("If you think the default ddddocr function is not good enough, you can modify the source code get_content function -> contentType == 8 position to your own OCR model and then compile and run it; or you can first set the content type of the crawler to \"Element Screenshot\" to save the picture, and then call your own program with custom operations. The function of the program is to read the latest generated picture, then use a good model, such as PaddleOCR to recognize the picture, and then return the return value as a parameter output to the program.")
if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2: if para["beforeJS"] == "" and para["afterJS"] == "" and para["contentType"] <= 1 and para["nodeType"] <= 2:
para["optimizable"] = True para["optimizable"] = True
else: else:
@ -1367,34 +1370,43 @@ class BrowserThread(Thread):
screenshot_stream = io.BytesIO(screenshot) screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图并转换为灰度图像 # 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L') image = Image.open(screenshot_stream).convert('L')
temp_name = "OCR_" + str(time.time()) + ".png"
location = "Data/Task_" + str(self.id) + "/" + self.saveName + "/" + temp_name
image.save(location)
ocr = DdddOcr()
with open(location, 'rb') as f:
image_bytes = f.read()
content = ocr.classification(image_bytes)
os.remove(location)
# 使用Tesseract OCR引擎识别图像中的文本 # 使用Tesseract OCR引擎识别图像中的文本
content = pytesseract.image_to_string(image, lang='chi_sim+eng') # content = pytesseract.image_to_string(image, lang='chi_sim+eng')
except Exception as e:
try:
print("识别中文失败,尝试只识别英文")
print("Failed to recognize Chinese, try to recognize English only")
screenshot = element.screenshot_as_png
screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
# 使用Tesseract OCR引擎识别图像中的文本
content = pytesseract.image_to_string(image, lang='eng')
except Exception as e: except Exception as e:
# try:
# print(e)
# print("识别中文失败,尝试只识别英文")
# print("Failed to recognize Chinese, try to recognize English only")
# screenshot = element.screenshot_as_png
# screenshot_stream = io.BytesIO(screenshot)
# # 使用Pillow库打开截图并转换为灰度图像
# image = Image.open(screenshot_stream).convert('L')
# # 使用Tesseract OCR引擎识别图像中的文本
# # content = pytesseract.image_to_string(image, lang='eng')
# except Exception as e:
content = "OCR Error" content = "OCR Error"
print(e) print(e)
if sys.platform == "win32": # if sys.platform == "win32":
print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501") # print("要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501")
print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/") # print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
elif sys.platform == "darwin": # elif sys.platform == "darwin":
print( # print(
"注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/146044810") # "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/146044810")
elif sys.platform == "linux": # elif sys.platform == "linux":
print( # print(
"注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/420259031") # "注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://zhuanlan.zhihu.com/p/420259031")
else: # else:
print("注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501") # print("注意以上错误要使用OCR识别功能你需要安装Tesseract-OCR并将其添加到环境变量PATH中添加后需重启EasySpiderhttps://blog.csdn.net/u010454030/article/details/80515501")
print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/") # print("\nhttps://www.bilibili.com/video/BV1GP411y7u4/")
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html") # print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
elif p["contentType"] == 9: elif p["contentType"] == 9:
content = self.execute_code( content = self.execute_code(
2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"]) 2, p["JS"], p["JSWaitTime"], element, iframe=p["iframe"])
@ -1642,6 +1654,7 @@ if __name__ == '__main__':
"read_type": "remote", "read_type": "remote",
"headless": False, "headless": False,
"server_address": "http://localhost:8074", "server_address": "http://localhost:8074",
"keyboard": True, # 是否监听键盘输入
"version": "0.3.6", "version": "0.3.6",
} }
c = Config(config) c = Config(config)
@ -1850,6 +1863,7 @@ if __name__ == '__main__':
# print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.") # print("Passing the Cloudflare verification mode is sometimes unstable. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入 # 使用监听器监听键盘输入
try: try:
if c.keyboard:
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener: with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:
listener.join() listener.join()
except: except:

View File

@ -1,5 +1,6 @@
rmdir /s /q build rmdir /s /q build
rmdir /s /q dist rmdir /s /q dist
pyinstaller -F --icon=favicon.ico easyspider_executestage.py @REM pyinstaller -F --icon=favicon.ico easyspider_executestage.py
pyinstaller -F --icon=favicon.ico --add-data "C:\Python311\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Python311\Lib\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
del ..\ElectronJS\chrome_win64\easyspider_executestage.exe del ..\ElectronJS\chrome_win64\easyspider_executestage.exe
copy dist\easyspider_executestage.exe ..\ElectronJS\chrome_win64\easyspider_executestage.exe copy dist\easyspider_executestage.exe ..\ElectronJS\chrome_win64\easyspider_executestage.exe

View File

@ -7,5 +7,6 @@ pytesseract==0.3.10
openpyxl==3.1.2 openpyxl==3.1.2
pymysql==1.1.0 pymysql==1.1.0
lxml==4.9.2 lxml==4.9.2
ddddocr==1.0.6
pynput==1.7.6 pynput==1.7.6
undetected-chromedriver==3.4.7 undetected-chromedriver==3.4.7