18验证码图片验证码-计算
54
猿人学练习/18验证码图片验证码-计算/README.md
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# 知识点: headers反爬,图片验证码,机器学习
|
||||||
|
|
||||||
|
## 解题思路
|
||||||
|
|
||||||
|
学习了3天的机器学习,发现机器学习不是一朝一夕能解决的,本来打算依葫芦画瓢直接搞定,事与愿违,不理解基础知识就不能很好调试模型,既然这样那就先用百度ocr走一波。
|
||||||
|
|
||||||
|
找到图片验证码地址,这里要注意的是此地址也存在headers反爬
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
下面就注册百度ocr平台,
|
||||||
|
|
||||||
|
https://ai.baidu.com/ai-doc/OCR/hk3h7y2qq
|
||||||
|
|
||||||
|
技术文档
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
网页调试工具
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
最主要的是还有免费额度,大企业果然大气多搞几个小号岂不是白嫖
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
ocr平台找好了,开始测试识别成功率,我这边测试了6种识别接口
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
发现图片进行灰度处理后+手写文字识别,成功率最高
|
||||||
|
|
||||||
|
def processing_image(img_file, standard=200):
|
||||||
|
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
|
||||||
|
img = Image.open(img_file)
|
||||||
|
|
||||||
|
# 灰度转换
|
||||||
|
_image = img.convert('L')
|
||||||
|
|
||||||
|
# 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割
|
||||||
|
pixels = _image.load()
|
||||||
|
for x in range(_image.width):
|
||||||
|
for y in range(_image.height):
|
||||||
|
if pixels[x, y] > standard:
|
||||||
|
pixels[x, y] = 255
|
||||||
|
else:
|
||||||
|
pixels[x, y] = 0
|
||||||
|
return _image
|
||||||
|
|
||||||
|
成功通过`18验证码图片验证码-计算`题的测试结果如下:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
100个数据地址,总共调用了311次百度ocr接口(包含测试期间用掉的78次),图片识别成功率在42%,百度技术已经很不错了。感觉如果自己实现一个模型并且针对此验证码单独优化,效果应该会更好。
|
BIN
猿人学练习/18验证码图片验证码-计算/img/1-test.png
Normal file
After Width: | Height: | Size: 779 B |
BIN
猿人学练习/18验证码图片验证码-计算/img/1.png
Normal file
After Width: | Height: | Size: 4.3 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/2.png
Normal file
After Width: | Height: | Size: 48 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/3.png
Normal file
After Width: | Height: | Size: 108 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/4.png
Normal file
After Width: | Height: | Size: 106 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/5.png
Normal file
After Width: | Height: | Size: 90 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/6.png
Normal file
After Width: | Height: | Size: 45 KiB |
BIN
猿人学练习/18验证码图片验证码-计算/img/7.png
Normal file
After Width: | Height: | Size: 57 KiB |
Before Width: | Height: | Size: 2.9 KiB |
@ -1,45 +1,88 @@
|
|||||||
|
import 百度手写文字识别
|
||||||
import requests
|
import requests
|
||||||
import ddddocr
|
|
||||||
|
|
||||||
|
|
||||||
def code_value():
|
def code_value():
|
||||||
url = "https://www.python-spider.com/api/challenge18/verify?"
|
url = "https://www.python-spider.com/api/challenge18/verify"
|
||||||
response = requests.request("GET", url)
|
payload = {}
|
||||||
ocr = ddddocr.DdddOcr(beta=True)
|
headers = {
|
||||||
with open('./img/img2.png', 'wb') as f:
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,'
|
||||||
|
'*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'cookie': 'sessionid=你的sessionid;',
|
||||||
|
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
||||||
|
' Chrome/112.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = requests.request("GET", url, headers=headers, data=payload)
|
||||||
|
with open('img/1.png', 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
response_json = 百度手写文字识别.run()
|
||||||
with open("./img/img2.png", 'rb') as f:
|
words_result = response_json.get('words_result')
|
||||||
image = f.read()
|
words = ''
|
||||||
res = ocr.classification(image)
|
for word in words_result:
|
||||||
print(res)
|
words += word.get('words')
|
||||||
return response.text
|
code = ''
|
||||||
|
if '减' in words:
|
||||||
|
word_list = words.split("减")
|
||||||
|
code = int(word_list[0]) - int(word_list[1])
|
||||||
|
elif '-' in words:
|
||||||
|
word_list = words.split("-")
|
||||||
|
code = int(word_list[0]) - int(word_list[1])
|
||||||
|
elif '+' in words:
|
||||||
|
word_list = words.split("+")
|
||||||
|
code = int(word_list[0]) + int(word_list[1])
|
||||||
|
elif '加' in words:
|
||||||
|
word_list = words.split("加")
|
||||||
|
code = int(word_list[0]) + int(word_list[1])
|
||||||
|
elif '*' in words:
|
||||||
|
word_list = words.split("*")
|
||||||
|
code = int(word_list[0]) * int(word_list[1])
|
||||||
|
elif '乘' in words:
|
||||||
|
word_list = words.split("乘")
|
||||||
|
code = int(word_list[0]) * int(word_list[1])
|
||||||
|
print(f"识别的文字是{words}-计算结果是{code}")
|
||||||
|
return code
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def challenge18(page):
|
def challenge18(page):
|
||||||
|
if page != 1:
|
||||||
code = code_value()
|
code = code_value()
|
||||||
exit()
|
else:
|
||||||
|
code = ''
|
||||||
url = "https://www.python-spider.com/api/challenge18"
|
url = "https://www.python-spider.com/api/challenge18"
|
||||||
payload = f"page={page}&code={code}"
|
payload = f"page={page}&code={code}"
|
||||||
session = requests.session()
|
|
||||||
headers = {
|
headers = {
|
||||||
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
|
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||||
|
'cookie': 'sessionid=你的sessionid;'
|
||||||
}
|
}
|
||||||
session.headers = headers
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
response = session.request("POST", url, data=payload)
|
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
data_num = 0
|
data_num = 0
|
||||||
for page in range(1, 101):
|
page = 1
|
||||||
|
while True:
|
||||||
res_dict = challenge18(page)
|
res_dict = challenge18(page)
|
||||||
|
|
||||||
|
if res_dict.get('message') == 'verify_failed':
|
||||||
|
print(f"验证码没有通过{res_dict}-{page}")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print(f"验证码通过{res_dict}-{page}")
|
||||||
|
|
||||||
data_list = res_dict.get('data')
|
data_list = res_dict.get('data')
|
||||||
print(data_list)
|
|
||||||
for data in data_list:
|
for data in data_list:
|
||||||
data_num += int(data.get('value'))
|
data_num += int(data.get('value'))
|
||||||
print(data_num)
|
print(data_num)
|
||||||
print(data_num)
|
page += 1
|
||||||
|
|
||||||
|
if page == 101:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
71
猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
from PIL import Image
|
||||||
|
from urllib.parse import quote
|
||||||
|
import base64
|
||||||
|
import urllib
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
API_KEY = "你的API_KEY"
|
||||||
|
SECRET_KEY = "你的SECRET_KEY"
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_content_as_base64(path, urlencoded=False):
|
||||||
|
"""
|
||||||
|
获取文件base64编码
|
||||||
|
:param path: 文件路径
|
||||||
|
:param urlencoded: 是否对结果进行urlencoded
|
||||||
|
:return: base64编码信息
|
||||||
|
"""
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
content = base64.b64encode(f.read()).decode("utf8")
|
||||||
|
if urlencoded:
|
||||||
|
content = urllib.parse.quote_plus(content)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_access_token():
|
||||||
|
"""
|
||||||
|
使用 AK,SK 生成鉴权签名(Access Token)
|
||||||
|
:return: access_token,或是None(如果错误)
|
||||||
|
"""
|
||||||
|
url = "https://aip.baidubce.com/oauth/2.0/token"
|
||||||
|
params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
|
||||||
|
return str(requests.post(url, params=params).json().get("access_token"))
|
||||||
|
|
||||||
|
|
||||||
|
def processing_image(img_file, standard=200):
|
||||||
|
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
|
||||||
|
img = Image.open(img_file)
|
||||||
|
|
||||||
|
# 灰度转换
|
||||||
|
_image = img.convert('L')
|
||||||
|
|
||||||
|
# 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割
|
||||||
|
pixels = _image.load()
|
||||||
|
for x in range(_image.width):
|
||||||
|
for y in range(_image.height):
|
||||||
|
if pixels[x, y] > standard:
|
||||||
|
pixels[x, y] = 255
|
||||||
|
else:
|
||||||
|
pixels[x, y] = 0
|
||||||
|
return _image
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
image_b = processing_image('./img/1.png')
|
||||||
|
image_b.save('./img/1-test.png')
|
||||||
|
image_to_base64_res = get_file_content_as_base64('./img/1-test.png')
|
||||||
|
image_to_base64_res = quote(image_to_base64_res, 'utf-8')
|
||||||
|
url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting?access_token=" + get_access_token()
|
||||||
|
payload = f'image={image_to_base64_res}'
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/x-www-form-urlencoded',
|
||||||
|
'Accept': 'application/json'
|
||||||
|
}
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.json())
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run()
|