猿人学第8题-验证码图文点选

This commit is contained in:
luzhisheng 2023-06-28 01:58:23 +08:00
parent 63be0d7d30
commit 141cc1c7c8
17 changed files with 69 additions and 17 deletions

View File

@ -1,4 +1,4 @@
# 知识点:图像识别、去除干扰线、降噪、机器学习
# 知识点:图像识别、去除干扰线、降噪、机器学习、headers请求顺序
## 一、解决点选坐标问题
@ -22,7 +22,7 @@
![debugger](./img/3.png)
由此推断
由此推断,坐标和对应的值
第1个图片坐标值可以是09
第2个图片坐标值可以是1019
@ -34,6 +34,22 @@
第8个图片坐标值可以是610619
第9个图片坐标值可以是620629
这里就可以写一个python方法作为调用
def coordinate(num):
coordinate_dict = {
1: random.randint(0, 9),
2: random.randint(10, 19),
3: random.randint(20, 29),
4: random.randint(300, 309),
5: random.randint(310, 319),
6: random.randint(320, 329),
7: random.randint(600, 609),
8: random.randint(610, 619),
9: random.randint(620, 629)
}
return coordinate_dict.get(num)
## 二、图像去噪点处理
### 选出rgp颜色最多2个进行降噪处理
@ -145,3 +161,26 @@
![debugger](./img/f-2.jpg)
## 三、图像识别/机器学习
测试百度ocr、ddddocr、还尝试了自己训练模型效果都不是很好菜是原罪啊最后发现科大讯飞ocr对这些生僻字识别效果还是不错的。
![debugger](./img/6.png)
效果
![debugger](./img/7.png)
## headers请求顺序
HEADERS = {
'Proxy-Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': 'yuanrenxue.project',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'http://match.yuanrenxue.com/match/8',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
SESSION = requests.session()
SESSION.headers = HEADERS
req = SESSION.get(self.url)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.4 KiB

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.0 KiB

After

Width:  |  Height:  |  Size: 5.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.6 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.4 KiB

After

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.7 KiB

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.1 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.2 KiB

After

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.8 KiB

After

Width:  |  Height:  |  Size: 4.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.5 KiB

After

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.9 KiB

After

Width:  |  Height:  |  Size: 4.7 KiB

View File

@ -3,6 +3,17 @@ import requests
import re
import base64
HEADERS = {
'Proxy-Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': 'yuanrenxue.project',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'http://match.yuanrenxue.com/match/8',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
SESSION = requests.session()
SESSION.headers = HEADERS
class YuanrenXuan(object):
@ -11,7 +22,7 @@ class YuanrenXuan(object):
self.sum_value = 0
def get_task(self):
req = requests.get(self.url)
req = SESSION.get(self.url)
text = re.findall(r'请依次点击:---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
text_list = text.split('</p>---<p>')
img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
@ -22,16 +33,18 @@ class YuanrenXuan(object):
return text_list
def get_match(self, page, answer):
url = f"https://match.yuanrenxue.cn/api/match/8?page={page}&answer={answer}"
payload = {}
headers = {
'cookie': 'sessionid=iikaj9bo7vzqv4mz1xvryl13o7z98l13;'
}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.json())
url = f"https://match.yuanrenxue.cn/api/match/8"
print(answer)
params = (
('page', str(page)),
('answer', answer)
)
response = SESSION.get(url, params=params)
return response.json()
def run(self):
num = 1
data_num_list = []
while True:
text_list = self.get_task()
print(text_list)
@ -43,14 +56,14 @@ class YuanrenXuan(object):
answer = '|'.join(answer_list)
if 'None' in answer:
continue
print(num, answer)
self.get_match(num, answer)
data_list = self.get_match(num, answer).get('data')
for data in data_list:
data_num_list.append(int(data.get('value')))
num += 1
if num == 5:
break
exit()
print(data_num_list)
if __name__ == '__main__':

View File

@ -106,9 +106,9 @@ def get_result(universal_ocr, file_path):
def run_ocr(file_path):
appid = "xxxx"
appid = "zzzz"
apisecret = "xxxx"
apikey = "xxxxx"
apikey = "xxxx"
universal_ocr = UniversalOcr(appid, apikey, apisecret)
res = get_result(universal_ocr, file_path)
return res.get('whole_text')