猿人学第8题-验证码图文点选
@ -1,4 +1,4 @@
|
||||
# 知识点:图像识别、去除干扰线、降噪、机器学习
|
||||
# 知识点:图像识别、去除干扰线、降噪、机器学习、headers请求顺序
|
||||
|
||||
## 一、解决点选坐标问题
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
|
||||

|
||||
|
||||
由此推断
|
||||
由此推断,坐标和对应的值
|
||||
|
||||
第1个图片坐标值可以是(0~9)
|
||||
第2个图片坐标值可以是(10~19)
|
||||
@ -34,6 +34,22 @@
|
||||
第8个图片坐标值可以是(610~619)
|
||||
第9个图片坐标值可以是(620~629)
|
||||
|
||||
这里就可以写一个python方法作为调用
|
||||
|
||||
def coordinate(num):
|
||||
coordinate_dict = {
|
||||
1: random.randint(0, 9),
|
||||
2: random.randint(10, 19),
|
||||
3: random.randint(20, 29),
|
||||
4: random.randint(300, 309),
|
||||
5: random.randint(310, 319),
|
||||
6: random.randint(320, 329),
|
||||
7: random.randint(600, 609),
|
||||
8: random.randint(610, 619),
|
||||
9: random.randint(620, 629)
|
||||
}
|
||||
return coordinate_dict.get(num)
|
||||
|
||||
## 二、图像去噪点处理
|
||||
|
||||
### 选出rgp颜色最多2个,进行降噪处理
|
||||
@ -144,4 +160,27 @@
|
||||
|
||||

|
||||
|
||||
## 三、图像识别/机器学习
|
||||
## 三、图像识别/机器学习
|
||||
|
||||
测试百度ocr、ddddocr、还尝试了自己训练模型,效果都不是很好,菜是原罪啊!最后发现科大讯飞ocr对这些生僻字识别效果还是不错的。
|
||||
|
||||

|
||||
|
||||
效果
|
||||
|
||||

|
||||
|
||||
## headers请求顺序
|
||||
|
||||
HEADERS = {
|
||||
'Proxy-Connection': 'keep-alive',
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'User-Agent': 'yuanrenxue.project',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Referer': 'http://match.yuanrenxue.com/match/8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9'
|
||||
}
|
||||
SESSION = requests.session()
|
||||
SESSION.headers = HEADERS
|
||||
|
||||
req = SESSION.get(self.url)
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 50 KiB |
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 8.4 KiB After Width: | Height: | Size: 8.6 KiB |
Before Width: | Height: | Size: 5.0 KiB After Width: | Height: | Size: 5.9 KiB |
Before Width: | Height: | Size: 4.6 KiB After Width: | Height: | Size: 3.8 KiB |
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.9 KiB |
Before Width: | Height: | Size: 4.7 KiB After Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 4.8 KiB After Width: | Height: | Size: 4.5 KiB |
Before Width: | Height: | Size: 3.8 KiB After Width: | Height: | Size: 5.2 KiB |
Before Width: | Height: | Size: 4.5 KiB After Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 4.7 KiB |
@ -3,6 +3,17 @@ import requests
|
||||
import re
|
||||
import base64
|
||||
|
||||
HEADERS = {
|
||||
'Proxy-Connection': 'keep-alive',
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'User-Agent': 'yuanrenxue.project',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Referer': 'http://match.yuanrenxue.com/match/8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9'
|
||||
}
|
||||
SESSION = requests.session()
|
||||
SESSION.headers = HEADERS
|
||||
|
||||
|
||||
class YuanrenXuan(object):
|
||||
|
||||
@ -11,7 +22,7 @@ class YuanrenXuan(object):
|
||||
self.sum_value = 0
|
||||
|
||||
def get_task(self):
|
||||
req = requests.get(self.url)
|
||||
req = SESSION.get(self.url)
|
||||
text = re.findall(r'请依次点击:---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
|
||||
text_list = text.split('</p>---<p>')
|
||||
img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
|
||||
@ -22,16 +33,18 @@ class YuanrenXuan(object):
|
||||
return text_list
|
||||
|
||||
def get_match(self, page, answer):
|
||||
url = f"https://match.yuanrenxue.cn/api/match/8?page={page}&answer={answer}"
|
||||
payload = {}
|
||||
headers = {
|
||||
'cookie': 'sessionid=iikaj9bo7vzqv4mz1xvryl13o7z98l13;'
|
||||
}
|
||||
response = requests.request("GET", url, headers=headers, data=payload)
|
||||
print(response.json())
|
||||
url = f"https://match.yuanrenxue.cn/api/match/8"
|
||||
print(answer)
|
||||
params = (
|
||||
('page', str(page)),
|
||||
('answer', answer)
|
||||
)
|
||||
response = SESSION.get(url, params=params)
|
||||
return response.json()
|
||||
|
||||
def run(self):
|
||||
num = 1
|
||||
data_num_list = []
|
||||
while True:
|
||||
text_list = self.get_task()
|
||||
print(text_list)
|
||||
@ -43,14 +56,14 @@ class YuanrenXuan(object):
|
||||
answer = '|'.join(answer_list)
|
||||
if 'None' in answer:
|
||||
continue
|
||||
print(num, answer)
|
||||
self.get_match(num, answer)
|
||||
data_list = self.get_match(num, answer).get('data')
|
||||
for data in data_list:
|
||||
data_num_list.append(int(data.get('value')))
|
||||
num += 1
|
||||
|
||||
if num == 5:
|
||||
break
|
||||
|
||||
exit()
|
||||
print(data_num_list)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -106,9 +106,9 @@ def get_result(universal_ocr, file_path):
|
||||
|
||||
|
||||
def run_ocr(file_path):
|
||||
appid = "xxxx"
|
||||
appid = "zzzz"
|
||||
apisecret = "xxxx"
|
||||
apikey = "xxxxx"
|
||||
apikey = "xxxx"
|
||||
universal_ocr = UniversalOcr(appid, apikey, apisecret)
|
||||
res = get_result(universal_ocr, file_path)
|
||||
return res.get('whole_text')
|
||||
|