猿人学第8题-验证码图文点选
@ -1,4 +1,4 @@
|
|||||||
# 知识点:图像识别、去除干扰线、降噪、机器学习
|
# 知识点:图像识别、去除干扰线、降噪、机器学习、headers请求顺序
|
||||||
|
|
||||||
## 一、解决点选坐标问题
|
## 一、解决点选坐标问题
|
||||||
|
|
||||||
@ -22,7 +22,7 @@
|
|||||||
|
|
||||||

|

|
||||||
|
|
||||||
由此推断
|
由此推断,坐标和对应的值
|
||||||
|
|
||||||
第1个图片坐标值可以是(0~9)
|
第1个图片坐标值可以是(0~9)
|
||||||
第2个图片坐标值可以是(10~19)
|
第2个图片坐标值可以是(10~19)
|
||||||
@ -34,6 +34,22 @@
|
|||||||
第8个图片坐标值可以是(610~619)
|
第8个图片坐标值可以是(610~619)
|
||||||
第9个图片坐标值可以是(620~629)
|
第9个图片坐标值可以是(620~629)
|
||||||
|
|
||||||
|
这里就可以写一个python方法作为调用
|
||||||
|
|
||||||
|
def coordinate(num):
|
||||||
|
coordinate_dict = {
|
||||||
|
1: random.randint(0, 9),
|
||||||
|
2: random.randint(10, 19),
|
||||||
|
3: random.randint(20, 29),
|
||||||
|
4: random.randint(300, 309),
|
||||||
|
5: random.randint(310, 319),
|
||||||
|
6: random.randint(320, 329),
|
||||||
|
7: random.randint(600, 609),
|
||||||
|
8: random.randint(610, 619),
|
||||||
|
9: random.randint(620, 629)
|
||||||
|
}
|
||||||
|
return coordinate_dict.get(num)
|
||||||
|
|
||||||
## 二、图像去噪点处理
|
## 二、图像去噪点处理
|
||||||
|
|
||||||
### 选出rgp颜色最多2个,进行降噪处理
|
### 选出rgp颜色最多2个,进行降噪处理
|
||||||
@ -145,3 +161,26 @@
|
|||||||

|

|
||||||
|
|
||||||
## 三、图像识别/机器学习
|
## 三、图像识别/机器学习
|
||||||
|
|
||||||
|
测试百度ocr、ddddocr、还尝试了自己训练模型,效果都不是很好,菜是原罪啊!最后发现科大讯飞ocr对这些生僻字识别效果还是不错的。
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
效果
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## headers请求顺序
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
'Proxy-Connection': 'keep-alive',
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'User-Agent': 'yuanrenxue.project',
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Referer': 'http://match.yuanrenxue.com/match/8',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9'
|
||||||
|
}
|
||||||
|
SESSION = requests.session()
|
||||||
|
SESSION.headers = HEADERS
|
||||||
|
|
||||||
|
req = SESSION.get(self.url)
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 50 KiB |
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 8.4 KiB After Width: | Height: | Size: 8.6 KiB |
Before Width: | Height: | Size: 5.0 KiB After Width: | Height: | Size: 5.9 KiB |
Before Width: | Height: | Size: 4.6 KiB After Width: | Height: | Size: 3.8 KiB |
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.9 KiB |
Before Width: | Height: | Size: 4.7 KiB After Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 4.8 KiB After Width: | Height: | Size: 4.5 KiB |
Before Width: | Height: | Size: 3.8 KiB After Width: | Height: | Size: 5.2 KiB |
Before Width: | Height: | Size: 4.5 KiB After Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 4.7 KiB |
@ -3,6 +3,17 @@ import requests
|
|||||||
import re
|
import re
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
'Proxy-Connection': 'keep-alive',
|
||||||
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||||
|
'User-Agent': 'yuanrenxue.project',
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Referer': 'http://match.yuanrenxue.com/match/8',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9'
|
||||||
|
}
|
||||||
|
SESSION = requests.session()
|
||||||
|
SESSION.headers = HEADERS
|
||||||
|
|
||||||
|
|
||||||
class YuanrenXuan(object):
|
class YuanrenXuan(object):
|
||||||
|
|
||||||
@ -11,7 +22,7 @@ class YuanrenXuan(object):
|
|||||||
self.sum_value = 0
|
self.sum_value = 0
|
||||||
|
|
||||||
def get_task(self):
|
def get_task(self):
|
||||||
req = requests.get(self.url)
|
req = SESSION.get(self.url)
|
||||||
text = re.findall(r'请依次点击:---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
|
text = re.findall(r'请依次点击:---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
|
||||||
text_list = text.split('</p>---<p>')
|
text_list = text.split('</p>---<p>')
|
||||||
img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
|
img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
|
||||||
@ -22,16 +33,18 @@ class YuanrenXuan(object):
|
|||||||
return text_list
|
return text_list
|
||||||
|
|
||||||
def get_match(self, page, answer):
|
def get_match(self, page, answer):
|
||||||
url = f"https://match.yuanrenxue.cn/api/match/8?page={page}&answer={answer}"
|
url = f"https://match.yuanrenxue.cn/api/match/8"
|
||||||
payload = {}
|
print(answer)
|
||||||
headers = {
|
params = (
|
||||||
'cookie': 'sessionid=iikaj9bo7vzqv4mz1xvryl13o7z98l13;'
|
('page', str(page)),
|
||||||
}
|
('answer', answer)
|
||||||
response = requests.request("GET", url, headers=headers, data=payload)
|
)
|
||||||
print(response.json())
|
response = SESSION.get(url, params=params)
|
||||||
|
return response.json()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
num = 1
|
num = 1
|
||||||
|
data_num_list = []
|
||||||
while True:
|
while True:
|
||||||
text_list = self.get_task()
|
text_list = self.get_task()
|
||||||
print(text_list)
|
print(text_list)
|
||||||
@ -43,14 +56,14 @@ class YuanrenXuan(object):
|
|||||||
answer = '|'.join(answer_list)
|
answer = '|'.join(answer_list)
|
||||||
if 'None' in answer:
|
if 'None' in answer:
|
||||||
continue
|
continue
|
||||||
print(num, answer)
|
data_list = self.get_match(num, answer).get('data')
|
||||||
self.get_match(num, answer)
|
for data in data_list:
|
||||||
|
data_num_list.append(int(data.get('value')))
|
||||||
num += 1
|
num += 1
|
||||||
|
|
||||||
if num == 5:
|
if num == 5:
|
||||||
break
|
break
|
||||||
|
print(data_num_list)
|
||||||
exit()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -106,9 +106,9 @@ def get_result(universal_ocr, file_path):
|
|||||||
|
|
||||||
|
|
||||||
def run_ocr(file_path):
|
def run_ocr(file_path):
|
||||||
appid = "xxxx"
|
appid = "zzzz"
|
||||||
apisecret = "xxxx"
|
apisecret = "xxxx"
|
||||||
apikey = "xxxxx"
|
apikey = "xxxx"
|
||||||
universal_ocr = UniversalOcr(appid, apikey, apisecret)
|
universal_ocr = UniversalOcr(appid, apikey, apisecret)
|
||||||
res = get_result(universal_ocr, file_path)
|
res = get_result(universal_ocr, file_path)
|
||||||
return res.get('whole_text')
|
return res.get('whole_text')
|
||||||
|