diff --git a/猿人学练习/18验证码图片验证码-计算/README.md b/猿人学练习/18验证码图片验证码-计算/README.md new file mode 100644 index 0000000..2a967be --- /dev/null +++ b/猿人学练习/18验证码图片验证码-计算/README.md @@ -0,0 +1,54 @@ +# 知识点: headers反爬,图片验证码,机器学习 + +## 解题思路 + +学习了3天的机器学习,发现机器学习不是一朝一夕能解决的,本来打算依葫芦画瓢直接搞定,事与愿违,不理解基础知识就不能很好调试模型,既然这样那就先用百度ocr走一波。 + +找到图片验证码地址,这里要注意的是此地址也存在headers反爬 + +![请求](./img/2.png) + +下面就注册百度ocr平台, + + https://ai.baidu.com/ai-doc/OCR/hk3h7y2qq + +技术文档 + +![请求](./img/3.png) + +网页调试工具 + +![请求](./img/4.png) + +最主要的是还有免费额度,大企业果然大气多搞几个小号岂不是白嫖 + +![请求](./img/5.png) + +ocr平台找好了,开始测试识别成功率,我这边测试了6种识别接口 + +![请求](./img/7.png) + +发现图片进行灰度处理后+手写文字识别,成功率最高 + + def processing_image(img_file, standard=200): + """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """ + img = Image.open(img_file) + + # 灰度转换 + _image = img.convert('L') + + # 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割 + pixels = _image.load() + for x in range(_image.width): + for y in range(_image.height): + if pixels[x, y] > standard: + pixels[x, y] = 255 + else: + pixels[x, y] = 0 + return _image + +成功通过`18验证码图片验证码-计算`题的测试结果如下: + +![请求](./img/6.png) + +100个数据地址,总共调用了311次百度ocr接口(包含测试期间用掉的78次),图片识别成功率在42%,百度技术已经很不错了。感觉如果自己实现一个模型并且针对此验证码单独优化,效果应该会更好。 diff --git a/猿人学练习/18验证码图片验证码-计算/img/1-test.png b/猿人学练习/18验证码图片验证码-计算/img/1-test.png new file mode 100644 index 0000000..d066d68 Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/1-test.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/1.png b/猿人学练习/18验证码图片验证码-计算/img/1.png new file mode 100644 index 0000000..4b40c9e Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/1.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/2.png b/猿人学练习/18验证码图片验证码-计算/img/2.png new file mode 100644 index 0000000..c72737f Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/2.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/3.png b/猿人学练习/18验证码图片验证码-计算/img/3.png new file mode 100644 index 0000000..dc5c161 Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/3.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/4.png b/猿人学练习/18验证码图片验证码-计算/img/4.png new file mode 100644 index 0000000..cb2626e Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/4.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/5.png b/猿人学练习/18验证码图片验证码-计算/img/5.png new file mode 100644 index 0000000..a18154d Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/5.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/6.png b/猿人学练习/18验证码图片验证码-计算/img/6.png new file mode 100644 index 0000000..f0b620e Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/6.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/7.png b/猿人学练习/18验证码图片验证码-计算/img/7.png new file mode 100644 index 0000000..c3d4ed7 Binary files /dev/null and b/猿人学练习/18验证码图片验证码-计算/img/7.png differ diff --git a/猿人学练习/18验证码图片验证码-计算/img/img2.png b/猿人学练习/18验证码图片验证码-计算/img/img2.png deleted file mode 100644 index c0a702f..0000000 Binary files a/猿人学练习/18验证码图片验证码-计算/img/img2.png and /dev/null differ diff --git a/猿人学练习/18验证码图片验证码-计算/main.py b/猿人学练习/18验证码图片验证码-计算/main.py index 873f760..482fc44 100644 --- a/猿人学练习/18验证码图片验证码-计算/main.py +++ b/猿人学练习/18验证码图片验证码-计算/main.py @@ -1,45 +1,88 @@ +import 百度手写文字识别 import requests -import ddddocr def code_value(): - url = "https://www.python-spider.com/api/challenge18/verify?" - response = requests.request("GET", url) - ocr = ddddocr.DdddOcr(beta=True) - with open('./img/img2.png', 'wb') as f: - f.write(response.content) - - with open("./img/img2.png", 'rb') as f: - image = f.read() - res = ocr.classification(image) - print(res) - return response.text + url = "https://www.python-spider.com/api/challenge18/verify" + payload = {} + headers = { + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,' + '*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'cookie': 'sessionid=你的sessionid;', + 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/112.0.0.0 Safari/537.36' + } + try: + response = requests.request("GET", url, headers=headers, data=payload) + with open('img/1.png', 'wb') as f: + f.write(response.content) + response_json = 百度手写文字识别.run() + words_result = response_json.get('words_result') + words = '' + for word in words_result: + words += word.get('words') + code = '' + if '减' in words: + word_list = words.split("减") + code = int(word_list[0]) - int(word_list[1]) + elif '-' in words: + word_list = words.split("-") + code = int(word_list[0]) - int(word_list[1]) + elif '+' in words: + word_list = words.split("+") + code = int(word_list[0]) + int(word_list[1]) + elif '加' in words: + word_list = words.split("加") + code = int(word_list[0]) + int(word_list[1]) + elif '*' in words: + word_list = words.split("*") + code = int(word_list[0]) * int(word_list[1]) + elif '乘' in words: + word_list = words.split("乘") + code = int(word_list[0]) * int(word_list[1]) + print(f"识别的文字是{words}-计算结果是{code}") + return code + except Exception as e: + print(e) + return '' def challenge18(page): - code = code_value() - exit() + if page != 1: + code = code_value() + else: + code = '' url = "https://www.python-spider.com/api/challenge18" payload = f"page={page}&code={code}" - session = requests.session() headers = { - 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8' + 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'cookie': 'sessionid=你的sessionid;' } - session.headers = headers - response = session.request("POST", url, data=payload) + response = requests.request("POST", url, headers=headers, data=payload) return response.json() def run(): data_num = 0 - for page in range(1, 101): + page = 1 + while True: res_dict = challenge18(page) + + if res_dict.get('message') == 'verify_failed': + print(f"验证码没有通过{res_dict}-{page}") + continue + else: + print(f"验证码通过{res_dict}-{page}") + data_list = res_dict.get('data') - print(data_list) for data in data_list: data_num += int(data.get('value')) print(data_num) - print(data_num) + page += 1 + + if page == 101: + break if __name__ == '__main__': diff --git a/猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py b/猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py new file mode 100644 index 0000000..b23f334 --- /dev/null +++ b/猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py @@ -0,0 +1,71 @@ +from PIL import Image +from urllib.parse import quote +import base64 +import urllib +import requests + + +API_KEY = "你的API_KEY" +SECRET_KEY = "你的SECRET_KEY" + + +def get_file_content_as_base64(path, urlencoded=False): + """ + 获取文件base64编码 + :param path: 文件路径 + :param urlencoded: 是否对结果进行urlencoded + :return: base64编码信息 + """ + with open(path, "rb") as f: + content = base64.b64encode(f.read()).decode("utf8") + if urlencoded: + content = urllib.parse.quote_plus(content) + return content + + +def get_access_token(): + """ + 使用 AK,SK 生成鉴权签名(Access Token) + :return: access_token,或是None(如果错误) + """ + url = "https://aip.baidubce.com/oauth/2.0/token" + params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY} + return str(requests.post(url, params=params).json().get("access_token")) + + +def processing_image(img_file, standard=200): + """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """ + img = Image.open(img_file) + + # 灰度转换 + _image = img.convert('L') + + # 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割 + pixels = _image.load() + for x in range(_image.width): + for y in range(_image.height): + if pixels[x, y] > standard: + pixels[x, y] = 255 + else: + pixels[x, y] = 0 + return _image + + +def run(): + image_b = processing_image('./img/1.png') + image_b.save('./img/1-test.png') + image_to_base64_res = get_file_content_as_base64('./img/1-test.png') + image_to_base64_res = quote(image_to_base64_res, 'utf-8') + url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting?access_token=" + get_access_token() + payload = f'image={image_to_base64_res}' + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/json' + } + response = requests.request("POST", url, headers=headers, data=payload) + print(response.json()) + return response.json() + + +if __name__ == '__main__': + run()