diff --git a/猿人学练习/17验证码图片验证码-识别/README.md b/猿人学练习/17验证码图片验证码-识别/README.md new file mode 100644 index 0000000..907a375 --- /dev/null +++ b/猿人学练习/17验证码图片验证码-识别/README.md @@ -0,0 +1,34 @@ +# 知识点: headers反爬,图片验证码,机器学习,图片验证码去噪点 + +## 解题思路 + +先来看一下图片验证码,为了提高识别的准确度就需要去掉噪点,用windows的画图工具打开 + +![请求](./img/1.png) + +你会发现大部分噪点的r, g, b值都是0,0,0,也就是黑色 + +![请求](./img/3.png) + +这就好处理了,将黑色部分全部变成白色 + + # 将部分像素值变为纯白色 + if r == 0 and g == 0 and b == 0: + img.putpixel((i, j), (255, 255, 255)) + +在将图片灰度转换,二值化 + + img = img.convert('L') + pixels = img.load() + for x in range(img.width): + for y in range(img.height): + if pixels[x, y] > standard: + pixels[x, y] = 255 + else: + pixels[x, y] = 0 + +看一下效果图 + +![请求](./img/1-test.png) + +接下来就和18题一样,利用百度ocr进行文字识别 \ No newline at end of file diff --git a/猿人学练习/17验证码图片验证码-识别/main.py b/猿人学练习/17验证码图片验证码-识别/main.py new file mode 100644 index 0000000..45504ec --- /dev/null +++ b/猿人学练习/17验证码图片验证码-识别/main.py @@ -0,0 +1,74 @@ +import 百度手写文字识别 +import 去除干扰线 +import requests +from urllib.parse import quote + + +def code_value(): + url = "https://www.python-spider.com/api/challenge17/verify" + payload = {} + headers = { + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,' + '*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'cookie': 'sessionid=你的sessionid;', + 'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/112.0.0.0 Safari/537.36' + } + try: + response = requests.request("GET", url, headers=headers, data=payload) + with open('img/1.png', 'wb') as f: + f.write(response.content) + 去除干扰线.run() + response_json = 百度手写文字识别.run() + words_result = response_json.get('words_result') + words = '' + for word in words_result: + words += word.get('words') + print(f"识别的文字是{words}") + return words + except Exception as e: + print(e) + return '' + + +def challenge18(page): + if page != 1: + code = code_value() + code = quote(code, 'utf-8') + else: + code = '' + url = "https://www.python-spider.com/api/challenge17" + payload = f"page={page}&code={code}" + headers = { + 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'cookie': 'sessionid=你的sessionid;' + } + response = requests.request("POST", url, headers=headers, data=payload) + return response.json() + + +def run(): + data_num = 0 + page = 1 + while True: + res_dict = challenge18(page) + + if res_dict.get('message') == 'verify_failed': + print(f"验证码没有通过{res_dict}-{page}") + continue + else: + print(f"验证码通过{res_dict}-{page}") + + data_list = res_dict.get('data') + for data in data_list: + data_num += int(data.get('value')) + print(data_num) + page += 1 + + if page == 101: + break + + +if __name__ == '__main__': + run() diff --git a/猿人学练习/17验证码图片验证码-识别/去除干扰线.py b/猿人学练习/17验证码图片验证码-识别/去除干扰线.py new file mode 100644 index 0000000..8f2fe8f --- /dev/null +++ b/猿人学练习/17验证码图片验证码-识别/去除干扰线.py @@ -0,0 +1,32 @@ +from PIL import Image + + +def processing_image(img_file, standard=205): + """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """ + img = Image.open(img_file) + for i in range(img.size[0]): + for j in range(img.size[1]): + r, g, b = img.getpixel((i, j)) + # 将部分像素值变为纯白色 + if r == 0 and g == 0 and b == 0: + img.putpixel((i, j), (255, 255, 255)) + + # 灰度转换,二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割 + img = img.convert('L') + pixels = img.load() + for x in range(img.width): + for y in range(img.height): + if pixels[x, y] > standard: + pixels[x, y] = 255 + else: + pixels[x, y] = 0 + return img + + +def run(): + image_b = processing_image('./img/1.png') + image_b.save('./img/1-test.png') + + +if __name__ == '__main__': + run() diff --git a/猿人学练习/17验证码图片验证码-识别/百度手写文字识别.py b/猿人学练习/17验证码图片验证码-识别/百度手写文字识别.py new file mode 100644 index 0000000..8c54501 --- /dev/null +++ b/猿人学练习/17验证码图片验证码-识别/百度手写文字识别.py @@ -0,0 +1,49 @@ +from urllib.parse import quote +import base64 +import urllib +import requests + +API_KEY = "" +SECRET_KEY = "" + + +def get_file_content_as_base64(path, urlencoded=False): + """ + 获取文件base64编码 + :param path: 文件路径 + :param urlencoded: 是否对结果进行urlencoded + :return: base64编码信息 + """ + with open(path, "rb") as f: + content = base64.b64encode(f.read()).decode("utf8") + if urlencoded: + content = urllib.parse.quote_plus(content) + return content + + +def get_access_token(): + """ + 使用 AK,SK 生成鉴权签名(Access Token) + :return: access_token,或是None(如果错误) + """ + url = "https://aip.baidubce.com/oauth/2.0/token" + params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY} + return str(requests.post(url, params=params).json().get("access_token")) + + +def run(): + image_to_base64_res = get_file_content_as_base64('./img/1-test.png') + image_to_base64_res = quote(image_to_base64_res, 'utf-8') + url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting?access_token=" + get_access_token() + payload = f'image={image_to_base64_res}' + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/json' + } + response = requests.request("POST", url, headers=headers, data=payload) + print(response.json()) + return response.json() + + +if __name__ == '__main__': + run()