17验证码图片验证码-识别

2025-04-20 03:59:57 +08:00 · 2023-04-29 00:13:09 +08:00 · 2023-04-29 00:13:09 +08:00 · 2876e0582e
commit 2876e0582e
parent 58cc3e96d6
4 changed files with 189 additions and 0 deletions
--- a/猿人学练习/17验证码图片验证码-识别/README.md
+++ b/猿人学练习/17验证码图片验证码-识别/README.md
@ -0,0 +1,34 @@
+# 知识点： headers反爬，图片验证码，机器学习，图片验证码去噪点
+
+## 解题思路
+
+先来看一下图片验证码，为了提高识别的准确度就需要去掉噪点，用windows的画图工具打开
+
+![请求](./img/1.png)
+
+你会发现大部分噪点的r, g, b值都是0，0，0，也就是黑色
+
+![请求](./img/3.png)
+
+这就好处理了，将黑色部分全部变成白色
+
+    # 将部分像素值变为纯白色
+    if r == 0 and g == 0 and b == 0:
+        img.putpixel((i, j), (255, 255, 255))
+
+在将图片灰度转换，二值化
+
+    img = img.convert('L')
+    pixels = img.load()
+    for x in range(img.width):
+        for y in range(img.height):
+            if pixels[x, y] > standard:
+                pixels[x, y] = 255
+            else:
+                pixels[x, y] = 0
+
+看一下效果图
+
+![请求](./img/1-test.png)
+
+接下来就和18题一样，利用百度ocr进行文字识别
--- a/猿人学练习/17验证码图片验证码-识别/main.py
+++ b/猿人学练习/17验证码图片验证码-识别/main.py
@ -0,0 +1,74 @@
+import 百度手写文字识别
+import 去除干扰线
+import requests
+from urllib.parse import quote
+
+
+def code_value():
+    url = "https://www.python-spider.com/api/challenge17/verify"
+    payload = {}
+    headers = {
+        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,'
+                  '*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'cookie': 'sessionid=你的sessionid;',
+        'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+                      ' Chrome/112.0.0.0 Safari/537.36'
+    }
+    try:
+        response = requests.request("GET", url, headers=headers, data=payload)
+        with open('img/1.png', 'wb') as f:
+            f.write(response.content)
+        去除干扰线.run()
+        response_json = 百度手写文字识别.run()
+        words_result = response_json.get('words_result')
+        words = ''
+        for word in words_result:
+            words += word.get('words')
+        print(f"识别的文字是{words}")
+        return words
+    except Exception as e:
+        print(e)
+        return ''
+
+
+def challenge18(page):
+    if page != 1:
+        code = code_value()
+        code = quote(code, 'utf-8')
+    else:
+        code = ''
+    url = "https://www.python-spider.com/api/challenge17"
+    payload = f"page={page}&code={code}"
+    headers = {
+        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'cookie': 'sessionid=你的sessionid;'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response.json()
+
+
+def run():
+    data_num = 0
+    page = 1
+    while True:
+        res_dict = challenge18(page)
+
+        if res_dict.get('message') == 'verify_failed':
+            print(f"验证码没有通过{res_dict}-{page}")
+            continue
+        else:
+            print(f"验证码通过{res_dict}-{page}")
+
+        data_list = res_dict.get('data')
+        for data in data_list:
+            data_num += int(data.get('value'))
+        print(data_num)
+        page += 1
+
+        if page == 101:
+            break
+
+
+if __name__ == '__main__':
+    run()
--- a/猿人学练习/17验证码图片验证码-识别/去除干扰线.py
+++ b/猿人学练习/17验证码图片验证码-识别/去除干扰线.py
@ -0,0 +1,32 @@
+from PIL import Image
+
+
+def processing_image(img_file, standard=205):
+    """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
+    img = Image.open(img_file)
+    for i in range(img.size[0]):
+        for j in range(img.size[1]):
+            r, g, b = img.getpixel((i, j))
+            # 将部分像素值变为纯白色
+            if r == 0 and g == 0 and b == 0:
+                img.putpixel((i, j), (255, 255, 255))
+
+    # 灰度转换，二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割
+    img = img.convert('L')
+    pixels = img.load()
+    for x in range(img.width):
+        for y in range(img.height):
+            if pixels[x, y] > standard:
+                pixels[x, y] = 255
+            else:
+                pixels[x, y] = 0
+    return img
+
+
+def run():
+    image_b = processing_image('./img/1.png')
+    image_b.save('./img/1-test.png')
+
+
+if __name__ == '__main__':
+    run()
--- a/猿人学练习/17验证码图片验证码-识别/百度手写文字识别.py
+++ b/猿人学练习/17验证码图片验证码-识别/百度手写文字识别.py
@ -0,0 +1,49 @@
+from urllib.parse import quote
+import base64
+import urllib
+import requests
+
+API_KEY = ""
+SECRET_KEY = ""
+
+
+def get_file_content_as_base64(path, urlencoded=False):
+    """
+    获取文件base64编码
+    :param path: 文件路径
+    :param urlencoded: 是否对结果进行urlencoded
+    :return: base64编码信息
+    """
+    with open(path, "rb") as f:
+        content = base64.b64encode(f.read()).decode("utf8")
+        if urlencoded:
+            content = urllib.parse.quote_plus(content)
+    return content
+
+
+def get_access_token():
+    """
+    使用 AK，SK 生成鉴权签名（Access Token）
+    :return: access_token，或是None(如果错误)
+    """
+    url = "https://aip.baidubce.com/oauth/2.0/token"
+    params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
+    return str(requests.post(url, params=params).json().get("access_token"))
+
+
+def run():
+    image_to_base64_res = get_file_content_as_base64('./img/1-test.png')
+    image_to_base64_res = quote(image_to_base64_res, 'utf-8')
+    url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting?access_token=" + get_access_token()
+    payload = f'image={image_to_base64_res}'
+    headers = {
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Accept': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+    print(response.json())
+    return response.json()
+
+
+if __name__ == '__main__':
+    run()