18验证码图片验证码-计算

2025-04-20 21:55:07 +08:00 · 2023-04-24 03:07:30 +08:00 · 2023-04-24 03:07:30 +08:00 · 1432e1c7c8
commit 1432e1c7c8
parent c53395f97a
12 changed files with 189 additions and 21 deletions
--- a/猿人学练习/18验证码图片验证码-计算/README.md
+++ b/猿人学练习/18验证码图片验证码-计算/README.md
@ -0,0 +1,54 @@
 # 知识点： headers反爬，图片验证码，机器学习
 ## 解题思路
 学习了3天的机器学习，发现机器学习不是一朝一夕能解决的，本来打算依葫芦画瓢直接搞定，事与愿违，不理解基础知识就不能很好调试模型，既然这样那就先用百度ocr走一波。
 找到图片验证码地址，这里要注意的是此地址也存在headers反爬
 ![请求](./img/2.png)
 下面就注册百度ocr平台，
    https://ai.baidu.com/ai-doc/OCR/hk3h7y2qq
 技术文档
 ![请求](./img/3.png)
 网页调试工具
 ![请求](./img/4.png)
 最主要的是还有免费额度，大企业果然大气多搞几个小号岂不是白嫖
 ![请求](./img/5.png)
 ocr平台找好了，开始测试识别成功率，我这边测试了6种识别接口
 ![请求](./img/7.png)
 发现图片进行灰度处理后+手写文字识别，成功率最高
    def processing_image(img_file, standard=200):
        """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
        img = Image.open(img_file)
        # 灰度转换
        _image = img.convert('L')
        # 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割
        pixels = _image.load()
        for x in range(_image.width):
            for y in range(_image.height):
                if pixels[x, y] > standard:
                    pixels[x, y] = 255
                else:
                    pixels[x, y] = 0
        return _image
 成功通过`18验证码图片验证码-计算`题的测试结果如下：
 ![请求](./img/6.png)
 100个数据地址，总共调用了311次百度ocr接口（包含测试期间用掉的78次），图片识别成功率在42%，百度技术已经很不错了。感觉如果自己实现一个模型并且针对此验证码单独优化，效果应该会更好。
--- a/猿人学练习/18验证码图片验证码-计算/img/1-test.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/1-test.png
--- a/猿人学练习/18验证码图片验证码-计算/img/1.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/1.png
--- a/猿人学练习/18验证码图片验证码-计算/img/2.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/2.png
--- a/猿人学练习/18验证码图片验证码-计算/img/3.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/3.png
--- a/猿人学练习/18验证码图片验证码-计算/img/4.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/4.png
--- a/猿人学练习/18验证码图片验证码-计算/img/5.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/5.png
--- a/猿人学练习/18验证码图片验证码-计算/img/6.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/6.png
--- a/猿人学练习/18验证码图片验证码-计算/img/7.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/7.png
--- a/猿人学练习/18验证码图片验证码-计算/img/img2.png
+++ b/猿人学练习/18验证码图片验证码-计算/img/img2.png
--- a/猿人学练习/18验证码图片验证码-计算/main.py
+++ b/猿人学练习/18验证码图片验证码-计算/main.py
@ -1,45 +1,88 @@
 import 百度手写文字识别
 import requests
 import ddddocr
 def code_value():
-    url = "https://www.python-spider.com/api/challenge18/verify?"
+    url = "https://www.python-spider.com/api/challenge18/verify"
-    response = requests.request("GET", url)
+    payload = {}
-    ocr = ddddocr.DdddOcr(beta=True)
+    headers = {
-    with open('./img/img2.png', 'wb') as f:
+        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,'
                  '*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'cookie': 'sessionid=你的sessionid;',
        'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                      ' Chrome/112.0.0.0 Safari/537.36'
    }
    try:
        response = requests.request("GET", url, headers=headers, data=payload)
        with open('img/1.png', 'wb') as f:
            f.write(response.content)
-
+        response_json = 百度手写文字识别.run()
-    with open("./img/img2.png", 'rb') as f:
+        words_result = response_json.get('words_result')
-        image = f.read()
+        words = ''
-    res = ocr.classification(image)
+        for word in words_result:
-    print(res)
+            words += word.get('words')
-    return response.text
+        code = ''
        if '减' in words:
            word_list = words.split("减")
            code = int(word_list[0]) - int(word_list[1])
        elif '-' in words:
            word_list = words.split("-")
            code = int(word_list[0]) - int(word_list[1])
        elif '+' in words:
            word_list = words.split("+")
            code = int(word_list[0]) + int(word_list[1])
        elif '加' in words:
            word_list = words.split("加")
            code = int(word_list[0]) + int(word_list[1])
        elif '*' in words:
            word_list = words.split("*")
            code = int(word_list[0]) * int(word_list[1])
        elif '乘' in words:
            word_list = words.split("乘")
            code = int(word_list[0]) * int(word_list[1])
        print(f"识别的文字是{words}-计算结果是{code}")
        return code
    except Exception as e:
        print(e)
        return ''
 def challenge18(page):
    if page != 1:
        code = code_value()
-    exit()
+    else:
        code = ''
    url = "https://www.python-spider.com/api/challenge18"
    payload = f"page={page}&code={code}"
    session = requests.session()
    headers = {
-        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
+        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'cookie': 'sessionid=你的sessionid;'
    }
-    session.headers = headers
+    response = requests.request("POST", url, headers=headers, data=payload)
    response = session.request("POST", url, data=payload)
    return response.json()
 def run():
    data_num = 0
-    for page in range(1, 101):
+    page = 1
    while True:
        res_dict = challenge18(page)
        if res_dict.get('message') == 'verify_failed':
            print(f"验证码没有通过{res_dict}-{page}")
            continue
        else:
            print(f"验证码通过{res_dict}-{page}")
        data_list = res_dict.get('data')
        print(data_list)
        for data in data_list:
            data_num += int(data.get('value'))
        print(data_num)
-    print(data_num)
+        page += 1
        if page == 101:
            break
 if __name__ == '__main__':
--- a/猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py
+++ b/猿人学练习/18验证码图片验证码-计算/百度手写文字识别.py
@ -0,0 +1,71 @@
 from PIL import Image
 from urllib.parse import quote
 import base64
 import urllib
 import requests
 API_KEY = "你的API_KEY"
 SECRET_KEY = "你的SECRET_KEY"
 def get_file_content_as_base64(path, urlencoded=False):
    """
    获取文件base64编码
    :param path: 文件路径
    :param urlencoded: 是否对结果进行urlencoded
    :return: base64编码信息
    """
    with open(path, "rb") as f:
        content = base64.b64encode(f.read()).decode("utf8")
        if urlencoded:
            content = urllib.parse.quote_plus(content)
    return content
 def get_access_token():
    """
    使用 AK，SK 生成鉴权签名（Access Token）
    :return: access_token，或是None(如果错误)
    """
    url = "https://aip.baidubce.com/oauth/2.0/token"
    params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
    return str(requests.post(url, params=params).json().get("access_token"))
 def processing_image(img_file, standard=200):
    """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
    img = Image.open(img_file)
    # 灰度转换
    _image = img.convert('L')
    # 二值化: 根据阈值 standard, 将所有像素都置为 0(黑色) 或 255(白色), 便于接下来的分割
    pixels = _image.load()
    for x in range(_image.width):
        for y in range(_image.height):
            if pixels[x, y] > standard:
                pixels[x, y] = 255
            else:
                pixels[x, y] = 0
    return _image
 def run():
    image_b = processing_image('./img/1.png')
    image_b.save('./img/1-test.png')
    image_to_base64_res = get_file_content_as_base64('./img/1-test.png')
    image_to_base64_res = quote(image_to_base64_res, 'utf-8')
    url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting?access_token=" + get_access_token()
    payload = f'image={image_to_base64_res}'
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Accept': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    print(response.json())
    return response.json()
 if __name__ == '__main__':
    run()