猿人学第8题-验证码图文点选

2025-04-21 07:35:13 +08:00 · 2023-06-28 01:58:23 +08:00 · 2023-06-28 01:58:23 +08:00 · 141cc1c7c8
commit 141cc1c7c8
parent 63be0d7d30
17 changed files with 69 additions and 17 deletions
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/README.md
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/README.md
@ -1,4 +1,4 @@
-# 知识点:图像识别、去除干扰线、降噪、机器学习
+# 知识点:图像识别、去除干扰线、降噪、机器学习、headers请求顺序
 ## 一、解决点选坐标问题
@ -22,7 +22,7 @@
 ![debugger](./img/3.png)
-由此推断
+由此推断，坐标和对应的值
    第1个图片坐标值可以是（0～9）
    第2个图片坐标值可以是（10～19）
@ -34,6 +34,22 @@
    第8个图片坐标值可以是（610～619）
    第9个图片坐标值可以是（620～629）
 这里就可以写一个python方法作为调用
    def coordinate(num):
        coordinate_dict = {
            1: random.randint(0, 9),
            2: random.randint(10, 19),
            3: random.randint(20, 29),
            4: random.randint(300, 309),
            5: random.randint(310, 319),
            6: random.randint(320, 329),
            7: random.randint(600, 609),
            8: random.randint(610, 619),
            9: random.randint(620, 629)
        }
        return coordinate_dict.get(num)
 ## 二、图像去噪点处理
 ### 选出rgp颜色最多2个，进行降噪处理
@ -145,3 +161,26 @@
 ![debugger](./img/f-2.jpg)
 ## 三、图像识别/机器学习
 测试百度ocr、ddddocr、还尝试了自己训练模型，效果都不是很好，菜是原罪啊！最后发现科大讯飞ocr对这些生僻字识别效果还是不错的。
 ![debugger](./img/6.png)
 效果
 ![debugger](./img/7.png)
 ## headers请求顺序
    HEADERS = {
        'Proxy-Connection': 'keep-alive',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'User-Agent': 'yuanrenxue.project',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://match.yuanrenxue.com/match/8',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }
    SESSION = requests.session()
    SESSION.headers = HEADERS
    req = SESSION.get(self.url)
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/b-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/b-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/c-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/c-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/d-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/d-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-1.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-1.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-2.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-2.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-3.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-3.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-4.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-4.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-5.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-5.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-6.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-6.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-7.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-7.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-8.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-8.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-9.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-9.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/main.py
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/main.py
@ -3,6 +3,17 @@ import requests
 import re
 import base64
 HEADERS = {
    'Proxy-Connection': 'keep-alive',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'User-Agent': 'yuanrenxue.project',
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'http://match.yuanrenxue.com/match/8',
    'Accept-Language': 'zh-CN,zh;q=0.9'
 }
 SESSION = requests.session()
 SESSION.headers = HEADERS
 class YuanrenXuan(object):
@ -11,7 +22,7 @@ class YuanrenXuan(object):
        self.sum_value = 0
    def get_task(self):
-        req = requests.get(self.url)
+        req = SESSION.get(self.url)
        text = re.findall(r'请依次点击：---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
        text_list = text.split('</p>---<p>')
        img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
@ -22,16 +33,18 @@ class YuanrenXuan(object):
        return text_list
    def get_match(self, page, answer):
-        url = f"https://match.yuanrenxue.cn/api/match/8?page={page}&answer={answer}"
+        url = f"https://match.yuanrenxue.cn/api/match/8"
-        payload = {}
+        print(answer)
-        headers = {
+        params = (
-            'cookie': 'sessionid=iikaj9bo7vzqv4mz1xvryl13o7z98l13;'
+            ('page', str(page)),
-        }
+            ('answer', answer)
-        response = requests.request("GET", url, headers=headers, data=payload)
+        )
-        print(response.json())
+        response = SESSION.get(url, params=params)
        return response.json()
    def run(self):
        num = 1
        data_num_list = []
        while True:
            text_list = self.get_task()
            print(text_list)
@ -43,14 +56,14 @@ class YuanrenXuan(object):
            answer = '|'.join(answer_list)
            if 'None' in answer:
                continue
-            print(num, answer)
+            data_list = self.get_match(num, answer).get('data')
-            self.get_match(num, answer)
+            for data in data_list:
                data_num_list.append(int(data.get('value')))
            num += 1
            if num == 5:
                break
-
+        print(data_num_list)
            exit()
 if __name__ == '__main__':
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
@ -106,9 +106,9 @@ def get_result(universal_ocr, file_path):
 def run_ocr(file_path):
-    appid = "xxxx"
+    appid = "zzzz"
    apisecret = "xxxx"
-    apikey = "xxxxx"
+    apikey = "xxxx"
    universal_ocr = UniversalOcr(appid, apikey, apisecret)
    res = get_result(universal_ocr, file_path)
    return res.get('whole_text')