猿人学第8题-验证码图文点选

2025-04-20 21:55:07 +08:00 · 2023-06-28 01:58:23 +08:00 · 2023-06-28 01:58:23 +08:00 · 141cc1c7c8
commit 141cc1c7c8
parent 63be0d7d30
17 changed files with 69 additions and 17 deletions
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/README.md
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/README.md
@ -1,4 +1,4 @@
-# 知识点:图像识别、去除干扰线、降噪、机器学习
+# 知识点:图像识别、去除干扰线、降噪、机器学习、headers请求顺序

 ## 一、解决点选坐标问题

@ -22,7 +22,7 @@

 ![debugger](./img/3.png)

-由此推断
+由此推断，坐标和对应的值

    第1个图片坐标值可以是（0～9）
    第2个图片坐标值可以是（10～19）
@ -34,6 +34,22 @@
    第8个图片坐标值可以是（610～619）
    第9个图片坐标值可以是（620～629）

+这里就可以写一个python方法作为调用
+
+    def coordinate(num):
+        coordinate_dict = {
+            1: random.randint(0, 9),
+            2: random.randint(10, 19),
+            3: random.randint(20, 29),
+            4: random.randint(300, 309),
+            5: random.randint(310, 319),
+            6: random.randint(320, 329),
+            7: random.randint(600, 609),
+            8: random.randint(610, 619),
+            9: random.randint(620, 629)
+        }
+        return coordinate_dict.get(num)
+
 ## 二、图像去噪点处理

 ### 选出rgp颜色最多2个，进行降噪处理
@ -144,4 +160,27 @@

 ![debugger](./img/f-2.jpg)

-## 三、图像识别/机器学习
+## 三、图像识别/机器学习
+
+测试百度ocr、ddddocr、还尝试了自己训练模型，效果都不是很好，菜是原罪啊！最后发现科大讯飞ocr对这些生僻字识别效果还是不错的。
+
+![debugger](./img/6.png)
+
+效果
+
+![debugger](./img/7.png)
+
+## headers请求顺序
+
+    HEADERS = {
+        'Proxy-Connection': 'keep-alive',
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'User-Agent': 'yuanrenxue.project',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Referer': 'http://match.yuanrenxue.com/match/8',
+        'Accept-Language': 'zh-CN,zh;q=0.9'
+    }
+    SESSION = requests.session()
+    SESSION.headers = HEADERS
+
+    req = SESSION.get(self.url)
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/a.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/b-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/b-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/c-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/c-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/d-test.png
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/d-test.png
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-1.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-1.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-2.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-2.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-3.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-3.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-4.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-4.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-5.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-5.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-6.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-6.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-7.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-7.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-8.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-8.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-9.jpg
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/img_a/f-9.jpg
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/main.py
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/main.py
@ -3,6 +3,17 @@ import requests
 import re
 import base64

+HEADERS = {
+    'Proxy-Connection': 'keep-alive',
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'User-Agent': 'yuanrenxue.project',
+    'X-Requested-With': 'XMLHttpRequest',
+    'Referer': 'http://match.yuanrenxue.com/match/8',
+    'Accept-Language': 'zh-CN,zh;q=0.9'
+}
+SESSION = requests.session()
+SESSION.headers = HEADERS
+

 class YuanrenXuan(object):

@ -11,7 +22,7 @@ class YuanrenXuan(object):
        self.sum_value = 0

    def get_task(self):
-        req = requests.get(self.url)
+        req = SESSION.get(self.url)
        text = re.findall(r'请依次点击：---<p>(.*)</p>--- <br>提示', req.json().get('html'))[0]
        text_list = text.split('</p>---<p>')
        img = re.findall(r'<img src="(.*)" alt="">', req.json().get('html'))[0]
@ -22,16 +33,18 @@ class YuanrenXuan(object):
        return text_list

    def get_match(self, page, answer):
-        url = f"https://match.yuanrenxue.cn/api/match/8?page={page}&answer={answer}"
-        payload = {}
-        headers = {
-            'cookie': 'sessionid=iikaj9bo7vzqv4mz1xvryl13o7z98l13;'
-        }
-        response = requests.request("GET", url, headers=headers, data=payload)
-        print(response.json())
+        url = f"https://match.yuanrenxue.cn/api/match/8"
+        print(answer)
+        params = (
+            ('page', str(page)),
+            ('answer', answer)
+        )
+        response = SESSION.get(url, params=params)
+        return response.json()

    def run(self):
        num = 1
+        data_num_list = []
        while True:
            text_list = self.get_task()
            print(text_list)
@ -43,14 +56,14 @@ class YuanrenXuan(object):
            answer = '|'.join(answer_list)
            if 'None' in answer:
                continue
-            print(num, answer)
-            self.get_match(num, answer)
+            data_list = self.get_match(num, answer).get('data')
+            for data in data_list:
+                data_num_list.append(int(data.get('value')))
            num += 1

            if num == 5:
                break
-
-            exit()
+        print(data_num_list)


 if __name__ == '__main__':
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
@ -106,9 +106,9 @@ def get_result(universal_ocr, file_path):


 def run_ocr(file_path):
-    appid = "xxxx"
+    appid = "zzzz"
    apisecret = "xxxx"
-    apikey = "xxxxx"
+    apikey = "xxxx"
    universal_ocr = UniversalOcr(appid, apikey, apisecret)
    res = get_result(universal_ocr, file_path)
    return res.get('whole_text')