猿人学第8题-验证码图文点选

2025-04-21 12:15:16 +08:00 · 2023-06-27 19:52:05 +08:00 · 2023-06-27 19:52:05 +08:00 · 63be0d7d30
commit 63be0d7d30
parent ab69bcc0ed
1 changed files with 121 additions and 0 deletions
--- a/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
+++ b/猿人学Web端爬虫攻防刷题平台/猿人学第8题-验证码图文点选/xin_fei_ocr.py
@ -0,0 +1,121 @@
 from wsgiref.handlers import format_date_time
 from datetime import datetime
 from time import mktime
 import hashlib
 import base64
 import hmac
 from urllib.parse import urlencode
 import json
 import requests
 class AssembleHeaderException(Exception):
    def __init__(self, msg):
        self.message = msg
 class Url:
    def __init__(this, host, path, schema):
        this.host = host
        this.path = path
        this.schema = schema
 class UniversalOcr(object):
    def __init__(self, appid, apikey, apisecret):
        self.appid = appid
        self.apikey = apikey
        self.apisecret = apisecret
        self.url = 'http://api.xf-yun.com/v1/private/hh_ocr_recognize_doc'
    def parse_url(self, requset_url):
        stidx = requset_url.index("://")
        host = requset_url[stidx + 3:]
        schema = requset_url[:stidx + 3]
        edidx = host.index("/")
        if edidx <= 0:
            raise AssembleHeaderException("invalid request url:" + requset_url)
        path = host[edidx:]
        host = host[:edidx]
        u = Url(host, path, schema)
        return u
    def get_body(self, file_path):
        # 将payload中数据替换成实际能力内容，参考不同能力接口文档请求数据中payload
        file = open(file_path, 'rb')
        buf = file.read()
        body = {
            "header": {
                "app_id": self.appid,
                "status": 3
            },
            "parameter": {
                "hh_ocr_recognize_doc": {
                    "recognizeDocumentRes": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "image": {
                    "encoding": "jpg",
                    "image": str(base64.b64encode(buf), 'utf-8'),
                    "status": 3
                }
            }
        }
        return body
 def assemble_ws_auth_url(universal_ocr):
    requset_url = universal_ocr.url
    u = universal_ocr.parse_url(requset_url)
    api_key = universal_ocr.apikey
    api_secret = universal_ocr.apisecret
    host = u.host
    path = u.path
    now = datetime.now()
    date = format_date_time(mktime(now.timetuple()))
    signature_origin = "host: {}\ndate: {}\n{} {} HTTP/1.1".format(host, date, "POST", path)
    signature_sha = hmac.new(api_secret.encode('utf-8'), signature_origin.encode('utf-8'),
                             digestmod=hashlib.sha256).digest()
    signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
    authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
        api_key, "hmac-sha256", "host date request-line", signature_sha)
    authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
    values = {
        "host": host,
        "date": date,
        "authorization": authorization
    }
    return requset_url + "?" + urlencode(values)
 def get_result(universal_ocr, file_path):
    request_url = assemble_ws_auth_url(universal_ocr)
    headers = {'content-type': "application/json", 'host': 'api.xf-yun.com', 'appid': 'APPID'}
    body = universal_ocr.get_body(file_path=file_path)
    response = requests.post(request_url, data=json.dumps(body), headers=headers)
    re = response.content.decode('utf8')
    str_result = json.loads(re)
    if str_result.__contains__('header') and str_result['header']['code'] == 0:
        renew_text = str_result['payload']['recognizeDocumentRes']['text']
        return json.loads(base64.b64decode(renew_text))
 def run_ocr(file_path):
    appid = "xxxx"
    apisecret = "xxxx"
    apikey = "xxxxx"
    universal_ocr = UniversalOcr(appid, apikey, apisecret)
    res = get_result(universal_ocr, file_path)
    return res.get('whole_text')
 if __name__ == "__main__":
    file_path = "./img_a/f-6.jpg"
    res = run_ocr(file_path)
    print(res)