增加1688数据

2025-04-20 01:34:55 +08:00 · 2021-09-26 17:29:26 +08:00 · 2021-09-26 17:29:26 +08:00 · eecfc80a1c
commit eecfc80a1c
parent 8ebb4e6461
8 changed files with 145 additions and 16 deletions
--- a/1688/clean/init.py
+++ b/1688/clean/init.py
--- a/1688/clean/extractor.py
+++ b/1688/clean/extractor.py
@ -0,0 +1,80 @@
+from dao.mongo_dao import MongoDao
+from scrapy.selector import Selector
+from spider.baes import Baes
+from datetime import datetime
+from tool.download_img import download_img
+import time
+import json
+import re
+
+
+class extractor(Baes):
+
+    def __init__(self):
+        self.col = MongoDao()
+        super(extractor, self).__init__()
+
+    def run(self):
+        res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
+        for s in res:
+            content = s.get('content')
+            sel = Selector(text=content, type='html')
+            title = sel.xpath('//title/text()').extract_first()
+            json_itme = re.findall(r'window.__INIT_DATA=(\{.*\})', content)[0]
+            json_dict = json.loads(json_itme)
+            globalData = json_dict.get('globalData')
+            offerId = globalData.get('tempModel').get('offerId')
+
+            data = json_dict.get('data')
+            skuInfoMap = globalData.get('skuModel').get('skuInfoMap')
+
+            sub_categorys = []
+            for key, value in skuInfoMap.items():
+                sub_categorys_dict = {
+                    'specId': value.get('specId'),
+                    'specAttrs': key,
+                    'discountPrice': value.get('discountPrice'),
+                    'canBookCount': value.get('canBookCount')
+                }
+                sub_categorys.append(sub_categorys_dict)
+
+            orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices')
+            companyName = globalData.get('tempModel').get('companyName')
+            sellerLoginId = globalData.get('tempModel').get('sellerLoginId')
+            offerUnit = globalData.get('tempModel').get('offerUnit')
+            images = globalData.get('images')
+
+            for image in images:
+                fullPathImageURI = image.get('fullPathImageURI')
+                download_img(fullPathImageURI, offerId)
+                print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
+                time.sleep(1)
+
+            a_590893002003 = data.get('590893002003')
+
+            if not a_590893002003:
+                a_590893002003 = data.get('605462009362')
+            propsList = a_590893002003.get('data').get('propsList')
+            detailUrl = globalData.get('detailModel').get('detailUrl')
+
+            item = {
+                "company_name": companyName,
+                "company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId),
+                "title": title,
+                "sub_categorys": sub_categorys,
+                "order_param_model": orderParam,
+                "sellerLoginId": sellerLoginId,
+                "offerUnit": offerUnit,
+                "images": images,
+                "propsList": propsList,
+                "detailUrl": detailUrl
+            }
+            print(json.dumps(item))
+            exit()
+
+            print(f"【{datetime.now()}】解析{offerId}")
+
+
+if __name__ == '__main__':
+    f = extractor()
+    f.run()
--- a/1688/dao/mongo_dao.py
+++ b/1688/dao/mongo_dao.py
@ -40,13 +40,14 @@ class MongoDao(object):
            print(f"【{datetime.now()}】入库{item.get('url')}")
            return collection.insert_one(item)

-    def update_item(self, collection, item):
+    def update_item(self, collection, sign):
        collection = self.client[collection]
-        if collection.find_one({"sign": item['sign']}):
-            return collection.update_one({"sign": item['sign']}, {"$set": {"stauts": '1'}})
+        if collection.find_one({"sign": sign}):
+            return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
        else:
            print(f"【{datetime.now()}】过滤")

    def find_item(self, collection, query, projection):
        collection = self.client[collection]
-        return collection.find(query, projection)
+        return collection.find(query, projection).batch_size(1)
+
--- a/1688/settings.py
+++ b/1688/settings.py
@ -1,5 +1,8 @@
+import os
+
+
 MONGODB_CONF = {
-    'host': '127.0.0.1',
+    'host': '192.168.5.151',
    'port': 27017,
    'username': '',
    'pwd': "",
@ -8,3 +11,6 @@ MONGODB_CONF = {
    'status': '',
    'producer': ''
 }
+
+DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+excel_path = os.path.join(DOCS_PATH, '1688/docs/')
--- a/1688/spider/1688企业产品列表页面.py
+++ b/1688/spider/1688企业产品列表页面.py
@ -14,10 +14,12 @@ class Film(Baes):
        super(Film, self).__init__()

    def run(self):
-        for i in range(1, 24):
-            cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
-            x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
-            url = self.url.format(i)
+        for i in range(15, 24):
+            cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1"
+            x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \
+                    "35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \
+                    "4d5463344e4473784d4f57426e355542227d"
+            url = self.url.format(i).replace('detail', 'm')
            headers = {
                'cookie': f"cookie2={cookie2};x5sec={x5sec}"
                }
@ -27,13 +29,18 @@ class Film(Baes):
                print(f"【{datetime.now()}】报错{i}")
                exit()

+            if '全球领先的采购批发平台,批发网' in response.text:
+                print(f"【{datetime.now()}】报错{i}")
+                exit()
+
            sel = Selector(text=response.text, type='html')
            urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()

            for url in urls:
                item = {
                    "sign": self.generate_sign(url),
-                    "url": url
+                    "url": url,
+                    "stauts": "0"
                }
                self.col.insert_item('RAW_URLS', item)
            time.sleep(10)
--- a/1688/spider/1688企业产品详情页面.py
+++ b/1688/spider/1688企业产品详情页面.py
@ -12,13 +12,15 @@ class 企业产品详情页面(Baes):
        super(企业产品详情页面, self).__init__()

    def run(self):
-        res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1})
+        res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1})
        for s in res:
-            url = s.get('url')
-            cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
-            x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d"
+            url = s.get('url').replace('detail', 'm')
+            sign = s.get('sign')
+            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \
+                    "323835663332623033396233366663613833323639396433326236364350372b76346" \
+                    "f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d"
            headers = {
-                'cookie': f"cookie2={cookie2};x5sec={x5sec}"
+                'cookie': f"x5sec={x5sec}"
            }
            response = requests.request("GET", url, headers=headers)

@ -26,13 +28,17 @@ class 企业产品详情页面(Baes):
                print(f"【{datetime.now()}】报错{url}")
                exit()

+            if '全球领先的采购批发平台,批发网' in response.text:
+                print(f"【{datetime.now()}】报错{url}")
+                exit()
+
            item = {
                "sign": self.generate_sign(url),
                "url": url,
                "content": response.text
            }
            self.col.insert_item('RAW_CONTENT', item)
-            self.col.update_item('RAW_URLS', item)
+            self.col.update_item('RAW_URLS', sign)
            time.sleep(10)


--- a/1688/tool/init.py
+++ b/1688/tool/init.py
--- a/1688/tool/download_img.py
+++ b/1688/tool/download_img.py
@ -0,0 +1,29 @@
+from urllib.parse import urlparse
+import settings
+import requests
+import os
+
+
+def request_download(image_url, path):
+    url_path = urlparse(image_url).path
+    image_name = url_path.split("/")[-1]
+    r = requests.get(image_url)
+    with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
+        f.write(r.content)
+
+
+def mkdir(path):
+    folder = os.path.exists(f"{settings.excel_path}{path}")
+    if not folder:
+        os.makedirs(f"{settings.excel_path}{path}")
+
+
+def download_img(image_url, path):
+    mkdir(path)
+    request_download(image_url, path)
+
+
+if __name__ == '__main__':
+    image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
+    name = "test"
+    download_img(image_url, name)