代码更新

2025-04-22 02:05:24 +08:00 · 2021-10-20 09:28:21 +08:00 · 2021-10-20 09:28:21 +08:00 · b58ade53d9
commit b58ade53d9
parent 21536e5c24
4 changed files with 23 additions and 24 deletions
--- a/1688/clean/extractor.py
+++ b/1688/clean/extractor.py
@ -46,11 +46,11 @@ class extractor(Baes):
            offerUnit = globalData.get('tempModel').get('offerUnit')
            images = globalData.get('images')
-            for image in images:
+            # for image in images:
-                fullPathImageURI = image.get('fullPathImageURI')
+            #     fullPathImageURI = image.get('fullPathImageURI')
-                download_img(fullPathImageURI, offerId)
+            #     download_img(fullPathImageURI, offerId)
-                print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
+            #     print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
-                time.sleep(1)
+            #     time.sleep(1)
            a_590893001984 = data.get('590893001984')
            if not a_590893001984:
@ -72,8 +72,9 @@ class extractor(Baes):
            detailUrl = globalData.get('detailModel').get('detailUrl')
            item = {
                "sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)),
                "company_name": companyName,
-                "company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId),
+                "url": "https://detail.1688.com/offer/{}.html".format(offerId),
                "title": title,
                "sub_categorys": sub_categorys,
                "sub_colour_categorys": sub_colour_categorys,
@ -86,8 +87,7 @@ class extractor(Baes):
                "unit_weight": unitWeight
            }
            print(json.dumps(item))
-            exit()
+            self.col.insert_item('CLEAN_CONTENT', item)
            print(f"【{datetime.now()}】解析{offerId}")
--- a/1688/settings.py
+++ b/1688/settings.py
@ -2,7 +2,7 @@ import os
 MONGODB_CONF = {
-    'host': '192.168.5.151',
+    'host': '127.0.0.1',
    'port': 27017,
    'username': '',
    'pwd': "",
--- a/1688/spider/1688企业产品列表页面.py
+++ b/1688/spider/1688企业产品列表页面.py
@ -10,15 +10,14 @@ class Film(Baes):
    def __init__(self):
        self.col = MongoDao()
-        self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}"
+        self.url = "https://shop1456245592469.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator." \
                   "4.79f525026COu37&pageNum={}"
        super(Film, self).__init__()
    def run(self):
-        for i in range(15, 24):
+        for i in range(1, 33):
-            cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1"
+            cookie2 = "1bd7858d65500ba53956ab164308ad2a"
-            x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \
+            x5sec = "7b226b796c696e3b32223a223932613866656331376631373065326331363635306638303764646635666438434f766276497347454c36483635715739704f7968674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
                    "35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \
                    "4d5463344e4473784d4f57426e355542227d"
            url = self.url.format(i).replace('detail', 'm')
            headers = {
                'cookie': f"cookie2={cookie2};x5sec={x5sec}"
@ -26,21 +25,23 @@ class Film(Baes):
            response = requests.request("GET", url, headers=headers)
            if '系统自动生成，请勿修改 100%' in response.text:
-                print(f"【{datetime.now()}】报错{i}")
+                print(f"系统自动生成，请勿修改 100%【{datetime.now()}】报错{i}")
                exit()
            if '全球领先的采购批发平台,批发网' in response.text:
-                print(f"【{datetime.now()}】报错{i}")
+                print(f"全球领先的采购批发平台,批发网【{datetime.now()}】报错{i}")
                exit()
            sel = Selector(text=response.text, type='html')
            urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
            shop_name = sel.xpath('//div[@class="name-wrap"]//a/text()').extract_first()
            for url in urls:
                item = {
                    "sign": self.generate_sign(url),
                    "url": url,
-                    "stauts": "0"
+                    "stauts": "0",
                    "shop_name": shop_name
                }
                self.col.insert_item('RAW_URLS', item)
            time.sleep(10)
--- a/1688/spider/1688企业产品详情页面.py
+++ b/1688/spider/1688企业产品详情页面.py
@ -16,20 +16,18 @@ class 企业产品详情页面(Baes):
        for s in res:
            url = s.get('url').replace('detail', 'm')
            sign = s.get('sign')
-            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \
+            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223532323634333863383734346532666230393835646164396366336533376664434d756776597347454a5056377266593972325539414561437a59324f4449794d5463344e4473784d4f584d6a3462342f2f2f2f2f77453d227d"
                    "323835663332623033396233366663613833323639396433326236364350372b76346" \
                    "f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d"
            headers = {
-                'cookie': f"x5sec={x5sec}"
+                'Cookie': f"x5sec={x5sec}"
            }
            response = requests.request("GET", url, headers=headers)
            if '系统自动生成，请勿修改 100%' in response.text:
-                print(f"【{datetime.now()}】报错{url}")
+                print(f"系统自动生成，请勿修改 100%【{datetime.now()}】报错{url}")
                exit()
            if '全球领先的采购批发平台,批发网' in response.text:
-                print(f"【{datetime.now()}】报错{url}")
+                print(f"全球领先的采购批发平台,批发网【{datetime.now()}】报错{url}")
                exit()
            item = {