From b58ade53d90e1823e5bbb4c2f96751771fa9406a Mon Sep 17 00:00:00 2001 From: aiguigu Date: Wed, 20 Oct 2021 09:28:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1688/clean/extractor.py | 16 ++++++++-------- 1688/settings.py | 2 +- 1688/spider/1688企业产品列表页面.py | 19 ++++++++++--------- 1688/spider/1688企业产品详情页面.py | 10 ++++------ 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index e00a738..cfec631 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -46,11 +46,11 @@ class extractor(Baes): offerUnit = globalData.get('tempModel').get('offerUnit') images = globalData.get('images') - for image in images: - fullPathImageURI = image.get('fullPathImageURI') - download_img(fullPathImageURI, offerId) - print(f"【{datetime.now()}】图片下载{fullPathImageURI}") - time.sleep(1) + # for image in images: + # fullPathImageURI = image.get('fullPathImageURI') + # download_img(fullPathImageURI, offerId) + # print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + # time.sleep(1) a_590893001984 = data.get('590893001984') if not a_590893001984: @@ -72,8 +72,9 @@ class extractor(Baes): detailUrl = globalData.get('detailModel').get('detailUrl') item = { + "sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)), "company_name": companyName, - "company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId), + "url": "https://detail.1688.com/offer/{}.html".format(offerId), "title": title, "sub_categorys": sub_categorys, "sub_colour_categorys": sub_colour_categorys, @@ -86,8 +87,7 @@ class extractor(Baes): "unit_weight": unitWeight } print(json.dumps(item)) - exit() - + self.col.insert_item('CLEAN_CONTENT', item) print(f"【{datetime.now()}】解析{offerId}") diff --git a/1688/settings.py b/1688/settings.py index e123fc9..54b5073 100644 --- a/1688/settings.py +++ b/1688/settings.py @@ -2,7 +2,7 @@ import os MONGODB_CONF = { - 'host': '192.168.5.151', + 'host': '127.0.0.1', 'port': 27017, 'username': '', 'pwd': "", diff --git a/1688/spider/1688企业产品列表页面.py b/1688/spider/1688企业产品列表页面.py index 8cb1874..b0f22f0 100644 --- a/1688/spider/1688企业产品列表页面.py +++ b/1688/spider/1688企业产品列表页面.py @@ -10,15 +10,14 @@ class Film(Baes): def __init__(self): self.col = MongoDao() - self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}" + self.url = "https://shop1456245592469.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator." \ + "4.79f525026COu37&pageNum={}" super(Film, self).__init__() def run(self): - for i in range(15, 24): - cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1" - x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \ - "35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \ - "4d5463344e4473784d4f57426e355542227d" + for i in range(1, 33): + cookie2 = "1bd7858d65500ba53956ab164308ad2a" + x5sec = "7b226b796c696e3b32223a223932613866656331376631373065326331363635306638303764646635666438434f766276497347454c36483635715739704f7968674561437a59324f4449794d5463344e4473784d4f57426e355542227d" url = self.url.format(i).replace('detail', 'm') headers = { 'cookie': f"cookie2={cookie2};x5sec={x5sec}" @@ -26,21 +25,23 @@ class Film(Baes): response = requests.request("GET", url, headers=headers) if '系统自动生成,请勿修改 100%' in response.text: - print(f"【{datetime.now()}】报错{i}") + print(f"系统自动生成,请勿修改 100%【{datetime.now()}】报错{i}") exit() if '全球领先的采购批发平台,批发网' in response.text: - print(f"【{datetime.now()}】报错{i}") + print(f"全球领先的采购批发平台,批发网【{datetime.now()}】报错{i}") exit() sel = Selector(text=response.text, type='html') urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract() + shop_name = sel.xpath('//div[@class="name-wrap"]//a/text()').extract_first() for url in urls: item = { "sign": self.generate_sign(url), "url": url, - "stauts": "0" + "stauts": "0", + "shop_name": shop_name } self.col.insert_item('RAW_URLS', item) time.sleep(10) diff --git a/1688/spider/1688企业产品详情页面.py b/1688/spider/1688企业产品详情页面.py index 3b9e1a0..ab4c970 100644 --- a/1688/spider/1688企业产品详情页面.py +++ b/1688/spider/1688企业产品详情页面.py @@ -16,20 +16,18 @@ class 企业产品详情页面(Baes): for s in res: url = s.get('url').replace('detail', 'm') sign = s.get('sign') - x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \ - "323835663332623033396233366663613833323639396433326236364350372b76346" \ - "f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d" + x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223532323634333863383734346532666230393835646164396366336533376664434d756776597347454a5056377266593972325539414561437a59324f4449794d5463344e4473784d4f584d6a3462342f2f2f2f2f77453d227d" headers = { - 'cookie': f"x5sec={x5sec}" + 'Cookie': f"x5sec={x5sec}" } response = requests.request("GET", url, headers=headers) if '系统自动生成,请勿修改 100%' in response.text: - print(f"【{datetime.now()}】报错{url}") + print(f"系统自动生成,请勿修改 100%【{datetime.now()}】报错{url}") exit() if '全球领先的采购批发平台,批发网' in response.text: - print(f"【{datetime.now()}】报错{url}") + print(f"全球领先的采购批发平台,批发网【{datetime.now()}】报错{url}") exit() item = {