diff --git a/1688/clean/__init__.py b/1688/clean/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py new file mode 100644 index 0000000..3cc0864 --- /dev/null +++ b/1688/clean/extractor.py @@ -0,0 +1,80 @@ +from dao.mongo_dao import MongoDao +from scrapy.selector import Selector +from spider.baes import Baes +from datetime import datetime +from tool.download_img import download_img +import time +import json +import re + + +class extractor(Baes): + + def __init__(self): + self.col = MongoDao() + super(extractor, self).__init__() + + def run(self): + res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) + for s in res: + content = s.get('content') + sel = Selector(text=content, type='html') + title = sel.xpath('//title/text()').extract_first() + json_itme = re.findall(r'window.__INIT_DATA=(\{.*\})', content)[0] + json_dict = json.loads(json_itme) + globalData = json_dict.get('globalData') + offerId = globalData.get('tempModel').get('offerId') + + data = json_dict.get('data') + skuInfoMap = globalData.get('skuModel').get('skuInfoMap') + + sub_categorys = [] + for key, value in skuInfoMap.items(): + sub_categorys_dict = { + 'specId': value.get('specId'), + 'specAttrs': key, + 'discountPrice': value.get('discountPrice'), + 'canBookCount': value.get('canBookCount') + } + sub_categorys.append(sub_categorys_dict) + + orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices') + companyName = globalData.get('tempModel').get('companyName') + sellerLoginId = globalData.get('tempModel').get('sellerLoginId') + offerUnit = globalData.get('tempModel').get('offerUnit') + images = globalData.get('images') + + for image in images: + fullPathImageURI = image.get('fullPathImageURI') + download_img(fullPathImageURI, offerId) + print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + time.sleep(1) + + a_590893002003 = data.get('590893002003') + + if not a_590893002003: + a_590893002003 = data.get('605462009362') + propsList = a_590893002003.get('data').get('propsList') + detailUrl = globalData.get('detailModel').get('detailUrl') + + item = { + "company_name": companyName, + "company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId), + "title": title, + "sub_categorys": sub_categorys, + "order_param_model": orderParam, + "sellerLoginId": sellerLoginId, + "offerUnit": offerUnit, + "images": images, + "propsList": propsList, + "detailUrl": detailUrl + } + print(json.dumps(item)) + exit() + + print(f"【{datetime.now()}】解析{offerId}") + + +if __name__ == '__main__': + f = extractor() + f.run() diff --git a/1688/dao/mongo_dao.py b/1688/dao/mongo_dao.py index 77c7a0a..a5261d3 100644 --- a/1688/dao/mongo_dao.py +++ b/1688/dao/mongo_dao.py @@ -40,13 +40,14 @@ class MongoDao(object): print(f"【{datetime.now()}】入库{item.get('url')}") return collection.insert_one(item) - def update_item(self, collection, item): + def update_item(self, collection, sign): collection = self.client[collection] - if collection.find_one({"sign": item['sign']}): - return collection.update_one({"sign": item['sign']}, {"$set": {"stauts": '1'}}) + if collection.find_one({"sign": sign}): + return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}}) else: print(f"【{datetime.now()}】过滤") def find_item(self, collection, query, projection): collection = self.client[collection] - return collection.find(query, projection) + return collection.find(query, projection).batch_size(1) + diff --git a/1688/settings.py b/1688/settings.py index 3a11833..e123fc9 100644 --- a/1688/settings.py +++ b/1688/settings.py @@ -1,5 +1,8 @@ +import os + + MONGODB_CONF = { - 'host': '127.0.0.1', + 'host': '192.168.5.151', 'port': 27017, 'username': '', 'pwd': "", @@ -8,3 +11,6 @@ MONGODB_CONF = { 'status': '', 'producer': '' } + +DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +excel_path = os.path.join(DOCS_PATH, '1688/docs/') diff --git a/1688/spider/1688企业产品列表页面.py b/1688/spider/1688企业产品列表页面.py index 5f5d437..8cb1874 100644 --- a/1688/spider/1688企业产品列表页面.py +++ b/1688/spider/1688企业产品列表页面.py @@ -14,10 +14,12 @@ class Film(Baes): super(Film, self).__init__() def run(self): - for i in range(1, 24): - cookie2 = "1e3cee17580ffb0eea62cdaec87c7771" - x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d" - url = self.url.format(i) + for i in range(15, 24): + cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1" + x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \ + "35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \ + "4d5463344e4473784d4f57426e355542227d" + url = self.url.format(i).replace('detail', 'm') headers = { 'cookie': f"cookie2={cookie2};x5sec={x5sec}" } @@ -27,13 +29,18 @@ class Film(Baes): print(f"【{datetime.now()}】报错{i}") exit() + if '全球领先的采购批发平台,批发网' in response.text: + print(f"【{datetime.now()}】报错{i}") + exit() + sel = Selector(text=response.text, type='html') urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract() for url in urls: item = { "sign": self.generate_sign(url), - "url": url + "url": url, + "stauts": "0" } self.col.insert_item('RAW_URLS', item) time.sleep(10) diff --git a/1688/spider/1688企业产品详情页面.py b/1688/spider/1688企业产品详情页面.py index 7896132..3b9e1a0 100644 --- a/1688/spider/1688企业产品详情页面.py +++ b/1688/spider/1688企业产品详情页面.py @@ -12,13 +12,15 @@ class 企业产品详情页面(Baes): super(企业产品详情页面, self).__init__() def run(self): - res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1}) + res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1}) for s in res: - url = s.get('url') - cookie2 = "1e3cee17580ffb0eea62cdaec87c7771" - x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d" + url = s.get('url').replace('detail', 'm') + sign = s.get('sign') + x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \ + "323835663332623033396233366663613833323639396433326236364350372b76346" \ + "f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d" headers = { - 'cookie': f"cookie2={cookie2};x5sec={x5sec}" + 'cookie': f"x5sec={x5sec}" } response = requests.request("GET", url, headers=headers) @@ -26,13 +28,17 @@ class 企业产品详情页面(Baes): print(f"【{datetime.now()}】报错{url}") exit() + if '全球领先的采购批发平台,批发网' in response.text: + print(f"【{datetime.now()}】报错{url}") + exit() + item = { "sign": self.generate_sign(url), "url": url, "content": response.text } self.col.insert_item('RAW_CONTENT', item) - self.col.update_item('RAW_URLS', item) + self.col.update_item('RAW_URLS', sign) time.sleep(10) diff --git a/1688/tool/__init__.py b/1688/tool/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/1688/tool/download_img.py b/1688/tool/download_img.py new file mode 100644 index 0000000..d043418 --- /dev/null +++ b/1688/tool/download_img.py @@ -0,0 +1,29 @@ +from urllib.parse import urlparse +import settings +import requests +import os + + +def request_download(image_url, path): + url_path = urlparse(image_url).path + image_name = url_path.split("/")[-1] + r = requests.get(image_url) + with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f: + f.write(r.content) + + +def mkdir(path): + folder = os.path.exists(f"{settings.excel_path}{path}") + if not folder: + os.makedirs(f"{settings.excel_path}{path}") + + +def download_img(image_url, path): + mkdir(path) + request_download(image_url, path) + + +if __name__ == '__main__': + image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg" + name = "test" + download_img(image_url, name)