From 56b82b0a801ed2417ab0176c3f501d5b78ce308d Mon Sep 17 00:00:00 2001 From: luzhisheng Date: Wed, 20 Oct 2021 11:39:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=BC=E5=87=BA=E7=A8=8B?= =?UTF-8?q?=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1688/clean/extractor.py | 21 ++++++++++++--------- 1688/settings.py | 2 +- 1688/spider/导出到本地json数据.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 10 deletions(-) create mode 100644 1688/spider/导出到本地json数据.py diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index cfec631..74214d0 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -25,6 +25,8 @@ class extractor(Baes): globalData = json_dict.get('globalData') offerId = globalData.get('tempModel').get('offerId') + print(f"【{datetime.now()}】解析 {offerId}") + data = json_dict.get('data') skuInfoMap = globalData.get('skuModel').get('skuInfoMap') @@ -37,8 +39,11 @@ class extractor(Baes): } sub_categorys.append(sub_categorys_dict) - value = globalData.get('skuModel').get('skuProps')[0].get('value') - sub_colour_categorys = value + if globalData.get('skuModel').get('skuProps'): + value = globalData.get('skuModel').get('skuProps')[0].get('value') + sub_colour_categorys = value + else: + sub_colour_categorys = [] orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices') companyName = globalData.get('tempModel').get('companyName') @@ -46,11 +51,11 @@ class extractor(Baes): offerUnit = globalData.get('tempModel').get('offerUnit') images = globalData.get('images') - # for image in images: - # fullPathImageURI = image.get('fullPathImageURI') - # download_img(fullPathImageURI, offerId) - # print(f"【{datetime.now()}】图片下载{fullPathImageURI}") - # time.sleep(1) + for image in images: + fullPathImageURI = image.get('fullPathImageURI') + download_img(fullPathImageURI, offerId) + print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + time.sleep(1) a_590893001984 = data.get('590893001984') if not a_590893001984: @@ -86,9 +91,7 @@ class extractor(Baes): "detailUrl": detailUrl, "unit_weight": unitWeight } - print(json.dumps(item)) self.col.insert_item('CLEAN_CONTENT', item) - print(f"【{datetime.now()}】解析{offerId}") if __name__ == '__main__': diff --git a/1688/settings.py b/1688/settings.py index 54b5073..9735374 100644 --- a/1688/settings.py +++ b/1688/settings.py @@ -2,7 +2,7 @@ import os MONGODB_CONF = { - 'host': '127.0.0.1', + 'host': '192.168.1.107', 'port': 27017, 'username': '', 'pwd': "", diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py new file mode 100644 index 0000000..24a3c87 --- /dev/null +++ b/1688/spider/导出到本地json数据.py @@ -0,0 +1,30 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import time +import json + + +class 导出到本地json数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到本地json数据, self).__init__() + + def run(self): + res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1, + "sub_colour_categorys": 1, "order_param_model": 1, + "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, + "detailUrl": 1, "unit_weight": 1}) + + for s in res: + s.pop('_id') + with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + + print(f"【{datetime.now()}】完成") + + +if __name__ == '__main__': + f = 导出到本地json数据() + f.run()