diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index cfec631..74214d0 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -25,6 +25,8 @@ class extractor(Baes): globalData = json_dict.get('globalData') offerId = globalData.get('tempModel').get('offerId') + print(f"【{datetime.now()}】解析 {offerId}") + data = json_dict.get('data') skuInfoMap = globalData.get('skuModel').get('skuInfoMap') @@ -37,8 +39,11 @@ class extractor(Baes): } sub_categorys.append(sub_categorys_dict) - value = globalData.get('skuModel').get('skuProps')[0].get('value') - sub_colour_categorys = value + if globalData.get('skuModel').get('skuProps'): + value = globalData.get('skuModel').get('skuProps')[0].get('value') + sub_colour_categorys = value + else: + sub_colour_categorys = [] orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices') companyName = globalData.get('tempModel').get('companyName') @@ -46,11 +51,11 @@ class extractor(Baes): offerUnit = globalData.get('tempModel').get('offerUnit') images = globalData.get('images') - # for image in images: - # fullPathImageURI = image.get('fullPathImageURI') - # download_img(fullPathImageURI, offerId) - # print(f"【{datetime.now()}】图片下载{fullPathImageURI}") - # time.sleep(1) + for image in images: + fullPathImageURI = image.get('fullPathImageURI') + download_img(fullPathImageURI, offerId) + print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + time.sleep(1) a_590893001984 = data.get('590893001984') if not a_590893001984: @@ -86,9 +91,7 @@ class extractor(Baes): "detailUrl": detailUrl, "unit_weight": unitWeight } - print(json.dumps(item)) self.col.insert_item('CLEAN_CONTENT', item) - print(f"【{datetime.now()}】解析{offerId}") if __name__ == '__main__': diff --git a/1688/settings.py b/1688/settings.py index 54b5073..9735374 100644 --- a/1688/settings.py +++ b/1688/settings.py @@ -2,7 +2,7 @@ import os MONGODB_CONF = { - 'host': '127.0.0.1', + 'host': '192.168.1.107', 'port': 27017, 'username': '', 'pwd': "", diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py new file mode 100644 index 0000000..24a3c87 --- /dev/null +++ b/1688/spider/导出到本地json数据.py @@ -0,0 +1,30 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import time +import json + + +class 导出到本地json数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到本地json数据, self).__init__() + + def run(self): + res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1, + "sub_colour_categorys": 1, "order_param_model": 1, + "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, + "detailUrl": 1, "unit_weight": 1}) + + for s in res: + s.pop('_id') + with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + + print(f"【{datetime.now()}】完成") + + +if __name__ == '__main__': + f = 导出到本地json数据() + f.run()