diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index 8fbc427..205b792 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -78,14 +78,14 @@ class extractor(Baes): a_590893001997 = data.get('590893001997') if not a_590893001997: - unitWeight = data.get('605462009364').get('data').get('test').get('unitWeight') + # unitWeight = data.get('605462009364').get('data').get('test').get('unitWeight') location = data.get('605462009364').get('data').get('location') cost = data.get('605462009364').get('data').get('logistics') else: - unitWeight = a_590893001997.get('data').get('test').get('unitWeight') + # unitWeight = a_590893001997.get('data').get('test').get('unitWeight') location = a_590893001997.get('data').get('location') cost = a_590893001997.get('data').get('logistics') - logistics = [{"from": location}, {"cost": cost}] + logistics = [{"from": location}, {"cost": cost.replace('快递', '').strip()}] a_590893002003 = data.get('590893002003') if not a_590893002003: @@ -109,7 +109,7 @@ class extractor(Baes): "images": images, "propsList": propsList, "detailUrl": detailUrl, - "unit_weight": unitWeight, + "unit_weight": "", "logistics": logistics } self.col.insert_item('CLEAN_CONTENT', item) diff --git a/1688/dao/mongo_dao.py b/1688/dao/mongo_dao.py index a5261d3..78657eb 100644 --- a/1688/dao/mongo_dao.py +++ b/1688/dao/mongo_dao.py @@ -47,7 +47,7 @@ class MongoDao(object): else: print(f"【{datetime.now()}】过滤") - def find_item(self, collection, query, projection): + def find_item(self, collection, *args, **kwargs): collection = self.client[collection] - return collection.find(query, projection).batch_size(1) + return collection.find(*args, **kwargs).batch_size(1) diff --git a/1688/spider/1688企业产品详情页面.py b/1688/spider/1688企业产品详情页面.py index 6a64888..8e08c11 100644 --- a/1688/spider/1688企业产品详情页面.py +++ b/1688/spider/1688企业产品详情页面.py @@ -16,7 +16,7 @@ class 企业产品详情页面(Baes): for s in res: url = s.get('url').replace('detail', 'm') sign = s.get('sign') - x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a22313336323861633166303531646664306233326164313139386263343465313343505867386f7347454f7963717172677a49437643686f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d" + x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223433333035343562623433343530616361636164636131373764396164613965434a754f7534774745507959795a577173616e3641526f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d" headers = { 'Cookie': f"x5sec={x5sec}" } diff --git a/1688/spider/导出到解析后json数据.py b/1688/spider/导出到解析后json数据.py new file mode 100644 index 0000000..e8e984c --- /dev/null +++ b/1688/spider/导出到解析后json数据.py @@ -0,0 +1,29 @@ +from scrapy.selector import Selector +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import time +import json + + +class 导出到解析后json数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到解析后json数据, self).__init__() + + def run(self): + res = self.col.find_item('CLEAN_CONTENT', {}) + + for s in res: + s.pop('_id') + s.pop('sign') + with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + + print(f"【{datetime.now()}】完成") + + +if __name__ == '__main__': + f = 导出到解析后json数据() + f.run()