diff --git a/1688/dao/mongo_dao.py b/1688/dao/mongo_dao.py index d669c11..cebf85f 100644 --- a/1688/dao/mongo_dao.py +++ b/1688/dao/mongo_dao.py @@ -39,3 +39,10 @@ class MongoDao(object): else: print(f"【{datetime.now()}】入库{item.get('sign')}") return collection.insert_one(item) + + def update_item(self, collection, sign): + collection = self.client[collection] + if collection.find_one({"sign": sign}): + return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}}) + else: + print(f"【{datetime.now()}】过滤") diff --git a/1688/spider/1688企业产品详情页面.py b/1688/spider/1688企业产品详情页面.py index 8e08c11..53c88f1 100644 --- a/1688/spider/1688企业产品详情页面.py +++ b/1688/spider/1688企业产品详情页面.py @@ -1,4 +1,4 @@ -from dao.mongo_dao import MongoDao +from dao.mongo_dao import MyMongodb, MongoDao from spider.baes import Baes from datetime import datetime import time @@ -8,15 +8,16 @@ import requests class 企业产品详情页面(Baes): def __init__(self): + self.mongodb = MyMongodb().db self.col = MongoDao() super(企业产品详情页面, self).__init__() def run(self): - res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1}) + res = self.mongodb['RAW_URLS'].find({"stauts": "0"}, {"url": 1, "sign": 1}) for s in res: url = s.get('url').replace('detail', 'm') sign = s.get('sign') - x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223433333035343562623433343530616361636164636131373764396164613965434a754f7534774745507959795a577173616e3641526f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d" + x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a226363663036373930386530363435333061646434616437316231373339646264434e7149744930474550476b36716541796f6655777745773563795068766a2f2f2f2f2f41513d3d227d" headers = { 'Cookie': f"x5sec={x5sec}" } @@ -37,7 +38,7 @@ class 企业产品详情页面(Baes): } self.col.insert_item('RAW_CONTENT', item) self.col.update_item('RAW_URLS', sign) - time.sleep(10) + time.sleep(3) if __name__ == '__main__': diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出元数据本地json数据.py similarity index 100% rename from 1688/spider/导出到本地json数据.py rename to 1688/spider/导出元数据本地json数据.py diff --git a/1688/spider/清洗数据json格式.py b/1688/spider/清洗数据json格式.py index a51f436..0598ecd 100644 --- a/1688/spider/清洗数据json格式.py +++ b/1688/spider/清洗数据json格式.py @@ -1,4 +1,4 @@ -from dao.mongo_dao import MongoDao +from dao.mongo_dao import MongoDao, MyMongodb from scrapy.selector import Selector from spider.baes import Baes from datetime import datetime @@ -9,11 +9,12 @@ import re class extractor(Baes): def __init__(self): + self.mongodb = MyMongodb().db self.col = MongoDao() super(extractor, self).__init__() def run(self): - res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) + res = self.mongodb['RAW_CONTENT'].find({}, {"content": 1}) for s in res: content = s.get('content') sel = Selector(text=content, type='html') @@ -33,7 +34,7 @@ class extractor(Baes): sub_categorys_dict = { 'specId': value.get('specId'), 'specAttrs': key.replace('>', '|'), - 'Price': globalData.get('tempModel').get('price'), + 'Price': value.get('price') if value.get('price') else globalData.get('tempModel').get('price'), 'canBookCount': value.get('canBookCount') } sub_categorys_canBookCount.append(sub_categorys_dict)