导出到解析后json数据

This commit is contained in:
aiguigu 2021-12-08 00:38:38 +08:00
parent 45986e186e
commit b8fc9d9517
4 changed files with 16 additions and 7 deletions

View File

@ -39,3 +39,10 @@ class MongoDao(object):
else: else:
print(f"{datetime.now()}】入库{item.get('sign')}") print(f"{datetime.now()}】入库{item.get('sign')}")
return collection.insert_one(item) return collection.insert_one(item)
def update_item(self, collection, sign):
collection = self.client[collection]
if collection.find_one({"sign": sign}):
return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
else:
print(f"{datetime.now()}】过滤")

View File

@ -1,4 +1,4 @@
from dao.mongo_dao import MongoDao from dao.mongo_dao import MyMongodb, MongoDao
from spider.baes import Baes from spider.baes import Baes
from datetime import datetime from datetime import datetime
import time import time
@ -8,15 +8,16 @@ import requests
class 企业产品详情页面(Baes): class 企业产品详情页面(Baes):
def __init__(self): def __init__(self):
self.mongodb = MyMongodb().db
self.col = MongoDao() self.col = MongoDao()
super(企业产品详情页面, self).__init__() super(企业产品详情页面, self).__init__()
def run(self): def run(self):
res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1}) res = self.mongodb['RAW_URLS'].find({"stauts": "0"}, {"url": 1, "sign": 1})
for s in res: for s in res:
url = s.get('url').replace('detail', 'm') url = s.get('url').replace('detail', 'm')
sign = s.get('sign') sign = s.get('sign')
x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223433333035343562623433343530616361636164636131373764396164613965434a754f7534774745507959795a577173616e3641526f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d" x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a226363663036373930386530363435333061646434616437316231373339646264434e7149744930474550476b36716541796f6655777745773563795068766a2f2f2f2f2f41513d3d227d"
headers = { headers = {
'Cookie': f"x5sec={x5sec}" 'Cookie': f"x5sec={x5sec}"
} }
@ -37,7 +38,7 @@ class 企业产品详情页面(Baes):
} }
self.col.insert_item('RAW_CONTENT', item) self.col.insert_item('RAW_CONTENT', item)
self.col.update_item('RAW_URLS', sign) self.col.update_item('RAW_URLS', sign)
time.sleep(10) time.sleep(3)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,4 +1,4 @@
from dao.mongo_dao import MongoDao from dao.mongo_dao import MongoDao, MyMongodb
from scrapy.selector import Selector from scrapy.selector import Selector
from spider.baes import Baes from spider.baes import Baes
from datetime import datetime from datetime import datetime
@ -9,11 +9,12 @@ import re
class extractor(Baes): class extractor(Baes):
def __init__(self): def __init__(self):
self.mongodb = MyMongodb().db
self.col = MongoDao() self.col = MongoDao()
super(extractor, self).__init__() super(extractor, self).__init__()
def run(self): def run(self):
res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) res = self.mongodb['RAW_CONTENT'].find({}, {"content": 1})
for s in res: for s in res:
content = s.get('content') content = s.get('content')
sel = Selector(text=content, type='html') sel = Selector(text=content, type='html')
@ -33,7 +34,7 @@ class extractor(Baes):
sub_categorys_dict = { sub_categorys_dict = {
'specId': value.get('specId'), 'specId': value.get('specId'),
'specAttrs': key.replace('>', '|'), 'specAttrs': key.replace('>', '|'),
'Price': globalData.get('tempModel').get('price'), 'Price': value.get('price') if value.get('price') else globalData.get('tempModel').get('price'),
'canBookCount': value.get('canBookCount') 'canBookCount': value.get('canBookCount')
} }
sub_categorys_canBookCount.append(sub_categorys_dict) sub_categorys_canBookCount.append(sub_categorys_dict)