导出到解析后json数据

This commit is contained in:
aiguigu 2021-12-02 02:21:05 +08:00
parent 168f011c24
commit 45986e186e
9 changed files with 154 additions and 75 deletions

View File

@ -37,17 +37,5 @@ class MongoDao(object):
if collection.find_one({"sign": item['sign']}): if collection.find_one({"sign": item['sign']}):
print(f"{datetime.now()}】过滤") print(f"{datetime.now()}】过滤")
else: else:
print(f"{datetime.now()}】入库{item.get('url')}") print(f"{datetime.now()}】入库{item.get('sign')}")
return collection.insert_one(item) return collection.insert_one(item)
def update_item(self, collection, sign):
collection = self.client[collection]
if collection.find_one({"sign": sign}):
return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
else:
print(f"{datetime.now()}】过滤")
def find_item(self, collection, *args, **kwargs):
collection = self.client[collection]
return collection.find(*args, **kwargs).batch_size(1)

View File

@ -0,0 +1,45 @@
import requests
from dao.mongo_dao import MyMongodb, MongoDao
from spider.baes import Baes
from datetime import datetime
import json
import re
class 企业产品详情内容(Baes):
def __init__(self):
self.client = MyMongodb().db
self.col = MongoDao()
super(企业产品详情内容, self).__init__()
def get_detail(self, url):
res = requests.get(url)
return res
def run(self):
res = self.client['CLEAN_CONTENT'].find({"detail_url_status": 0}).batch_size(1)
for s in res:
sign = s.get('sign')
id = s.get('id')
detailUrl = s.get('detailUrl')
if detailUrl:
detailUrl = re.findall(r'url=(.*)', detailUrl)[0]
res = self.get_detail(detailUrl)
offer_details = re.findall(r'offer_details=(.*);', res.text)[0]
offer_details_dict = json.loads(offer_details).get('content')
item = {
"sign": sign,
"id": id,
"offer_details": offer_details_dict,
"stauts": "0"
}
self.col.insert_item('RAW_DETAIL', item)
self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"detail_url_status": 2}})
print(f"{datetime.now()}】完成")
if __name__ == '__main__':
img = 企业产品详情内容()
img.run()

View File

@ -0,0 +1,63 @@
from urllib.parse import urlparse
import settings
import requests
import os
from dao.mongo_dao import MyMongodb
from spider.baes import Baes
from datetime import datetime
class 图片下载(Baes):
def __init__(self):
self.client = MyMongodb().db
super(图片下载, self).__init__()
def request_download(self, image_url, path):
try:
url_path = urlparse(image_url).path
image_name = url_path.split("/")[-1]
r = requests.get(image_url)
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
f.write(r.content)
return 1
except Exception as e:
return -1
def mkdir(self, path):
folder = os.path.exists(f"{settings.excel_path}{path}")
if not folder:
os.makedirs(f"{settings.excel_path}{path}")
def download_img(self, image_url, path):
self.mkdir(path)
return self.request_download(image_url, path)
def run(self):
res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1)
for s in res:
id = s.get('id')
sign = s.get('sign')
for img_url in s.get('images'):
if img_url.get('imageURI'):
fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI')
res = self.download_img(fullPathImageURI, id)
if res == -1:
break
print(f"{datetime.now()}】图片下载{fullPathImageURI}")
for sub_category in s.get('sub_categorys_option'):
if sub_category.get('OptionImageUrl'):
OptionImageUrl = sub_category.get('OptionImageUrl')
res = self.download_img(OptionImageUrl, id)
if res == -1:
break
print(f"{datetime.now()}】图片下载{OptionImageUrl}")
res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
print(f"{datetime.now()}】完成 {res}")
if __name__ == '__main__':
img = 图片下载()
img.run()

View File

@ -0,0 +1,42 @@
from dao.mongo_dao import MyMongodb
from spider.baes import Baes
from datetime import datetime
import time
import json
class 导出到json数据(Baes):
def __init__(self):
self.client = MyMongodb().db
super(导出到json数据, self).__init__()
def export_CLEAN_CONTENT(self):
res = self.client['CLEAN_CONTENT'].find({}).batch_size(100)
for s in res:
s.pop('_id')
s.pop('sign')
with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
f.write(json.dumps(s) + '\n')
print(f"{datetime.now()}】完成")
def export_RAW_DETAIL(self):
res = self.client['RAW_DETAIL'].find({}).batch_size(100)
for s in res:
s.pop('_id')
s.pop('sign')
s.pop('stauts')
with open(f"../docs/导出到详情内容json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
f.write(json.dumps(s) + '\n')
print(f"{datetime.now()}】完成")
def run(self):
self.export_CLEAN_CONTENT()
self.export_RAW_DETAIL()
if __name__ == '__main__':
f = 导出到json数据()
f.run()

View File

@ -1,5 +1,4 @@
from scrapy.selector import Selector from dao.mongo_dao import MyMongodb
from dao.mongo_dao import MongoDao
from spider.baes import Baes from spider.baes import Baes
from datetime import datetime from datetime import datetime
import time import time
@ -10,11 +9,11 @@ import re
class 导出到本地元数据(Baes): class 导出到本地元数据(Baes):
def __init__(self): def __init__(self):
self.col = MongoDao() self.client = MyMongodb().db
super(导出到本地元数据, self).__init__() super(导出到本地元数据, self).__init__()
def run(self): def run(self):
res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) res = self.client['RAW_CONTENT'].find({}, {"content": 1}).batch_size(100)
for s in res: for s in res:
s.pop('_id') s.pop('_id')

View File

@ -1,29 +0,0 @@
from scrapy.selector import Selector
from dao.mongo_dao import MongoDao
from spider.baes import Baes
from datetime import datetime
import time
import json
class 导出到解析后json数据(Baes):
def __init__(self):
self.col = MongoDao()
super(导出到解析后json数据, self).__init__()
def run(self):
res = self.col.find_item('CLEAN_CONTENT', {})
for s in res:
s.pop('_id')
s.pop('sign')
with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
f.write(json.dumps(s) + '\n')
print(f"{datetime.now()}】完成")
if __name__ == '__main__':
f = 导出到解析后json数据()
f.run()

View File

View File

@ -1,29 +0,0 @@
from urllib.parse import urlparse
import settings
import requests
import os
def request_download(image_url, path):
url_path = urlparse(image_url).path
image_name = url_path.split("/")[-1]
r = requests.get(image_url)
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
f.write(r.content)
def mkdir(path):
folder = os.path.exists(f"{settings.excel_path}{path}")
if not folder:
os.makedirs(f"{settings.excel_path}{path}")
def download_img(image_url, path):
mkdir(path)
request_download(image_url, path)
if __name__ == '__main__':
image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
name = "test"
download_img(image_url, name)