mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 21:55:07 +08:00
导出到解析后json数据
This commit is contained in:
parent
168f011c24
commit
45986e186e
@ -37,17 +37,5 @@ class MongoDao(object):
|
|||||||
if collection.find_one({"sign": item['sign']}):
|
if collection.find_one({"sign": item['sign']}):
|
||||||
print(f"【{datetime.now()}】过滤")
|
print(f"【{datetime.now()}】过滤")
|
||||||
else:
|
else:
|
||||||
print(f"【{datetime.now()}】入库{item.get('url')}")
|
print(f"【{datetime.now()}】入库{item.get('sign')}")
|
||||||
return collection.insert_one(item)
|
return collection.insert_one(item)
|
||||||
|
|
||||||
def update_item(self, collection, sign):
|
|
||||||
collection = self.client[collection]
|
|
||||||
if collection.find_one({"sign": sign}):
|
|
||||||
return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
|
|
||||||
else:
|
|
||||||
print(f"【{datetime.now()}】过滤")
|
|
||||||
|
|
||||||
def find_item(self, collection, *args, **kwargs):
|
|
||||||
collection = self.client[collection]
|
|
||||||
return collection.find(*args, **kwargs).batch_size(1)
|
|
||||||
|
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
import requests
|
||||||
|
from dao.mongo_dao import MyMongodb, MongoDao
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class 企业产品详情内容(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = MyMongodb().db
|
||||||
|
self.col = MongoDao()
|
||||||
|
super(企业产品详情内容, self).__init__()
|
||||||
|
|
||||||
|
def get_detail(self, url):
|
||||||
|
res = requests.get(url)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
res = self.client['CLEAN_CONTENT'].find({"detail_url_status": 0}).batch_size(1)
|
||||||
|
for s in res:
|
||||||
|
sign = s.get('sign')
|
||||||
|
id = s.get('id')
|
||||||
|
detailUrl = s.get('detailUrl')
|
||||||
|
if detailUrl:
|
||||||
|
detailUrl = re.findall(r'url=(.*)', detailUrl)[0]
|
||||||
|
res = self.get_detail(detailUrl)
|
||||||
|
offer_details = re.findall(r'offer_details=(.*);', res.text)[0]
|
||||||
|
offer_details_dict = json.loads(offer_details).get('content')
|
||||||
|
|
||||||
|
item = {
|
||||||
|
"sign": sign,
|
||||||
|
"id": id,
|
||||||
|
"offer_details": offer_details_dict,
|
||||||
|
"stauts": "0"
|
||||||
|
}
|
||||||
|
self.col.insert_item('RAW_DETAIL', item)
|
||||||
|
self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"detail_url_status": 2}})
|
||||||
|
print(f"【{datetime.now()}】完成")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
img = 企业产品详情内容()
|
||||||
|
img.run()
|
63
1688/spider/1688图片下载.py
Normal file
63
1688/spider/1688图片下载.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
from urllib.parse import urlparse
|
||||||
|
import settings
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from dao.mongo_dao import MyMongodb
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class 图片下载(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = MyMongodb().db
|
||||||
|
super(图片下载, self).__init__()
|
||||||
|
|
||||||
|
def request_download(self, image_url, path):
|
||||||
|
try:
|
||||||
|
url_path = urlparse(image_url).path
|
||||||
|
image_name = url_path.split("/")[-1]
|
||||||
|
r = requests.get(image_url)
|
||||||
|
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
|
||||||
|
f.write(r.content)
|
||||||
|
return 1
|
||||||
|
except Exception as e:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def mkdir(self, path):
|
||||||
|
folder = os.path.exists(f"{settings.excel_path}{path}")
|
||||||
|
if not folder:
|
||||||
|
os.makedirs(f"{settings.excel_path}{path}")
|
||||||
|
|
||||||
|
def download_img(self, image_url, path):
|
||||||
|
self.mkdir(path)
|
||||||
|
return self.request_download(image_url, path)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1)
|
||||||
|
for s in res:
|
||||||
|
id = s.get('id')
|
||||||
|
sign = s.get('sign')
|
||||||
|
for img_url in s.get('images'):
|
||||||
|
if img_url.get('imageURI'):
|
||||||
|
fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI')
|
||||||
|
res = self.download_img(fullPathImageURI, id)
|
||||||
|
if res == -1:
|
||||||
|
break
|
||||||
|
print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
|
||||||
|
|
||||||
|
for sub_category in s.get('sub_categorys_option'):
|
||||||
|
if sub_category.get('OptionImageUrl'):
|
||||||
|
OptionImageUrl = sub_category.get('OptionImageUrl')
|
||||||
|
res = self.download_img(OptionImageUrl, id)
|
||||||
|
if res == -1:
|
||||||
|
break
|
||||||
|
print(f"【{datetime.now()}】图片下载{OptionImageUrl}")
|
||||||
|
|
||||||
|
res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
|
||||||
|
print(f"【{datetime.now()}】完成 {res}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
img = 图片下载()
|
||||||
|
img.run()
|
42
1688/spider/导出到json数据.py
Normal file
42
1688/spider/导出到json数据.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from dao.mongo_dao import MyMongodb
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class 导出到json数据(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = MyMongodb().db
|
||||||
|
super(导出到json数据, self).__init__()
|
||||||
|
|
||||||
|
def export_CLEAN_CONTENT(self):
|
||||||
|
res = self.client['CLEAN_CONTENT'].find({}).batch_size(100)
|
||||||
|
|
||||||
|
for s in res:
|
||||||
|
s.pop('_id')
|
||||||
|
s.pop('sign')
|
||||||
|
with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
|
||||||
|
f.write(json.dumps(s) + '\n')
|
||||||
|
print(f"【{datetime.now()}】完成")
|
||||||
|
|
||||||
|
def export_RAW_DETAIL(self):
|
||||||
|
res = self.client['RAW_DETAIL'].find({}).batch_size(100)
|
||||||
|
|
||||||
|
for s in res:
|
||||||
|
s.pop('_id')
|
||||||
|
s.pop('sign')
|
||||||
|
s.pop('stauts')
|
||||||
|
with open(f"../docs/导出到详情内容json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
|
||||||
|
f.write(json.dumps(s) + '\n')
|
||||||
|
print(f"【{datetime.now()}】完成")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.export_CLEAN_CONTENT()
|
||||||
|
self.export_RAW_DETAIL()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
f = 导出到json数据()
|
||||||
|
f.run()
|
@ -1,5 +1,4 @@
|
|||||||
from scrapy.selector import Selector
|
from dao.mongo_dao import MyMongodb
|
||||||
from dao.mongo_dao import MongoDao
|
|
||||||
from spider.baes import Baes
|
from spider.baes import Baes
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import time
|
import time
|
||||||
@ -10,11 +9,11 @@ import re
|
|||||||
class 导出到本地元数据(Baes):
|
class 导出到本地元数据(Baes):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.col = MongoDao()
|
self.client = MyMongodb().db
|
||||||
super(导出到本地元数据, self).__init__()
|
super(导出到本地元数据, self).__init__()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
|
res = self.client['RAW_CONTENT'].find({}, {"content": 1}).batch_size(100)
|
||||||
|
|
||||||
for s in res:
|
for s in res:
|
||||||
s.pop('_id')
|
s.pop('_id')
|
||||||
|
@ -1,29 +0,0 @@
|
|||||||
from scrapy.selector import Selector
|
|
||||||
from dao.mongo_dao import MongoDao
|
|
||||||
from spider.baes import Baes
|
|
||||||
from datetime import datetime
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
class 导出到解析后json数据(Baes):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.col = MongoDao()
|
|
||||||
super(导出到解析后json数据, self).__init__()
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
res = self.col.find_item('CLEAN_CONTENT', {})
|
|
||||||
|
|
||||||
for s in res:
|
|
||||||
s.pop('_id')
|
|
||||||
s.pop('sign')
|
|
||||||
with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
|
|
||||||
f.write(json.dumps(s) + '\n')
|
|
||||||
|
|
||||||
print(f"【{datetime.now()}】完成")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
f = 导出到解析后json数据()
|
|
||||||
f.run()
|
|
@ -1,29 +0,0 @@
|
|||||||
from urllib.parse import urlparse
|
|
||||||
import settings
|
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def request_download(image_url, path):
|
|
||||||
url_path = urlparse(image_url).path
|
|
||||||
image_name = url_path.split("/")[-1]
|
|
||||||
r = requests.get(image_url)
|
|
||||||
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
|
|
||||||
f.write(r.content)
|
|
||||||
|
|
||||||
|
|
||||||
def mkdir(path):
|
|
||||||
folder = os.path.exists(f"{settings.excel_path}{path}")
|
|
||||||
if not folder:
|
|
||||||
os.makedirs(f"{settings.excel_path}{path}")
|
|
||||||
|
|
||||||
|
|
||||||
def download_img(image_url, path):
|
|
||||||
mkdir(path)
|
|
||||||
request_download(image_url, path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
|
|
||||||
name = "test"
|
|
||||||
download_img(image_url, name)
|
|
Loading…
x
Reference in New Issue
Block a user