From 45986e186e66245e1e7db6ddcd7b22a00854b3ba Mon Sep 17 00:00:00 2001 From: aiguigu Date: Thu, 2 Dec 2021 02:21:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=BC=E5=87=BA=E5=88=B0=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E5=90=8Ejson=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1688/dao/mongo_dao.py | 14 +------ 1688/spider/1688企业产品详情内容.py | 45 +++++++++++++++++++++ 1688/spider/1688图片下载.py | 63 +++++++++++++++++++++++++++++ 1688/spider/图片下载.py | 0 1688/spider/导出到json数据.py | 42 +++++++++++++++++++ 1688/spider/导出到本地json数据.py | 7 ++-- 1688/spider/导出到解析后json数据.py | 29 ------------- 1688/tool/__init__.py | 0 1688/tool/download_img.py | 29 ------------- 9 files changed, 154 insertions(+), 75 deletions(-) create mode 100644 1688/spider/1688图片下载.py delete mode 100644 1688/spider/图片下载.py create mode 100644 1688/spider/导出到json数据.py delete mode 100644 1688/spider/导出到解析后json数据.py delete mode 100644 1688/tool/__init__.py delete mode 100644 1688/tool/download_img.py diff --git a/1688/dao/mongo_dao.py b/1688/dao/mongo_dao.py index 78657eb..d669c11 100644 --- a/1688/dao/mongo_dao.py +++ b/1688/dao/mongo_dao.py @@ -37,17 +37,5 @@ class MongoDao(object): if collection.find_one({"sign": item['sign']}): print(f"【{datetime.now()}】过滤") else: - print(f"【{datetime.now()}】入库{item.get('url')}") + print(f"【{datetime.now()}】入库{item.get('sign')}") return collection.insert_one(item) - - def update_item(self, collection, sign): - collection = self.client[collection] - if collection.find_one({"sign": sign}): - return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}}) - else: - print(f"【{datetime.now()}】过滤") - - def find_item(self, collection, *args, **kwargs): - collection = self.client[collection] - return collection.find(*args, **kwargs).batch_size(1) - diff --git a/1688/spider/1688企业产品详情内容.py b/1688/spider/1688企业产品详情内容.py index e69de29..8846a06 100644 --- a/1688/spider/1688企业产品详情内容.py +++ b/1688/spider/1688企业产品详情内容.py @@ -0,0 +1,45 @@ +import requests +from dao.mongo_dao import MyMongodb, MongoDao +from spider.baes import Baes +from datetime import datetime +import json +import re + + +class 企业产品详情内容(Baes): + + def __init__(self): + self.client = MyMongodb().db + self.col = MongoDao() + super(企业产品详情内容, self).__init__() + + def get_detail(self, url): + res = requests.get(url) + return res + + def run(self): + res = self.client['CLEAN_CONTENT'].find({"detail_url_status": 0}).batch_size(1) + for s in res: + sign = s.get('sign') + id = s.get('id') + detailUrl = s.get('detailUrl') + if detailUrl: + detailUrl = re.findall(r'url=(.*)', detailUrl)[0] + res = self.get_detail(detailUrl) + offer_details = re.findall(r'offer_details=(.*);', res.text)[0] + offer_details_dict = json.loads(offer_details).get('content') + + item = { + "sign": sign, + "id": id, + "offer_details": offer_details_dict, + "stauts": "0" + } + self.col.insert_item('RAW_DETAIL', item) + self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"detail_url_status": 2}}) + print(f"【{datetime.now()}】完成") + + +if __name__ == '__main__': + img = 企业产品详情内容() + img.run() diff --git a/1688/spider/1688图片下载.py b/1688/spider/1688图片下载.py new file mode 100644 index 0000000..2f81951 --- /dev/null +++ b/1688/spider/1688图片下载.py @@ -0,0 +1,63 @@ +from urllib.parse import urlparse +import settings +import requests +import os +from dao.mongo_dao import MyMongodb +from spider.baes import Baes +from datetime import datetime + + +class 图片下载(Baes): + + def __init__(self): + self.client = MyMongodb().db + super(图片下载, self).__init__() + + def request_download(self, image_url, path): + try: + url_path = urlparse(image_url).path + image_name = url_path.split("/")[-1] + r = requests.get(image_url) + with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f: + f.write(r.content) + return 1 + except Exception as e: + return -1 + + def mkdir(self, path): + folder = os.path.exists(f"{settings.excel_path}{path}") + if not folder: + os.makedirs(f"{settings.excel_path}{path}") + + def download_img(self, image_url, path): + self.mkdir(path) + return self.request_download(image_url, path) + + def run(self): + res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1) + for s in res: + id = s.get('id') + sign = s.get('sign') + for img_url in s.get('images'): + if img_url.get('imageURI'): + fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI') + res = self.download_img(fullPathImageURI, id) + if res == -1: + break + print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + + for sub_category in s.get('sub_categorys_option'): + if sub_category.get('OptionImageUrl'): + OptionImageUrl = sub_category.get('OptionImageUrl') + res = self.download_img(OptionImageUrl, id) + if res == -1: + break + print(f"【{datetime.now()}】图片下载{OptionImageUrl}") + + res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}}) + print(f"【{datetime.now()}】完成 {res}") + + +if __name__ == '__main__': + img = 图片下载() + img.run() diff --git a/1688/spider/图片下载.py b/1688/spider/图片下载.py deleted file mode 100644 index e69de29..0000000 diff --git a/1688/spider/导出到json数据.py b/1688/spider/导出到json数据.py new file mode 100644 index 0000000..344f073 --- /dev/null +++ b/1688/spider/导出到json数据.py @@ -0,0 +1,42 @@ +from dao.mongo_dao import MyMongodb +from spider.baes import Baes +from datetime import datetime +import time +import json + + +class 导出到json数据(Baes): + + def __init__(self): + self.client = MyMongodb().db + super(导出到json数据, self).__init__() + + def export_CLEAN_CONTENT(self): + res = self.client['CLEAN_CONTENT'].find({}).batch_size(100) + + for s in res: + s.pop('_id') + s.pop('sign') + with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + print(f"【{datetime.now()}】完成") + + def export_RAW_DETAIL(self): + res = self.client['RAW_DETAIL'].find({}).batch_size(100) + + for s in res: + s.pop('_id') + s.pop('sign') + s.pop('stauts') + with open(f"../docs/导出到详情内容json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + print(f"【{datetime.now()}】完成") + + def run(self): + self.export_CLEAN_CONTENT() + self.export_RAW_DETAIL() + + +if __name__ == '__main__': + f = 导出到json数据() + f.run() diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py index e7e6523..31228e6 100644 --- a/1688/spider/导出到本地json数据.py +++ b/1688/spider/导出到本地json数据.py @@ -1,5 +1,4 @@ -from scrapy.selector import Selector -from dao.mongo_dao import MongoDao +from dao.mongo_dao import MyMongodb from spider.baes import Baes from datetime import datetime import time @@ -10,11 +9,11 @@ import re class 导出到本地元数据(Baes): def __init__(self): - self.col = MongoDao() + self.client = MyMongodb().db super(导出到本地元数据, self).__init__() def run(self): - res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) + res = self.client['RAW_CONTENT'].find({}, {"content": 1}).batch_size(100) for s in res: s.pop('_id') diff --git a/1688/spider/导出到解析后json数据.py b/1688/spider/导出到解析后json数据.py deleted file mode 100644 index e8e984c..0000000 --- a/1688/spider/导出到解析后json数据.py +++ /dev/null @@ -1,29 +0,0 @@ -from scrapy.selector import Selector -from dao.mongo_dao import MongoDao -from spider.baes import Baes -from datetime import datetime -import time -import json - - -class 导出到解析后json数据(Baes): - - def __init__(self): - self.col = MongoDao() - super(导出到解析后json数据, self).__init__() - - def run(self): - res = self.col.find_item('CLEAN_CONTENT', {}) - - for s in res: - s.pop('_id') - s.pop('sign') - with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: - f.write(json.dumps(s) + '\n') - - print(f"【{datetime.now()}】完成") - - -if __name__ == '__main__': - f = 导出到解析后json数据() - f.run() diff --git a/1688/tool/__init__.py b/1688/tool/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/1688/tool/download_img.py b/1688/tool/download_img.py deleted file mode 100644 index d043418..0000000 --- a/1688/tool/download_img.py +++ /dev/null @@ -1,29 +0,0 @@ -from urllib.parse import urlparse -import settings -import requests -import os - - -def request_download(image_url, path): - url_path = urlparse(image_url).path - image_name = url_path.split("/")[-1] - r = requests.get(image_url) - with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f: - f.write(r.content) - - -def mkdir(path): - folder = os.path.exists(f"{settings.excel_path}{path}") - if not folder: - os.makedirs(f"{settings.excel_path}{path}") - - -def download_img(image_url, path): - mkdir(path) - request_download(image_url, path) - - -if __name__ == '__main__': - image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg" - name = "test" - download_img(image_url, name)