导出到解析后json数据

2025-04-20 21:55:07 +08:00 · 2021-12-02 02:21:05 +08:00 · 2021-12-02 02:21:05 +08:00 · 45986e186e
commit 45986e186e
parent 168f011c24
9 changed files with 154 additions and 75 deletions
--- a/1688/dao/mongo_dao.py
+++ b/1688/dao/mongo_dao.py
@ -37,17 +37,5 @@ class MongoDao(object):
        if collection.find_one({"sign": item['sign']}):
            print(f"【{datetime.now()}】过滤")
        else:
-            print(f"【{datetime.now()}】入库{item.get('url')}")
+            print(f"【{datetime.now()}】入库{item.get('sign')}")
            return collection.insert_one(item)
    def update_item(self, collection, sign):
        collection = self.client[collection]
        if collection.find_one({"sign": sign}):
            return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
        else:
            print(f"【{datetime.now()}】过滤")
    def find_item(self, collection, *args, **kwargs):
        collection = self.client[collection]
        return collection.find(*args, **kwargs).batch_size(1)
--- a/1688/spider/1688企业产品详情内容.py
+++ b/1688/spider/1688企业产品详情内容.py
@ -0,0 +1,45 @@
 import requests
 from dao.mongo_dao import MyMongodb, MongoDao
 from spider.baes import Baes
 from datetime import datetime
 import json
 import re
 class 企业产品详情内容(Baes):
    def __init__(self):
        self.client = MyMongodb().db
        self.col = MongoDao()
        super(企业产品详情内容, self).__init__()
    def get_detail(self, url):
        res = requests.get(url)
        return res
    def run(self):
        res = self.client['CLEAN_CONTENT'].find({"detail_url_status": 0}).batch_size(1)
        for s in res:
            sign = s.get('sign')
            id = s.get('id')
            detailUrl = s.get('detailUrl')
            if detailUrl:
                detailUrl = re.findall(r'url=(.*)', detailUrl)[0]
                res = self.get_detail(detailUrl)
                offer_details = re.findall(r'offer_details=(.*);', res.text)[0]
                offer_details_dict = json.loads(offer_details).get('content')
                item = {
                    "sign": sign,
                    "id": id,
                    "offer_details": offer_details_dict,
                    "stauts": "0"
                }
                self.col.insert_item('RAW_DETAIL', item)
                self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"detail_url_status": 2}})
                print(f"【{datetime.now()}】完成")
 if __name__ == '__main__':
    img = 企业产品详情内容()
    img.run()
--- a/1688/spider/1688图片下载.py
+++ b/1688/spider/1688图片下载.py
@ -0,0 +1,63 @@
 from urllib.parse import urlparse
 import settings
 import requests
 import os
 from dao.mongo_dao import MyMongodb
 from spider.baes import Baes
 from datetime import datetime
 class 图片下载(Baes):
    def __init__(self):
        self.client = MyMongodb().db
        super(图片下载, self).__init__()
    def request_download(self, image_url, path):
        try:
            url_path = urlparse(image_url).path
            image_name = url_path.split("/")[-1]
            r = requests.get(image_url)
            with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
                f.write(r.content)
            return 1
        except Exception as e:
            return -1
    def mkdir(self, path):
        folder = os.path.exists(f"{settings.excel_path}{path}")
        if not folder:
            os.makedirs(f"{settings.excel_path}{path}")
    def download_img(self, image_url, path):
        self.mkdir(path)
        return self.request_download(image_url, path)
    def run(self):
        res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1)
        for s in res:
            id = s.get('id')
            sign = s.get('sign')
            for img_url in s.get('images'):
                if img_url.get('imageURI'):
                    fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI')
                    res = self.download_img(fullPathImageURI, id)
                    if res == -1:
                        break
                    print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
            for sub_category in s.get('sub_categorys_option'):
                if sub_category.get('OptionImageUrl'):
                    OptionImageUrl = sub_category.get('OptionImageUrl')
                    res = self.download_img(OptionImageUrl, id)
                    if res == -1:
                        break
                    print(f"【{datetime.now()}】图片下载{OptionImageUrl}")
            res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
            print(f"【{datetime.now()}】完成 {res}")
 if __name__ == '__main__':
    img = 图片下载()
    img.run()
--- a/1688/spider/图片下载.py
+++ b/1688/spider/图片下载.py
--- a/1688/spider/导出到json数据.py
+++ b/1688/spider/导出到json数据.py
@ -0,0 +1,42 @@
 from dao.mongo_dao import MyMongodb
 from spider.baes import Baes
 from datetime import datetime
 import time
 import json
 class 导出到json数据(Baes):
    def __init__(self):
        self.client = MyMongodb().db
        super(导出到json数据, self).__init__()
    def export_CLEAN_CONTENT(self):
        res = self.client['CLEAN_CONTENT'].find({}).batch_size(100)
        for s in res:
            s.pop('_id')
            s.pop('sign')
            with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
                f.write(json.dumps(s) + '\n')
        print(f"【{datetime.now()}】完成")
    def export_RAW_DETAIL(self):
        res = self.client['RAW_DETAIL'].find({}).batch_size(100)
        for s in res:
            s.pop('_id')
            s.pop('sign')
            s.pop('stauts')
            with open(f"../docs/导出到详情内容json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
                f.write(json.dumps(s) + '\n')
        print(f"【{datetime.now()}】完成")
    def run(self):
        self.export_CLEAN_CONTENT()
        self.export_RAW_DETAIL()
 if __name__ == '__main__':
    f = 导出到json数据()
    f.run()
--- a/1688/spider/导出到本地json数据.py
+++ b/1688/spider/导出到本地json数据.py
@ -1,5 +1,4 @@
-from scrapy.selector import Selector
+from dao.mongo_dao import MyMongodb
 from dao.mongo_dao import MongoDao
 from spider.baes import Baes
 from datetime import datetime
 import time
@ -10,11 +9,11 @@ import re
 class 导出到本地元数据(Baes):
    def __init__(self):
-        self.col = MongoDao()
+        self.client = MyMongodb().db
        super(导出到本地元数据, self).__init__()
    def run(self):
-        res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
+        res = self.client['RAW_CONTENT'].find({}, {"content": 1}).batch_size(100)
        for s in res:
            s.pop('_id')
--- a/1688/spider/导出到解析后json数据.py
+++ b/1688/spider/导出到解析后json数据.py
@ -1,29 +0,0 @@
 from scrapy.selector import Selector
 from dao.mongo_dao import MongoDao
 from spider.baes import Baes
 from datetime import datetime
 import time
 import json
 class 导出到解析后json数据(Baes):
    def __init__(self):
        self.col = MongoDao()
        super(导出到解析后json数据, self).__init__()
    def run(self):
        res = self.col.find_item('CLEAN_CONTENT', {})
        for s in res:
            s.pop('_id')
            s.pop('sign')
            with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
                f.write(json.dumps(s) + '\n')
        print(f"【{datetime.now()}】完成")
 if __name__ == '__main__':
    f = 导出到解析后json数据()
    f.run()
--- a/1688/tool/init.py
+++ b/1688/tool/init.py
--- a/1688/tool/download_img.py
+++ b/1688/tool/download_img.py
@ -1,29 +0,0 @@
 from urllib.parse import urlparse
 import settings
 import requests
 import os
 def request_download(image_url, path):
    url_path = urlparse(image_url).path
    image_name = url_path.split("/")[-1]
    r = requests.get(image_url)
    with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
        f.write(r.content)
 def mkdir(path):
    folder = os.path.exists(f"{settings.excel_path}{path}")
    if not folder:
        os.makedirs(f"{settings.excel_path}{path}")
 def download_img(image_url, path):
    mkdir(path)
    request_download(image_url, path)
 if __name__ == '__main__':
    image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
    name = "test"
    download_img(image_url, name)