From 45986e186e66245e1e7db6ddcd7b22a00854b3ba Mon Sep 17 00:00:00 2001
From: aiguigu <aiyingfeng110@qq.com>
Date: Thu, 2 Dec 2021 02:21:05 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AF=BC=E5=87=BA=E5=88=B0=E8=A7=A3=E6=9E=90?=
 =?UTF-8?q?=E5=90=8Ejson=E6=95=B0=E6=8D=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 1688/dao/mongo_dao.py               | 14 +------
 1688/spider/1688企业产品详情内容.py | 45 +++++++++++++++++++++
 1688/spider/1688图片下载.py         | 63 +++++++++++++++++++++++++++++
 1688/spider/图片下载.py             |  0
 1688/spider/导出到json数据.py       | 42 +++++++++++++++++++
 1688/spider/导出到本地json数据.py   |  7 ++--
 1688/spider/导出到解析后json数据.py | 29 -------------
 1688/tool/__init__.py               |  0
 1688/tool/download_img.py           | 29 -------------
 9 files changed, 154 insertions(+), 75 deletions(-)
 create mode 100644 1688/spider/1688图片下载.py
 delete mode 100644 1688/spider/图片下载.py
 create mode 100644 1688/spider/导出到json数据.py
 delete mode 100644 1688/spider/导出到解析后json数据.py
 delete mode 100644 1688/tool/__init__.py
 delete mode 100644 1688/tool/download_img.py

diff --git a/1688/dao/mongo_dao.py b/1688/dao/mongo_dao.py
index 78657eb..d669c11 100644
--- a/1688/dao/mongo_dao.py
+++ b/1688/dao/mongo_dao.py
@@ -37,17 +37,5 @@ class MongoDao(object):
         if collection.find_one({"sign": item['sign']}):
             print(f"【{datetime.now()}】过滤")
         else:
-            print(f"【{datetime.now()}】入库{item.get('url')}")
+            print(f"【{datetime.now()}】入库{item.get('sign')}")
             return collection.insert_one(item)
-
-    def update_item(self, collection, sign):
-        collection = self.client[collection]
-        if collection.find_one({"sign": sign}):
-            return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
-        else:
-            print(f"【{datetime.now()}】过滤")
-
-    def find_item(self, collection, *args, **kwargs):
-        collection = self.client[collection]
-        return collection.find(*args, **kwargs).batch_size(1)
-
diff --git a/1688/spider/1688企业产品详情内容.py b/1688/spider/1688企业产品详情内容.py
index e69de29..8846a06 100644
--- a/1688/spider/1688企业产品详情内容.py
+++ b/1688/spider/1688企业产品详情内容.py
@@ -0,0 +1,45 @@
+import requests
+from dao.mongo_dao import MyMongodb, MongoDao
+from spider.baes import Baes
+from datetime import datetime
+import json
+import re
+
+
+class 企业产品详情内容(Baes):
+
+    def __init__(self):
+        self.client = MyMongodb().db
+        self.col = MongoDao()
+        super(企业产品详情内容, self).__init__()
+
+    def get_detail(self, url):
+        res = requests.get(url)
+        return res
+
+    def run(self):
+        res = self.client['CLEAN_CONTENT'].find({"detail_url_status": 0}).batch_size(1)
+        for s in res:
+            sign = s.get('sign')
+            id = s.get('id')
+            detailUrl = s.get('detailUrl')
+            if detailUrl:
+                detailUrl = re.findall(r'url=(.*)', detailUrl)[0]
+                res = self.get_detail(detailUrl)
+                offer_details = re.findall(r'offer_details=(.*);', res.text)[0]
+                offer_details_dict = json.loads(offer_details).get('content')
+
+                item = {
+                    "sign": sign,
+                    "id": id,
+                    "offer_details": offer_details_dict,
+                    "stauts": "0"
+                }
+                self.col.insert_item('RAW_DETAIL', item)
+                self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"detail_url_status": 2}})
+                print(f"【{datetime.now()}】完成")
+
+
+if __name__ == '__main__':
+    img = 企业产品详情内容()
+    img.run()
diff --git a/1688/spider/1688图片下载.py b/1688/spider/1688图片下载.py
new file mode 100644
index 0000000..2f81951
--- /dev/null
+++ b/1688/spider/1688图片下载.py
@@ -0,0 +1,63 @@
+from urllib.parse import urlparse
+import settings
+import requests
+import os
+from dao.mongo_dao import MyMongodb
+from spider.baes import Baes
+from datetime import datetime
+
+
+class 图片下载(Baes):
+
+    def __init__(self):
+        self.client = MyMongodb().db
+        super(图片下载, self).__init__()
+
+    def request_download(self, image_url, path):
+        try:
+            url_path = urlparse(image_url).path
+            image_name = url_path.split("/")[-1]
+            r = requests.get(image_url)
+            with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
+                f.write(r.content)
+            return 1
+        except Exception as e:
+            return -1
+
+    def mkdir(self, path):
+        folder = os.path.exists(f"{settings.excel_path}{path}")
+        if not folder:
+            os.makedirs(f"{settings.excel_path}{path}")
+
+    def download_img(self, image_url, path):
+        self.mkdir(path)
+        return self.request_download(image_url, path)
+
+    def run(self):
+        res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1)
+        for s in res:
+            id = s.get('id')
+            sign = s.get('sign')
+            for img_url in s.get('images'):
+                if img_url.get('imageURI'):
+                    fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI')
+                    res = self.download_img(fullPathImageURI, id)
+                    if res == -1:
+                        break
+                    print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
+
+            for sub_category in s.get('sub_categorys_option'):
+                if sub_category.get('OptionImageUrl'):
+                    OptionImageUrl = sub_category.get('OptionImageUrl')
+                    res = self.download_img(OptionImageUrl, id)
+                    if res == -1:
+                        break
+                    print(f"【{datetime.now()}】图片下载{OptionImageUrl}")
+
+            res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
+            print(f"【{datetime.now()}】完成 {res}")
+
+
+if __name__ == '__main__':
+    img = 图片下载()
+    img.run()
diff --git a/1688/spider/图片下载.py b/1688/spider/图片下载.py
deleted file mode 100644
index e69de29..0000000
diff --git a/1688/spider/导出到json数据.py b/1688/spider/导出到json数据.py
new file mode 100644
index 0000000..344f073
--- /dev/null
+++ b/1688/spider/导出到json数据.py
@@ -0,0 +1,42 @@
+from dao.mongo_dao import MyMongodb
+from spider.baes import Baes
+from datetime import datetime
+import time
+import json
+
+
+class 导出到json数据(Baes):
+
+    def __init__(self):
+        self.client = MyMongodb().db
+        super(导出到json数据, self).__init__()
+
+    def export_CLEAN_CONTENT(self):
+        res = self.client['CLEAN_CONTENT'].find({}).batch_size(100)
+
+        for s in res:
+            s.pop('_id')
+            s.pop('sign')
+            with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
+                f.write(json.dumps(s) + '\n')
+        print(f"【{datetime.now()}】完成")
+
+    def export_RAW_DETAIL(self):
+        res = self.client['RAW_DETAIL'].find({}).batch_size(100)
+
+        for s in res:
+            s.pop('_id')
+            s.pop('sign')
+            s.pop('stauts')
+            with open(f"../docs/导出到详情内容json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
+                f.write(json.dumps(s) + '\n')
+        print(f"【{datetime.now()}】完成")
+
+    def run(self):
+        self.export_CLEAN_CONTENT()
+        self.export_RAW_DETAIL()
+
+
+if __name__ == '__main__':
+    f = 导出到json数据()
+    f.run()
diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py
index e7e6523..31228e6 100644
--- a/1688/spider/导出到本地json数据.py
+++ b/1688/spider/导出到本地json数据.py
@@ -1,5 +1,4 @@
-from scrapy.selector import Selector
-from dao.mongo_dao import MongoDao
+from dao.mongo_dao import MyMongodb
 from spider.baes import Baes
 from datetime import datetime
 import time
@@ -10,11 +9,11 @@ import re
 class 导出到本地元数据(Baes):
 
     def __init__(self):
-        self.col = MongoDao()
+        self.client = MyMongodb().db
         super(导出到本地元数据, self).__init__()
 
     def run(self):
-        res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
+        res = self.client['RAW_CONTENT'].find({}, {"content": 1}).batch_size(100)
 
         for s in res:
             s.pop('_id')
diff --git a/1688/spider/导出到解析后json数据.py b/1688/spider/导出到解析后json数据.py
deleted file mode 100644
index e8e984c..0000000
--- a/1688/spider/导出到解析后json数据.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from scrapy.selector import Selector
-from dao.mongo_dao import MongoDao
-from spider.baes import Baes
-from datetime import datetime
-import time
-import json
-
-
-class 导出到解析后json数据(Baes):
-
-    def __init__(self):
-        self.col = MongoDao()
-        super(导出到解析后json数据, self).__init__()
-
-    def run(self):
-        res = self.col.find_item('CLEAN_CONTENT', {})
-
-        for s in res:
-            s.pop('_id')
-            s.pop('sign')
-            with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
-                f.write(json.dumps(s) + '\n')
-
-        print(f"【{datetime.now()}】完成")
-
-
-if __name__ == '__main__':
-    f = 导出到解析后json数据()
-    f.run()
diff --git a/1688/tool/__init__.py b/1688/tool/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/1688/tool/download_img.py b/1688/tool/download_img.py
deleted file mode 100644
index d043418..0000000
--- a/1688/tool/download_img.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from urllib.parse import urlparse
-import settings
-import requests
-import os
-
-
-def request_download(image_url, path):
-    url_path = urlparse(image_url).path
-    image_name = url_path.split("/")[-1]
-    r = requests.get(image_url)
-    with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
-        f.write(r.content)
-
-
-def mkdir(path):
-    folder = os.path.exists(f"{settings.excel_path}{path}")
-    if not folder:
-        os.makedirs(f"{settings.excel_path}{path}")
-
-
-def download_img(image_url, path):
-    mkdir(path)
-    request_download(image_url, path)
-
-
-if __name__ == '__main__':
-    image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
-    name = "test"
-    download_img(image_url, name)