导出到解析后json数据

2025-04-21 21:10:21 +08:00 · 2021-12-08 00:38:38 +08:00 · 2021-12-08 00:38:38 +08:00 · b8fc9d9517
commit b8fc9d9517
parent 45986e186e
4 changed files with 16 additions and 7 deletions
--- a/1688/dao/mongo_dao.py
+++ b/1688/dao/mongo_dao.py
@ -39,3 +39,10 @@ class MongoDao(object):
        else:
            print(f"【{datetime.now()}】入库{item.get('sign')}")
            return collection.insert_one(item)
+
+    def update_item(self, collection, sign):
+        collection = self.client[collection]
+        if collection.find_one({"sign": sign}):
+            return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
+        else:
+            print(f"【{datetime.now()}】过滤")
--- a/1688/spider/1688企业产品详情页面.py
+++ b/1688/spider/1688企业产品详情页面.py
@ -1,4 +1,4 @@
-from dao.mongo_dao import MongoDao
+from dao.mongo_dao import MyMongodb, MongoDao
 from spider.baes import Baes
 from datetime import datetime
 import time
@ -8,15 +8,16 @@ import requests
 class 企业产品详情页面(Baes):

    def __init__(self):
+        self.mongodb = MyMongodb().db
        self.col = MongoDao()
        super(企业产品详情页面, self).__init__()

    def run(self):
-        res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1})
+        res = self.mongodb['RAW_URLS'].find({"stauts": "0"}, {"url": 1, "sign": 1})
        for s in res:
            url = s.get('url').replace('detail', 'm')
            sign = s.get('sign')
-            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223433333035343562623433343530616361636164636131373764396164613965434a754f7534774745507959795a577173616e3641526f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d"
+            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a226363663036373930386530363435333061646434616437316231373339646264434e7149744930474550476b36716541796f6655777745773563795068766a2f2f2f2f2f41513d3d227d"
            headers = {
                'Cookie': f"x5sec={x5sec}"
            }
@ -37,7 +38,7 @@ class 企业产品详情页面(Baes):
            }
            self.col.insert_item('RAW_CONTENT', item)
            self.col.update_item('RAW_URLS', sign)
-            time.sleep(10)
+            time.sleep(3)


 if __name__ == '__main__':
--- a/1688/spider/导出元数据本地json数据.py
+++ b/1688/spider/导出元数据本地json数据.py
--- a/1688/spider/清洗数据json格式.py
+++ b/1688/spider/清洗数据json格式.py
@ -1,4 +1,4 @@
-from dao.mongo_dao import MongoDao
+from dao.mongo_dao import MongoDao, MyMongodb
 from scrapy.selector import Selector
 from spider.baes import Baes
 from datetime import datetime
@ -9,11 +9,12 @@ import re
 class extractor(Baes):

    def __init__(self):
+        self.mongodb = MyMongodb().db
        self.col = MongoDao()
        super(extractor, self).__init__()

    def run(self):
-        res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
+        res = self.mongodb['RAW_CONTENT'].find({}, {"content": 1})
        for s in res:
            content = s.get('content')
            sel = Selector(text=content, type='html')
@ -33,7 +34,7 @@ class extractor(Baes):
                sub_categorys_dict = {
                    'specId': value.get('specId'),
                    'specAttrs': key.replace('&gt;', '|'),
-                    'Price': globalData.get('tempModel').get('price'),
+                    'Price': value.get('price') if value.get('price') else globalData.get('tempModel').get('price'),
                    'canBookCount': value.get('canBookCount')
                }
                sub_categorys_canBookCount.append(sub_categorys_dict)