导出到解析后json数据

2025-04-20 01:34:55 +08:00 · 2021-11-15 02:20:41 +08:00 · 2021-11-15 02:20:41 +08:00 · 48c2c0d92f
commit 48c2c0d92f
parent 1ae653fec0
4 changed files with 36 additions and 7 deletions
--- a/1688/clean/extractor.py
+++ b/1688/clean/extractor.py
@ -78,14 +78,14 @@ class extractor(Baes):

            a_590893001997 = data.get('590893001997')
            if not a_590893001997:
-                unitWeight = data.get('605462009364').get('data').get('test').get('unitWeight')
+                # unitWeight = data.get('605462009364').get('data').get('test').get('unitWeight')
                location = data.get('605462009364').get('data').get('location')
                cost = data.get('605462009364').get('data').get('logistics')
            else:
-                unitWeight = a_590893001997.get('data').get('test').get('unitWeight')
+                # unitWeight = a_590893001997.get('data').get('test').get('unitWeight')
                location = a_590893001997.get('data').get('location')
                cost = a_590893001997.get('data').get('logistics')
-            logistics = [{"from": location}, {"cost": cost}]
+            logistics = [{"from": location}, {"cost": cost.replace('快递', '').strip()}]

            a_590893002003 = data.get('590893002003')
            if not a_590893002003:
@ -109,7 +109,7 @@ class extractor(Baes):
                "images": images,
                "propsList": propsList,
                "detailUrl": detailUrl,
-                "unit_weight": unitWeight,
+                "unit_weight": "",
                "logistics": logistics
            }
            self.col.insert_item('CLEAN_CONTENT', item)
--- a/1688/dao/mongo_dao.py
+++ b/1688/dao/mongo_dao.py
@ -47,7 +47,7 @@ class MongoDao(object):
        else:
            print(f"【{datetime.now()}】过滤")

-    def find_item(self, collection, query, projection):
+    def find_item(self, collection, *args, **kwargs):
        collection = self.client[collection]
-        return collection.find(query, projection).batch_size(1)
+        return collection.find(*args, **kwargs).batch_size(1)

--- a/1688/spider/1688企业产品详情页面.py
+++ b/1688/spider/1688企业产品详情页面.py
@ -16,7 +16,7 @@ class 企业产品详情页面(Baes):
        for s in res:
            url = s.get('url').replace('detail', 'm')
            sign = s.get('sign')
-            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a22313336323861633166303531646664306233326164313139386263343465313343505867386f7347454f7963717172677a49437643686f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d"
+            x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223433333035343562623433343530616361636164636131373764396164613965434a754f7534774745507959795a577173616e3641526f4c4e6a59344d6a49784e7a67304f7a45773563795068766a2f2f2f2f2f41513d3d227d"
            headers = {
                'Cookie': f"x5sec={x5sec}"
            }
--- a/1688/spider/导出到解析后json数据.py
+++ b/1688/spider/导出到解析后json数据.py
@ -0,0 +1,29 @@
+from scrapy.selector import Selector
+from dao.mongo_dao import MongoDao
+from spider.baes import Baes
+from datetime import datetime
+import time
+import json
+
+
+class 导出到解析后json数据(Baes):
+
+    def __init__(self):
+        self.col = MongoDao()
+        super(导出到解析后json数据, self).__init__()
+
+    def run(self):
+        res = self.col.find_item('CLEAN_CONTENT', {})
+
+        for s in res:
+            s.pop('_id')
+            s.pop('sign')
+            with open(f"../docs/导出到解析后json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
+                f.write(json.dumps(s) + '\n')
+
+        print(f"【{datetime.now()}】完成")
+
+
+if __name__ == '__main__':
+    f = 导出到解析后json数据()
+    f.run()