增加導出csv格式

2025-04-22 11:12:48 +08:00 · 2021-10-26 02:27:36 +08:00 · 2021-10-26 02:27:36 +08:00 · e8bc9b5472
commit e8bc9b5472
parent 56b82b0a80
5 changed files with 232 additions and 36 deletions
--- a/1688/clean/extractor.py
+++ b/1688/clean/extractor.py
@ -49,13 +49,14 @@ class extractor(Baes):
            companyName = globalData.get('tempModel').get('companyName')
            sellerLoginId = globalData.get('tempModel').get('sellerLoginId')
            offerUnit = globalData.get('tempModel').get('offerUnit')
            saledCount = globalData.get('tempModel').get('saledCount')
            images = globalData.get('images')
-            for image in images:
+            # for image in images:
-                fullPathImageURI = image.get('fullPathImageURI')
+            #     fullPathImageURI = image.get('fullPathImageURI')
-                download_img(fullPathImageURI, offerId)
+            #     download_img(fullPathImageURI, offerId)
-                print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
+            #     print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
-                time.sleep(1)
+            #     time.sleep(1)
            a_590893001984 = data.get('590893001984')
            if not a_590893001984:
@ -78,6 +79,7 @@ class extractor(Baes):
            item = {
                "sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)),
                "id": offerId,
                "company_name": companyName,
                "url": "https://detail.1688.com/offer/{}.html".format(offerId),
                "title": title,
@ -86,6 +88,7 @@ class extractor(Baes):
                "order_param_model": priceModel,
                "sellerLoginId": sellerLoginId,
                "offerUnit": offerUnit,
                "saledCount": saledCount,
                "images": images,
                "propsList": propsList,
                "detailUrl": detailUrl,
--- a/1688/settings.py
+++ b/1688/settings.py
@ -2,7 +2,7 @@ import os
 MONGODB_CONF = {
-    'host': '192.168.1.107',
+    'host': '127.0.0.1',
    'port': 27017,
    'username': '',
    'pwd': "",
--- a/1688/spider/baes.py
+++ b/1688/spider/baes.py
@ -1,10 +1,23 @@
 from urllib.parse import urlparse
 from datetime import datetime
 import hashlib
 import base64
 class Baes(object):
    def getYMDHMSstrList(self):
        """返回：[2019,03,01]"""
        td = datetime.now()
        tdTupele = td.timetuple()
        y = str(tdTupele[0])
        m = ("0" + str(tdTupele[1]))[-2:]
        d = ("0" + str(tdTupele[2]))[-2:]
        h = ("0" + str(tdTupele[3]))[-2:]
        M = ("0" + str(tdTupele[4]))[-2:]
        s = ("0" + str(tdTupele[5]))[-2:]
        return [y, m, d, h, M, s]
    def generate_sign(self, url):
        """通过md5生成项目符号表单URL"""
        md5 = hashlib.md5()
--- a/1688/spider/导出到本地csv数据.py
+++ b/1688/spider/导出到本地csv数据.py
@ -0,0 +1,210 @@
 from dao.mongo_dao import MongoDao
 from spider.baes import Baes
 import settings
 import pandas as pd
 from urllib.parse import urlparse
 import os
 class 导出到本地csv数据(Baes):
    def __init__(self):
        self.col = MongoDao()
        super(导出到本地csv数据, self).__init__()
        path_1 = "数据分析_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4]))
        pd_path = os.path.join(settings.excel_path, path_1)
        self.writer = pd.ExcelWriter(pd_path, options={'strings_to_urls': False})
    def export(self):
        res = self.col.find_item('CLEAN_CONTENT', {}, {"id": 1, "company_name": 1, "url": 1, "title": 1,
                                                       "sub_categorys": 1,
                                                       "sub_colour_categorys": 1, "order_param_model": 1,
                                                       "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1,
                                                       "detailUrl": 1, "unit_weight": 1})
        # 初始化df
        df = pd.DataFrame(columns={
            "店铺id": str,
            "店铺名称": str,
            "店铺地址": str,
            "銷量": pd.to_numeric,
            "商品名称": str,
            "轮播图": str,
            "起订量1": str,
            "价格1": str,
            "起订量2": str,
            "价格2": str,
            "起订量3": str,
            "价格3": str,
            "单位": str,
            "品牌": str,
            "货号": str,
            "包装": str,
            "材质": str,
            "尺寸": str,
            "颜色": str,
            "是否专利货源": str,
            "是否进口": str,
            "造型": str,
            "主要下游平台": str,
            "主要销售地区": str,
            "有可授权的自有品牌": str,
            "是否跨境出口专供货源": str,
            "单位重量": pd.to_numeric,
            "详情页html": str
        })
        # 初始化 df_cat
        df_cat = pd.DataFrame(columns={
            "店铺id": str,
            "规格名称": str,
            "可售数量": str,
            "图片id": str
        })
        dict_list = []
        dict_list_cat = []
        for s in res:
            carousel_id = []
            images = s.get('images')
            for image in images:
                fullPathImageURI = image.get('fullPathImageURI')
                url_path = urlparse(fullPathImageURI).path
                carousel_id.append(url_path.split("/")[-1])
            propsList = s.get('propsList')
            for props in propsList:
                if props.get('name') == "品牌":
                    pp = props.get('value')
                    continue
                if props.get('name') == "货号":
                    hh = props.get('value')
                    continue
                if props.get('name') == "包装":
                    bz = props.get('value')
                    continue
                if props.get('name') == "材质":
                    cz = props.get('value')
                    continue
                if props.get('name') == "尺寸":
                    cc = props.get('value')
                    continue
                if props.get('name') == "颜色":
                    ys = props.get('value')
                    continue
                if props.get('name') == "是否专利货源":
                    yszlzy = props.get('value')
                    continue
                if props.get('name') == "是否进口":
                    sfjk = props.get('value')
                    continue
                if props.get('name') == "造型":
                    zx = props.get('value')
                    continue
                if props.get('name') == "主要下游平台":
                    zyxy = props.get('value')
                    continue
                if props.get('name') == "主要销售地区":
                    zyxsdq = props.get('value')
                    continue
                if props.get('name') == "有可授权的自有品牌":
                    yksqdzypp = props.get('value')
                    continue
                if props.get('name') == "是否跨境出口专供货源":
                    sfkjckzgzy = props.get('value')
                    continue
            print(s.get('id'))
            originalPrices = s.get('order_param_model').get('originalPrices')
            qdl_1 = ""
            jg_1 = ""
            qdl_2 = ""
            jg_2 = ""
            qdl_3 = ""
            jg_3 = ""
            if len(originalPrices) >= 3:
                qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount')
                jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price')
                qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount')
                jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price')
                qdl_3 = s.get('order_param_model').get('originalPrices')[2].get('beginAmount')
                jg_3 = s.get('order_param_model').get('originalPrices')[2].get('price')
            if len(originalPrices) >= 2:
                qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount')
                jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price')
                qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount')
                jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price')
            if len(originalPrices) >= 1:
                qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount')
                jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price')
            item = {
                "店铺id": s.get('id'),
                "店铺名称": s.get('company_name'),
                "店铺地址": s.get('url'),
                "銷量": s.get('saledCount'),
                "商品名称": s.get('title'),
                "轮播图": carousel_id,
                "起订量1": qdl_1,
                "价格1": jg_1,
                "起订量2": qdl_2,
                "价格2": jg_2,
                "起订量3": qdl_3,
                "价格3": jg_3,
                "单位": s.get('offerUnit'),
                "品牌": pp,
                "货号": hh,
                "包装": bz,
                "材质": cz,
                "尺寸": cc,
                "颜色": ys,
                "是否专利货源": yszlzy,
                "是否进口": sfjk,
                "造型": zx,
                "主要下游平台": zyxy,
                "主要销售地区": zyxsdq,
                "有可授权的自有品牌": yksqdzypp,
                "是否跨境出口专供货源": sfkjckzgzy,
                "单位重量": s.get('unit_weight'),
                "详情页html": s.get('detailUrl')
            }
            dict_list.append(item)
            # 规格详情開始
            sub_categorys = s.get('sub_categorys')
            sub_colour_categorys = s.get('sub_colour_categorys')
            for sub_category in sub_categorys:
                imageUrl_id = ''
                specAttrs = sub_category.get('specAttrs')
                for sub_colour_category in sub_colour_categorys:
                    if specAttrs == sub_colour_category.get('name'):
                        imageUrl = sub_colour_category.get('imageUrl') or ''
                        if imageUrl:
                            url_path = urlparse(imageUrl).path
                            imageUrl_id = url_path.split("/")[-1]
                cat_item = {
                    "店铺id": s.get('id'),
                    "规格名称": specAttrs,
                    "可售数量": sub_category.get('canBookCount'),
                    "图片id": imageUrl_id
                }
                dict_list_cat.append(cat_item)
        df = df.append(dict_list, ignore_index=True, sort=False)
        df.to_excel(sheet_name="1-商品详情", index=False, excel_writer=self.writer)
        df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False)
        df_cat.to_excel(sheet_name="2-规格详情", index=False, excel_writer=self.writer)
        self.writer.save()
    def run(self):
        self.export()
 if __name__ == '__main__':
    f = 导出到本地csv数据()
    f.run()
--- a/1688/spider/导出到本地json数据.py
+++ b/1688/spider/导出到本地json数据.py
@ -1,30 +0,0 @@
 from dao.mongo_dao import MongoDao
 from spider.baes import Baes
 from datetime import datetime
 import time
 import json
 class 导出到本地json数据(Baes):
    def __init__(self):
        self.col = MongoDao()
        super(导出到本地json数据, self).__init__()
    def run(self):
        res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1,
                                                       "sub_colour_categorys": 1, "order_param_model": 1,
                                                       "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1,
                                                       "detailUrl": 1, "unit_weight": 1})
        for s in res:
            s.pop('_id')
            with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f:
                f.write(json.dumps(s) + '\n')
        print(f"【{datetime.now()}】完成")
 if __name__ == '__main__':
    f = 导出到本地json数据()
    f.run()