From d60a1f8efa5f9f3923f46c340be3c4bc7e513b86 Mon Sep 17 00:00:00 2001 From: aiguigu Date: Thu, 4 Nov 2021 02:18:24 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=AF=BC=E5=87=BA=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1688/clean/extractor.py | 2 +- 1688/spider/导出到本地csv数据.py | 218 ------------------------------ 1688/spider/导出到本地json数据.py | 31 +++++ 1688/spider/导出到本地xlsx数据.py | 137 +++++++++++++++++++ 4 files changed, 169 insertions(+), 219 deletions(-) delete mode 100644 1688/spider/导出到本地csv数据.py create mode 100644 1688/spider/导出到本地json数据.py create mode 100644 1688/spider/导出到本地xlsx数据.py diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index 07959a2..370f5fb 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -40,7 +40,7 @@ class extractor(Baes): sub_categorys.append(sub_categorys_dict) if globalData.get('skuModel').get('skuProps'): - value = globalData.get('skuModel').get('skuProps')[0].get('value') + value = globalData.get('skuModel').get('skuProps') sub_colour_categorys = value else: sub_colour_categorys = [] diff --git a/1688/spider/导出到本地csv数据.py b/1688/spider/导出到本地csv数据.py deleted file mode 100644 index 61267af..0000000 --- a/1688/spider/导出到本地csv数据.py +++ /dev/null @@ -1,218 +0,0 @@ -from dao.mongo_dao import MongoDao -from spider.baes import Baes -import settings -import pandas as pd -from urllib.parse import urlparse -import os - - -class 导出到本地csv数据(Baes): - - def __init__(self): - self.col = MongoDao() - super(导出到本地csv数据, self).__init__() - path_1 = "数据分析_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4])) - pd_path = os.path.join(settings.excel_path, path_1) - self.writer = pd.ExcelWriter(pd_path, options={'strings_to_urls': False}) - - def export(self): - res = self.col.find_item('CLEAN_CONTENT', {"company_name" : "坂戈实力旗舰店"}, None) - - # 初始化df - df = pd.DataFrame(columns={ - "店铺id": str, - "店铺名称": str, - "店铺地址": str, - "30天銷量": pd.to_numeric, - "商品名称": str, - "轮播图": str, - "起订量1": str, - "价格1": str, - "起订量2": str, - "价格2": str, - "起订量3": str, - "价格3": str, - "单位": str, - "品牌": str, - "货号": str, - "包装": str, - "材质": str, - "尺寸": str, - "颜色": str, - "是否专利货源": str, - "是否进口": str, - "造型": str, - "主要下游平台": str, - "主要销售地区": str, - "有可授权的自有品牌": str, - "是否跨境出口专供货源": str, - "单位重量": pd.to_numeric, - "详情页html": str - }) - - # 初始化 df_cat - df_cat = pd.DataFrame(columns={ - "店铺id": str, - "规格名称": str, - "可售数量": str, - "图片id": str - }) - - dict_list = [] - dict_list_cat = [] - - for s in res: - carousel_id = [] - images = s.get('images') - for image in images: - fullPathImageURI = image.get('fullPathImageURI') - url_path = urlparse(fullPathImageURI).path - carousel_id.append(url_path.split("/")[-1]) - - pp = '' - hh = '' - bz = '' - cz = '' - cc = '' - ys = '' - yszlzy = '' - sfjk = '' - zx = '' - zyxy = '' - zyxsdq = '' - yksqdzypp = '' - sfkjckzgzy = '' - - propsList = s.get('propsList') - for props in propsList: - if props.get('name') == "品牌": - pp = props.get('value') - continue - if props.get('name') == "货号": - hh = props.get('value') - continue - if props.get('name') == "包装": - bz = props.get('value') - continue - if props.get('name') == "材质": - cz = props.get('value') - continue - if props.get('name') == "尺寸": - cc = props.get('value') - continue - if props.get('name') == "颜色": - ys = props.get('value') - continue - if props.get('name') == "是否专利货源": - yszlzy = props.get('value') - continue - if props.get('name') == "是否进口": - sfjk = props.get('value') - continue - if props.get('name') == "造型": - zx = props.get('value') - continue - if props.get('name') == "主要下游平台": - zyxy = props.get('value') - continue - if props.get('name') == "主要销售地区": - zyxsdq = props.get('value') - continue - if props.get('name') == "有可授权的自有品牌": - yksqdzypp = props.get('value') - continue - if props.get('name') == "是否跨境出口专供货源": - sfkjckzgzy = props.get('value') - continue - - originalPrices = s.get('order_param_model').get('originalPrices') - qdl_1 = "" - jg_1 = "" - qdl_2 = "" - jg_2 = "" - qdl_3 = "" - jg_3 = "" - if len(originalPrices) >= 3: - qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') - jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') - qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount') - jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price') - qdl_3 = s.get('order_param_model').get('originalPrices')[2].get('beginAmount') - jg_3 = s.get('order_param_model').get('originalPrices')[2].get('price') - if len(originalPrices) >= 2: - qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') - jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') - qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount') - jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price') - if len(originalPrices) >= 1: - qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') - jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') - - item = { - "店铺id": s.get('id'), - "店铺名称": s.get('company_name'), - "店铺地址": s.get('url'), - "30天銷量": s.get('saledCount'), - "商品名称": s.get('title'), - "轮播图": carousel_id, - "起订量1": qdl_1, - "价格1": jg_1, - "起订量2": qdl_2, - "价格2": jg_2, - "起订量3": qdl_3, - "价格3": jg_3, - "单位": s.get('offerUnit'), - "品牌": pp, - "货号": hh, - "包装": bz, - "材质": cz, - "尺寸": cc, - "颜色": ys, - "是否专利货源": yszlzy, - "是否进口": sfjk, - "造型": zx, - "主要下游平台": zyxy, - "主要销售地区": zyxsdq, - "有可授权的自有品牌": yksqdzypp, - "是否跨境出口专供货源": sfkjckzgzy, - "单位重量": s.get('unit_weight'), - "详情页html": s.get('detailUrl') - } - dict_list.append(item) - - # 规格详情開始 - sub_categorys = s.get('sub_categorys') - sub_colour_categorys = s.get('sub_colour_categorys') - - for sub_category in sub_categorys: - imageUrl_id = '' - specAttrs = sub_category.get('specAttrs') - for sub_colour_category in sub_colour_categorys: - if sub_colour_category.get('name') in specAttrs: - imageUrl = sub_colour_category.get('imageUrl') or '' - if imageUrl: - url_path = urlparse(imageUrl).path - imageUrl_id = url_path.split("/")[-1] - - cat_item = { - "店铺id": s.get('id'), - "规格名称": specAttrs, - "可售数量": sub_category.get('canBookCount'), - "图片id": imageUrl_id - } - dict_list_cat.append(cat_item) - - df = df.append(dict_list, ignore_index=True, sort=False) - df.to_excel(sheet_name="1-商品详情", index=False, excel_writer=self.writer) - - df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False) - df_cat.to_excel(sheet_name="2-规格详情", index=False, excel_writer=self.writer) - self.writer.save() - - def run(self): - self.export() - - -if __name__ == '__main__': - f = 导出到本地csv数据() - f.run() diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py new file mode 100644 index 0000000..0c7a862 --- /dev/null +++ b/1688/spider/导出到本地json数据.py @@ -0,0 +1,31 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import pandas as pd +import time +import json + + +class 导出到本地json数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到本地json数据, self).__init__() + + def run(self): + res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1, + "sub_colour_categorys": 1, "order_param_model": 1, + "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, + "detailUrl": 1, "unit_weight": 1}) + + for s in res: + s.pop('_id') + with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(s) + '\n') + + print(f"【{datetime.now()}】完成") + + +if __name__ == '__main__': + f = 导出到本地json数据() + f.run() diff --git a/1688/spider/导出到本地xlsx数据.py b/1688/spider/导出到本地xlsx数据.py new file mode 100644 index 0000000..42c7d5c --- /dev/null +++ b/1688/spider/导出到本地xlsx数据.py @@ -0,0 +1,137 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +import settings +import pandas as pd +from urllib.parse import urlparse +import os +import json + + +class 导出到本地xlsx数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到本地xlsx数据, self).__init__() + + def export(self, company_name, writer): + res = self.col.find_item('CLEAN_CONTENT', {"company_name": company_name}, None) + + # 初始化df + df = pd.DataFrame(columns={ + "product_id": str, + "product_attributes": str + }) + + # 初始化 df_cat + df_cat = pd.DataFrame(columns={ + "product_id": str, + "sku_description": str, + "sku_image": str + }) + + # 价格区间 + df_price = pd.DataFrame(columns={ + "product_id": str, + "priceRanges": str + }) + + # 选项列 + df_row = pd.DataFrame(columns={ + "product_id": str, + "option_name": str, + "option_value": str + }) + + dict_list = [] + dict_list_cat = [] + dict_list_price = [] + dict_list_row = [] + + for s in res: + # 产品属性 + item = { + "product_id": s.get('id'), + "product_attributes": s.get('propsList') + } + dict_list.append(item) + + # 产品图片 + sub_categorys = s.get('sub_categorys') + + if s.get('sub_colour_categorys'): + sub_colour_categorys = s.get('sub_colour_categorys')[0].get('value') + + for sub_category in sub_categorys: + imageUrl_id = '' + specAttrs = sub_category.get('specAttrs').replace('>', '') + for sub_colour_category in sub_colour_categorys: + if sub_colour_category.get('name') in specAttrs: + imageUrl = sub_colour_category.get('imageUrl') or '' + if imageUrl: + url_path = urlparse(imageUrl).path + imageUrl_id = url_path.split("/")[-1] + + cat_item = { + "product_id": s.get('id'), + "sku_description": specAttrs, + "sku_image": imageUrl_id + } + dict_list_cat.append(cat_item) + + # 价格区间 + originalPrices = s.get('order_param_model').get('originalPrices') + price_str = "" + for originalPrice in originalPrices: + beginAmount = originalPrice.get('beginAmount') + price = originalPrice.get('price') + price_item = str(beginAmount) + ':' + str(price) + price_item_str = json.dumps(price_item) + if price_str: + price_str = price_str + '`' + price_item_str + else: + price_str = price_item_str + + price_item = { + "product_id": s.get('id'), + "priceRanges": price_str, + } + dict_list_price.append(price_item) + + # 选项列 + sub_colour_categorys = s.get('sub_colour_categorys') + for sub_colour_category in sub_colour_categorys: + values = sub_colour_category.get('value') + prop = sub_colour_category.get('prop') + for value in values: + row_dict = { + "product_id": s.get('id'), + "option_name": prop, + "option_value": value.get('name') + } + dict_list_row.append(row_dict) + + df = df.append(dict_list, ignore_index=True, sort=False) + df.to_excel(sheet_name="1-产品属性", index=False, excel_writer=writer) + + df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False) + df_cat.to_excel(sheet_name="2-产品图片", index=False, excel_writer=writer) + + df_price = df_price.append(dict_list_price, ignore_index=True, sort=False) + df_price.to_excel(sheet_name="3-价格区间", index=False, excel_writer=writer) + + df_row = df_row.append(dict_list_row, ignore_index=True, sort=False) + df_row.to_excel(sheet_name="4-选项列", index=False, excel_writer=writer) + + writer.save() + + def run(self, company_name): + path_1 = f"{company_name}_1688_{''.join(self.getYMDHMSstrList()[0:4])}_v1.xlsx" + pd_path = os.path.join(settings.excel_path, path_1) + writer = pd.ExcelWriter(pd_path, options={'strings_to_urls': False}) + self.export(company_name, writer) + + +if __name__ == '__main__': + f = 导出到本地xlsx数据() + company_name = '东莞市茶山品轩玩具厂' + f.run(company_name)