diff --git a/1688/clean/extractor.py b/1688/clean/extractor.py index 74214d0..07959a2 100644 --- a/1688/clean/extractor.py +++ b/1688/clean/extractor.py @@ -49,13 +49,14 @@ class extractor(Baes): companyName = globalData.get('tempModel').get('companyName') sellerLoginId = globalData.get('tempModel').get('sellerLoginId') offerUnit = globalData.get('tempModel').get('offerUnit') + saledCount = globalData.get('tempModel').get('saledCount') images = globalData.get('images') - for image in images: - fullPathImageURI = image.get('fullPathImageURI') - download_img(fullPathImageURI, offerId) - print(f"【{datetime.now()}】图片下载{fullPathImageURI}") - time.sleep(1) + # for image in images: + # fullPathImageURI = image.get('fullPathImageURI') + # download_img(fullPathImageURI, offerId) + # print(f"【{datetime.now()}】图片下载{fullPathImageURI}") + # time.sleep(1) a_590893001984 = data.get('590893001984') if not a_590893001984: @@ -78,6 +79,7 @@ class extractor(Baes): item = { "sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)), + "id": offerId, "company_name": companyName, "url": "https://detail.1688.com/offer/{}.html".format(offerId), "title": title, @@ -86,6 +88,7 @@ class extractor(Baes): "order_param_model": priceModel, "sellerLoginId": sellerLoginId, "offerUnit": offerUnit, + "saledCount": saledCount, "images": images, "propsList": propsList, "detailUrl": detailUrl, diff --git a/1688/settings.py b/1688/settings.py index 9735374..54b5073 100644 --- a/1688/settings.py +++ b/1688/settings.py @@ -2,7 +2,7 @@ import os MONGODB_CONF = { - 'host': '192.168.1.107', + 'host': '127.0.0.1', 'port': 27017, 'username': '', 'pwd': "", diff --git a/1688/spider/baes.py b/1688/spider/baes.py index e906366..2d61bfd 100644 --- a/1688/spider/baes.py +++ b/1688/spider/baes.py @@ -1,10 +1,23 @@ from urllib.parse import urlparse +from datetime import datetime import hashlib import base64 class Baes(object): + def getYMDHMSstrList(self): + """返回:[2019,03,01]""" + td = datetime.now() + tdTupele = td.timetuple() + y = str(tdTupele[0]) + m = ("0" + str(tdTupele[1]))[-2:] + d = ("0" + str(tdTupele[2]))[-2:] + h = ("0" + str(tdTupele[3]))[-2:] + M = ("0" + str(tdTupele[4]))[-2:] + s = ("0" + str(tdTupele[5]))[-2:] + return [y, m, d, h, M, s] + def generate_sign(self, url): """通过md5生成项目符号表单URL""" md5 = hashlib.md5() diff --git a/1688/spider/导出到本地csv数据.py b/1688/spider/导出到本地csv数据.py new file mode 100644 index 0000000..f5de318 --- /dev/null +++ b/1688/spider/导出到本地csv数据.py @@ -0,0 +1,210 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +import settings +import pandas as pd +from urllib.parse import urlparse +import os + + +class 导出到本地csv数据(Baes): + + def __init__(self): + self.col = MongoDao() + super(导出到本地csv数据, self).__init__() + path_1 = "数据分析_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4])) + pd_path = os.path.join(settings.excel_path, path_1) + self.writer = pd.ExcelWriter(pd_path, options={'strings_to_urls': False}) + + def export(self): + res = self.col.find_item('CLEAN_CONTENT', {}, {"id": 1, "company_name": 1, "url": 1, "title": 1, + "sub_categorys": 1, + "sub_colour_categorys": 1, "order_param_model": 1, + "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, + "detailUrl": 1, "unit_weight": 1}) + + # 初始化df + df = pd.DataFrame(columns={ + "店铺id": str, + "店铺名称": str, + "店铺地址": str, + "銷量": pd.to_numeric, + "商品名称": str, + "轮播图": str, + "起订量1": str, + "价格1": str, + "起订量2": str, + "价格2": str, + "起订量3": str, + "价格3": str, + "单位": str, + "品牌": str, + "货号": str, + "包装": str, + "材质": str, + "尺寸": str, + "颜色": str, + "是否专利货源": str, + "是否进口": str, + "造型": str, + "主要下游平台": str, + "主要销售地区": str, + "有可授权的自有品牌": str, + "是否跨境出口专供货源": str, + "单位重量": pd.to_numeric, + "详情页html": str + }) + + # 初始化 df_cat + df_cat = pd.DataFrame(columns={ + "店铺id": str, + "规格名称": str, + "可售数量": str, + "图片id": str + }) + + dict_list = [] + dict_list_cat = [] + + for s in res: + carousel_id = [] + images = s.get('images') + for image in images: + fullPathImageURI = image.get('fullPathImageURI') + url_path = urlparse(fullPathImageURI).path + carousel_id.append(url_path.split("/")[-1]) + + propsList = s.get('propsList') + for props in propsList: + if props.get('name') == "品牌": + pp = props.get('value') + continue + if props.get('name') == "货号": + hh = props.get('value') + continue + if props.get('name') == "包装": + bz = props.get('value') + continue + if props.get('name') == "材质": + cz = props.get('value') + continue + if props.get('name') == "尺寸": + cc = props.get('value') + continue + if props.get('name') == "颜色": + ys = props.get('value') + continue + if props.get('name') == "是否专利货源": + yszlzy = props.get('value') + continue + if props.get('name') == "是否进口": + sfjk = props.get('value') + continue + if props.get('name') == "造型": + zx = props.get('value') + continue + if props.get('name') == "主要下游平台": + zyxy = props.get('value') + continue + if props.get('name') == "主要销售地区": + zyxsdq = props.get('value') + continue + if props.get('name') == "有可授权的自有品牌": + yksqdzypp = props.get('value') + continue + if props.get('name') == "是否跨境出口专供货源": + sfkjckzgzy = props.get('value') + continue + + print(s.get('id')) + + originalPrices = s.get('order_param_model').get('originalPrices') + qdl_1 = "" + jg_1 = "" + qdl_2 = "" + jg_2 = "" + qdl_3 = "" + jg_3 = "" + if len(originalPrices) >= 3: + qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') + jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') + qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount') + jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price') + qdl_3 = s.get('order_param_model').get('originalPrices')[2].get('beginAmount') + jg_3 = s.get('order_param_model').get('originalPrices')[2].get('price') + if len(originalPrices) >= 2: + qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') + jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') + qdl_2 = s.get('order_param_model').get('originalPrices')[1].get('beginAmount') + jg_2 = s.get('order_param_model').get('originalPrices')[1].get('price') + if len(originalPrices) >= 1: + qdl_1 = s.get('order_param_model').get('originalPrices')[0].get('beginAmount') + jg_1 = s.get('order_param_model').get('originalPrices')[0].get('price') + + item = { + "店铺id": s.get('id'), + "店铺名称": s.get('company_name'), + "店铺地址": s.get('url'), + "銷量": s.get('saledCount'), + "商品名称": s.get('title'), + "轮播图": carousel_id, + "起订量1": qdl_1, + "价格1": jg_1, + "起订量2": qdl_2, + "价格2": jg_2, + "起订量3": qdl_3, + "价格3": jg_3, + "单位": s.get('offerUnit'), + "品牌": pp, + "货号": hh, + "包装": bz, + "材质": cz, + "尺寸": cc, + "颜色": ys, + "是否专利货源": yszlzy, + "是否进口": sfjk, + "造型": zx, + "主要下游平台": zyxy, + "主要销售地区": zyxsdq, + "有可授权的自有品牌": yksqdzypp, + "是否跨境出口专供货源": sfkjckzgzy, + "单位重量": s.get('unit_weight'), + "详情页html": s.get('detailUrl') + } + dict_list.append(item) + + # 规格详情開始 + sub_categorys = s.get('sub_categorys') + sub_colour_categorys = s.get('sub_colour_categorys') + + for sub_category in sub_categorys: + imageUrl_id = '' + specAttrs = sub_category.get('specAttrs') + for sub_colour_category in sub_colour_categorys: + if specAttrs == sub_colour_category.get('name'): + imageUrl = sub_colour_category.get('imageUrl') or '' + if imageUrl: + url_path = urlparse(imageUrl).path + imageUrl_id = url_path.split("/")[-1] + + cat_item = { + "店铺id": s.get('id'), + "规格名称": specAttrs, + "可售数量": sub_category.get('canBookCount'), + "图片id": imageUrl_id + } + dict_list_cat.append(cat_item) + + df = df.append(dict_list, ignore_index=True, sort=False) + df.to_excel(sheet_name="1-商品详情", index=False, excel_writer=self.writer) + + df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False) + df_cat.to_excel(sheet_name="2-规格详情", index=False, excel_writer=self.writer) + self.writer.save() + + def run(self): + self.export() + + +if __name__ == '__main__': + f = 导出到本地csv数据() + f.run() diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py deleted file mode 100644 index 24a3c87..0000000 --- a/1688/spider/导出到本地json数据.py +++ /dev/null @@ -1,30 +0,0 @@ -from dao.mongo_dao import MongoDao -from spider.baes import Baes -from datetime import datetime -import time -import json - - -class 导出到本地json数据(Baes): - - def __init__(self): - self.col = MongoDao() - super(导出到本地json数据, self).__init__() - - def run(self): - res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1, - "sub_colour_categorys": 1, "order_param_model": 1, - "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, - "detailUrl": 1, "unit_weight": 1}) - - for s in res: - s.pop('_id') - with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: - f.write(json.dumps(s) + '\n') - - print(f"【{datetime.now()}】完成") - - -if __name__ == '__main__': - f = 导出到本地json数据() - f.run()