diff --git a/1688/spider/导出到本地json数据.py b/1688/spider/导出到本地json数据.py index 0c7a862..e7e6523 100644 --- a/1688/spider/导出到本地json数据.py +++ b/1688/spider/导出到本地json数据.py @@ -1,31 +1,31 @@ +from scrapy.selector import Selector from dao.mongo_dao import MongoDao from spider.baes import Baes from datetime import datetime -import pandas as pd import time import json +import re -class 导出到本地json数据(Baes): +class 导出到本地元数据(Baes): def __init__(self): self.col = MongoDao() - super(导出到本地json数据, self).__init__() + super(导出到本地元数据, self).__init__() def run(self): - res = self.col.find_item('CLEAN_CONTENT', {}, {"company_name": 1, "url": 1, "title": 1, "sub_categorys": 1, - "sub_colour_categorys": 1, "order_param_model": 1, - "sellerLoginId": 1, "offerUnit": 1, "images": 1, "propsList": 1, - "detailUrl": 1, "unit_weight": 1}) + res = self.col.find_item('RAW_CONTENT', {}, {"content": 1}) for s in res: s.pop('_id') - with open(f"../docs/导出到本地json数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: - f.write(json.dumps(s) + '\n') + content = s.get('content') + json_itme = re.findall(r'window.__INIT_DATA=(\{.*\})', content)[0] + with open(f"../docs/导出到本地元数据{time.strftime('%Y-%m-%d', time.localtime())}.json", "a+") as f: + f.write(json.dumps(json_itme) + '\n') print(f"【{datetime.now()}】完成") if __name__ == '__main__': - f = 导出到本地json数据() + f = 导出到本地元数据() f.run() diff --git a/1688/spider/导出到本地xlsx数据.py b/1688/spider/导出到本地xlsx数据.py index e754b2b..198abcd 100644 --- a/1688/spider/导出到本地xlsx数据.py +++ b/1688/spider/导出到本地xlsx数据.py @@ -13,6 +13,18 @@ class 导出到本地xlsx数据(Baes): self.col = MongoDao() super(导出到本地xlsx数据, self).__init__() + path_1 = "包含产品的所有信息的表_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4])) + pd_path_1 = os.path.join(settings.excel_path, path_1) + self.writer = pd.ExcelWriter(pd_path_1, options={'strings_to_urls': False}) + + path_2 = "产品图片_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4])) + pd_path_2 = os.path.join(settings.excel_path, path_2) + self.writer_img = pd.ExcelWriter(pd_path_2, options={'strings_to_urls': False}) + + path_3 = "选项列_1688_{}_v1.xlsx".format("".join(self.getYMDHMSstrList()[0:4])) + pd_path_3 = os.path.join(settings.excel_path, path_3) + self.writer_option = pd.ExcelWriter(pd_path_3, options={'strings_to_urls': False}) + def export(self, company_name): res = self.col.find_item('CLEAN_CONTENT', {"company_name": company_name}, None) @@ -111,16 +123,20 @@ class 导出到本地xlsx数据(Baes): dict_list_row.append(row_dict) df = df.append(dict_list, ignore_index=True, sort=False) - df.to_csv('../docs/1-产品属性.csv', index=False, header=True) - - df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False) - df_cat.to_csv('../docs/2-产品图片.csv', index=False, header=True) + df.to_excel(sheet_name="1-产品属性", index=False, excel_writer=self.writer) df_price = df_price.append(dict_list_price, ignore_index=True, sort=False) - df_price.to_csv('../docs/3-价格区间.csv', index=False, header=True) + df_price.to_excel(sheet_name="2-价格区间", index=False, excel_writer=self.writer) + + df_cat = df_cat.append(dict_list_cat, ignore_index=True, sort=False) + df_cat.to_excel(sheet_name="1-产品图片", index=False, excel_writer=self.writer_img) df_row = df_row.append(dict_list_row, ignore_index=True, sort=False) - df_row.to_csv('../docs/4-选项列.csv', index=False, header=True) + df_row.to_excel(sheet_name="1-选项列", index=False, excel_writer=self.writer_option) + + self.writer.save() + self.writer_img.save() + self.writer_option.save() def run(self, company_name): self.export(company_name)