js_reverse/1688/clean/extractor.py
2021-11-15 02:20:41 +08:00

121 lines
5.0 KiB
Python

from dao.mongo_dao import MongoDao
from scrapy.selector import Selector
from spider.baes import Baes
from datetime import datetime
from tool.download_img import download_img
import time
import json
import re
class extractor(Baes):
def __init__(self):
self.col = MongoDao()
super(extractor, self).__init__()
def run(self):
res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
for s in res:
content = s.get('content')
sel = Selector(text=content, type='html')
title = sel.xpath('//title/text()').extract_first()
json_itme = re.findall(r'window.__INIT_DATA=(\{.*\})', content)[0]
json_dict = json.loads(json_itme)
globalData = json_dict.get('globalData')
offerId = globalData.get('tempModel').get('offerId')
print(f"{datetime.now()}】解析 {offerId}")
data = json_dict.get('data')
skuInfoMap = globalData.get('skuModel').get('skuInfoMap')
sub_categorys_canBookCount = []
for key, value in skuInfoMap.items():
sub_categorys_dict = {
'specId': value.get('specId'),
'specAttrs': key.replace('>', '|'),
'Price': globalData.get('tempModel').get('price'),
'canBookCount': value.get('canBookCount')
}
sub_categorys_canBookCount.append(sub_categorys_dict)
if globalData.get('skuModel').get('skuProps'):
skuProps = globalData.get('skuModel').get('skuProps')
list_dict = []
for skuProp in skuProps:
prop = skuProp.get('prop')
value = skuProp.get('value')
for val in value:
item_dict = {
'OptionImageUrl': val.get('imageUrl') or '',
'name': prop,
'optionValue': val.get('name')
}
list_dict.append(item_dict)
else:
list_dict = []
orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices')
companyName = globalData.get('tempModel').get('companyName')
sellerLoginId = globalData.get('tempModel').get('sellerLoginId')
offerUnit = globalData.get('tempModel').get('offerUnit')
saledCount = globalData.get('tempModel').get('saledCount')
images = globalData.get('images')
# for image in images:
# fullPathImageURI = image.get('fullPathImageURI')
# download_img(fullPathImageURI, offerId)
# print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
# time.sleep(1)
a_590893001984 = data.get('590893001984')
if not a_590893001984:
priceModel = globalData.get('processingModel').get('offerPriceModel').get('currentPrices')
else:
priceModel = a_590893001984.get('data').get('priceModel')
a_590893001997 = data.get('590893001997')
if not a_590893001997:
# unitWeight = data.get('605462009364').get('data').get('test').get('unitWeight')
location = data.get('605462009364').get('data').get('location')
cost = data.get('605462009364').get('data').get('logistics')
else:
# unitWeight = a_590893001997.get('data').get('test').get('unitWeight')
location = a_590893001997.get('data').get('location')
cost = a_590893001997.get('data').get('logistics')
logistics = [{"from": location}, {"cost": cost.replace('快递', '').strip()}]
a_590893002003 = data.get('590893002003')
if not a_590893002003:
a_590893002003 = data.get('605462009362')
propsList = a_590893002003.get('data').get('propsList')
detailUrl = globalData.get('detailModel').get('detailUrl')
item = {
"sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)),
"id": offerId,
"company_name": companyName,
"url": "https://detail.1688.com/offer/{}.html".format(offerId),
"title": title,
"sub_categorys_canBookCount": sub_categorys_canBookCount,
"sub_categorys_option": list_dict,
"order_param_model": priceModel,
"sellerLoginId": sellerLoginId,
"offerUnit": offerUnit,
"30daySold": saledCount,
"images": images,
"propsList": propsList,
"detailUrl": detailUrl,
"unit_weight": "",
"logistics": logistics
}
self.col.insert_item('CLEAN_CONTENT', item)
if __name__ == '__main__':
f = extractor()
f.run()