mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-19 18:24:51 +08:00
增加1688数据
This commit is contained in:
parent
8ebb4e6461
commit
eecfc80a1c
0
1688/clean/__init__.py
Normal file
0
1688/clean/__init__.py
Normal file
80
1688/clean/extractor.py
Normal file
80
1688/clean/extractor.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
from dao.mongo_dao import MongoDao
|
||||||
|
from scrapy.selector import Selector
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
from tool.download_img import download_img
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class extractor(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.col = MongoDao()
|
||||||
|
super(extractor, self).__init__()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
res = self.col.find_item('RAW_CONTENT', {}, {"content": 1})
|
||||||
|
for s in res:
|
||||||
|
content = s.get('content')
|
||||||
|
sel = Selector(text=content, type='html')
|
||||||
|
title = sel.xpath('//title/text()').extract_first()
|
||||||
|
json_itme = re.findall(r'window.__INIT_DATA=(\{.*\})', content)[0]
|
||||||
|
json_dict = json.loads(json_itme)
|
||||||
|
globalData = json_dict.get('globalData')
|
||||||
|
offerId = globalData.get('tempModel').get('offerId')
|
||||||
|
|
||||||
|
data = json_dict.get('data')
|
||||||
|
skuInfoMap = globalData.get('skuModel').get('skuInfoMap')
|
||||||
|
|
||||||
|
sub_categorys = []
|
||||||
|
for key, value in skuInfoMap.items():
|
||||||
|
sub_categorys_dict = {
|
||||||
|
'specId': value.get('specId'),
|
||||||
|
'specAttrs': key,
|
||||||
|
'discountPrice': value.get('discountPrice'),
|
||||||
|
'canBookCount': value.get('canBookCount')
|
||||||
|
}
|
||||||
|
sub_categorys.append(sub_categorys_dict)
|
||||||
|
|
||||||
|
orderParam = globalData.get('orderParamModel').get('orderParam').get('skuParam').get('skuRangePrices')
|
||||||
|
companyName = globalData.get('tempModel').get('companyName')
|
||||||
|
sellerLoginId = globalData.get('tempModel').get('sellerLoginId')
|
||||||
|
offerUnit = globalData.get('tempModel').get('offerUnit')
|
||||||
|
images = globalData.get('images')
|
||||||
|
|
||||||
|
for image in images:
|
||||||
|
fullPathImageURI = image.get('fullPathImageURI')
|
||||||
|
download_img(fullPathImageURI, offerId)
|
||||||
|
print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
a_590893002003 = data.get('590893002003')
|
||||||
|
|
||||||
|
if not a_590893002003:
|
||||||
|
a_590893002003 = data.get('605462009362')
|
||||||
|
propsList = a_590893002003.get('data').get('propsList')
|
||||||
|
detailUrl = globalData.get('detailModel').get('detailUrl')
|
||||||
|
|
||||||
|
item = {
|
||||||
|
"company_name": companyName,
|
||||||
|
"company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId),
|
||||||
|
"title": title,
|
||||||
|
"sub_categorys": sub_categorys,
|
||||||
|
"order_param_model": orderParam,
|
||||||
|
"sellerLoginId": sellerLoginId,
|
||||||
|
"offerUnit": offerUnit,
|
||||||
|
"images": images,
|
||||||
|
"propsList": propsList,
|
||||||
|
"detailUrl": detailUrl
|
||||||
|
}
|
||||||
|
print(json.dumps(item))
|
||||||
|
exit()
|
||||||
|
|
||||||
|
print(f"【{datetime.now()}】解析{offerId}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
f = extractor()
|
||||||
|
f.run()
|
@ -40,13 +40,14 @@ class MongoDao(object):
|
|||||||
print(f"【{datetime.now()}】入库{item.get('url')}")
|
print(f"【{datetime.now()}】入库{item.get('url')}")
|
||||||
return collection.insert_one(item)
|
return collection.insert_one(item)
|
||||||
|
|
||||||
def update_item(self, collection, item):
|
def update_item(self, collection, sign):
|
||||||
collection = self.client[collection]
|
collection = self.client[collection]
|
||||||
if collection.find_one({"sign": item['sign']}):
|
if collection.find_one({"sign": sign}):
|
||||||
return collection.update_one({"sign": item['sign']}, {"$set": {"stauts": '1'}})
|
return collection.update_one({"sign": sign}, {"$set": {"stauts": '1'}})
|
||||||
else:
|
else:
|
||||||
print(f"【{datetime.now()}】过滤")
|
print(f"【{datetime.now()}】过滤")
|
||||||
|
|
||||||
def find_item(self, collection, query, projection):
|
def find_item(self, collection, query, projection):
|
||||||
collection = self.client[collection]
|
collection = self.client[collection]
|
||||||
return collection.find(query, projection)
|
return collection.find(query, projection).batch_size(1)
|
||||||
|
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
MONGODB_CONF = {
|
MONGODB_CONF = {
|
||||||
'host': '127.0.0.1',
|
'host': '192.168.5.151',
|
||||||
'port': 27017,
|
'port': 27017,
|
||||||
'username': '',
|
'username': '',
|
||||||
'pwd': "",
|
'pwd': "",
|
||||||
@ -8,3 +11,6 @@ MONGODB_CONF = {
|
|||||||
'status': '',
|
'status': '',
|
||||||
'producer': ''
|
'producer': ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
excel_path = os.path.join(DOCS_PATH, '1688/docs/')
|
||||||
|
@ -14,10 +14,12 @@ class Film(Baes):
|
|||||||
super(Film, self).__init__()
|
super(Film, self).__init__()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
for i in range(1, 24):
|
for i in range(15, 24):
|
||||||
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
|
cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1"
|
||||||
x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
|
x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \
|
||||||
url = self.url.format(i)
|
"35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \
|
||||||
|
"4d5463344e4473784d4f57426e355542227d"
|
||||||
|
url = self.url.format(i).replace('detail', 'm')
|
||||||
headers = {
|
headers = {
|
||||||
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
|
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
|
||||||
}
|
}
|
||||||
@ -27,13 +29,18 @@ class Film(Baes):
|
|||||||
print(f"【{datetime.now()}】报错{i}")
|
print(f"【{datetime.now()}】报错{i}")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
if '全球领先的采购批发平台,批发网' in response.text:
|
||||||
|
print(f"【{datetime.now()}】报错{i}")
|
||||||
|
exit()
|
||||||
|
|
||||||
sel = Selector(text=response.text, type='html')
|
sel = Selector(text=response.text, type='html')
|
||||||
urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
|
urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
item = {
|
item = {
|
||||||
"sign": self.generate_sign(url),
|
"sign": self.generate_sign(url),
|
||||||
"url": url
|
"url": url,
|
||||||
|
"stauts": "0"
|
||||||
}
|
}
|
||||||
self.col.insert_item('RAW_URLS', item)
|
self.col.insert_item('RAW_URLS', item)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
@ -12,13 +12,15 @@ class 企业产品详情页面(Baes):
|
|||||||
super(企业产品详情页面, self).__init__()
|
super(企业产品详情页面, self).__init__()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1})
|
res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1, "sign": 1})
|
||||||
for s in res:
|
for s in res:
|
||||||
url = s.get('url')
|
url = s.get('url').replace('detail', 'm')
|
||||||
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
|
sign = s.get('sign')
|
||||||
x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d"
|
x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \
|
||||||
|
"323835663332623033396233366663613833323639396433326236364350372b76346" \
|
||||||
|
"f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d"
|
||||||
headers = {
|
headers = {
|
||||||
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
|
'cookie': f"x5sec={x5sec}"
|
||||||
}
|
}
|
||||||
response = requests.request("GET", url, headers=headers)
|
response = requests.request("GET", url, headers=headers)
|
||||||
|
|
||||||
@ -26,13 +28,17 @@ class 企业产品详情页面(Baes):
|
|||||||
print(f"【{datetime.now()}】报错{url}")
|
print(f"【{datetime.now()}】报错{url}")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
if '全球领先的采购批发平台,批发网' in response.text:
|
||||||
|
print(f"【{datetime.now()}】报错{url}")
|
||||||
|
exit()
|
||||||
|
|
||||||
item = {
|
item = {
|
||||||
"sign": self.generate_sign(url),
|
"sign": self.generate_sign(url),
|
||||||
"url": url,
|
"url": url,
|
||||||
"content": response.text
|
"content": response.text
|
||||||
}
|
}
|
||||||
self.col.insert_item('RAW_CONTENT', item)
|
self.col.insert_item('RAW_CONTENT', item)
|
||||||
self.col.update_item('RAW_URLS', item)
|
self.col.update_item('RAW_URLS', sign)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
0
1688/tool/__init__.py
Normal file
0
1688/tool/__init__.py
Normal file
29
1688/tool/download_img.py
Normal file
29
1688/tool/download_img.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from urllib.parse import urlparse
|
||||||
|
import settings
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def request_download(image_url, path):
|
||||||
|
url_path = urlparse(image_url).path
|
||||||
|
image_name = url_path.split("/")[-1]
|
||||||
|
r = requests.get(image_url)
|
||||||
|
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
|
||||||
|
f.write(r.content)
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(path):
|
||||||
|
folder = os.path.exists(f"{settings.excel_path}{path}")
|
||||||
|
if not folder:
|
||||||
|
os.makedirs(f"{settings.excel_path}{path}")
|
||||||
|
|
||||||
|
|
||||||
|
def download_img(image_url, path):
|
||||||
|
mkdir(path)
|
||||||
|
request_download(image_url, path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
image_url = "https://cbu01.alicdn.com/img/ibank/O1CN01daaXsL1dVskYx7T92_!!3193983742-0-cib.jpg"
|
||||||
|
name = "test"
|
||||||
|
download_img(image_url, name)
|
Loading…
x
Reference in New Issue
Block a user