代码更新

This commit is contained in:
aiguigu 2021-10-20 09:28:21 +08:00
parent 21536e5c24
commit b58ade53d9
4 changed files with 23 additions and 24 deletions

View File

@ -46,11 +46,11 @@ class extractor(Baes):
offerUnit = globalData.get('tempModel').get('offerUnit')
images = globalData.get('images')
for image in images:
fullPathImageURI = image.get('fullPathImageURI')
download_img(fullPathImageURI, offerId)
print(f"{datetime.now()}】图片下载{fullPathImageURI}")
time.sleep(1)
# for image in images:
# fullPathImageURI = image.get('fullPathImageURI')
# download_img(fullPathImageURI, offerId)
# print(f"【{datetime.now()}】图片下载{fullPathImageURI}")
# time.sleep(1)
a_590893001984 = data.get('590893001984')
if not a_590893001984:
@ -72,8 +72,9 @@ class extractor(Baes):
detailUrl = globalData.get('detailModel').get('detailUrl')
item = {
"sign": self.generate_sign("https://detail.1688.com/offer/{}.html".format(offerId)),
"company_name": companyName,
"company_name_url": "https://detail.1688.com/offer/{}.html".format(offerId),
"url": "https://detail.1688.com/offer/{}.html".format(offerId),
"title": title,
"sub_categorys": sub_categorys,
"sub_colour_categorys": sub_colour_categorys,
@ -86,8 +87,7 @@ class extractor(Baes):
"unit_weight": unitWeight
}
print(json.dumps(item))
exit()
self.col.insert_item('CLEAN_CONTENT', item)
print(f"{datetime.now()}】解析{offerId}")

View File

@ -2,7 +2,7 @@ import os
MONGODB_CONF = {
'host': '192.168.5.151',
'host': '127.0.0.1',
'port': 27017,
'username': '',
'pwd': "",

View File

@ -10,15 +10,14 @@ class Film(Baes):
def __init__(self):
self.col = MongoDao()
self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}"
self.url = "https://shop1456245592469.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator." \
"4.79f525026COu37&pageNum={}"
super(Film, self).__init__()
def run(self):
for i in range(15, 24):
cookie2 = "1bdee7e6f5206d15ccfabb2cc828a2d1"
x5sec = "7b226b796c696e3b32223a2232386636646266333930343734353861333765356163386535" \
"35636232343339434a757676346f47454c434b357258693249655973674561437a59324f44497" \
"4d5463344e4473784d4f57426e355542227d"
for i in range(1, 33):
cookie2 = "1bd7858d65500ba53956ab164308ad2a"
x5sec = "7b226b796c696e3b32223a223932613866656331376631373065326331363635306638303764646635666438434f766276497347454c36483635715739704f7968674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
url = self.url.format(i).replace('detail', 'm')
headers = {
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
@ -26,21 +25,23 @@ class Film(Baes):
response = requests.request("GET", url, headers=headers)
if '系统自动生成,请勿修改 100%' in response.text:
print(f"{datetime.now()}】报错{i}")
print(f"系统自动生成,请勿修改 100%{datetime.now()}】报错{i}")
exit()
if '全球领先的采购批发平台,批发网' in response.text:
print(f"{datetime.now()}】报错{i}")
print(f"全球领先的采购批发平台,批发网{datetime.now()}】报错{i}")
exit()
sel = Selector(text=response.text, type='html')
urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
shop_name = sel.xpath('//div[@class="name-wrap"]//a/text()').extract_first()
for url in urls:
item = {
"sign": self.generate_sign(url),
"url": url,
"stauts": "0"
"stauts": "0",
"shop_name": shop_name
}
self.col.insert_item('RAW_URLS', item)
time.sleep(10)

View File

@ -16,20 +16,18 @@ class 企业产品详情页面(Baes):
for s in res:
url = s.get('url').replace('detail', 'm')
sign = s.get('sign')
x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a2236653736" \
"323835663332623033396233366663613833323639396433326236364350372b76346" \
"f47454b7a58673776446d357578685145773563795068766a2f2f2f2f2f41513d3d227d"
x5sec = "7b22776972656c6573732d7365727665722d72656e6465723b32223a223532323634333863383734346532666230393835646164396366336533376664434d756776597347454a5056377266593972325539414561437a59324f4449794d5463344e4473784d4f584d6a3462342f2f2f2f2f77453d227d"
headers = {
'cookie': f"x5sec={x5sec}"
'Cookie': f"x5sec={x5sec}"
}
response = requests.request("GET", url, headers=headers)
if '系统自动生成,请勿修改 100%' in response.text:
print(f"{datetime.now()}】报错{url}")
print(f"系统自动生成,请勿修改 100%{datetime.now()}】报错{url}")
exit()
if '全球领先的采购批发平台,批发网' in response.text:
print(f"{datetime.now()}】报错{url}")
print(f"全球领先的采购批发平台,批发网{datetime.now()}】报错{url}")
exit()
item = {