代码更新

This commit is contained in:
aiguigu 2021-09-26 02:22:43 +08:00
parent 6352281fd5
commit dc5bfd9fbc
5 changed files with 109 additions and 0 deletions

0
1688/__init__.py Normal file
View File

View File

@ -0,0 +1,44 @@
from scrapy.selector import Selector
from dao.mongo_dao import MongoDao
from spider.baes import Baes
from datetime import datetime
import time
import requests
class Film(Baes):
def __init__(self):
self.col = MongoDao()
self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}"
super(Film, self).__init__()
def run(self):
for i in range(1, 24):
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
url = self.url.format(i)
headers = {
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
}
response = requests.request("GET", url, headers=headers)
if '系统自动生成,请勿修改 100%' in response.text:
print(f"{datetime.now()}】报错{i}")
exit()
sel = Selector(text=response.text, type='html')
urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
for url in urls:
item = {
"sign": self.generate_sign(url),
"url": url
}
self.col.insert_item('RAW_URLS', item)
time.sleep(10)
if __name__ == '__main__':
f = Film()
f.run()

View File

@ -0,0 +1,41 @@
from dao.mongo_dao import MongoDao
from spider.baes import Baes
from datetime import datetime
import time
import requests
class 企业产品详情页面(Baes):
def __init__(self):
self.col = MongoDao()
super(企业产品详情页面, self).__init__()
def run(self):
res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1})
for s in res:
url = s.get('url')
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d"
headers = {
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
}
response = requests.request("GET", url, headers=headers)
if '系统自动生成,请勿修改 100%' in response.text:
print(f"{datetime.now()}】报错{url}")
exit()
item = {
"sign": self.generate_sign(url),
"url": url,
"content": response.text
}
self.col.insert_item('RAW_CONTENT', item)
self.col.update_item('RAW_URLS', item)
time.sleep(10)
if __name__ == '__main__':
f = 企业产品详情页面()
f.run()

0
1688/spider/__init__.py Normal file
View File

24
1688/spider/baes.py Normal file
View File

@ -0,0 +1,24 @@
from urllib.parse import urlparse
import hashlib
import base64
class Baes(object):
def generate_sign(self, url):
"""通过md5生成项目符号表单URL"""
md5 = hashlib.md5()
md5.update(self.check_domain_area(str(url)).encode('utf-8'))
sign = base64.urlsafe_b64encode(md5.digest())
sign = str(sign, encoding="utf-8").replace('==', '')
return sign
def check_domain_area(self, url):
"""确认域名区域.不对域名区域,协议类型,做md5加密"""
try:
parsed_uri = urlparse(url)
uri_netloc = parsed_uri.netloc
uri_netloc_new = '.'.join(parsed_uri.netloc.split('.')[:-1])
url = url.replace(uri_netloc, uri_netloc_new).replace('https', '').replace('http', '')
finally:
return url