diff --git a/1688/__init__.py b/1688/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/1688/spider/1688企业产品列表页面.py b/1688/spider/1688企业产品列表页面.py new file mode 100644 index 0000000..5f5d437 --- /dev/null +++ b/1688/spider/1688企业产品列表页面.py @@ -0,0 +1,44 @@ +from scrapy.selector import Selector +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import time +import requests + + +class Film(Baes): + + def __init__(self): + self.col = MongoDao() + self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}" + super(Film, self).__init__() + + def run(self): + for i in range(1, 24): + cookie2 = "1e3cee17580ffb0eea62cdaec87c7771" + x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d" + url = self.url.format(i) + headers = { + 'cookie': f"cookie2={cookie2};x5sec={x5sec}" + } + response = requests.request("GET", url, headers=headers) + + if '系统自动生成,请勿修改 100%' in response.text: + print(f"【{datetime.now()}】报错{i}") + exit() + + sel = Selector(text=response.text, type='html') + urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract() + + for url in urls: + item = { + "sign": self.generate_sign(url), + "url": url + } + self.col.insert_item('RAW_URLS', item) + time.sleep(10) + + +if __name__ == '__main__': + f = Film() + f.run() diff --git a/1688/spider/1688企业产品详情页面.py b/1688/spider/1688企业产品详情页面.py new file mode 100644 index 0000000..7896132 --- /dev/null +++ b/1688/spider/1688企业产品详情页面.py @@ -0,0 +1,41 @@ +from dao.mongo_dao import MongoDao +from spider.baes import Baes +from datetime import datetime +import time +import requests + + +class 企业产品详情页面(Baes): + + def __init__(self): + self.col = MongoDao() + super(企业产品详情页面, self).__init__() + + def run(self): + res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1}) + for s in res: + url = s.get('url') + cookie2 = "1e3cee17580ffb0eea62cdaec87c7771" + x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d" + headers = { + 'cookie': f"cookie2={cookie2};x5sec={x5sec}" + } + response = requests.request("GET", url, headers=headers) + + if '系统自动生成,请勿修改 100%' in response.text: + print(f"【{datetime.now()}】报错{url}") + exit() + + item = { + "sign": self.generate_sign(url), + "url": url, + "content": response.text + } + self.col.insert_item('RAW_CONTENT', item) + self.col.update_item('RAW_URLS', item) + time.sleep(10) + + +if __name__ == '__main__': + f = 企业产品详情页面() + f.run() diff --git a/1688/spider/__init__.py b/1688/spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/1688/spider/baes.py b/1688/spider/baes.py new file mode 100644 index 0000000..e906366 --- /dev/null +++ b/1688/spider/baes.py @@ -0,0 +1,24 @@ +from urllib.parse import urlparse +import hashlib +import base64 + + +class Baes(object): + + def generate_sign(self, url): + """通过md5生成项目符号表单URL""" + md5 = hashlib.md5() + md5.update(self.check_domain_area(str(url)).encode('utf-8')) + sign = base64.urlsafe_b64encode(md5.digest()) + sign = str(sign, encoding="utf-8").replace('==', '') + return sign + + def check_domain_area(self, url): + """确认域名区域.不对域名区域,协议类型,做md5加密""" + try: + parsed_uri = urlparse(url) + uri_netloc = parsed_uri.netloc + uri_netloc_new = '.'.join(parsed_uri.netloc.split('.')[:-1]) + url = url.replace(uri_netloc, uri_netloc_new).replace('https', '').replace('http', '') + finally: + return url