mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 03:59:57 +08:00
代码更新
This commit is contained in:
parent
6352281fd5
commit
dc5bfd9fbc
0
1688/__init__.py
Normal file
0
1688/__init__.py
Normal file
44
1688/spider/1688企业产品列表页面.py
Normal file
44
1688/spider/1688企业产品列表页面.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from scrapy.selector import Selector
|
||||||
|
from dao.mongo_dao import MongoDao
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class Film(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.col = MongoDao()
|
||||||
|
self.url = "https://dearbei.1688.com/page/offerlist.htm?spm=a2615.7691456.autotrace-paginator.2&pageNum={}"
|
||||||
|
super(Film, self).__init__()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for i in range(1, 24):
|
||||||
|
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
|
||||||
|
x5sec = "7b226b796c696e3b32223a226666636266643833623266666662366331306164643530623830623436613662434d4f7076596f47454e337336596a57674f62427a674561437a59324f4449794d5463344e4473784d4f57426e355542227d"
|
||||||
|
url = self.url.format(i)
|
||||||
|
headers = {
|
||||||
|
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
|
||||||
|
}
|
||||||
|
response = requests.request("GET", url, headers=headers)
|
||||||
|
|
||||||
|
if '系统自动生成,请勿修改 100%' in response.text:
|
||||||
|
print(f"【{datetime.now()}】报错{i}")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
sel = Selector(text=response.text, type='html')
|
||||||
|
urls = sel.xpath('//ul[@class="offer-list-row"]//div[@class="image"]/a/@href').extract()
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
item = {
|
||||||
|
"sign": self.generate_sign(url),
|
||||||
|
"url": url
|
||||||
|
}
|
||||||
|
self.col.insert_item('RAW_URLS', item)
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
f = Film()
|
||||||
|
f.run()
|
41
1688/spider/1688企业产品详情页面.py
Normal file
41
1688/spider/1688企业产品详情页面.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from dao.mongo_dao import MongoDao
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class 企业产品详情页面(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.col = MongoDao()
|
||||||
|
super(企业产品详情页面, self).__init__()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
res = self.col.find_item('RAW_URLS', {"stauts": "0"}, {"url": 1})
|
||||||
|
for s in res:
|
||||||
|
url = s.get('url')
|
||||||
|
cookie2 = "1e3cee17580ffb0eea62cdaec87c7771"
|
||||||
|
x5sec = "7b226c61707574613b32223a223936636266303531633230613132626262646165393438306666303931336364434d625076596f4745506a44375a4f6f706f58416f514561437a59324f4449794d5463344e4473314d50617371536f3d227d"
|
||||||
|
headers = {
|
||||||
|
'cookie': f"cookie2={cookie2};x5sec={x5sec}"
|
||||||
|
}
|
||||||
|
response = requests.request("GET", url, headers=headers)
|
||||||
|
|
||||||
|
if '系统自动生成,请勿修改 100%' in response.text:
|
||||||
|
print(f"【{datetime.now()}】报错{url}")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
item = {
|
||||||
|
"sign": self.generate_sign(url),
|
||||||
|
"url": url,
|
||||||
|
"content": response.text
|
||||||
|
}
|
||||||
|
self.col.insert_item('RAW_CONTENT', item)
|
||||||
|
self.col.update_item('RAW_URLS', item)
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
f = 企业产品详情页面()
|
||||||
|
f.run()
|
0
1688/spider/__init__.py
Normal file
0
1688/spider/__init__.py
Normal file
24
1688/spider/baes.py
Normal file
24
1688/spider/baes.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from urllib.parse import urlparse
|
||||||
|
import hashlib
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
|
class Baes(object):
|
||||||
|
|
||||||
|
def generate_sign(self, url):
|
||||||
|
"""通过md5生成项目符号表单URL"""
|
||||||
|
md5 = hashlib.md5()
|
||||||
|
md5.update(self.check_domain_area(str(url)).encode('utf-8'))
|
||||||
|
sign = base64.urlsafe_b64encode(md5.digest())
|
||||||
|
sign = str(sign, encoding="utf-8").replace('==', '')
|
||||||
|
return sign
|
||||||
|
|
||||||
|
def check_domain_area(self, url):
|
||||||
|
"""确认域名区域.不对域名区域,协议类型,做md5加密"""
|
||||||
|
try:
|
||||||
|
parsed_uri = urlparse(url)
|
||||||
|
uri_netloc = parsed_uri.netloc
|
||||||
|
uri_netloc_new = '.'.join(parsed_uri.netloc.split('.')[:-1])
|
||||||
|
url = url.replace(uri_netloc, uri_netloc_new).replace('https', '').replace('http', '')
|
||||||
|
finally:
|
||||||
|
return url
|
Loading…
x
Reference in New Issue
Block a user