mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-21 00:25:09 +08:00
修复代码
This commit is contained in:
parent
1f6ddc35ba
commit
cdac194193
@ -54,8 +54,9 @@ class 图片下载(Baes):
|
|||||||
break
|
break
|
||||||
print(f"【{datetime.now()}】图片下载{OptionImageUrl}")
|
print(f"【{datetime.now()}】图片下载{OptionImageUrl}")
|
||||||
|
|
||||||
res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
|
# res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}})
|
||||||
print(f"【{datetime.now()}】完成 {res}")
|
print(f"【{datetime.now()}】完成 {res}")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
56
1688/spider/图片下载协程.py
Normal file
56
1688/spider/图片下载协程.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from urllib.parse import urlparse
|
||||||
|
import settings
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from dao.mongo_dao import MyMongodb
|
||||||
|
from spider.baes import Baes
|
||||||
|
from datetime import datetime
|
||||||
|
import gevent
|
||||||
|
import gevent.monkey
|
||||||
|
from gevent import Greenlet
|
||||||
|
gevent.monkey.patch_all(thread=False, select=False)
|
||||||
|
|
||||||
|
|
||||||
|
class 图片下载(Baes):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.client = MyMongodb().db
|
||||||
|
super(图片下载, self).__init__()
|
||||||
|
|
||||||
|
def request_download(self, image_url, path):
|
||||||
|
try:
|
||||||
|
url_path = urlparse(image_url).path
|
||||||
|
image_name = url_path.split("/")[-1]
|
||||||
|
r = requests.get(image_url)
|
||||||
|
with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f:
|
||||||
|
f.write(r.content)
|
||||||
|
print(f"【{datetime.now()}】图片下载{image_url}")
|
||||||
|
return 1
|
||||||
|
except Exception as e:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def mkdir(self, path):
|
||||||
|
folder = os.path.exists(f"{settings.excel_path}{path}")
|
||||||
|
if not folder:
|
||||||
|
os.makedirs(f"{settings.excel_path}{path}")
|
||||||
|
|
||||||
|
def download_img(self, image_url, path):
|
||||||
|
self.mkdir(path)
|
||||||
|
return self.request_download(image_url, path)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1)
|
||||||
|
for s in res:
|
||||||
|
img_list = []
|
||||||
|
id = s.get('id')
|
||||||
|
for img_url in s.get('images'):
|
||||||
|
if img_url.get('imageURI'):
|
||||||
|
fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI')
|
||||||
|
img_list.append(fullPathImageURI)
|
||||||
|
dowload_jobs = [Greenlet.spawn(self.download_img, i, id) for i in img_list]
|
||||||
|
gevent.joinall(dowload_jobs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
img = 图片下载()
|
||||||
|
img.run()
|
@ -1,56 +0,0 @@
|
|||||||
from lxml import etree
|
|
||||||
from time import time
|
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
url = 'https://movie.douban.com/top250'
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_content(url):
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.get(url) as response:
|
|
||||||
return await response.text()
|
|
||||||
|
|
||||||
|
|
||||||
async def parse(url):
|
|
||||||
page = await fetch_content(url)
|
|
||||||
html = etree.HTML(page)
|
|
||||||
|
|
||||||
xpath_movie = '//*[@id="content"]/div/div[1]/ol/li'
|
|
||||||
xpath_title = './/span[@class="title"]'
|
|
||||||
xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a'
|
|
||||||
|
|
||||||
pages = html.xpath(xpath_pages)
|
|
||||||
fetch_list = []
|
|
||||||
result = []
|
|
||||||
|
|
||||||
for element_movie in html.xpath(xpath_movie):
|
|
||||||
result.append(element_movie)
|
|
||||||
|
|
||||||
for p in pages:
|
|
||||||
fetch_list.append(url + p.get('href'))
|
|
||||||
|
|
||||||
tasks = [fetch_content(url) for url in fetch_list]
|
|
||||||
pages = await asyncio.gather(*tasks)
|
|
||||||
|
|
||||||
for page in pages:
|
|
||||||
html = etree.HTML(page)
|
|
||||||
for element_movie in html.xpath(xpath_movie):
|
|
||||||
result.append(element_movie)
|
|
||||||
|
|
||||||
for i, movie in enumerate(result, 1):
|
|
||||||
title = movie.find(xpath_title).text
|
|
||||||
print(i, title)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
start = time()
|
|
||||||
for i in range(5):
|
|
||||||
loop.run_until_complete(parse(url))
|
|
||||||
end = time()
|
|
||||||
print('Cost {} seconds'.format((end - start) / 5))
|
|
||||||
loop.close()
|
|
||||||
|
|
||||||
|
|
||||||
main()
|
|
Loading…
x
Reference in New Issue
Block a user