diff --git a/1688/spider/图片下载.py b/1688/spider/图片下载.py index 2f81951..d1a5212 100644 --- a/1688/spider/图片下载.py +++ b/1688/spider/图片下载.py @@ -54,8 +54,9 @@ class 图片下载(Baes): break print(f"【{datetime.now()}】图片下载{OptionImageUrl}") - res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}}) + # res = self.client['CLEAN_CONTENT'].update_one({"sign": sign}, {"$set": {"download_img_status": 2}}) print(f"【{datetime.now()}】完成 {res}") + exit() if __name__ == '__main__': diff --git a/1688/spider/图片下载协程.py b/1688/spider/图片下载协程.py new file mode 100644 index 0000000..366d507 --- /dev/null +++ b/1688/spider/图片下载协程.py @@ -0,0 +1,56 @@ +from urllib.parse import urlparse +import settings +import requests +import os +from dao.mongo_dao import MyMongodb +from spider.baes import Baes +from datetime import datetime +import gevent +import gevent.monkey +from gevent import Greenlet +gevent.monkey.patch_all(thread=False, select=False) + + +class 图片下载(Baes): + + def __init__(self): + self.client = MyMongodb().db + super(图片下载, self).__init__() + + def request_download(self, image_url, path): + try: + url_path = urlparse(image_url).path + image_name = url_path.split("/")[-1] + r = requests.get(image_url) + with open(f'{settings.excel_path}{path}/{image_name}', 'wb') as f: + f.write(r.content) + print(f"【{datetime.now()}】图片下载{image_url}") + return 1 + except Exception as e: + return -1 + + def mkdir(self, path): + folder = os.path.exists(f"{settings.excel_path}{path}") + if not folder: + os.makedirs(f"{settings.excel_path}{path}") + + def download_img(self, image_url, path): + self.mkdir(path) + return self.request_download(image_url, path) + + def run(self): + res = self.client['CLEAN_CONTENT'].find({"download_img_status": 0}).batch_size(1) + for s in res: + img_list = [] + id = s.get('id') + for img_url in s.get('images'): + if img_url.get('imageURI'): + fullPathImageURI = "https://cbu01.alicdn.com/img/ibank/" + img_url.get('imageURI') + img_list.append(fullPathImageURI) + dowload_jobs = [Greenlet.spawn(self.download_img, i, id) for i in img_list] + gevent.joinall(dowload_jobs) + + +if __name__ == '__main__': + img = 图片下载() + img.run() diff --git a/1688/spider/图片下载多线程.py b/1688/spider/图片下载多线程.py deleted file mode 100644 index 8a73ec2..0000000 --- a/1688/spider/图片下载多线程.py +++ /dev/null @@ -1,56 +0,0 @@ -from lxml import etree -from time import time -import asyncio -import aiohttp - -url = 'https://movie.douban.com/top250' - - -async def fetch_content(url): - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - return await response.text() - - -async def parse(url): - page = await fetch_content(url) - html = etree.HTML(page) - - xpath_movie = '//*[@id="content"]/div/div[1]/ol/li' - xpath_title = './/span[@class="title"]' - xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a' - - pages = html.xpath(xpath_pages) - fetch_list = [] - result = [] - - for element_movie in html.xpath(xpath_movie): - result.append(element_movie) - - for p in pages: - fetch_list.append(url + p.get('href')) - - tasks = [fetch_content(url) for url in fetch_list] - pages = await asyncio.gather(*tasks) - - for page in pages: - html = etree.HTML(page) - for element_movie in html.xpath(xpath_movie): - result.append(element_movie) - - for i, movie in enumerate(result, 1): - title = movie.find(xpath_title).text - print(i, title) - - -def main(): - loop = asyncio.get_event_loop() - start = time() - for i in range(5): - loop.run_until_complete(parse(url)) - end = time() - print('Cost {} seconds'.format((end - start) / 5)) - loop.close() - - -main()