From 1f6ddc35ba99a0dd4f5dea1a51a7ca6019aafba1 Mon Sep 17 00:00:00 2001 From: aiguigu Date: Mon, 20 Dec 2021 01:59:13 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1688/spider/图片下载多线程.py | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 1688/spider/图片下载多线程.py diff --git a/1688/spider/图片下载多线程.py b/1688/spider/图片下载多线程.py new file mode 100644 index 0000000..8a73ec2 --- /dev/null +++ b/1688/spider/图片下载多线程.py @@ -0,0 +1,56 @@ +from lxml import etree +from time import time +import asyncio +import aiohttp + +url = 'https://movie.douban.com/top250' + + +async def fetch_content(url): + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.text() + + +async def parse(url): + page = await fetch_content(url) + html = etree.HTML(page) + + xpath_movie = '//*[@id="content"]/div/div[1]/ol/li' + xpath_title = './/span[@class="title"]' + xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a' + + pages = html.xpath(xpath_pages) + fetch_list = [] + result = [] + + for element_movie in html.xpath(xpath_movie): + result.append(element_movie) + + for p in pages: + fetch_list.append(url + p.get('href')) + + tasks = [fetch_content(url) for url in fetch_list] + pages = await asyncio.gather(*tasks) + + for page in pages: + html = etree.HTML(page) + for element_movie in html.xpath(xpath_movie): + result.append(element_movie) + + for i, movie in enumerate(result, 1): + title = movie.find(xpath_title).text + print(i, title) + + +def main(): + loop = asyncio.get_event_loop() + start = time() + for i in range(5): + loop.run_until_complete(parse(url)) + end = time() + print('Cost {} seconds'.format((end - start) / 5)) + loop.close() + + +main()