这里我测试了一下爬取豆瓣电影top250的所有图片,为了便于测试,我没有每一页进入爬取大图,只是在每个list抓取小图,测试结果是成功的,接下来继续测试python的多线程给爬虫提速.
#coding:utf-8
import os
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
download_path = './douban'
if not os.path.exists(download_path):
os.makedirs(download_path)
def download_pic(url):
ua = UserAgent()
headers = {'User-Agent':ua.chrome}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
content = soup.find('div',class_ = 'article') #获取div,注意,这里需要获取整页的div,我有尝试直接获取pic,可是它只爬取每页的第一张图
images = content.find_all('img')
pic_link_list = [image['src'] for image in images] #遍历当前网页内的所有图片的src地址
pic_name_list = [image['alt'] for image in images] #遍历当前网页内所有图片的alt标签
for name,link in zip(pic_name_list,pic_link_list):
urlretrieve(link,f'{download_path}/{name}.jpg')
print(f'{url}所有电影图片下载完成')
def main():
start_urls = ['https://movie.douban.com/top250']
for i in range(1,10):
start_urls.append(f'https://movie.douban.com/top250?start={i * 25}&filter=')
print(start_urls)
for url in start_urls:
download_pic(url)
main()
接下来时隔一天,接入多线程进行爬取,发现速度快了很多
看一下速度的对比
一个耗时47秒近48秒的时间,提速之后耗时9秒,开启的是10个进程的加速.
以下为改造后的代码
#coding:utf-8
import os
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import time
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
download_path = './douban'
if not os.path.exists(download_path):
os.makedirs(download_path)
os.system('title 豆瓣电影Top250图片下载 @小伍的游乐场 E_Page') #设置窗口标题
def download_pic(url):
ua = UserAgent()
headers = {'User-Agent':ua.chrome}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
content = soup.find('div',class_ = 'article') #获取div,注意,这里需要获取整页的div,我有尝试直接获取pic,可是它只爬取每页的第一张图
images = content.find_all('img')
pic_link_list = [image['src'] for image in images] #遍历当前网页内的所有图片的src地址
pic_name_list = [image['alt'] for image in images] #遍历当前网页内所有图片的alt标签
for name,link in zip(pic_name_list,pic_link_list):
urlretrieve(link,f'{download_path}/{name}.jpg')
print(f'{url}所有电影图片下载完成')
def main():
start_urls = ['https://movie.douban.com/top250']
for i in range(1,10):
start_urls.append(f'https://movie.douban.com/top250?start={i * 25}&filter=')
# print(start_urls)
start_time = time.time()
# for url in start_urls:
# download_pic(url)
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for url in start_urls:
future = executor.submit(download_pic,url)
futures.append(future)
wait(futures,return_when=ALL_COMPLETED)
end_time = time.time()
print('=' * 50)
print(f'运行时间为:{end_time - start_time}')
main()