这里我测试了一下爬取豆瓣电影top250的所有图片,为了便于测试,我没有每一页进入爬取大图,只是在每个list抓取小图,测试结果是成功的,接下来继续测试python的多线程给爬虫提速.

#coding:utf-8
import os
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

download_path = './douban'
if not os.path.exists(download_path):
    os.makedirs(download_path)


def download_pic(url):
    ua = UserAgent()
    headers = {'User-Agent':ua.chrome}
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text,'lxml')
    content = soup.find('div',class_ = 'article') #获取div,注意,这里需要获取整页的div,我有尝试直接获取pic,可是它只爬取每页的第一张图
    images = content.find_all('img')
    pic_link_list = [image['src'] for image in images] #遍历当前网页内的所有图片的src地址
    pic_name_list = [image['alt'] for image in images]  #遍历当前网页内所有图片的alt标签
    for name,link in zip(pic_name_list,pic_link_list):
        urlretrieve(link,f'{download_path}/{name}.jpg')
    print(f'{url}所有电影图片下载完成')




def main():
    start_urls = ['https://movie.douban.com/top250']

    for i in range(1,10):
        start_urls.append(f'https://movie.douban.com/top250?start={i * 25}&filter=')
    print(start_urls)

    for url in start_urls:
        download_pic(url)



main()

接下来时隔一天,接入多线程进行爬取,发现速度快了很多
看一下速度的对比

一个耗时47秒近48秒的时间,提速之后耗时9秒,开启的是10个进程的加速.
以下为改造后的代码

#coding:utf-8
import os
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import time
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
download_path = './douban'
if not os.path.exists(download_path):
    os.makedirs(download_path)

os.system('title 豆瓣电影Top250图片下载 @小伍的游乐场 E_Page') #设置窗口标题


def download_pic(url):
    ua = UserAgent()
    headers = {'User-Agent':ua.chrome}
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text,'lxml')
    content = soup.find('div',class_ = 'article') #获取div,注意,这里需要获取整页的div,我有尝试直接获取pic,可是它只爬取每页的第一张图
    images = content.find_all('img')
    pic_link_list = [image['src'] for image in images] #遍历当前网页内的所有图片的src地址
    pic_name_list = [image['alt'] for image in images]  #遍历当前网页内所有图片的alt标签
    for name,link in zip(pic_name_list,pic_link_list):
        urlretrieve(link,f'{download_path}/{name}.jpg')
    print(f'{url}所有电影图片下载完成')




def main():
    start_urls = ['https://movie.douban.com/top250']

    for i in range(1,10):
        start_urls.append(f'https://movie.douban.com/top250?start={i * 25}&filter=')
   # print(start_urls)

    start_time = time.time()

    # for url in start_urls:
    #     download_pic(url)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for url in start_urls:
            future = executor.submit(download_pic,url)
            futures.append(future)

    wait(futures,return_when=ALL_COMPLETED)

    end_time = time.time()
    print('=' * 50)
    print(f'运行时间为:{end_time - start_time}')

main()

最后修改:2022 年 12 月 05 日
如果觉得我的文章对你有用,请随意赞赏