文章

Python爬虫实例:爬某草榴网站,并且down图片到本地

#coding:gbk
import requests
from bs4 import BeautifulSoup as bs
import re
import os
import socket
import time
import threading


def url_open(url):
    socket.setdefaulttimeout(20)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
    res = requests.get(url, headers=headers)

    return res


def save(url):  # 传入每个子网页链接
    res = url_open(url)
    res.encoding = 'gbk'
    soup = bs(res.text, 'lxml')
    title = soup.find('title').text.split('-')[0]  # 标题
    #os.mkdir(title)
    # os.chdir(title)
    temp = soup.find_all('tr', class_='tr3')
    img = re.findall(r'data-src="(.*?jpg)" type', str(temp))
    
    
    imglist = []

    for each in img:
        imglist.append(each)
    for each in imglist:
        filename = each.split('/')[-1]
        img = url_open(each)
        print('saving...+%s'%filename)

        with open(title+filename, 'wb')as f:
            f.write(img.content)
    #os.chdir('..')




if __name__ == '__main__':
    os.makedirs('1024', exist_ok=True)
    os.chdir('1024')
    url = 'http://cl.7ar.xyz/thread0806.php?fid=16&search=&page=1'  #默认爬取第一个页面,毕竟要注意身体,需要多个页面的话,自己加个for循环也不是什么难事~
    urlhead = 'http://cl.7ar.xyz/'   #页面解析出来的连接前面需要加上这个头才能打开,根据多年经验这个头是会变的,如果哪天不能用了自己看下是不是这个头变了
    res = url_open(url)
    res.encoding = 'gbk'

    '''找到页面中的所有子网页'''
    soup = bs(res.text, 'lxml')
    temp = soup.find_all('td', class_="tal")
    link = []
    for each in temp:
        link.append(urlhead + each.h3.a.get('href'))
    # del link[0:10]

    downloads = []
    for each in link:
        print(each)

        down = threading.Thread(target=save, args=[each])
        downloads.append(down)

        down.start()
    for each in downloads:
        each.join()
    print('Done')
    input("\n欢迎使用你伍大爷为你制作的小工具,微信号:e_page。按任意键退出!")

原文来自:Python爬虫实例:爬某草榴网站,并且down图片到本地,尊重自己,尊重每一个人;转发请注明来源!
0 0

发表评论