文章

爬取上一篇文章的主页及分页所有图片(更新完结)

mark一下,明天继续
没写完,只写到了获取分页的最大page数量
2020年4月5日0:49
其实代码不多,由于不熟悉码了很久...

import re
import requests
import os
from bs4 import BeautifulSoup
import lxml

down_path = './美图11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)
    return first_pic_url

def lastpage(content):
    urls2 = []
    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,取出来使用
    print(lastpage)
    return lastpage

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)
    for i in fenxi(html):
        urls.append('https://www.umtuba.com' + i)
        for url in urls:
            aurl = theuseragent(url)
            print(url)
            lastpage(aurl)
            print(lastpage())

    # print('已经加载完毕,请问您要下载第几页 : ')
    # soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
    # print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签



main()

这些url的拼接真是让我第一次感觉到int和str之间转换的麻烦,耗时耗力了蛮久,2020年4月5日12:04更新,赶紧保存,进度已经只差download了,写完程序基本就完成了,之后就是如果再出现问题就引入time或者恶补一下网络i/o延迟的处理了

import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time

down_path = './美图11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)
    return first_pic_url

def lastpage(content):

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,取出来使用
    return lastpage

def down_pic(content):
    picnamezz = re.compile('a href.*?//(.*?).html',re.S)
    picname = re.findall(picnamezz,content)
    print(picname)

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)
    for i in fenxi(html):
        urls.append('https://www.umtuba.com' + i)
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            for x1 in range(0,int(lastpage(aurl))):
                pingjie1 = re.sub('.html','',url)
                urls2.append(pingjie1+'_'+ str(x1) +'.html')
                for item in urls2:
                    down_pic(item)

    # print('已经加载完毕,请问您要下载第几页 : ')
    # soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
    # print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签



main()

上面的for循环嵌套写错了,我绕一了会儿,正在修正
2020年4月5日13:25,已经处理完成,代码还有些许散乱,搞了一天了,搞完了,暂时不去优化了,以后优化了在更新本帖.

import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,取出来使用
    return lastpage

def down_pic(content):
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    jpgget = requests.get(jpgdown[0])
    print(jpgdown)
    with open(f'./meitu11/{picname1}.jpg','wb') as f:
        f.write(jpgget.content)
        f.close()
        print(f'已完成{picname}的下载')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)
    for i in fenxi(html):
        urls.append('https://www.umtuba.com' + i)
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            pingjie1 = re.sub('.html','',url)
        for x1 in range(0,int(lastpage(aurl))):
            urls2.append(pingjie1+'_'+ str(x1) +'.html')
                # print(urls2)
        for item in urls2:
            time.sleep(1)
            html2 = theuseragent(item)

            down_pic(html2)

    # print('已经加载完毕,请问您要下载第几页 : ')
    # soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
    # print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签



main()

饭后继续战斗,加入了多线程处理,依葫芦画瓢,摸着石头过河,发现加成功了,虽然还有很多不明白的地方,但是爬虫速度起码提速了5-10倍了,这样也没出现之前的延迟卡住的问题,然后就是剩下延迟处理的还没去学习了.

import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,取出来使用
    return lastpage

def down_pic(content):
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    jpgget = requests.get(jpgdown[0])
    print(jpgdown)
    with open(f'./meitu11/{picname1}.jpg','wb') as f:
        f.write(jpgget.content)
        f.close()
        print(f'已完成{picname}的下载')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)

    with ThreadPoolExecutor(max_workers=10) as ex:
        for i in fenxi(html):
            urls.append('https://www.umtuba.com' + i)
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            time.sleep(1)
            print('=' * 50)
            print(lastpage(aurl))
            for x1 in range(0,int(lastpage(aurl))):
                pingjie1 = re.sub('.html','',url)
                urls2.append(pingjie1+'_'+ str(x1) +'.html')
                print(urls2)

        for item in urls2:
            futures = []
            html2 = theuseragent(item)
            future = ex.submit(down_pic,html2)
            futures.append(future)
            time.sleep(1)
    wait(futures,return_when=ALL_COMPLETED)
    # print('已经加载完毕,请问您要下载第几页 : ')
    # soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
    # print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签



main()

以下为2020年4月9日15:01再次更新,设置了timeout参数为10秒,经过测试是不够的,我会再次修改,但是这里不再提示
如果timeout了,就打印访问超时图片跳过的提醒,然后继续爬.
优化循环获取网址部分的速度,删除等待时间,加入random随机数导入time.sleep内,让每次下载图片之前的访问网址的等待时间随机生成,目前是1-6(包含)之间的随机数,经测试随机time.sleep(random(1,6))这一段代码有问题,一旦加入,程序就只会直接走入下边的except代码段,暂时屏蔽.
被爬网站没有任何反爬措施,所以估计很多人爬或者服务器本身在香港节点,访问较慢.
down_pic模块加入user-agent
第一次在程序内写入try,except 感觉还是蛮爽的.本身没有加入重试次数,也就是只要这张图片一次访问超时即放弃本图的下载.
以后会增加访问超时之后的重试次数
请忽略截图中未显示的图片,那是第一次修改代码down错的东西
加入注释部分!

import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
import random
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):#访问,且带着useragent,return的是text文本内容
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):#获取每一页的lastpage

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
    return lastpage

def down_pic(content):#获取图片的name及下载地址及下载到本地
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    try:
        jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
        # time.sleep(random(1,6))
        print(jpgdown)
        with open(f'./meitu11/{picname1}.jpg','wb') as f:
            f.write(jpgget.content)
            f.close()
            print(f'已完成{picname}的下载')
    except:
        print('该图片下载超时,自动跳过...')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)

    with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
        for i in fenxi(html):#循环分析首页的每一个分页
            urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            print('=' * 50)
            print(lastpage(aurl))
            for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
                pingjie1 = re.sub('.html','',url)#将.html替换为空
                urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
                # print(urls2)#这里打印的是网图的绝对网址
                print('=' * 50)
                print('网址库更新中,请耐心等待!')

        for item in urls2:
            futures = []
            html2 = theuseragent(item)
            future = ex.submit(down_pic,html2)
            futures.append(future)
    wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
    print('=' * 50)
    print('程序运行完毕,请关闭!')
    # print('已经加载完毕,请问您要下载第几页 : ')
    # soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
    # print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签



main()

以下为随机数优化完成
time.sleep(random,randint(1,6))
更多random的解释点击这里了解
2020年4月9日18:15更新

import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):#访问,且带着useragent,return的是text文本内容
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):#获取每一页的lastpage

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
    return lastpage


def down_pic(content):#获取图片的name及下载地址及下载到本地
    i = random.randint(1,6)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    try:
        time.sleep(i)
        print('=' * 20)
        print(f'已开启随机等待,等待时间{i}秒!!!')
        jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)

        # print(jpgdown) #打印图片绝对网址
        with open(f'./meitu11/{picname1}.jpg','wb') as f:
            f.write(jpgget.content)
            f.close()
            print(f'已完成{picname}的下载')
    except:
        print('该图片下载超时,自动跳过...')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)

    with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
        for i in fenxi(html):#循环分析首页的每一个分页
            urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            print('=' * 50)
            # print(lastpage(aurl))#这里打印每一页的最大page
            for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
                pingjie1 = re.sub('.html','',url)#将.html替换为空
                urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
                # print(urls2)#这里打印的是网图的绝对网址
                print('=' * 50)
                print('网址库更新中,请耐心等待!')
                # time.sleep(1)
        for item in urls2:
            futures = []
            html2 = theuseragent(item)
            future = ex.submit(down_pic,html2)
            futures.append(future)
    wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
    print('=' * 50)
    print('程序运行完毕,请关闭!')

main()

2020年4月9日19:20分
修复程序假死问题,原因在于f.close()
多线程使用,且使用了with open,就不要再使用close方法了.

import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):#访问,且带着useragent,return的是text文本内容
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):#获取每一页的lastpage

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
    return lastpage


def down_pic(content):#获取图片的name及下载地址及下载到本地
    i = random.randint(1,6)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    try:
        time.sleep(i)
        print('=' * 20)
        print(f'已开启随机等待,等待时间{i}秒!!!')
        jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)

        # print(jpgdown) #打印图片绝对网址
        with open(f'./meitu11/{picname1}.jpg','wb') as f:
            f.write(jpgget.content)
            print(f'已完成{picname}的下载')
    except:
        print('该图片下载超时,自动跳过...')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)

    with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
        for i in fenxi(html):#循环分析首页的每一个分页
            urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
        for url in urls:
            aurl = theuseragent(url)
            lastpage(aurl)
            print('=' * 50)
            # print(lastpage(aurl))#这里打印每一页的最大page
            for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
                pingjie1 = re.sub('.html','',url)#将.html替换为空
                urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
                # print(urls2)#这里打印的是网图的绝对网址
                print('=' * 50)
                print('网址库更新中,请耐心等待!')
                # time.sleep(1)
        for item in urls2:
            futures = []
            html2 = theuseragent(item)
            future = ex.submit(down_pic,html2)
            futures.append(future)
    wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
    print('=' * 50)
    print('程序运行完毕,请关闭!')

main()

因为依然卡死,所以更新了多线程部分,目前没发现卡死,现在出门做事了,挂在这里看看,应该没问题了.2020年4月9日20:10分

import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):#访问,且带着useragent,return的是text文本内容
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    r = requests.get(html,headers=headers)
    if r.status_code == 200:
        return r.text
    return None

def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):#获取每一页的lastpage

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
    return lastpage


def down_pic(content):#获取图片的name及下载地址及下载到本地
    i = random.randint(1,6)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    try:
        jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
        time.sleep(i)
        print('=' * 20)
        print(f'已开启随机等待,等待时间为{i}秒')
        # print(jpgdown) #打印图片绝对网址
        with open(f'./meitu11/{picname1}.jpg','wb') as f:
            f.write(jpgget.content)
            print(f'已完成{picname}的下载')
    except:
        print('该图片下载超时,自动跳过...')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)
    ex = ThreadPoolExecutor(max_workers=10)#多线程处理模块,开启10线程赋值给变量ex

    for i in fenxi(html):#循环分析首页的每一个分页
        urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
    for url in urls:
        aurl = theuseragent(url)
        lastpage(aurl)
        print('=' * 50)
        # print(lastpage(aurl))#这里打印每一页的最大page
        for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
            pingjie1 = re.sub('.html','',url)#将.html替换为空
            urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
            # print(urls2)#这里打印的是网图的绝对网址
            print('=' * 50)
            print('网址库更新中,请耐心等待!')
            # time.sleep(1)

    for item in urls2:
        futures = []
        html2 = theuseragent(item)
        future = ex.submit(down_pic,html2)#变量ex就是10线程,上面有说明,
        futures.append(future)
    wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
    print('=' * 50)
    print('程序运行完毕,请关闭!')

main()

2020年4月10日10:36程序假死,目前挂在这里继续让他运行,一会回来看动了没,卡住的地方:
已开启随机等待,等待时间为6秒

已完成['/siwameitui/58941_34.html']的下载

4月10日16:23已确认,程序确认假死,正在寻找问题
4月10日19:41测试修改后的程序,发现第一次程序运行完毕,且提示"程序运行完毕,请关闭!",证明了此次更新已经近乎完美,至于代码杂乱我就不修改了,本次学习到了很多东西,最后一次更新也修改了很多东西.本次图片爬取到本地共计1157张
我抽时间还会再更新一次,下次更新就是全站所有的图片都下载了.最好希望每个专辑专门自动建立一个文件夹

import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
i = random.randint(1,5)
down_path = './meitu11'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def theuseragent(html):#访问,且带着useragent,return的是text文本内容
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    time.sleep(i)#最大问题就是这里,time.sleep就应该放这里,然后下边的requests.get设置timeout
    print('=' * 20)
    print(f'已开启随机等待,等待时间为{i}秒')
    r = requests.get(html,headers=headers,timeout = 10)
    if r.status_code == 200:
        return r.text
        print('该地址已经return')
    return None

def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
    zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
    first_pic_url = re.findall(zhengze,html)

    return first_pic_url

def lastpage(content):#获取每一页的lastpage

    zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
    picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
    lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
    return lastpage


def down_pic(content):#获取图片的name及下载地址及下载到本地
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
    picname = re.findall(picnamezz,content)
    picname1 = re.sub('\W','',picname[0])
    jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
    jpgdown = re.findall(jpgnow,content)
    jpgget = requests.get(jpgdown[0],headers=headers,timeout = (30,30))#这里的timeout分别是链接时间和读取时间,超时自动放弃
    try:
        #print(jpgdown) #打印图片绝对网址
        with open(f'./meitu11/{picname1}.jpg','wb') as f:
            f.write(jpgget.content)
            # time.sleep(i) #time.sleep不因该在这里,我移到上边的theuseragent模块了
            print(f'已完成{picname}的下载')
    except:
        print('该图片下载超时,自动跳过...')

def main():
    start_url = 'https://www.umtuba.com'
    urls = []
    urls2 = []
    html = theuseragent(start_url)
    ex = ThreadPoolExecutor(max_workers=20)#多线程处理模块,开启10线程赋值给变量ex

    for i in fenxi(html):#循环分析首页的每一个分页
        urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
    for url in urls:
        aurl = theuseragent(url)
        lastpage(aurl)
        # print('=' * 50)
        # print(lastpage(aurl))#这里打印每一页的最大page
        for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
            pingjie1 = re.sub('.html','',url)#将.html替换为空
            urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
            # print(urls2)#这里打印的是网图的绝对网址
            print('=' * 50)
            print('网址库更新中,请耐心等待!')
            # time.sleep(0.1)

# for item in urls2:#如果从这里就开始循环,等于多线程没有用上
    # html2 = theuseragent(item)
    future = [ex.submit(down_pic,theuseragent(item)) for item in urls2]#应该从这里开始循环.(变量ex就是20线程,上面有说明,)
    wait(future,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束,加入timeout
    print('=' * 50)
    print('程序运行完毕,请关闭!')

main()
原文来自:爬取上一篇文章的主页及分页所有图片(更新完结),尊重自己,尊重每一个人;转发请注明来源!
0 0

发表评论