mark一下,明天继续
没写完,只写到了获取分页的最大page数量
2020年4月5日0:49
其实代码不多,由于不熟悉码了很久...
import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
down_path = './美图11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):
urls2 = []
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,取出来使用
print(lastpage)
return lastpage
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
for i in fenxi(html):
urls.append('https://www.umtuba.com' + i)
for url in urls:
aurl = theuseragent(url)
print(url)
lastpage(aurl)
print(lastpage())
# print('已经加载完毕,请问您要下载第几页 : ')
# soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
# print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签
main()
这些url的拼接真是让我第一次感觉到int和str之间转换的麻烦,耗时耗力了蛮久,2020年4月5日12:04更新,赶紧保存,进度已经只差download了,写完程序基本就完成了,之后就是如果再出现问题就引入time或者恶补一下网络i/o延迟的处理了
import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
down_path = './美图11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,取出来使用
return lastpage
def down_pic(content):
picnamezz = re.compile('a href.*?//(.*?).html',re.S)
picname = re.findall(picnamezz,content)
print(picname)
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
for i in fenxi(html):
urls.append('https://www.umtuba.com' + i)
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
for x1 in range(0,int(lastpage(aurl))):
pingjie1 = re.sub('.html','',url)
urls2.append(pingjie1+'_'+ str(x1) +'.html')
for item in urls2:
down_pic(item)
# print('已经加载完毕,请问您要下载第几页 : ')
# soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
# print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签
main()
上面的for循环嵌套写错了,我绕一了会儿,正在修正
2020年4月5日13:25,已经处理完成,代码还有些许散乱,搞了一天了,搞完了,暂时不去优化了,以后优化了在更新本帖.
import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,取出来使用
return lastpage
def down_pic(content):
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
jpgget = requests.get(jpgdown[0])
print(jpgdown)
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
f.close()
print(f'已完成{picname}的下载')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
for i in fenxi(html):
urls.append('https://www.umtuba.com' + i)
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
pingjie1 = re.sub('.html','',url)
for x1 in range(0,int(lastpage(aurl))):
urls2.append(pingjie1+'_'+ str(x1) +'.html')
# print(urls2)
for item in urls2:
time.sleep(1)
html2 = theuseragent(item)
down_pic(html2)
# print('已经加载完毕,请问您要下载第几页 : ')
# soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
# print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签
main()
饭后继续战斗,加入了多线程处理,依葫芦画瓢,摸着石头过河,发现加成功了,虽然还有很多不明白的地方,但是爬虫速度起码提速了5-10倍了,这样也没出现之前的延迟卡住的问题,然后就是剩下延迟处理的还没去学习了.
import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,取出来使用
return lastpage
def down_pic(content):
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
jpgget = requests.get(jpgdown[0])
print(jpgdown)
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
f.close()
print(f'已完成{picname}的下载')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
with ThreadPoolExecutor(max_workers=10) as ex:
for i in fenxi(html):
urls.append('https://www.umtuba.com' + i)
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
time.sleep(1)
print('=' * 50)
print(lastpage(aurl))
for x1 in range(0,int(lastpage(aurl))):
pingjie1 = re.sub('.html','',url)
urls2.append(pingjie1+'_'+ str(x1) +'.html')
print(urls2)
for item in urls2:
futures = []
html2 = theuseragent(item)
future = ex.submit(down_pic,html2)
futures.append(future)
time.sleep(1)
wait(futures,return_when=ALL_COMPLETED)
# print('已经加载完毕,请问您要下载第几页 : ')
# soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
# print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签
main()
以下为2020年4月9日15:01再次更新,设置了timeout参数为10秒,经过测试是不够的,我会再次修改,但是这里不再提示
如果timeout了,就打印访问超时图片跳过的提醒,然后继续爬.
优化循环获取网址部分的速度,删除等待时间,加入random随机数导入time.sleep内,让每次下载图片之前的访问网址的等待时间随机生成,目前是1-6(包含)之间的随机数,经测试随机time.sleep(random(1,6))这一段代码有问题,一旦加入,程序就只会直接走入下边的except代码段,暂时屏蔽.
被爬网站没有任何反爬措施,所以估计很多人爬或者服务器本身在香港节点,访问较慢.
down_pic模块加入user-agent
第一次在程序内写入try,except 感觉还是蛮爽的.本身没有加入重试次数,也就是只要这张图片一次访问超时即放弃本图的下载.
以后会增加访问超时之后的重试次数
请忽略截图中未显示的图片,那是第一次修改代码down错的东西
加入注释部分!
import re
import requests
import os
from bs4 import BeautifulSoup
import lxml
import time
import random
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):#访问,且带着useragent,return的是text文本内容
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):#获取每一页的lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
return lastpage
def down_pic(content):#获取图片的name及下载地址及下载到本地
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
try:
jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
# time.sleep(random(1,6))
print(jpgdown)
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
f.close()
print(f'已完成{picname}的下载')
except:
print('该图片下载超时,自动跳过...')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
for i in fenxi(html):#循环分析首页的每一个分页
urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
print('=' * 50)
print(lastpage(aurl))
for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
pingjie1 = re.sub('.html','',url)#将.html替换为空
urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
# print(urls2)#这里打印的是网图的绝对网址
print('=' * 50)
print('网址库更新中,请耐心等待!')
for item in urls2:
futures = []
html2 = theuseragent(item)
future = ex.submit(down_pic,html2)
futures.append(future)
wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
print('=' * 50)
print('程序运行完毕,请关闭!')
# print('已经加载完毕,请问您要下载第几页 : ')
# soup = BeautifulSoup(html.content,'lxml') #由于html属于requests对象,无法用于beautifulsoup,所以这里加上content
# print(soup.title.string) #title和string都属于beautifulsoup的方法,title取标题,但是包含标签,string纯输出标题的文本,不包含标签
main()
以下为随机数优化完成
time.sleep(random,randint(1,6))
更多random的解释点击这里了解
2020年4月9日18:15更新
import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):#访问,且带着useragent,return的是text文本内容
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):#获取每一页的lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
return lastpage
def down_pic(content):#获取图片的name及下载地址及下载到本地
i = random.randint(1,6)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
try:
time.sleep(i)
print('=' * 20)
print(f'已开启随机等待,等待时间{i}秒!!!')
jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
# print(jpgdown) #打印图片绝对网址
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
f.close()
print(f'已完成{picname}的下载')
except:
print('该图片下载超时,自动跳过...')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
for i in fenxi(html):#循环分析首页的每一个分页
urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
print('=' * 50)
# print(lastpage(aurl))#这里打印每一页的最大page
for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
pingjie1 = re.sub('.html','',url)#将.html替换为空
urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
# print(urls2)#这里打印的是网图的绝对网址
print('=' * 50)
print('网址库更新中,请耐心等待!')
# time.sleep(1)
for item in urls2:
futures = []
html2 = theuseragent(item)
future = ex.submit(down_pic,html2)
futures.append(future)
wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
print('=' * 50)
print('程序运行完毕,请关闭!')
main()
2020年4月9日19:20分
修复程序假死问题,原因在于f.close()
多线程使用,且使用了with open,就不要再使用close方法了.
import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):#访问,且带着useragent,return的是text文本内容
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):#获取每一页的lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
return lastpage
def down_pic(content):#获取图片的name及下载地址及下载到本地
i = random.randint(1,6)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
try:
time.sleep(i)
print('=' * 20)
print(f'已开启随机等待,等待时间{i}秒!!!')
jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
# print(jpgdown) #打印图片绝对网址
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
print(f'已完成{picname}的下载')
except:
print('该图片下载超时,自动跳过...')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
with ThreadPoolExecutor(max_workers=10) as ex:#多线程处理模块
for i in fenxi(html):#循环分析首页的每一个分页
urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
print('=' * 50)
# print(lastpage(aurl))#这里打印每一页的最大page
for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
pingjie1 = re.sub('.html','',url)#将.html替换为空
urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
# print(urls2)#这里打印的是网图的绝对网址
print('=' * 50)
print('网址库更新中,请耐心等待!')
# time.sleep(1)
for item in urls2:
futures = []
html2 = theuseragent(item)
future = ex.submit(down_pic,html2)
futures.append(future)
wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
print('=' * 50)
print('程序运行完毕,请关闭!')
main()
因为依然卡死,所以更新了多线程部分,目前没发现卡死,现在出门做事了,挂在这里看看,应该没问题了.2020年4月9日20:10分
import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):#访问,且带着useragent,return的是text文本内容
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
r = requests.get(html,headers=headers)
if r.status_code == 200:
return r.text
return None
def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):#获取每一页的lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
return lastpage
def down_pic(content):#获取图片的name及下载地址及下载到本地
i = random.randint(1,6)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
try:
jpgget = requests.get(jpgdown[0],headers=headers,timeout = 20)
time.sleep(i)
print('=' * 20)
print(f'已开启随机等待,等待时间为{i}秒')
# print(jpgdown) #打印图片绝对网址
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
print(f'已完成{picname}的下载')
except:
print('该图片下载超时,自动跳过...')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
ex = ThreadPoolExecutor(max_workers=10)#多线程处理模块,开启10线程赋值给变量ex
for i in fenxi(html):#循环分析首页的每一个分页
urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
print('=' * 50)
# print(lastpage(aurl))#这里打印每一页的最大page
for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
pingjie1 = re.sub('.html','',url)#将.html替换为空
urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
# print(urls2)#这里打印的是网图的绝对网址
print('=' * 50)
print('网址库更新中,请耐心等待!')
# time.sleep(1)
for item in urls2:
futures = []
html2 = theuseragent(item)
future = ex.submit(down_pic,html2)#变量ex就是10线程,上面有说明,
futures.append(future)
wait(futures,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束
print('=' * 50)
print('程序运行完毕,请关闭!')
main()
2020年4月10日10:36程序假死,目前挂在这里继续让他运行,一会回来看动了没,卡住的地方:
已开启随机等待,等待时间为6秒
已完成['/siwameitui/58941_34.html']的下载
4月10日16:23已确认,程序确认假死,正在寻找问题
4月10日19:41测试修改后的程序,发现第一次程序运行完毕,且提示"程序运行完毕,请关闭!",证明了此次更新已经近乎完美,至于代码杂乱我就不修改了,本次学习到了很多东西,最后一次更新也修改了很多东西.本次图片爬取到本地共计1157张
我抽时间还会再更新一次,下次更新就是全站所有的图片都下载了.最好希望每个专辑专门自动建立一个文件夹
import re,os,time,random,lxml
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
#coding:utf-8
i = random.randint(1,5)
down_path = './meitu11'
if not os.path.exists(down_path):
os.makedirs(down_path)
def theuseragent(html):#访问,且带着useragent,return的是text文本内容
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
time.sleep(i)#最大问题就是这里,time.sleep就应该放这里,然后下边的requests.get设置timeout
print('=' * 20)
print(f'已开启随机等待,等待时间为{i}秒')
r = requests.get(html,headers=headers,timeout = 10)
if r.status_code == 200:
return r.text
print('该地址已经return')
return None
def fenxi(html):#分析文本内容模块,正则寻找href的网址链接
zhengze = re.compile('li class.*?a href="(.*?)"',re.S)
first_pic_url = re.findall(zhengze,html)
return first_pic_url
def lastpage(content):#获取每一页的lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
lastpage = picurl[0]#最大page是列表,赋值给lastpage变量为int整数,取出来使用
return lastpage
def down_pic(content):#获取图片的name及下载地址及下载到本地
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('content_left.*?<a.*?"(.*?)".*?title.*?查看下一张',re.S)
picname = re.findall(picnamezz,content)
picname1 = re.sub('\W','',picname[0])
jpgnow = re.compile('content_left.*?<a.*?".*?".*?title.*?查看下一张.*?src="(.*?)"',re.S)
jpgdown = re.findall(jpgnow,content)
jpgget = requests.get(jpgdown[0],headers=headers,timeout = (30,30))#这里的timeout分别是链接时间和读取时间,超时自动放弃
try:
#print(jpgdown) #打印图片绝对网址
with open(f'./meitu11/{picname1}.jpg','wb') as f:
f.write(jpgget.content)
# time.sleep(i) #time.sleep不因该在这里,我移到上边的theuseragent模块了
print(f'已完成{picname}的下载')
except:
print('该图片下载超时,自动跳过...')
def main():
start_url = 'https://www.umtuba.com'
urls = []
urls2 = []
html = theuseragent(start_url)
ex = ThreadPoolExecutor(max_workers=20)#多线程处理模块,开启10线程赋值给变量ex
for i in fenxi(html):#循环分析首页的每一个分页
urls.append('https://www.umtuba.com' + i)#将这些分页append从尾部添加到urls列表内
for url in urls:
aurl = theuseragent(url)
lastpage(aurl)
# print('=' * 50)
# print(lastpage(aurl))#这里打印每一页的最大page
for x1 in range(0,int(lastpage(aurl))):#网址拼接部分
pingjie1 = re.sub('.html','',url)#将.html替换为空
urls2.append(pingjie1+'_'+ str(x1) +'.html')#将pingjie1的内容加上_再加上x1的值在加上.html
# print(urls2)#这里打印的是网图的绝对网址
print('=' * 50)
print('网址库更新中,请耐心等待!')
# time.sleep(0.1)
# for item in urls2:#如果从这里就开始循环,等于多线程没有用上
# html2 = theuseragent(item)
future = [ex.submit(down_pic,theuseragent(item)) for item in urls2]#应该从这里开始循环.(变量ex就是20线程,上面有说明,)
wait(future,return_when=ALL_COMPLETED)#多线程处理模块的wait,等待括号内的所有内容结束,加入timeout
print('=' * 50)
print('程序运行完毕,请关闭!')
main()