前几天看到一个阿朱的写真网站,心想留下地址,以后爬取练手,今天匆匆忙忙的很晚了赶来第一次练手,爬取是成功的,写的有点乱,暂时记录一下,以后有待优化

import requests
import re
import os
from pyquery import PyQuery as pq

down_path = './azhu'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def getonepage(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def fenxi(html):
    zhengze = re.compile('img src="(.*?)"',re.S)
    pic_url = re.findall(zhengze,html)
    pic_url_rea = pic_url[3]
    picgit = requests.get(pic_url_rea)
    print(pic_url_rea)
    nameurl = re.compile('_.*')
    picname = re.findall(nameurl,pic_url_rea)
    print(picname)
    with open(f'./azhu/{picname}.jpg','wb') as f:
        f.write(picgit.content)

def main():
    urls = ['https://www.umtuba.com/siwameitui/56995_1.html']
    for i in range(2,68):
        urls.append(f'https://www.umtuba.com/siwameitui/56995_{i}.html')
        # print(urls)
    for url in urls:
        html = getonepage(url)
        fenxi(html)
main()

2020年4月4日,时隔一夜发现程序怎么都会卡到某一个地址,可能是由于甲方的网站服务器不稳定导致的,所以本次小修改之外还加入了time模块,每次访问都让程序间隔1秒

import requests
import re
import os
import time

down_path = './azhu'
if not os.path.exists(down_path):
    os.makedirs(down_path)

def getonepage(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

def fenxi(html):
    zhengze = re.compile('img src="(.*?)"',re.S)
    pic_url = re.findall(zhengze,html)
    pic_url_rea = pic_url[3]
    picgit = requests.get(pic_url_rea)
    print(pic_url_rea)
    nameurl = re.compile('_.*')
    picname = re.findall(nameurl,pic_url_rea)
    print(picname)
    with open(f'./azhu/{picname}.jpg','wb') as f:
        f.write(picgit.content)
        f.close()
        print(f'已完成{picname}的下载')

def main():
    urls = ['https://www.umtuba.com/siwameitui/56995_1.html']
    for i in range(2,68):
        urls.append(f'https://www.umtuba.com/siwameitui/56995_{i}.html')
        # print(urls)
    for url in urls:
        html = getonepage(url)
        time.sleep(1)
        fenxi(html)
    print('图片下载已全部完成!')
main()

图片打包下载地址:
链接:https://pan.baidu.com/s/1XgZE3Pvwlg0422-QQSotlA
提取码:7oug

最后修改:2022 年 12 月 05 日
如果觉得我的文章对你有用,请随意赞赏