文章

夏西cici 写真合集下载(附爬虫源码)

链接:https://pan.baidu.com/s/1L4QVt2aHcoIueywcTyHLDA
提取码:vgk6
--来自百度网盘超级会员V5的分享
解压密码:www.5yang.cc

import requests
from lxml import etree
import os,sys,time

down_path = '夏西CiCi'
if not os.path.exists(down_path):
    os.makedirs(down_path)

pics_urls = []
pic_urls=[]
all_pics = []
all_names=[]
titles = []

class cici:
    url = 'https://tw.kissgoddess.com/people/xia-xi-ci-ci.html'
    headers = {
        'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
    }
    session = requests.session()

    def req(self):
        r = self.session.get(self.url,headers = self.headers)
        if r.status_code == 200:
            return r.text
        else:
            print(r.status_code)

    def pics_urls(self):
        r = Lets.req()
        html = etree.HTML(r)
        pics = html.xpath('//*[@id="divAlbum"]/div/div/div/a/@href')  #获取所有图册url
        for url in pics:
            url = 'https://tw.kissgoddess.com' + url
            pics_urls.append(url)
        #print('pics_urls:',len(pics_urls))


    def Get_urls(self):
        Lets.pics_urls()
        for url in pics_urls:

            global all_pics, all_names, titles
            r = self.session.get(url,headers=self.headers)
            html = etree.HTML(r.text)
            picurls = html.xpath('//*[@id="pages"]/a/@href') #图册内所有页码的URls
            picurls = picurls[0:-1]

            for pic in picurls:
                pic = 'https://tw.kissgoddess.com' + pic
                pic_urls.append(pic)#加入整本图册的所有页码url
                r = self.session.get(pic,headers=self.headers)

                html = etree.HTML(r.text)
                allurl = html.xpath('//*[@id="td-outer-wrap"]/div[2]/div/div[2]/div/div/article/div[2]/img/@src')
                allnames = html.xpath('//*[@id="td-outer-wrap"]/div[2]/div/div[2]/div/div/article/div[2]/img/@alt')
                alltitles = html.xpath('//*[@id="td-outer-wrap"]/div[2]/div/div[2]/div/div/article/div[1]/header/h1/text()')

                all_pics.append(allurl)  #每一页url单页的图片URls 绝对地址,可以直接下载
                all_names.append(allnames)  #每一页的单图片名称
                titles.append(alltitles)#图片标题 用于创建文件夹

    def down_pic(self):
        Lets.Get_urls()
        i = 0
        for url,title in zip(all_pics, titles):
            for t in title:
                for u in url:
                    i = i+1
                    if not os.path.exists(f'{down_path}/{t}'):
                        os.makedirs(f'{down_path}/{t}')
                        print(f'{t}:Downloading...')
                    dd = self.session.get(u,headers=self.headers)
                    with open(f'{down_path}/{t}/{i}.jpg','wb') as e:
                        e.write(dd.content)
                        print('Done!')
                print(t,'Is Done!')











if __name__ == '__main__':
    Lets = cici()
    Lets.down_pic()


原文来自:夏西cici 写真合集下载(附爬虫源码),尊重自己,尊重每一个人;转发请注明来源!
3 0

发表评论