本代码采集某人才网的所有HR联系方式
自动保存为文本文件,自动剔除座机电话,自动剔除重复号码
为保证他人服务器压力,设置延迟时间
为保证他人隐私,本代码中的URL隐藏
代码很多冗余,请自行修改

#coding=utf-8

import requests,re,os,time,pymysql,time,json
from lxml import etree

os.system('title 人才网爬虫 @小伍的游乐场-www.5yang.cc')
down_path = '人才网'
if not os.path.exists(down_path):
    os.makedirs(down_path)
url3 = []
url4 = []
telnumlist = []
def nextpage(lastpage):
    for i in range (1,1200):
        print('页码:',i)
        nexturl = f'https://m.****&.com/index.php?m=Mobile&c=Jobs&a=index&m=Mobile&page={i}'
        r = s.get(url=nexturl).text
        html = etree.HTML(r)

        href = html.xpath('/html/body/div/@onclick')  # 公司主页
        global url3
        global url4
        url3.extend(href)
    for re in url3:
        re = re.replace('javascript:location.href=','')
        re = re.replace('\'', '')
        url4.append(re)
    url4 = set(url4)
    #print(url4)
    global telnumlist
    try:
        for h in url4:
            #print('hurl:',h)

                time.sleep(0.5)
                r = s.get(url=h).text
                html = etree.HTML(r)
                telnum = html.xpath('/html/body/div[4]/div[1]/div[1]/div[2]/div[4]/span/text()')[0]
                telnumlist.append(telnum)
                print(telnum)
    except:
        print('发生异常,跳过')
        pass
        #电话号码去重
        #telnumlist = set(telnumlist)
    newlist = []
    for tel in telnumlist:
        if tel not in newlist and '07**-' not in tel:
            newlist.append(tel)
    try:
        for tel in newlist:
            with open(f'{down_path}/telnum.txt', 'a') as f:
                    f.write(tel + '\n')
                    print(tel,'写入成功!')
    except:
        print(tel, '写入失败!')
        pass


        # jobs = re.findall('<a href="(.*?)">全部职位',r)#全部职位
        # jobs = 'https://www.*****.com' + jobs[0]#全部职位链接
        # r2 = s.get(url=jobs).text
        # print('jobs是:',jobs)
        # html2 = etree.HTML(r2)
        # #alljobs = html2.xpath('/html/body//div[@class="jname"]/a/text()')#全部职位
        # try:
        #     allurl = html2.xpath('/html/body//div[@class="jname"]/a/@href')#全部职位URl
        # except:
        #     pass
        #
        # for url in allurl:  # 遍历全部职位url
        #     r = s.get(url).text
        #     print('Url:',url)
        #     html = etree.HTML(r)
        #     job = html.xpath('/html/body/div[3]/div/div/div[2]/div[1]/text()')[0]  # 岗位名称
        #     cname = html.xpath('/html/body//div[@class="comname"]/a/text()')[0]  # 公司名称
        #     jobin = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[2]/text()')[0]  # 全职/兼职
        #     joba = html.xpath('/html/body//div[@class="itemli"][2]/text()')  # 职位类别
        #     if len(joba) == 0:
        #         joba = ['无']
        #     jobnum = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[4]/text()')[0]  # 招聘人数
        #     #jobsc = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[6]/text()')[0]  # 学历要求
        #     #jobyear = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[7]/text()')[0]  # 工作经验
        #     sex = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[8]/text()')[0]  # 性别要求
        #     if sex == '不限':
        #         sex = 0
        #     elif sex == '女':
        #         sex = 2
        #     elif sex == '男':
        #         sex = 1
        #     #old = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[10]/text()')[0]  # 年龄要求
        #     #intype = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[11]/text()')[0]  # 招聘部门
        #     jobaddress = html.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div[13]/text()')[0]  # 工作地点
        #     #whos = html.xpath('/html/body/div[4]/div[1]/div[1]/div[2]/div[2]/text()[1]')[0]  # 联系人
        #     andtel = html.xpath('/html/body/div[4]/div[1]/div[1]/div[2]/div[4]/span/text()')  # 联系电话
        #     classjob = html.xpath('/html/body//div[@class="describe"]/div[@class="txt"]/text()')#职位描述
        #     money = html.xpath('/html/body//div[@class="jobstit"]/div[@class="wage"]/text()')[0]#薪资
        #
        #     if '1K' in money:
        #         money = 2
        #     elif '2K' in money:
        #         money = 3
        #     elif '3K' in money:
        #         money = 4
        #     elif '4K' in money:
        #         moeny = 5
        #     elif '5K' in money:
        #         money = 6
        #     elif '6K' in money:
        #         money = 6
        #     elif '7K' in money:
        #         money = 6
        #     elif '8K' in money or '9K' in money or '10K' in money or '11K' in money or '12K' in money:
        #         money = 7
        #     elif '13K' in money or '14K' in money or '15K' in money or '16K' in money or '17K' in money or '18K' in money or '19K' in money or '20K' in money :
        #         money = 8
        #     else:
        #         money = 0
        #
        #
        #     if len(andtel) != 0:
        #         db = pymysql.connect(host = 'localhost' ,user='root', password='6330055', port=3306, db='spiders')
        #         cursor = db.cursor()
        #
        #         data = {
        #         'uniacid' : 1,
        #         'uid' : 24,
        #         'isc' : 2,
        #         'cid' : 0,
        #         'title' : job,
        #         'jobcatindex' : 0,
        #         'salaryindex' : money,
        #         'flow' : 0,
        #         'recruitnum' : jobnum,
        #         'sexindex' : sex,
        #         'degreesindex' : 0,
        #         'experiencesindex' : 0,
        #         'des' : classjob,
        #         'chosewelfare' : ' a:1:{i:0;s:12:"其他补助";}',
        #         'imgs' : 'a:1:{i:0;s:0:"";}',
        #         'time':int(time.time()),
        #         'status' : 1,
        #         'istop' : 0,
        #         'toptime' : 'NULL',
        #         'flow' : 0,
        #         'istou' : 0,
        #         'isting' : 0,
        #         'citycode' : '360500',
        #         'telnum' : andtel[0],
        #         'address' : jobaddress,
        #         'state' : '1'
        #         }
        #
        #         table = 'ims_lshd_zhaopinhign_zpxx'
        #         keys = ','.join(data.keys())
        #         values = ','.join(['%s'] * len(data))
        #         sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table,keys=keys,values=values)
        #         try:
        #             if cursor.execute(sql,tuple(data.values())):
        #                 print('Successful')
        #                 db.commit()
        #         except:
        #             print('Failed')
        #             db.rollback()
        #         db.close()

            #     with open(f'{down_path}/人才网数据.txt', 'a') as f:
            #         f.write(cname)
            #         f.write(' ')
            #         f.write('\n\n')
            #         f.write('岗位名称:' + job)
            #         f.write(' ')
            #         f.write('职位类别:' + joba[0])
            #         f.write(' ')
            #         f.write('招聘人数:' + jobnum)
            #         f.write(' ')
            #         f.write('性别要求' + sex)
            #         f.write(' ')
            #         f.write(jobaddress)
            #         f.write(' ')
            #         f.write('联系电话' + andtel[0])
            #         f.write('\n\n')
            #         f.write('岗位职责:' + classjob)
            #         f.write('\r\n\n')
            #
            #
            # else:
            #     pass


def cookies_read():
    cookies_txt = open('cookies.txt', 'r')
    cookies_dict = json.loads(cookies_txt.read())
    cookies = requests.utils.cookiejar_from_dict(cookies_dict)
    return (cookies)

def sign_in():
    url = 'https://m.*****.com/index.php?m=mobile&c=members&a=login'

    headers = {
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1',
        'X-Requested-With': 'XMLHttpRequest'
    }
    data = {
        'username': '*****',#登陆信息隐藏
        'password': '*****',
        'expire': '1'
    }
    s = requests.session()
    s.headers.update()
    r = s.post(url=url, data=data, headers=headers, allow_redirects=True)
    cookies_dict = requests.utils.dict_from_cookiejar(s.cookies)
    cookies_str = json.dumps(cookies_dict)
    f = open('cookies.txt', 'w')
    f.write(cookies_str)
    f.close()

def sign_2in():
    url2 = 'https://m.*****.com/index.php?m=Mobile&c=Jobs&a=index'
    r = s.get(url=url2).text
    html = etree.HTML(r)
    lastpage = html.xpath('/html/body/div[44]/a[7]/@href')[0]
    lastpage = re.findall('page=(\d{3})', lastpage)[0]
    return lastpage
if __name__ == '__main__':

    s = requests.session()
    s.headers.update()
    try:
        s.cookies = cookies_read()
    except FileNotFoundError:
        sign_in()
    sign_in()
    sign_2in()
    nextpage(sign_2in)

最后修改:2022 年 12 月 05 日
如果觉得我的文章对你有用,请随意赞赏