51job-爬虫

import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_html(page, keyword, header):
    url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
    response = requests.get(url, verify=False, headers=header)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        return response.text


def parse_html(html):
    data = etree.HTML(html)
    table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
    datas = []
    for info in table_list:

        name = info.xpath('p//a/text()')  # 职位名称
        comn = info.xpath('span/a/text()')  # 公司名称
        region = info.xpath('span[@class="t3"]/text()')  # 地区
        salary = info.xpath('span[@class="t4"]/text()')  # 薪资
        times = info.xpath('span[@class="t5"]/text()')  # 日期
        rest = []
        if name:
            rest.append(name[0].strip())
            rest.append(comn[0].strip())
            rest.append(region[0].strip())
            if salary == []:
                rest.append('面议')
            else:
                rest.append(salary[0].strip())
            rest.append(times[0].strip())



            print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
        datas.append(rest)
    return datas


def save_data(data,filename='python'):
    with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
        wo = csv.writer(f)
        for i in data:
            wo.writerow(i)



if __name__ == '__main__':
    """
    
    """
    ua = UserAgent()
    header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "search.51job.com",
        "Referer": "https://www.51job.com/",
        "User-Agent": ua.random
        }
    keyword = input('请输入您要查询的岗位:')
    page = int(input('请输入要获取的页数:'))
    for i in range(1, page+1):
        print(f'开始爬取第{i}页')
        html = get_html(i, keyword, header)
        time.sleep(1.1)
        datas = parse_html(html)
        save_data(datas,keyword)

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值