51job-爬虫

最新推荐文章于 2022-01-01 15:19:47 发布

难为知己~难为敌

最新推荐文章于 2022-01-01 15:19:47 发布

阅读量434

点赞数 1

本文链接：https://blog.csdn.net/weixin_40594668/article/details/107271239

版权

import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_html(page, keyword, header):
    url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
    response = requests.get(url, verify=False, headers=header)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        return response.text


def parse_html(html):
    data = etree.HTML(html)
    table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
    datas = []
    for info in table_list:

        name = info.xpath('p//a/text()')  # 职位名称
        comn = info.xpath('span/a/text()')  # 公司名称
        region = info.xpath('span[@class="t3"]/text()')  # 地区
        salary = info.xpath('span[@class="t4"]/text()')  # 薪资
        times = info.xpath('span[@class="t5"]/text()')  # 日期
        rest = []
        if name:
            rest.append(name[0].strip())
            rest.append(comn[0].strip())
            rest.append(region[0].strip())
            if salary == []:
                rest.append('面议')
            else:
                rest.append(salary[0].strip())
            rest.append(times[0].strip())



            print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
        datas.append(rest)
    return datas


def save_data(data,filename='python'):
    with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
        wo = csv.writer(f)
        for i in data:
            wo.writerow(i)



if __name__ == '__main__':
    """
    
    """
    ua = UserAgent()
    header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "search.51job.com",
        "Referer": "https://www.51job.com/",
        "User-Agent": ua.random
        }
    keyword = input('请输入您要查询的岗位：')
    page = int(input('请输入要获取的页数：'))
    for i in range(1, page+1):
        print(f'开始爬取第{i}页')
        html = get_html(i, keyword, header)
        time.sleep(1.1)
        datas = parse_html(html)
        save_data(datas,keyword)

难为知己~难为敌

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
51job-爬虫

import requestsimport urllib3from fake_useragent import UserAgentfrom lxml import etreeimport csvimport timeurllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)def get_html(page, keyword, header): url = f'https://search.51job.co
复制链接

扫一扫