51job招聘信息爬取

最新推荐文章于 2024-08-01 11:20:11 发布

Andy_mq

最新推荐文章于 2024-08-01 11:20:11 发布

阅读量167

点赞数

分类专栏：作业文章标签： python 爬虫数据挖掘

本文链接：https://blog.csdn.net/weixin_44160560/article/details/117402977

版权

作业专栏收录该内容

16 篇文章 0 订阅

订阅专栏

该博客使用Python的BeautifulSoup和Selenium库爬取了51job网站的171页数据分析岗位信息，包括职位、发布时间、公司、年薪、招聘条件、公司规模、经营范围和职位链接，并将数据保存到CSV文件中。

摘要由CSDN通过智能技术生成

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import csv


def request_data():
    # 设置打开网页不自动关闭
    option = webdriver.ChromeOptions()
    option.add_experimental_option("detach", True)
    browser = webdriver.Chrome(options=option)

    data_all = []
    for i in range(1, 172):
        browser.get(f'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{i}.html')
        time.sleep(1)
        data_all.append(browser.page_source)
    return data_all


def analysis_data(data):
    finally_data = []
    for each_item in data:
        html = bs(each_item, 'lxml')
        all_contents = html.select('.j_joblist>.e')
        all_info = []
        for item in all_contents:
            # 职位
            post = item.select_one('.t>.jname.at').get_text()
            # 发布时间
            announce_time = item.select_one('.t>.time').string
            # 公司
            company = item.select_one('.cname.at').get_text()
            # 年薪
            wages = item.select_one('.sal').string
            # 条件要求
            ask = item.select_one('.d.at').get_text()
            # 公司规模
            scale = item.select_one('.dc.at').get_text()
            # 经营范围
            business = item.select_one('.int.at').get_text()
            # 详情地址
            postlink = item.select_one('.el').attrs['href']
            all_info.append([post, announce_time, company, wages, ask, scale, business, postlink])
        finally_data.append(all_info)
    return finally_data


if __name__ == '__main__':
    result = analysis_data(request_data())
    with open('./51job.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['招聘岗位', '发布时间', '公司名称', '工资', '招聘条件', '公司规模', '经营范围', '职位连接'])
        for item1 in result:
            writer.writerows(item1)