[python爬虫]selenium模拟登录京东招聘网,爬取研发类,数据清洗,数据存储,终数据分析和可视化

目录

引入包

模拟登陆京东网

利用XPath对网页源代码进行解析

数据清洗

每页数据以追加形式保存至csv文件

保存数据到MongoDB数据库,参数为字典组成的列表

数据分析与可视化

总函数


引入包

from selenium import webdriver
from selenium.webdriver.common.by import By
#WebDriverWait类,负责循环等待
from selenium.webdriver.support.ui import WebDriverWait
#expected_contions类,负责条件触发
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from lxml import etree
import pandas as pd
import time
import os
import re
from pymongo import *
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib.pyplot import MultipleLocator
import wordcloud
import jieba
import imageio
import csv
import re

模拟登陆京东网

def login(url, user, password):
    driver.get(url)
    driver.maximize_window()
    # 进入登录页面
    driver.find_element_by_link_text('登录').click()
    # 选择账户登录方式
    driver.find_element_by_link_text('账户登录').click()
    # 输入框输入账号和密码
    driver.find_element_by_id('loginname').send_keys(user)
    driver.find_element_by_id('nloginpwd').send_keys(password)
    driver.find_element_by_id('loginsubmit').click()
    #手动滑动验证码

    time.sleep(8)
    #切换界面后选择职位类别:研发类
    search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//i[@class="arrow-down"]')))
    search_btn.click()
    driver.find_element_by_id('YANFA').click()
    #点击搜索
    driver.find_element_by_link_text('搜索').click()
    time.sleep(8)
    #进入到京东研发类招聘界面
    #可能进入界面后,研发类仍旧没有选出来,再次选择一下
    search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="suggess-sel"]/i')))
    search_btn.click()
    driver.find_element_by_xpath('//input[@value="YANFA"]').click()
    search_btn.click()
    time.sleep(5)
    return driver.page_source


# 通过点击下一页按钮,获取出第1页外其它页网页源代码
def get_next_page():
    # 将滚动条拖动至页面底端,使下一页按钮显露出来
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    # 查找下一页按钮
    next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.next')))
    # 单击按钮
    next_button.click()
    time.sleep(5)
    return driver.page_source

利用XPath对网页源代码进行解析

def parse_page(html):
    dom = etree.HTML(html)
    # 点击下拉框查看岗位描述和任职要求,以爬取信息
    for i in range(1, 11):
        driver.find_element_by_xpath('//div[@class="line"][{}]/div/span/b[@class="drop-down"]'.format(i)).click()
    # 职位名称
    careers_name = dom.xpath('//div[@class="info"]/span[1]/text()')
    # print(careers_name)
    # print(len(careers_name))
    # 职位类别
    careers_class = dom.xpath('//span[@class="sel"][1]/text()')
    # print(careers_class)
    # print(len(careers_class))
    # 工作地点
    careers_place = dom.xpath('//span[@class="sel"][2]/text()')
    # print(careers_place)
    # print(len(careers_place))
    # 发布时间
    published_time = dom.xpath('//div[@class="info"]/span[4]/text()')
    # print(published_time)
    # print(len(published_time))
    # 岗位描述
    published_info = []
    for p in range(1, 11):
        #需要获取每一个父节点内所有的子节点信息
        info = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][1])'.format(p))
        #正则表达式去除“岗位描述”多余字
        published=re.findall(r"岗位描述:(.*)",info,re.S)[0]
        published_info.append(published)
    # published_info = dom.xpath('//div[@class="detail"][1]/div[@class="par"]/p/string()')
    # print(published_info)
    # print(len(published_info))
    # 任职要求
    careers_requirement = []
    for p in range(1, 11):
        #同理岗位描述
        requirement = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][2])'.format(p))
        careers = re.findall(r"任职要求:(.*)", requirement,re.S)[0]
        careers_requirement.append(careers)
    # print(careers_requirement)
    # print(len(careers_requirement))
    data = pd.DataFrame({
        '职位名称': careers_name,
        '职位类别': careers_class,
        '工作地点': careers_place,
        '发布时间': published_time,
        '岗位描述': published_info,
        '任职要求': careers_requirement
    })
    return data

数据清洗

def data_cleaning(data):
    # 查看索引,维度
    print(result.shape)
    # 重复值
    data.drop_duplicates()
    #缺失值
    data.fillna("无",inplace=True)
    #异常值处理
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.figure() # 建立图像
    p=data.boxplot(return_type = 'dict')
    x = p['fliers'][0].get_xdata()
    y = p['fliers'][0].get_ydata()
    y.sort()
    print(x)
    print(y)
    # 使用annotate添加加标注,y[i]为标注的内容,xy表示标注点坐标,xytert表示注释坐标
    for i in range(len(x)):
        plt.annotate(y[i],xy = (x[i],y[i]), xytext = (x[i] + 0.05,y[i]))
    plt.show()#展示箱线图
    #按照列值进行排序
    data.sort_values("发布时间",ascending=False)
    return data

每页数据以追加形式保存至csv文件

#每页数据以追加形式保存至csv文件
def save_file(data):  # 参数为DataFrame
    columns = ['职位名称', '职位类别', '工作地点', '发布时间','岗位描述', '任职要求']
    filename = './../data/京东研发类招聘信息.csv'
    if os.path.exists(filename):
        data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False, header=False)
    else:
        data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False)
    print("csv保存成功!")

保存数据到MongoDB数据库,参数为字典组成的列表

# 保存数据到MongoDB数据库,参数为字典组成的列表
def save_to_mongo(data):
    try:
        #dataframe转为字典
        data= data.to_dict(orient='records')
        collection.insert_many(data)
        print('Saved to Mongo')
    except Exception as e:
        print(e)

数据分析与可视化

# 数据分析与可视化
def analysis_visualization():
    #统计职位名称并画柱形图
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    data_path = "./../data/京东研发类招聘信息.csv"
    train_data = pd.read_csv(data_path)
    article = train_data["职位名称"]
    f = lambda x: x.split(" ")
    article_list = article.apply(f)
    word_counts = Counter()
    for line in article_list:
        word_counts.update(line)
    counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    x = list(map(lambda x: x[0], counter_list[:5]))
    y = list(map(lambda y: y[1], counter_list[:5]))
    ax = plt.gca()
    # ax为两条坐标轴的实例
    x_major_locator = MultipleLocator(1)
    # 把x轴的刻度间隔设置为1,并存在变量里
    ax.xaxis.set_major_locator(x_major_locator)
    plt.bar(x, y)
    plt.xlabel('职位名称')
    plt.ylabel('数目')
    # 添加数据标签
    plt.title('职位名称')
    for i in range(len(x)):
        plt.text(x[i], y[i], y[i])
    plt.show()
    #统计工作地点并画扇形图
    data = train_data['工作地点']
    num = data.value_counts()
    plt.figure(figsize=(8, 8))  # 设置画布
    plt.pie(num, autopct='%.2f%%', labels=num.index)
    plt.axis("equal")
    plt.title('工作地点')
    plt.show()
    #统计发布日期并画折线图
    article = train_data["发布时间"]
    f = lambda x: x.split(" ")
    article_list = article.apply(f)
    word_counts = Counter()
    for line in article_list:
        word_counts.update(line)
    counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    x = list(map(lambda x: x[0], counter_list[:5]))
    y = list(map(lambda y: y[1], counter_list[:5]))
    plt.plot(x, y)
    plt.title('发布时间')
    plt.xlabel('发布时间')
    plt.ylabel('数目')
    plt.show()
    ##统计岗位描述化为词云
    maskImag = imageio.imread('./../img/京东岗位描述原图.jpg')
    wordcloudString = ''
    with open(data_path, 'r', encoding='utf-8') as f:
        data = csv.reader(f)
        for item in data:
            career = ''.join(item[4])
            wordcloudString += career
    ls = jieba.lcut(wordcloudString)
    txt = " ".join(ls)
    wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',
                             mask=maskImag,
                             width=1000, height=700,
                             background_color='white',
                             )
    wc.generate(txt)
    wc.to_file('./../img/岗位描述.png')
    wc.to_image().show()
    # 统计任职要求化为词云
    maskImag = imageio.imread('./../img/京东任职要求原图.jpg')
    wordcloudString = ''
    with open(data_path, 'r', encoding='utf-8') as f:
        data = csv.reader(f)
        for item in data:
            career = ''.join(item[5])
            wordcloudString += career
    ls = jieba.lcut(wordcloudString)
    txt = " ".join(ls)
    wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',
                             mask=maskImag,
                             width=1000, height=700,
                             background_color='white',
                             )
    wc.generate(txt)
    wc.to_file('./../img/任职要求.png')
    wc.to_image().show()

 

 

  

总函数

if __name__ == '__main__':
    # 账号:xxxxx
    user = input("请输入账号:")
    #密码:xxxxxx
    password = input("请输入密码:")
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    url = "http://zhaopin.jd.com/"
    #存储至mangodb
    client = MongoClient()  # 创建连接
    db = client.jingdong  # 访问数据库
    collection = db.zhaopin # 创建集合
    #模拟登陆账号并且登录到招聘页,并选择研发类
    html = login(url, user, password)
    for i in range(1, 75):
        #倘若不是第一页
        if i != 1:
            html = get_next_page()
        data = parse_page(html)
        if i ==1:
            result= data
        if i != 1:
            result = result.append(data)
        #print(data)
        print("第{}页爬取完成!".format(i))
    #数据预处理
    result=data_cleaning(result)
    #保存为csv
    save_file(result)
    #保存至mangodb
    save_to_mongo(result)
    #数据分析与可视化
    analysis_visualization()
    driver.close()

  • 3
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值