目录
引入包
from selenium import webdriver
from selenium.webdriver.common.by import By
#WebDriverWait类,负责循环等待
from selenium.webdriver.support.ui import WebDriverWait
#expected_contions类,负责条件触发
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from lxml import etree
import pandas as pd
import time
import os
import re
from pymongo import *
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib.pyplot import MultipleLocator
import wordcloud
import jieba
import imageio
import csv
import re
模拟登陆京东网
def login(url, user, password):
driver.get(url)
driver.maximize_window()
# 进入登录页面
driver.find_element_by_link_text('登录').click()
# 选择账户登录方式
driver.find_element_by_link_text('账户登录').click()
# 输入框输入账号和密码
driver.find_element_by_id('loginname').send_keys(user)
driver.find_element_by_id('nloginpwd').send_keys(password)
driver.find_element_by_id('loginsubmit').click()
#手动滑动验证码
time.sleep(8)
#切换界面后选择职位类别:研发类
search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//i[@class="arrow-down"]')))
search_btn.click()
driver.find_element_by_id('YANFA').click()
#点击搜索
driver.find_element_by_link_text('搜索').click()
time.sleep(8)
#进入到京东研发类招聘界面
#可能进入界面后,研发类仍旧没有选出来,再次选择一下
search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="suggess-sel"]/i')))
search_btn.click()
driver.find_element_by_xpath('//input[@value="YANFA"]').click()
search_btn.click()
time.sleep(5)
return driver.page_source
# 通过点击下一页按钮,获取出第1页外其它页网页源代码
def get_next_page():
# 将滚动条拖动至页面底端,使下一页按钮显露出来
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# 查找下一页按钮
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.next')))
# 单击按钮
next_button.click()
time.sleep(5)
return driver.page_source
利用XPath对网页源代码进行解析
def parse_page(html):
dom = etree.HTML(html)
# 点击下拉框查看岗位描述和任职要求,以爬取信息
for i in range(1, 11):
driver.find_element_by_xpath('//div[@class="line"][{}]/div/span/b[@class="drop-down"]'.format(i)).click()
# 职位名称
careers_name = dom.xpath('//div[@class="info"]/span[1]/text()')
# print(careers_name)
# print(len(careers_name))
# 职位类别
careers_class = dom.xpath('//span[@class="sel"][1]/text()')
# print(careers_class)
# print(len(careers_class))
# 工作地点
careers_place = dom.xpath('//span[@class="sel"][2]/text()')
# print(careers_place)
# print(len(careers_place))
# 发布时间
published_time = dom.xpath('//div[@class="info"]/span[4]/text()')
# print(published_time)
# print(len(published_time))
# 岗位描述
published_info = []
for p in range(1, 11):
#需要获取每一个父节点内所有的子节点信息
info = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][1])'.format(p))
#正则表达式去除“岗位描述”多余字
published=re.findall(r"岗位描述:(.*)",info,re.S)[0]
published_info.append(published)
# published_info = dom.xpath('//div[@class="detail"][1]/div[@class="par"]/p/string()')
# print(published_info)
# print(len(published_info))
# 任职要求
careers_requirement = []
for p in range(1, 11):
#同理岗位描述
requirement = dom.xpath('string(//div[@class="line"][{}]/div[@class="detail"]/div[@class="par"][2])'.format(p))
careers = re.findall(r"任职要求:(.*)", requirement,re.S)[0]
careers_requirement.append(careers)
# print(careers_requirement)
# print(len(careers_requirement))
data = pd.DataFrame({
'职位名称': careers_name,
'职位类别': careers_class,
'工作地点': careers_place,
'发布时间': published_time,
'岗位描述': published_info,
'任职要求': careers_requirement
})
return data
数据清洗
def data_cleaning(data):
# 查看索引,维度
print(result.shape)
# 重复值
data.drop_duplicates()
#缺失值
data.fillna("无",inplace=True)
#异常值处理
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure() # 建立图像
p=data.boxplot(return_type = 'dict')
x = p['fliers'][0].get_xdata()
y = p['fliers'][0].get_ydata()
y.sort()
print(x)
print(y)
# 使用annotate添加加标注,y[i]为标注的内容,xy表示标注点坐标,xytert表示注释坐标
for i in range(len(x)):
plt.annotate(y[i],xy = (x[i],y[i]), xytext = (x[i] + 0.05,y[i]))
plt.show()#展示箱线图
#按照列值进行排序
data.sort_values("发布时间",ascending=False)
return data
每页数据以追加形式保存至csv文件
#每页数据以追加形式保存至csv文件
def save_file(data): # 参数为DataFrame
columns = ['职位名称', '职位类别', '工作地点', '发布时间','岗位描述', '任职要求']
filename = './../data/京东研发类招聘信息.csv'
if os.path.exists(filename):
data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False, header=False)
else:
data.to_csv(filename, mode='a', encoding='utf_8_sig', columns=columns, index=False)
print("csv保存成功!")
保存数据到MongoDB数据库,参数为字典组成的列表
# 保存数据到MongoDB数据库,参数为字典组成的列表
def save_to_mongo(data):
try:
#dataframe转为字典
data= data.to_dict(orient='records')
collection.insert_many(data)
print('Saved to Mongo')
except Exception as e:
print(e)
数据分析与可视化
# 数据分析与可视化
def analysis_visualization():
#统计职位名称并画柱形图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data_path = "./../data/京东研发类招聘信息.csv"
train_data = pd.read_csv(data_path)
article = train_data["职位名称"]
f = lambda x: x.split(" ")
article_list = article.apply(f)
word_counts = Counter()
for line in article_list:
word_counts.update(line)
counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
x = list(map(lambda x: x[0], counter_list[:5]))
y = list(map(lambda y: y[1], counter_list[:5]))
ax = plt.gca()
# ax为两条坐标轴的实例
x_major_locator = MultipleLocator(1)
# 把x轴的刻度间隔设置为1,并存在变量里
ax.xaxis.set_major_locator(x_major_locator)
plt.bar(x, y)
plt.xlabel('职位名称')
plt.ylabel('数目')
# 添加数据标签
plt.title('职位名称')
for i in range(len(x)):
plt.text(x[i], y[i], y[i])
plt.show()
#统计工作地点并画扇形图
data = train_data['工作地点']
num = data.value_counts()
plt.figure(figsize=(8, 8)) # 设置画布
plt.pie(num, autopct='%.2f%%', labels=num.index)
plt.axis("equal")
plt.title('工作地点')
plt.show()
#统计发布日期并画折线图
article = train_data["发布时间"]
f = lambda x: x.split(" ")
article_list = article.apply(f)
word_counts = Counter()
for line in article_list:
word_counts.update(line)
counter_list = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
x = list(map(lambda x: x[0], counter_list[:5]))
y = list(map(lambda y: y[1], counter_list[:5]))
plt.plot(x, y)
plt.title('发布时间')
plt.xlabel('发布时间')
plt.ylabel('数目')
plt.show()
##统计岗位描述化为词云
maskImag = imageio.imread('./../img/京东岗位描述原图.jpg')
wordcloudString = ''
with open(data_path, 'r', encoding='utf-8') as f:
data = csv.reader(f)
for item in data:
career = ''.join(item[4])
wordcloudString += career
ls = jieba.lcut(wordcloudString)
txt = " ".join(ls)
wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',
mask=maskImag,
width=1000, height=700,
background_color='white',
)
wc.generate(txt)
wc.to_file('./../img/岗位描述.png')
wc.to_image().show()
# 统计任职要求化为词云
maskImag = imageio.imread('./../img/京东任职要求原图.jpg')
wordcloudString = ''
with open(data_path, 'r', encoding='utf-8') as f:
data = csv.reader(f)
for item in data:
career = ''.join(item[5])
wordcloudString += career
ls = jieba.lcut(wordcloudString)
txt = " ".join(ls)
wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\微软雅黑\msyhbd.ttc',
mask=maskImag,
width=1000, height=700,
background_color='white',
)
wc.generate(txt)
wc.to_file('./../img/任职要求.png')
wc.to_image().show()
总函数
if __name__ == '__main__':
# 账号:xxxxx
user = input("请输入账号:")
#密码:xxxxxx
password = input("请输入密码:")
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
url = "http://zhaopin.jd.com/"
#存储至mangodb
client = MongoClient() # 创建连接
db = client.jingdong # 访问数据库
collection = db.zhaopin # 创建集合
#模拟登陆账号并且登录到招聘页,并选择研发类
html = login(url, user, password)
for i in range(1, 75):
#倘若不是第一页
if i != 1:
html = get_next_page()
data = parse_page(html)
if i ==1:
result= data
if i != 1:
result = result.append(data)
#print(data)
print("第{}页爬取完成!".format(i))
#数据预处理
result=data_cleaning(result)
#保存为csv
save_file(result)
#保存至mangodb
save_to_mongo(result)
#数据分析与可视化
analysis_visualization()
driver.close()