city= '合肥'
kw = '算法'
dffromlagou = lagou_main(city= city,kw = kw)
import matplotlib.pyplot as plt
import numpy as np
from scipy import math
from scipy import stats
#import seaborn as sns
#sns.set()
修改需要计算的几项为数值型
dffromlagou.head()
dffromlagou[['工资低位','工资高位','公司规模低位']] = dffromlagou[['工资低位','工资高位','公司规模低位']].apply(pd.to_numeric)
验证是否有为0的薪酬项目
dffromlagou[dffromlagou.工资低位==0]
保存初始表单到本机/追加写入
import os
dffromlagou.to_csv("{}{}{}{}{}{}".format(os.getcwd(),'/','拉钩网',city,kw,'.csv'),mode='a+')
查看本次抓取数据行数
dffromlagou.count()[0]
分组统计初探
grouped=dffromlagou.groupby(dffromlagou["位置"])
查看各行政区县的岗位数量
grouped.count().职位
转学历项为布尔值编码
def getcodeforEducation(List):
education = ['不限','大专','本科','硕士','博士']
return [i in List and 1 or 0 for i in education]
统计学百分比函数
getcodeforEducation(['本科', '硕士'])
def getCount(List):
return [[i,List.count(i),List.count(i)/len(List)] for i in set(List)]
getCount(['本科','大专','本科'])
生成内存数据存储的hash链表
def dataSetDict(data):
return {[c][0]:[dffromlagou[dffromlagou.公司==c]['位置'].to_list()[:1]
+dffromlagou[dffromlagou.公司==c]['公司规模低位'].to_list()[:1]
+dffromlagou[dffromlagou.公司==c]['工资低位'].to_list()[:1]
+getcodeforEducation(set(dffromlagou[dffromlagou.公司==c]['学历'].to_list()))
+[ len(dffromlagou[dffromlagou.公司==c]['学历'].to_list())]
+ dffromlagou[dffromlagou.公司==c]['企业最后登陆'].to_list()[:1]
+ [ getCount(dffromlagou[dffromlagou.公司==c]['学历'].to_list())]]
for c in set(data)}
{i : dataSetDict(set(dffromlagou[dffromlagou.位置==i].公司)) for i in set(dffromlagou.位置)}
查看区域分布的柱状图
from pyecharts.charts import Bar
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
from pyecharts import options as opts
from pyecharts.faker import Faker
def P1():
values = [dffromlagou[dffromlagou.位置==i]['职位'].count()
for i in set(dffromlagou.位置.to_list())]
names = [i for i in set(dffromlagou.位置.to_list())]
table = pd.DataFrame({'分布地区':names,'数量':values})
c = (
Bar()
.add_xaxis(table.分布地区.to_list())
.add_yaxis("岗位分布地区", table.数量.to_list(), category_gap=0, color="#778899")
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-35))
,title_opts=opts.TitleOpts(title="{}{}{}".format(samplingTime,city,"不同地区岗位数据分析岗位数量分布")
, subtitle="None"))
)
return c.render_notebook()#ca8622,#61a0a8
P1()
分区描述统计
import datetime
def statisticsTable():
income = {i:np.array(dffromlagou[dffromlagou.位置==i].工资低位.to_list())
for i in set(dffromlagou.位置.to_list())}
groupingStatistics = {n:[v.mean()
,v.std()
,np.argmax(np.bincount(v))
,np.median(v)
,v.min()
,v.max()
,v.ptp()
,v.shape[0]
,v.shape[0]/dffromlagou.count()[0]
,samplingTime]
for n,v
in income.items() if n != None}
result = pd.DataFrame(groupingStatistics).transpose()
result.columns=['均值','标准差','众数','中位数','最小值','最大值','极差','计数','区域占比','采样时间']
return result
statisticsTable()
公司收入均值排名,倒序
groupedm=dffromlagou.groupby(dffromlagou["公司"])
groupedm.工资低位.mean().sort_values()
招聘数量均值排名倒序
groupedm.工资低位.count().sort_values()[-10:]
从百度地图api抓取招聘企业公司地址坐标
import requests
import json
def getUrl(*address):
'''
调用地图API获取待查询地址专属url
最高查询次数30w/天,最大并发量160/秒
'''
ak =
if len(address) < 1:
return None
else:
for add in address:
url = 'http://api.map.baidu.com/geocoding/v3/?address={inputAddress}&output=json&ak={myAk}'.format(inputAddress=add,myAk=ak)
yield url
def getPosition(url):
'''返回经纬度信息'''
res = requests.get(url)
json_data = json.loads(res.text)
if json_data['status'] == 0:
lat = json_data['result']['location']['lat'] #纬度
lng = json_data['result']['location']['lng'] #经度
precise = json_data['result']['precise']
confidence = json_data['result']['confidence']
comprehension = json_data['result']['comprehension']
level = json_data['result']['level']
else:
print("Error output!")
return json_data['status']
return lat,lng,precise,confidence,comprehension,level
def getStrings(address):
for add in address:
add_url = list(getUrl(add))[0]
print(add_url)
try:
lat,lng,precise,confidence,comprehension,level = getPosition(add_url)
print("地址:{0}|经度:{1}|纬度:{2}|精度:{3}|置信度:{4}|解释性:{5}|分类等级:{6}".format(add,lat,lng,precise,confidence,comprehension,level))
except Error as e:
print(e)
#getStrings()
def getDict(address):
column = ["地址","经度","纬度","精度","置信度","解释性","分类等级"]
addressL,latL,lngL,preciseL,confidenceL,comprehensionL,levelL = [],[],[],[],[],[],[]
n = -1
for add in address:
addressL.append(add)
lat,lng,precise,confidence,comprehension,level = getPosition(list(getUrl(add))[0])
latL.append(lat)
lngL.append(lng)
preciseL.append(precise)
confidenceL.append(confidence)
comprehensionL.append(comprehension)
levelL.append(level)
return {cn : data for cn , data in zip(column,(addressL,latL,lngL,preciseL,confidenceL,comprehensionL,levelL))}
查看地理坐标列表的dataframe
def cityTable():
checkaddress = list(set([''.join((
city != None and city or ''
,region != None and region or ''
,name != None and name or ''))
for city,region,name
in zip(dffromlagou.城市,dffromlagou.位置,dffromlagou.公司全称)]))
return pd.DataFrame(getDict(checkaddress))
citytable=cityTable()
citytable
对本次抓取的公司薪酬做描述统计
company = pd.DataFrame({i:[dffromlagou[dffromlagou.公司全称 == i].工资低位.mean()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.sum()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.median()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.mode()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.count()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.std()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.min()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.max()
,dffromlagou[dffromlagou.公司全称 == i].工资低位.max()-dffromlagou[dffromlagou.公司全称 == i].工资低位.min()
]
for i in dffromlagou.公司全称}).transpose()
company.columns = ['mean','sum','median','mode','count','std','min','max','ptp']
company
查看地理位置在坐标系的相对距离
import plotly.express as px
fig = px.scatter(x=citytable.经度,y=citytable.纬度)
fig.show()
查看招聘企业整体的地理位置在地图上的分布
KMpoint = getDict([city+'火车站'])
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
#token='pk.eyJ1Ijoic3l2aW5jZSIsImEiOiJjazZrNTcwY3kwMHBrM2txaGJqZWEzNWExIn0.tLQHY_OoiR2NMxnYHXUBAA'
#token = 'pk.eyJ1IjoiYmxhY2tzaGVlcHdhbGwwMzA1IiwiYSI6ImNrMHo5ZnQxYjBjbG8zbm84b3hrb25vb24ifQ.K8tcDjJDsPcjdYFTSVgTxw'
token = 'pk.eyJ1IjoibWFubWFuemhhbmciLCJhIjoiY2thMDRkbGM3MDh0aDNybjZpM3hyam5yOCJ9.TOaUXDGA4hzIoAowDRSOqw'
university = pd.read_csv("{}{}{}".format(os.getcwd(),'/','拉钩网公司地址临时.csv'))
university.置信度 = university.置信度*1000
university.经度.mean(),university.纬度.mean()
px.set_mapbox_access_token(token)
fig4 = px.scatter_mapbox(university
, lat="经度"
, lon="纬度"
, color="置信度"
, size="解释性"
, color_continuous_scale=px.colors.cyclical.IceFire
, size_max=6
, zoom=9
,center = {'lon': KMpoint['纬度'][0], 'lat': KMpoint['经度'][0]})
fig4.show()
二级页面抓取招聘岗位详细信息
from lxml import etree
def getPage(Num,url):
html= getHTML(url)
advertising = '//*[@id="job_detail"]/dd[1]/p/text()'## 岗位广告
demand = '//*[@id="job_detail"]/dd[2]/div/p/text()'## 岗位需求
location = '//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()'## 详细地理位置
staffPosition = '//*[@id="job_detail"]/dd[4]/div/div/span[1]/text()' ##招聘人员职位
employerInformation = '//*[@id="job_detail"]/dd[4]/div/div/a/span[1]/text()' ## 招聘公司及人员姓名
industry = '/html/body/div[6]/div/div[1]/dd/ul/li[1]/text()' # 行业
jobDescriptions = '/html/body/div[6]/div/div[1]/dd/h3/span/text()' #岗位要求描述
refreshTime = '/html/body/div[6]/div/div[1]/dd/p/text()' #刷新时间
Num = 1
return tuple({(''.join(tree.xpath(employerInformation)) ## 招聘公司及人员姓名
,''.join(tree.xpath(staffPosition)).replace(' ','')##招聘人员职位
,''.join(tree.xpath(location)) ## 详细地理位置
,''.join(tree.xpath(demand)) ## 岗位需求
,''.join(tree.xpath(advertising))## 岗位广告
,''.join(tree.xpath(industry)) # 行业
,''.join(tree.xpath(jobDescriptions)) #岗位要求描述
,''.join(tree.xpath(refreshTime)).split('\xa0')[0]
,samplingTime
)
for tree in etree.HTML(html)})[0]
开始抓取每页详细信息
def getDetails():
urls = dffromlagou.具体链接
names = dffromlagou.公司
return {i:(list(getPage(i,urls[i]))+[n]+[urls[i]]
,time.sleep(np.random.randint(10)))[:-1]
for n,i in zip(names,range(len(urls)))}
info = getDetails()
余弦相似度模块,对招聘岗位的相关度做验证
import jieba
import math
from scipy import spatial
import numpy as np
class cosine_similarity_of_text:
def __init__(self,strings1=None,strings2=None,model='fitnumpy'):
self.strings1 , self.strings2 = strings1 , strings2
self.strings1_cut, self.strings2_cut = self.cut(strings1), self.cut(strings2)
self.strings1_cut_code ,self.strings2_cut_code = self.cut_code(self.strings1_cut) , self.cut_code(self.strings2_cut)
self.frequency_of_word1 = self.frequency_of_word(self.strings1_cut,self.strings1_cut_code)
self.frequency_of_word2 = self.frequency_of_word(self.strings2_cut,self.strings2_cut_code)
if model == 'numpy':
self.fit = self.full_npcosine(np.array(self.frequency_of_word1),np.array(self.frequency_of_word2))
elif model == 'python':
self.fit =self.full_pycosine(self.frequency_of_word1, self.frequency_of_word2).__next__()
def cut(self,strings):
return [i for i in jieba.cut(strings, cut_all=True) if i != '']
def word_set(self):
return set(self.strings1_cut) | (set(self.strings2_cut))
def word_dict(self):
return {tuple(self.word_set())[i]: i for i in range(len(self.word_set()))}
def cut_code(self,cut):
return (self.word_dict()[word] for word in cut)
def frequency_of_word(self,string_cut,string_cut_code):
dict_ = self.word_dict()
string_cut_code = string_cut_code
string_cut_code = [0] * len(dict_)
for word in string_cut:
string_cut_code[dict_[word]] += 1
return (string_cut_code)
def full_pycosine(self,vector1,vector2):
sum = 0
sqrt1 = 0
sqrt2 = 0
for i in range(len(vector1)):
sum += vector1[i] * vector2[i]
sqrt1 += pow(vector1[i], 2)
sqrt2 += pow(vector2[i], 2)
try:
result = yield round(float(sum) / (math.sqrt(sqrt1) * math.sqrt(sqrt2)), 2)
except ZeroDivisionError:
result = 0.0
return result
def full_npcosine(self,vector1,vector2):
return spatial.distance.cosine(vector1,vector2)
def __del__(self):
pass
匹配要求语料库
Vectors = {'算法':['算法','神经网络','CNN','RNN','DNN','BP','Matlab','python','Python','C++','聚类','回归','研发']
,'数据分析师':['分析','数据分析','业务理解','理解业务','python','excel','Excel','sql']
,'人力资源':['招聘','绩效','薪酬','劳动关系','人力资源规划','培训','KPI','培训与开发','人员配置']
,'招聘':['招聘工作','指标']}
wordVector = ''.join([','+i for i in Vectors[kw]])[1:]
wordVector
生成新的匹配匹配要求的表单
def recommended(wordVector,info):
objForCosine = [{info[i][0][-2]:[
len(info[i][0][3]) > 2 and cosine_similarity_of_text(wordVector,info[i][0][3].replace('\xa0','') or False
,model='python'),info[i][0][-1]]}
for i in range(len(info))]
result = [(list(fbj)[0]
,fbj[list(fbj)[0]][0] != False and fbj[list(fbj)[0]][0].fit or 0
,fbj[list(fbj)[0]][-1])
for fbj in objForCosine]
dict_ = {i[0]:i[1:] for i in result}
example = pd.DataFrame({'COS':[dict_[i][0] for i in dffromlagou.公司]})
#dffromlagou.insert(len(dffromlagou.columns),'COS', example)
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
Table = pd.merge(dffromlagou, example, left_index=True, right_index=True, how='left')
return Table[Table.COS>0]
Table = recommended(wordVector,info)
企业匹配要求的柱状图显示
def P2():
values = [dffromlagou[dffromlagou.位置==i]['职位'].count()
for i in set(dffromlagou.位置.to_list())]
names = [i for i in set(dffromlagou.位置.to_list())]
table = pd.DataFrame({'分布地区':names,'数量':values})
c = (
Bar()
.add_xaxis(Table.公司.to_list())
.add_yaxis("岗位相关度匹配", Table.COS.to_list(), category_gap=0, color="#61a0a8")
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-35))
,title_opts=opts.TitleOpts(title="{}{}{}".format(samplingTime,city,"岗位相关度匹配")
, subtitle="None"))
)
return c.render_notebook()#ca8622,#61a0a8
P2()
展示匹配列表Table