基于requests numpy pandas生成行业分布调查

city= '合肥'
kw = '算法'
dffromlagou = lagou_main(city= city,kw = kw)
import matplotlib.pyplot as plt
import numpy as np
from scipy import math
from scipy import stats
#import seaborn as sns
#sns.set()

修改需要计算的几项为数值型

dffromlagou.head()
dffromlagou[['工资低位','工资高位','公司规模低位']] = dffromlagou[['工资低位','工资高位','公司规模低位']].apply(pd.to_numeric)

验证是否有为0的薪酬项目

dffromlagou[dffromlagou.工资低位==0]

保存初始表单到本机/追加写入

import os
dffromlagou.to_csv("{}{}{}{}{}{}".format(os.getcwd(),'/','拉钩网',city,kw,'.csv'),mode='a+')

查看本次抓取数据行数

dffromlagou.count()[0]

分组统计初探

grouped=dffromlagou.groupby(dffromlagou["位置"])

查看各行政区县的岗位数量

grouped.count().职位

转学历项为布尔值编码

def getcodeforEducation(List):
    education = ['不限','大专','本科','硕士','博士']
    return [i in List and  1 or 0 for i in education]

统计学百分比函数

getcodeforEducation(['本科', '硕士'])
def getCount(List):
    return [[i,List.count(i),List.count(i)/len(List)] for i in set(List)]
getCount(['本科','大专','本科'])

生成内存数据存储的hash链表

def dataSetDict(data):
    return {[c][0]:[dffromlagou[dffromlagou.公司==c]['位置'].to_list()[:1]
    +dffromlagou[dffromlagou.公司==c]['公司规模低位'].to_list()[:1]
    +dffromlagou[dffromlagou.公司==c]['工资低位'].to_list()[:1]
    +getcodeforEducation(set(dffromlagou[dffromlagou.公司==c]['学历'].to_list()))
    +[ len(dffromlagou[dffromlagou.公司==c]['学历'].to_list())]
    + dffromlagou[dffromlagou.公司==c]['企业最后登陆'].to_list()[:1]
    + [ getCount(dffromlagou[dffromlagou.公司==c]['学历'].to_list())]]
            for c in set(data)}
{i : dataSetDict(set(dffromlagou[dffromlagou.位置==i].公司)) for i in set(dffromlagou.位置)}

查看区域分布的柱状图

from pyecharts.charts import Bar
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
from pyecharts import options as opts
from pyecharts.faker import Faker


def P1():
    values = [dffromlagou[dffromlagou.位置==i]['职位'].count() 
              for i in set(dffromlagou.位置.to_list())]
    names = [i for i in set(dffromlagou.位置.to_list())]
    table = pd.DataFrame({'分布地区':names,'数量':values})
    c = (
        Bar()
        .add_xaxis(table.分布地区.to_list())
        .add_yaxis("岗位分布地区", table.数量.to_list(), category_gap=0, color="#778899")
        .set_global_opts(
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-35))
            ,title_opts=opts.TitleOpts(title="{}{}{}".format(samplingTime,city,"不同地区岗位数据分析岗位数量分布")
                                       , subtitle="None"))
    )
    return c.render_notebook()#ca8622,#61a0a8
P1()

在这里插入图片描述
分区描述统计

import datetime

def statisticsTable():
    income = {i:np.array(dffromlagou[dffromlagou.位置==i].工资低位.to_list()) 
              for i in set(dffromlagou.位置.to_list())}
    groupingStatistics = {n:[v.mean()
           ,v.std()
           ,np.argmax(np.bincount(v))
           ,np.median(v)
           ,v.min()
           ,v.max()
           ,v.ptp()
           ,v.shape[0]
            ,v.shape[0]/dffromlagou.count()[0]
            ,samplingTime]
           for n,v 
           in income.items() if n != None}
    result = pd.DataFrame(groupingStatistics).transpose()
    result.columns=['均值','标准差','众数','中位数','最小值','最大值','极差','计数','区域占比','采样时间']
    return result

statisticsTable()

在这里插入图片描述
公司收入均值排名,倒序

groupedm=dffromlagou.groupby(dffromlagou["公司"])
groupedm.工资低位.mean().sort_values()

在这里插入图片描述
招聘数量均值排名倒序

groupedm.工资低位.count().sort_values()[-10:]

在这里插入图片描述
从百度地图api抓取招聘企业公司地址坐标

import requests
import json

def getUrl(*address):
    '''
    调用地图API获取待查询地址专属url
    最高查询次数30w/天,最大并发量160/秒
    '''
    ak = 
    if len(address) < 1:
        return None
    else:
        for add in address:   
            url = 'http://api.map.baidu.com/geocoding/v3/?address={inputAddress}&output=json&ak={myAk}'.format(inputAddress=add,myAk=ak)  
            yield url
            
def getPosition(url):
    '''返回经纬度信息'''
    res = requests.get(url)
    json_data = json.loads(res.text)
    
    if json_data['status'] == 0:
        lat = json_data['result']['location']['lat'] #纬度
        lng = json_data['result']['location']['lng'] #经度
        precise = json_data['result']['precise']
        confidence = json_data['result']['confidence']
        comprehension = json_data['result']['comprehension']
        level = json_data['result']['level']
    else:
        print("Error output!")
        return json_data['status']
    return lat,lng,precise,confidence,comprehension,level

def getStrings(address):
    for add in address:
        add_url = list(getUrl(add))[0]
        print(add_url)
        try:
            lat,lng,precise,confidence,comprehension,level = getPosition(add_url) 
            print("地址:{0}|经度:{1}|纬度:{2}|精度:{3}|置信度:{4}|解释性:{5}|分类等级:{6}".format(add,lat,lng,precise,confidence,comprehension,level))
        except Error as e:
            print(e)
#getStrings()

def getDict(address):
    column = ["地址","经度","纬度","精度","置信度","解释性","分类等级"]
    addressL,latL,lngL,preciseL,confidenceL,comprehensionL,levelL = [],[],[],[],[],[],[]
    n = -1
    for add in address:
        addressL.append(add)
        lat,lng,precise,confidence,comprehension,level  = getPosition(list(getUrl(add))[0])
        latL.append(lat)
        lngL.append(lng)
        preciseL.append(precise)
        confidenceL.append(confidence)
        comprehensionL.append(comprehension)
        levelL.append(level)
                
    return {cn : data for cn , data in zip(column,(addressL,latL,lngL,preciseL,confidenceL,comprehensionL,levelL))}

查看地理坐标列表的dataframe

def cityTable():
    checkaddress = list(set([''.join((
        city != None and city or ''
        ,region != None and region or ''
        ,name != None and name or '')) 
        for city,region,name 
        in zip(dffromlagou.城市,dffromlagou.位置,dffromlagou.公司全称)]))
    return pd.DataFrame(getDict(checkaddress))

citytable=cityTable()
citytable

对本次抓取的公司薪酬做描述统计

company = pd.DataFrame({i:[dffromlagou[dffromlagou.公司全称 == i].工资低位.mean()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.sum()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.median()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.mode()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.count()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.std()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.min()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.max()
    ,dffromlagou[dffromlagou.公司全称 == i].工资低位.max()-dffromlagou[dffromlagou.公司全称 == i].工资低位.min()
   ] 
 for i in dffromlagou.公司全称}).transpose()
company.columns = ['mean','sum','median','mode','count','std','min','max','ptp']
company

查看地理位置在坐标系的相对距离

import plotly.express as px
fig = px.scatter(x=citytable.经度,y=citytable.纬度)
fig.show()

在这里插入图片描述
查看招聘企业整体的地理位置在地图上的分布

KMpoint = getDict([city+'火车站'])
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
#token='pk.eyJ1Ijoic3l2aW5jZSIsImEiOiJjazZrNTcwY3kwMHBrM2txaGJqZWEzNWExIn0.tLQHY_OoiR2NMxnYHXUBAA'
#token = 'pk.eyJ1IjoiYmxhY2tzaGVlcHdhbGwwMzA1IiwiYSI6ImNrMHo5ZnQxYjBjbG8zbm84b3hrb25vb24ifQ.K8tcDjJDsPcjdYFTSVgTxw'
token = 'pk.eyJ1IjoibWFubWFuemhhbmciLCJhIjoiY2thMDRkbGM3MDh0aDNybjZpM3hyam5yOCJ9.TOaUXDGA4hzIoAowDRSOqw'
university = pd.read_csv("{}{}{}".format(os.getcwd(),'/','拉钩网公司地址临时.csv'))
university.置信度 = university.置信度*1000
university.经度.mean(),university.纬度.mean()
px.set_mapbox_access_token(token)
fig4 = px.scatter_mapbox(university
                         , lat="经度"
                         , lon="纬度"
                         , color="置信度"
                         , size="解释性"
                         , color_continuous_scale=px.colors.cyclical.IceFire
                         , size_max=6
                         , zoom=9
                         ,center = {'lon': KMpoint['纬度'][0], 'lat': KMpoint['经度'][0]})
fig4.show()

在这里插入图片描述
二级页面抓取招聘岗位详细信息

from lxml import etree

def getPage(Num,url):
    html= getHTML(url)
    advertising = '//*[@id="job_detail"]/dd[1]/p/text()'## 岗位广告
    demand = '//*[@id="job_detail"]/dd[2]/div/p/text()'## 岗位需求
    location = '//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()'## 详细地理位置
    staffPosition = '//*[@id="job_detail"]/dd[4]/div/div/span[1]/text()' ##招聘人员职位
    employerInformation = '//*[@id="job_detail"]/dd[4]/div/div/a/span[1]/text()' ## 招聘公司及人员姓名
    industry = '/html/body/div[6]/div/div[1]/dd/ul/li[1]/text()' # 行业
    jobDescriptions = '/html/body/div[6]/div/div[1]/dd/h3/span/text()' #岗位要求描述
    refreshTime = '/html/body/div[6]/div/div[1]/dd/p/text()' #刷新时间
    Num = 1
    return tuple({(''.join(tree.xpath(employerInformation)) ## 招聘公司及人员姓名
      ,''.join(tree.xpath(staffPosition)).replace(' ','')##招聘人员职位
      ,''.join(tree.xpath(location)) ## 详细地理位置
      ,''.join(tree.xpath(demand)) ## 岗位需求
      ,''.join(tree.xpath(advertising))## 岗位广告
      ,''.join(tree.xpath(industry)) # 行业
      ,''.join(tree.xpath(jobDescriptions))  #岗位要求描述
      ,''.join(tree.xpath(refreshTime)).split('\xa0')[0]
      ,samplingTime  
     ) 
     for tree in etree.HTML(html)})[0]

开始抓取每页详细信息

def getDetails():
    urls = dffromlagou.具体链接
    names = dffromlagou.公司
    return {i:(list(getPage(i,urls[i]))+[n]+[urls[i]]
               ,time.sleep(np.random.randint(10)))[:-1]
            for n,i in zip(names,range(len(urls)))}
info = getDetails()

余弦相似度模块,对招聘岗位的相关度做验证

import jieba
import math
from scipy import spatial
import numpy as np

class cosine_similarity_of_text:
    def __init__(self,strings1=None,strings2=None,model='fitnumpy'):
        self.strings1 , self.strings2 = strings1 , strings2
        self.strings1_cut, self.strings2_cut = self.cut(strings1), self.cut(strings2)
        self.strings1_cut_code ,self.strings2_cut_code = self.cut_code(self.strings1_cut) , self.cut_code(self.strings2_cut)
        self.frequency_of_word1 = self.frequency_of_word(self.strings1_cut,self.strings1_cut_code)
        self.frequency_of_word2 = self.frequency_of_word(self.strings2_cut,self.strings2_cut_code)
        if model == 'numpy':
            self.fit = self.full_npcosine(np.array(self.frequency_of_word1),np.array(self.frequency_of_word2))
        elif model == 'python':
            self.fit =self.full_pycosine(self.frequency_of_word1, self.frequency_of_word2).__next__()

    def cut(self,strings):
        return [i for i in jieba.cut(strings, cut_all=True) if i != '']

    def word_set(self):
        return set(self.strings1_cut) | (set(self.strings2_cut))

    def word_dict(self):
        return {tuple(self.word_set())[i]: i for i in range(len(self.word_set()))}

    def cut_code(self,cut):
        return (self.word_dict()[word] for word in cut)

    def frequency_of_word(self,string_cut,string_cut_code):
        dict_ = self.word_dict()
        string_cut_code = string_cut_code
        string_cut_code = [0] * len(dict_)
        for word in string_cut:
            string_cut_code[dict_[word]] += 1
        return (string_cut_code)

    def full_pycosine(self,vector1,vector2):
        sum = 0
        sqrt1 = 0
        sqrt2 = 0
        for i in range(len(vector1)):
            sum += vector1[i] * vector2[i]
            sqrt1 += pow(vector1[i], 2)
            sqrt2 += pow(vector2[i], 2)
        try:
            result = yield round(float(sum) / (math.sqrt(sqrt1) * math.sqrt(sqrt2)), 2)
        except ZeroDivisionError:
            result = 0.0
        return result

    def full_npcosine(self,vector1,vector2):
        return spatial.distance.cosine(vector1,vector2)

    def __del__(self):
        pass

匹配要求语料库

Vectors = {'算法':['算法','神经网络','CNN','RNN','DNN','BP','Matlab','python','Python','C++','聚类','回归','研发']
          ,'数据分析师':['分析','数据分析','业务理解','理解业务','python','excel','Excel','sql']
          ,'人力资源':['招聘','绩效','薪酬','劳动关系','人力资源规划','培训','KPI','培训与开发','人员配置']
          ,'招聘':['招聘工作','指标']}
wordVector = ''.join([','+i for i in Vectors[kw]])[1:]
wordVector

生成新的匹配匹配要求的表单

def recommended(wordVector,info):
    objForCosine = [{info[i][0][-2]:[
        len(info[i][0][3]) > 2 and cosine_similarity_of_text(wordVector,info[i][0][3].replace('\xa0','') or False
                                  ,model='python'),info[i][0][-1]]}
                    for i in range(len(info))]
    result = [(list(fbj)[0]
               ,fbj[list(fbj)[0]][0] != False and fbj[list(fbj)[0]][0].fit or 0
               ,fbj[list(fbj)[0]][-1])
              for fbj in objForCosine]
    dict_ = {i[0]:i[1:] for i in result}
    example = pd.DataFrame({'COS':[dict_[i][0] for i in dffromlagou.公司]})
    #dffromlagou.insert(len(dffromlagou.columns),'COS', example)
    #显示所有列
    pd.set_option('display.max_columns', None)
    #显示所有行
    pd.set_option('display.max_rows', None)
    #设置value的显示长度为100,默认为50
    pd.set_option('max_colwidth',100)
    Table = pd.merge(dffromlagou, example, left_index=True, right_index=True, how='left')
    return Table[Table.COS>0]
Table = recommended(wordVector,info)

企业匹配要求的柱状图显示

def P2():
    values = [dffromlagou[dffromlagou.位置==i]['职位'].count() 
              for i in set(dffromlagou.位置.to_list())]
    names = [i for i in set(dffromlagou.位置.to_list())]
    table = pd.DataFrame({'分布地区':names,'数量':values})
    c = (
        Bar()
        .add_xaxis(Table.公司.to_list())
        .add_yaxis("岗位相关度匹配", Table.COS.to_list(), category_gap=0, color="#61a0a8")
        .set_global_opts(
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-35))
            ,title_opts=opts.TitleOpts(title="{}{}{}".format(samplingTime,city,"岗位相关度匹配")
                                       , subtitle="None"))
    )
    return c.render_notebook()#ca8622,#61a0a8
P2()

在这里插入图片描述
展示匹配列表Table
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值