直接放代码惹,网址是智联招聘的,当前搜索条件是数据分析,可更换搜索条件之后更改url前缀。
#https://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E5%B8%82%E5%9C%BA_1_0 市场岗
#https://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90_1_0 数据分析岗
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
import re
#读url
def get_content(page):
url = 'https://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90_'+str(page)+'_0'
a = urllib.request.urlopen(url)
html = a.read().decode('utf-8')
return html
#找字段
def get(html):
reg = re.compile(r'<p class="searchResultCompanyname"><span>(.*?)</span></p>.*?class="oTips oTips4 fl">(.*?)</span>.*?span class="searchResultKeyval">.*?<span>职位类别:<em>(.*?)</em></span>.*?class="fl __ga__fullResultcampuspostname_clicksfullresultcampuspostnames_001">(.*?)</a>.*?class="searchResultJobdescription">.*?职责描述:<span>(.*?)</span>',re.S)
items = re.findall(reg,html)
items_length = len(items)
return items,items_length
#爬取信息
items_all=[]
items_length_all=0
#30是爬取页数
for i in range(0,30):
items,items_length = get(get_content(i))
items_all+=items
items_length_all=items_length+items_length_all
print(i)
import pandas as pd
items_all=pd.DataFrame(items_all).drop_duplicates()
items_all.columns=["企业","类型","职位类型","职位","职责"]
#导出
#items_all.to_excel("D:/Marketing.xls")
#词频统计
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import numpy as np
from PIL import Image
test_file=""
for i in items_all.index:
test_file+=str(items_all["职责"])
str_quan2=jieba.cut(test_file,cut_all=True)
#print("$".join(str_quan2))
import jieba.analyse as anl
#词频统计:得出最为关键的前1000个词,及相应的逆文档频率
seg = anl.extract_tags(test_file, topK = 1000, withWeight = True)
dflist=pd.DataFrame(seg)
dflist.columns=["word","freq"]
wordlist = jieba.cut(test_file,cut_all=True)#切割
con=[]
for i in dflist.index:
con.append(len(items_all[items_all["职责"].str.contains(dflist.loc[i]["word"])]))
dflist["con"]=con
dflist=dflist[dflist["con"]>1]
#词云图
space_list =" ".join(list(dflist["word"]))
backgroud = np.array(Image.open("C:/Users/houshunqi/Desktop/326313207.jpg"))
mywordcloud = WordCloud(background_color="white",
width=800,height=600,
mask=backgroud,
stopwords=STOPWORDS,
max_font_size=100,
random_state=30,
scale=1).generate(space_list)
image_color = ImageColorGenerator(backgroud)
plt.imshow(mywordcloud)
plt.axis("off")
plt.show()