词云是人工智能里的一环,掌握爬取数据并数据可视化绘制词云蛮重要的,这里是爬取51job招聘岗位信息绘制词云
import urllib.request
import re
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import jieba.analyse
from bs4 import BeautifulSoup
from PIL import Image
import numpy as np
def geturllistsz():#51招聘所有页面
listurl = []
for num in range(1, 2):
num = str(num)
url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,python,2,' + num + '.html?'
listurl.append(url)
return listurl
def downloadurl(urllist):#51招聘所有页面中子页面
returnlist = []
for urlpage in urllist:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
request = urllib.request.Request(urlpage, headers=headers)
response = urllib.request.urlopen(request)
res = response.read().decode('gbk')
restr = "<!--\列表表格 start-->(.*?)<!-- getPageFormHtml end -->"
regix = re.compile(restr, re.DOTALL)
alllist = re.findall(regix, res)
restr = "<span>(.*?)</span>"
regix = re.compile(restr, re.DOTALL)
pageurl = re.findall(regix, alllist[0])
for url in pageurl:
restr = "href=\"(.*?)\" οnmοusedοwn=\"\">"
regix = re.compile(restr, re.DOTALL)
tureurl = re.findall(regix, url)
returnlist.append(tureurl)
return returnlist
def data(url):#岗位信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
res = response.read().decode('gbk')
restr = "<div class=\"bmsg job_msg inbox\">(.*?) <div class=\"mt10\">"
regix = re.compile(restr, re.DOTALL)
alllist = re.findall(regix, res)
soup = BeautifulSoup(alllist[0],'lxml')
list=[]
for ui in soup.find_all(name='p'):
list.append(ui.string)
return list
def printciyun():#绘制词云
# 打开文件
text = open('shuju.txt',encoding='utf-8').read()
# 中文分词
text =''.join(jieba.cut_for_search(text))
# # 提取关键词和权重
# freq = jieba.analyse.extract_tags(text, topK=200, withWeight=True)
# print(freq[:20])
# freq = {i[0]: i[1] for i in freq}
# 生成对象
mask=np.array(Image.open('tupian.jpg'))
wc = WordCloud(mask=mask,font_path='STXIHEI.TTF', width=800, height=600, mode='RGBA', background_color='white').generate(text)
# 从图片中生成颜色
image_colors = ImageColorGenerator(mask)
wc.recolor(color_func=image_colors)
# 显示词云
plt.imshow(wc, interpolation='bicubic')
plt.axis('off')
plt.show()
# 保存文件
wc.to_file('wordcloud1.png')
return None
urllist = geturllistsz()
url = downloadurl(urllist)
shujudata = []
file = open('shuju.txt','w',encoding='utf-8')
for i in url:
if len(i)>0:
shujudata.append(data(i[0]))
# print(shujudata)
for x in shujudata:
for i in x:
if i!=None:
file.write(i)
file.close()
printciyun()
如有疑问欢迎评论,大家一起讨论