爬取页面数据并绘制词云

词云是人工智能里的一环,掌握爬取数据并数据可视化绘制词云蛮重要的,这里是爬取51job招聘岗位信息绘制词云

import urllib.request
import re
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import jieba.analyse
from bs4 import BeautifulSoup
from PIL import Image
import numpy as np


def geturllistsz():#51招聘所有页面
    listurl = []
    for num in range(1, 2):
        num = str(num)
        url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,python,2,' + num + '.html?'
        listurl.append(url)
    return listurl





def downloadurl(urllist):#51招聘所有页面中子页面
    returnlist = []
    for urlpage in urllist:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
        }
        request = urllib.request.Request(urlpage, headers=headers)
        response = urllib.request.urlopen(request)
        res = response.read().decode('gbk')

        restr = "<!--\列表表格 start-->(.*?)<!-- getPageFormHtml end -->"
        regix = re.compile(restr, re.DOTALL)
        alllist = re.findall(regix, res)

        restr = "<span>(.*?)</span>"
        regix = re.compile(restr, re.DOTALL)
        pageurl = re.findall(regix, alllist[0])

        for url in pageurl:
            restr = "href=\"(.*?)\"  οnmοusedοwn=\"\">"
            regix = re.compile(restr, re.DOTALL)
            tureurl = re.findall(regix, url)
            returnlist.append(tureurl)
    return returnlist


def data(url):#岗位信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=headers)

    response = urllib.request.urlopen(request)
    res = response.read().decode('gbk')

    restr = "<div class=\"bmsg job_msg inbox\">(.*?) <div class=\"mt10\">"
    regix = re.compile(restr, re.DOTALL)
    alllist = re.findall(regix, res)

    soup = BeautifulSoup(alllist[0],'lxml')
    list=[]
    for ui in soup.find_all(name='p'):
        list.append(ui.string)
    return list


def printciyun():#绘制词云
    # 打开文件
    text = open('shuju.txt',encoding='utf-8').read()


    # 中文分词
    text =''.join(jieba.cut_for_search(text))


    # # 提取关键词和权重
    # freq = jieba.analyse.extract_tags(text, topK=200, withWeight=True)
    # print(freq[:20])
    # freq = {i[0]: i[1] for i in freq}

    # 生成对象
    mask=np.array(Image.open('tupian.jpg'))
    wc = WordCloud(mask=mask,font_path='STXIHEI.TTF', width=800, height=600, mode='RGBA', background_color='white').generate(text)

    # 从图片中生成颜色
    image_colors = ImageColorGenerator(mask)
    wc.recolor(color_func=image_colors)

    # 显示词云
    plt.imshow(wc, interpolation='bicubic')
    plt.axis('off')
    plt.show()

    # 保存文件
    wc.to_file('wordcloud1.png')
    return None




urllist = geturllistsz()
url = downloadurl(urllist)
shujudata = []
file = open('shuju.txt','w',encoding='utf-8')
for i in url:
    if len(i)>0:
        shujudata.append(data(i[0]))
# print(shujudata)
for x in shujudata:
    for i in x:
        if i!=None:
             file.write(i)
file.close()
printciyun()


如有疑问欢迎评论,大家一起讨论

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值