Boss直聘数据爬取,词云图绘制

'''
BOSS爬虫,boss.py
'''

from pymongo import *
import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# 添加自己的cookies
cookies = {
    'Cookie':''
}
for i in range(0,11):
    response = requests.get('https://www.zhipin.com/c101010100/?page='+str(i)+'&ka=page-10',headers=headers,cookies=cookies)
    response.encoding='utf8'

    with open('./boss.txt','a',encoding='utf8') as fp:
        fp.write(response.text)

with open('./boss.txt',encoding='utf8') as fp:
    a = fp.read()
    # print(a)
# w_name = re.findall('\.html\" title=(.*?) target\=',a)
# print(w_name)
list = []
tree = etree.HTML(a)
lis = tree.xpath('//div[@class="job-list"]/ul/li')
for li in lis:
    c_url = li.xpath('.//span[@class="job-name"]/a/@href')
    # c_HRname = li.xpath('.//span[@class="red"]/text()')
    w_name = li.xpath('.//span[@class="job-name"]/a/text()')[0]
    money = li.xpath('.//span[@class="red"]/text()')[0]
    c_name = li.xpath('.//h3[@class="name"]/a/text()')[0]
    zhuangtai = li.xpath('.//div[@class="company-text"]/p/text()')[0]
    url = 'https://www.zhipin.com'+c_url[0]


    data =  {
        "薪资":money,
        "职位名":w_name,
        "融资状态":zhuangtai,
        "链接地址":url,
        "公司名":c_name,
    }
    list.append(data)




def insert():
    try:
        client = MongoClient(host='localhost',port=27017)
        db = client.BOSS
        db.message2.insert_many(list)


    except Exception as e:
        print(e)

if __name__ == '__main__':
    insert()
'''
mongodb数据库中提取数据,绘制词云图 worldcloud.py
'''
import pymongo
import wordcloud
import jieba
import imageio

cilent = pymongo.MongoClient(host='127.0.0.1',port=27017)
# print(cilent)
db = cilent.BOSS
collection = db.message2
data = list(collection.find())
lis = []
for i in data:
    # print(i['职位名'],i['薪资'])
    a = jieba.lcut(i['公司名'])

    # 合并分词
    str = ' '.join(a)
    # print(str)
    lis.append(str)
# print(lis)

# 读取图片
img = imageio.imread('wx2.png')

# 创建词云图
wc = wordcloud.WordCloud(
    width=100,
    height=80,
    background_color='black',
    font_path='msyh.ttc',

    # 默认字体大小
    scale=1,
    # 指定词云图图片
    mask=img,
)
str_ = ' '.join(lis)
# print(str_)
# 绘制词云图
wc.generate(str_)
# 保存词云图
wc.to_file('out.png')

cilent.close()
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值