爬取网易新闻并生成词云

import requests
from requests.exceptions import RequestException
import json
import jieba
import re
from os import path
from wordcloud import WordCloud
import numpy as np

headers = {
'Referer': 'http://news.163.com/world/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def get_json(url):
	try:
		response = requests.get(url,headers=headers)
		if response.status_code == 200:
			return response.content.decode('gbk')
		else :
			return None
	except RequestException as e:
		return None

def get_info(json_data,wordcloud_list):
	real_json_content = re.findall('data_callback\((.*?)\)',json_data,re.S)
	content = json.loads(real_json_content[0])
	# print(content) 测试成功
	for obj in content:
		title = obj['title']
		keywords = obj['keywords']
		keyword_list = list()
		try:
			for keyword in keywords:
				keyword_list.append(keyword['keyname'])
		except:
			pass
		# print(title)
		# print(keyword_list)
		wordcloud_list.append(keyword_list + list(title))

def create(imgFile, text):
	im = Image.open(imgFile)
	w, h = im.size
	# 创建wordcloud对象
	wc = WordCloud(
		r'C:\windows\fonts\simfang.ttf', width=w, height=h,
		background_color='white', font_step=3,min_font_size = 4,
		random_state=False, prefer_horizontal=0.9,max_font_size = 150,
	)
	t = wc.generate(text)
	t = t.to_image()
	for w1 in range(w):
		for h1 in range(h):
			if im.getpixel((w1, h1))[:3] == (255, 255, 255):
				t.putpixel((w1, h1), (255, 255, 255))
	t.save('D:/特朗普123.jpg')

if __name__ == '__main__':
	wordcloud_list = list()
	urls = ('http://temp.163.com/special/00804KVA/cm_guoji.js?callback=data_callback','http://temp.163.com/special/00804KVA/cm_guoji_02.js?callback=data_callback')
	for url in urls:
		json_data = get_json(url)
		get_info(json_data,wordcloud_list)
	wordcloud_list_data = [j for i in wordcloud_list for j in i]# 列表推导式实现二维列表的平铺变为一维列表
	content = ''.join(wordcloud_list_data)# 一维列表才可以用join方法进行合并方成一个字符串来进行jieba分词
	# print(finall) 测试成功

	content = re.sub('[0-9]|[!@#$%^&*()_+=";:\',.;‘,?《》<>。?“ ”:~·\"!"]','',content)
	print(content)

	words = jieba.lcut(content)
	counts = dict()
	stopwords = ('被','是','将','在','美','中','人','警察','的','宣布','与','回应','前','年','会','女子','称','为')
	for word in words:
		if word in stopwords:
			pass
		else:
			counts[word] = counts.get(word,0) + 1
	items = list(counts.items())
	items.sort(key=lambda x: x[1], reverse=True)
	mylist = list()
	for i in range(100):
		word, count = items[i]
		mylist.append(word)
	text = ' '.join(mylist)
	# print(text) 测试成功
	create('D:/timg.jpg',text)
	

要注意的点是

  • 进行jieba分词之前要把里面的特殊字符全部去掉
  • 拿去生成词云的一定是用空格分开每个词的字符串
  • 要实现二维列表的平铺可以使用一个列表推导式
  • **wordcloud_list_data = [j for i in wordcloud_list for j in i]**解释一下这个列表推导式,就是取出二维列表的每行,在从每行里面取出每个元素作为新列表里面的元素

效果图:
在这里插入图片描述

  • 3
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值