1. 先下载并安装nltk包,准备一张简单的图片存入代码所在文件目录,搜集英文停用词表
import nltk
nltk.download()
2. 绘制词云图
import re
import numpy as np
import pandas as pd
#import matplotlib
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from PIL import Image
from wordcloud import WordCloud
from sklearn.datasets import fetch_20newsgroups
#from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter, defaultdict
def word_cut(contents, cut=','):
res = []
for content in contents:
content = content.lower()
words = [word for word in re.split(cut, content) if word]
res.append(words)
return res
def word_count(contents):
#words_count = Counter(sum(contents,[])) #慢
word_count_dict = defau