1.选一个自己感兴趣的主题。选取www.91.com为移动互联网来作为兴趣来进行研究
2.网络上爬取相关的数据。到网站进行对网页框架进行分析,然后搜出标题进行爬数据
3.进行文本分析,生成词云。将收集到的数据进行结巴分词,然后将写入wadee.txt中,并且进行词频统计
import requests from bs4 import BeautifulSoup import jieba import matplotlib.pyplot as plt from wordcloud import WordCloud url = "http://www.91.com/" res = requests.get(url) res.encoding = "utf-8" soup = BeautifulSoup(res.text, "html.parser") output = open("wadee.txt", "a+", encoding="utf-8") for p in soup.find_all("p"): output.write(p.get_text() + "\n") output.close() txt = open("wadee.txt", "r", encoding="utf-8").read() words = jieba.lcut(txt) ls = [] counts = {} for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(10): word , count = items[i] print ("{:<5}{:>2}".format(word,count))
使用词云来展示词频。并且字体库出现过问题使用微软雅黑,换个支持中文的就能正常显示。
wordlist = jieba.cut(txt, cut_all=True) wl_split = "/".join(wordlist) mywc = WordCloud(font_path='msyh.ttc').generate(wl_split) plt.imshow(mywc) plt.axis("off") plt.show()