[文本挖掘] 5 词频统计1:文本预处理 微博数据
# -*- coding: utf-8 -*-
# 词频统计 一:文本数据预处理
# 1 jieba分词
# 2 文本清洗:将非字母、非数字、非中文字符的字符转换成空字符
# 3 去除停用词语/修改停用词表、添加用户新词
# 4 合并同义词
# 5 词频统计
import jieba
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # MAC正常显示中文
plt.rcParams['axes.unicode_minus'] = False # MAC正常显现负号
import re # 正则表达式模块
# 1 jieba分词
str="波哥大与金迪奥"
jieba.cut(str) #精确模式
jieba.cut(str,cut_all=True) #全模式
jieba.cut_for_search(str) # 搜索引擎模式
jieba.lcut(str) #精确模式,返回一个列表类型
jieba.lcut(str,cut_all=True) #全模式,返回一个列表类型
jieba.lcut_for_search(str) # 搜索引擎模式,返回一个列表类型
data_path = r'/Users/zitongqiu/Documents/weibo-search-master/结果文件/此沙 封神/此沙 封神.csv'
df = pd.read_csv(data_path)
# print(df.head())
print('原数据:', df.shape)
# 去重
df = df.sort_values(by='评论数', ascending=False)
df = df.drop_duplicates(subset='微博正文', keep='first')
df = df.reset_index(drop=True)
print("去重并保留评论最多微博正文:",df.shape)
# 进行分词
df['分词结果'] = df['微博正文'].apply(lambda x: " ".join(jieba.cut(x)))
corpus = df['分词结果'].tolist()
# print(corpus)
# 2 文本清洗:将非字母、非数字、非中文字符的字符转换成空字符
df['分词结果'] = df['微博正文'].apply(lambda x: ' '.join(jieba.cut(re.sub(r"[^a-zA-Z0-9\u4e00-\u9fa5]+", "", x))))
corpus = df['分词结果'].tolist()
# print(corpus)
# 3 添加用户新词,去除停用词语/修改停用词表
user_file = "/Users/zitongqiu/PycharmProjects/pythonProject5/douban/userwords.txt"
jieba.load_userdict(user_file)
stop_words = []
with open('/Users/zitongqiu/PycharmProjects/pythonProject5/douban/stopwordslist.txt', 'r', encoding='utf-8') as swfile:
for line in swfile.readlines():
stop_words.append(line.strip())
df['分词结果'] = df['微博正文'].apply(lambda x: ' '.join([word for word in jieba.cut(re.sub(r"[^a-zA-Z0-9\u4e00-\u9fa5]+", "", x)) if word not in stop_words]))
corpus = df['分词结果'].tolist()
# print(corpus)
#
# 4 合并同义词(合并要在统计词频之后)
# 词频统计 counter()
word_counts = Counter()
for sentence in corpus:
words = sentence.split()
word_counts.update(words)
# print(word_counts)
synonyms_dict = {
'封神第一部': ['第一部', '电影封神第一部','朝歌风云','封神第一部:朝歌风云'],
'封神': ['封神三部曲'],
'此沙': ['演员此沙', '吉乌此沙','演员吉乌此沙','木古惹古·吉乌此沙','沙沙','此沙此沙','此沙此沙此沙'],
'封神第二部': ['第二部'],
'封神第三部': ['第三部'],
'姬发':['发发']
}
for word, synonym_list in synonyms_dict.items():
count = word_counts[word] #dict[key]=values
for synonym in synonym_list:
count += word_counts[synonym]
del word_counts[synonym]
word_counts[word] = count
# print(word_counts)
# most_common() 函数
word_frequency = Counter(word_counts)
top_300 = word_frequency.most_common(300)
print(top_300)
print(type(top_300))
# 词频统计与可视化
# 1 字频
# 2 词频与可视化-微博正文词云图
# 3 词频与可视化-微博话题柱状图/条形图
# 1 字频
str1 = "fengshen is a phenomenal movie"
str2 = "《封神》这部电影是一部极为出色的电影"
cnt1 = Counter(str1)
cnt2 = Counter(str2)
print(cnt1,"\n",cnt2)
filepath = r'/Users/zitongqiu/Documents/data mining/data/许立志.txt'
with open(filepath,encoding='utf-8') as elpoema:
str='\n'.join(elpoema.read())
cnt=Counter(str)
print(cnt)
print(sorted(cnt.items(),key=lambda x:x[1],reverse=True)[:20])
# 2 词频与可视化-微博正文词云图:概览文本主题/强调高频词/视觉吸引力
from wordcloud import WordCloud
wordcloud = WordCloud(width=1000, height=500, background_color='white', colormap='winter_r',
font_path=r'/System/Library/Fonts/STHeiti Medium.ttc')
wordcloud.generate_from_frequencies(dict(top_300))
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 剪影词云图
from PIL import Image # PILPython Imaging Library 图像处理的库
import numpy as np
shape_image = np.array(Image.open(r'/Users/zitongqiu/Documents/data mining/data/cishaprofile2.jpg'))
wordcloud = WordCloud(width=800, height=400,colormap='plasma_r',background_color='white', margin=1,mask=shape_image,font_path=r'/System/Library/Fonts/STHeiti Medium.ttc')
wordcloud.generate_from_frequencies(dict(top_300))
plt.figure(figsize=(8,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# 3 词频与可视化-微博话题柱状图/条形图
import re
corpus = df['微博正文'].tolist()
pattern = r"#(.*?)#" #此沙##封神第一部#
topic_list= []
for text in corpus:
topic = re.findall(pattern, text)
topic_list.extend(topic)
print(topic_list)
from collections import Counter
topic_frequency = Counter(topic_list)
top_20 = topic_frequency.most_common(20)
print(top_20)
frequency = [item[1] for item in top_20]
topics = [item[0] for item in top_20]
# 柱状图bar 出现话题重复问题
plt.figure(figsize=(16,10))
bars = plt.bar(topics, frequency,width=0.9, color='yellowgreen')
plt.xlabel('微博话题')
plt.ylabel('热度')
plt.title('排名前20话题-未合并同义词(关键字:此沙 封神)')
plt.xticks(rotation=70)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), ha='center', va='bottom')
plt.tight_layout()
plt.show()
# 合并同义词
synonyms_dict = {
'封神第一部': ['封神第一部', '电影封神第一部','朝歌风云','封神第一部:朝歌风云','封神第一部:朝歌风云'],
'封神': ['封神三部曲'],
'此沙': ['演员此沙', '此沙杨戬','吉乌此沙','演员吉乌此沙','木古惹古·吉乌此沙','沙沙','此沙此沙','此沙此沙此沙','我沙'],
'封神第二部': ['第二部'],
'封神第三部': ['第三部'],
'姬发':['发发']
}
merged_topic_list = []
for topic in topic_list:
for main_topic, synonym_list in synonyms_dict.items():
if topic in synonym_list:
merged_topic_list.append(main_topic)
break
else:
merged_topic_list.append(topic)
merged_topic_frequency = Counter(merged_topic_list)
merged_top_20 = merged_topic_frequency.most_common(20)
print(merged_top_20)
merged_frequency = [item[1] for item in merged_top_20]
merged_topics = [item[0] for item in merged_top_20]
# 柱状图bar
plt.figure(figsize=(16,10))
bars = plt.bar(merged_topics, merged_frequency,width=0.9,color='mediumvioletred')
plt.xlabel('微博话题')
plt.ylabel('热度')
plt.title('合并同义词后排名前20话题(关键字:此沙 封神)')
plt.xticks(rotation=70)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), ha='center', va='bottom')
plt.tight_layout()
plt.show()
# 条形图 barh
plt.figure(figsize=(14, 8))
bars = plt.barh(merged_topics[::-1], merged_frequency[::-1], height=0.9, color='darkturquoise')
plt.xlabel('热度')
plt.ylabel('微博话题')
plt.title('合并同义词后排名前20话题(关键字:此沙 封神)')
plt.xticks([])
for bar in bars:
xval = bar.get_width()
plt.text(xval, bar.get_y() + bar.get_height() / 2, int(xval), ha='left', va='center')
plt.tight_layout()
plt.show()