[文本挖掘] 6 词频统计2:词云图微博数据此沙封神

最新推荐文章于 2024-07-18 19:10:04 发布

波哥大与金迪奥

最新推荐文章于 2024-07-18 19:10:04 发布

阅读量873

点赞数 5

分类专栏：文本挖掘文章标签：数据挖掘

本文链接：https://blog.csdn.net/uforfor1/article/details/132501184

版权

文本挖掘专栏收录该内容

2 篇文章 1 订阅

订阅专栏

[文本挖掘] 5 词频统计1:文本预处理微博数据

# -*- coding: utf-8 -*-
# 词频统计 一：文本数据预处理
# 1 jieba分词
# 2 文本清洗：将非字母、非数字、非中文字符的字符转换成空字符
# 3 去除停用词语/修改停用词表、添加用户新词
# 4 合并同义词
# 5 词频统计

import jieba
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # MAC正常显示中文
plt.rcParams['axes.unicode_minus'] = False # MAC正常显现负号
import re # 正则表达式模块

# 1 jieba分词
str="波哥大与金迪奥"
jieba.cut(str) #精确模式
jieba.cut(str,cut_all=True) #全模式
jieba.cut_for_search(str) # 搜索引擎模式

jieba.lcut(str) #精确模式，返回一个列表类型
jieba.lcut(str,cut_all=True) #全模式，返回一个列表类型
jieba.lcut_for_search(str) # 搜索引擎模式，返回一个列表类型


data_path = r'/Users/zitongqiu/Documents/weibo-search-master/结果文件/此沙 封神/此沙 封神.csv'
df = pd.read_csv(data_path)
# print(df.head())
print('原数据:', df.shape)

# 去重
df = df.sort_values(by='评论数', ascending=False)
df = df.drop_duplicates(subset='微博正文', keep='first')
df = df.reset_index(drop=True)
print("去重并保留评论最多微博正文:",df.shape)

# 进行分词
df['分词结果'] = df['微博正文'].apply(lambda x: " ".join(jieba.cut(x)))
corpus = df['分词结果'].tolist()
# print(corpus)

# 2 文本清洗：将非字母、非数字、非中文字符的字符转换成空字符
df['分词结果'] = df['微博正文'].apply(lambda x: ' '.join(jieba.cut(re.sub(r"[^a-zA-Z0-9\u4e00-\u9fa5]+", "", x))))
corpus = df['分词结果'].tolist()
# print(corpus)

# 3 添加用户新词，去除停用词语/修改停用词表
user_file = "/Users/zitongqiu/PycharmProjects/pythonProject5/douban/userwords.txt"
jieba.load_userdict(user_file)

stop_words = []
with open('/Users/zitongqiu/PycharmProjects/pythonProject5/douban/stopwordslist.txt', 'r', encoding='utf-8') as swfile:
    for line in swfile.readlines():
        stop_words.append(line.strip())
df['分词结果'] = df['微博正文'].apply(lambda x: ' '.join([word for word in jieba.cut(re.sub(r"[^a-zA-Z0-9\u4e00-\u9fa5]+", "", x)) if word not in stop_words]))
corpus = df['分词结果'].tolist()
# print(corpus)
#
# 4 合并同义词（合并要在统计词频之后）
# 词频统计 counter()
word_counts = Counter()
for sentence in corpus:
    words = sentence.split()
    word_counts.update(words)
# print(word_counts)

synonyms_dict = {
    '封神第一部': ['第一部', '电影封神第一部','朝歌风云','封神第一部：朝歌风云'],
    '封神': ['封神三部曲'],
    '此沙': ['演员此沙', '吉乌此沙','演员吉乌此沙','木古惹古·吉乌此沙','沙沙','此沙此沙','此沙此沙此沙'],
    '封神第二部': ['第二部'],
    '封神第三部': ['第三部'],
    '姬发':['发发']
}
for word, synonym_list in synonyms_dict.items():
    count = word_counts[word] #dict[key]=values
    for synonym in synonym_list:
        count += word_counts[synonym]
        del word_counts[synonym]
    word_counts[word] = count
# print(word_counts)

# most_common() 函数
word_frequency = Counter(word_counts)
top_300 = word_frequency.most_common(300)
print(top_300)
print(type(top_300))




# 词频统计与可视化
# 1 字频
# 2 词频与可视化-微博正文词云图
# 3 词频与可视化-微博话题柱状图/条形图

# 1 字频
str1 = "fengshen is a phenomenal movie"
str2 = "《封神》这部电影是一部极为出色的电影"
cnt1 = Counter(str1)
cnt2 = Counter(str2)
print(cnt1,"\n",cnt2)

filepath = r'/Users/zitongqiu/Documents/data mining/data/许立志.txt'
with open(filepath,encoding='utf-8') as elpoema:
    str='\n'.join(elpoema.read())
cnt=Counter(str)
print(cnt)
print(sorted(cnt.items(),key=lambda x:x[1],reverse=True)[:20])

# 2 词频与可视化-微博正文词云图：概览文本主题/强调高频词/视觉吸引力
from wordcloud import WordCloud

wordcloud = WordCloud(width=1000, height=500, background_color='white', colormap='winter_r',
                      font_path=r'/System/Library/Fonts/STHeiti Medium.ttc')
wordcloud.generate_from_frequencies(dict(top_300))
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 剪影词云图
from PIL import Image # PILPython Imaging Library 图像处理的库
import numpy as np

shape_image = np.array(Image.open(r'/Users/zitongqiu/Documents/data mining/data/cishaprofile2.jpg'))

wordcloud = WordCloud(width=800, height=400,colormap='plasma_r',background_color='white', margin=1,mask=shape_image,font_path=r'/System/Library/Fonts/STHeiti Medium.ttc')
wordcloud.generate_from_frequencies(dict(top_300))
plt.figure(figsize=(8,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


# 3 词频与可视化-微博话题柱状图/条形图

import re
corpus = df['微博正文'].tolist()
pattern = r"#(.*?)#" #此沙##封神第一部#

topic_list= []
for text in corpus:
    topic = re.findall(pattern, text)
    topic_list.extend(topic)
print(topic_list)


from collections import Counter
topic_frequency = Counter(topic_list)
top_20 = topic_frequency.most_common(20)

print(top_20)
frequency = [item[1] for item in top_20]
topics = [item[0] for item in top_20]

# 柱状图bar 出现话题重复问题
plt.figure(figsize=(16,10))
bars = plt.bar(topics, frequency,width=0.9, color='yellowgreen')
plt.xlabel('微博话题')
plt.ylabel('热度')
plt.title('排名前20话题-未合并同义词（关键字：此沙 封神）')
plt.xticks(rotation=70)
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), ha='center', va='bottom')
plt.tight_layout()
plt.show()

# 合并同义词
synonyms_dict = {
    '封神第一部': ['封神第一部', '电影封神第一部','朝歌风云','封神第一部：朝歌风云','封神第一部:朝歌风云'],
    '封神': ['封神三部曲'],
    '此沙': ['演员此沙', '此沙杨戬','吉乌此沙','演员吉乌此沙','木古惹古·吉乌此沙','沙沙','此沙此沙','此沙此沙此沙','我沙'],
    '封神第二部': ['第二部'],
    '封神第三部': ['第三部'],
    '姬发':['发发']
}

merged_topic_list = []
for topic in topic_list:
    for main_topic, synonym_list in synonyms_dict.items():
        if topic in synonym_list:
            merged_topic_list.append(main_topic)
            break
    else:
        merged_topic_list.append(topic)

merged_topic_frequency = Counter(merged_topic_list)
merged_top_20 = merged_topic_frequency.most_common(20)
print(merged_top_20)
merged_frequency = [item[1] for item in merged_top_20]
merged_topics = [item[0] for item in merged_top_20]

# 柱状图bar
plt.figure(figsize=(16,10))
bars = plt.bar(merged_topics, merged_frequency,width=0.9,color='mediumvioletred')
plt.xlabel('微博话题')
plt.ylabel('热度')
plt.title('合并同义词后排名前20话题（关键字：此沙 封神）')
plt.xticks(rotation=70)
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), ha='center', va='bottom')
plt.tight_layout()
plt.show()

# 条形图 barh
plt.figure(figsize=(14, 8))
bars = plt.barh(merged_topics[::-1], merged_frequency[::-1], height=0.9, color='darkturquoise')
plt.xlabel('热度')
plt.ylabel('微博话题')
plt.title('合并同义词后排名前20话题（关键字：此沙 封神）')
plt.xticks([])
for bar in bars:
    xval = bar.get_width()
    plt.text(xval, bar.get_y() + bar.get_height() / 2, int(xval), ha='left', va='center')
plt.tight_layout()
plt.show()