自行测试,欢迎交流,使用前安转所需内容。
一、 下面是一次利用python编程对文本分析的测试
1.词频图生成代码
import pandas as pd
from collections import Counter
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体为SimHei
plt.rcParams['axes.unicode_minus'] = False # 解决负号'-'显示为方块的问题
# 读取Excel文件
data = pd.read_csv(r'你的文件路径')
import re
# 去除中文标点符号
data['你需要提取的一列内容标题'] = data['你需要提取的一列内容标题'].apply(lambda x: re.sub(r'[^\w\s]','', x))
# 创建停用词列表(txt文件是自行在网上搜索“最全中文停用词表”保存到同一目录,文章----最下面)
stopwords = set(line.strip() for line in open('ting.txt', encoding='UTF-8'))
# 分词并去除停用词
words = []
for comment in data['你需要提取的一列内容标题']:
words.extend([word for word in jieba.lcut(comment) if word not in stopwords and len(word) > 1])
words = [word for word in words if len(word) > 1]
# 统计词频
word_counts = Counter(words)
# 假设stopwords是一个包含停用词的集合
stopwords = {'的', '了', '在', '是', '我'}
words_cleaned = [word for word in words if word not in stopwords and len(word) > 1]
word_counts_cleaned = Counter(words_cleaned)
import jieba.posseg as pseg
words_classified = pseg.lcut(''.join(data['你需要提取的一列内容标题']))
import matplotlib.pyplot as plt
import seaborn as sns
# 选择要展示的词频数0
top_words = word_counts_cleaned.most_common(50)
# 数据准备
words, counts = zip(*top_words)
# 假设top_words是一个包含(词, 频率)元组的列表
top_words = word_counts.most_common(30)
words, counts = zip(*top_words)
# 使用关键字参数x和y来绘制条形图
sns.barplot(x=list(counts), y=list(words))
plt.show()
import matplotlib.pyplot as plt
2.词云图代码
import pandas as pd
from collections import Counter
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
# 读取Excel文件
data = pd.read_csv(r'读取文件名')
import re
# 去除中文标点符号
data['你需要提取的一列内容标题'] = data['你需要提取的一列内容标题'].apply(lambda x: re.sub(r'[^\w\s]','', x))
# 创建停用词列表(txt文件是自行在网上搜索“最全中文停用词表”保存到同一目录,文章----最下面)
stopwords = set(line.strip() for line in open('ting.txt', encoding='UTF-8'))
# 分词并去除停用词
words = []
for comment in data['你需要提取的一列内容标题']:
words.extend([word for word in jieba.lcut(comment) if word not in stopwords and len(word) > 1])
words = [word for word in words if len(word) > 1]
# 使用Counter统计词频
word_counts = Counter(words)
# 生成词云图
wordcloud = WordCloud(font_path='simhei.ttf', width=800, height=400).generate_from_frequencies(word_counts)
# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
3.情感分析代码
import pandas as pd
from collections import Counter
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体为SimHei
plt.rcParams['axes.unicode_minus'] = False # 解决负号'-'显示为方块的问题
# 读取Excel文件
data = pd.read_csv(r'读取文件名')
import re
# 去除中文标点符号
data['你需要提取的一列内容标题'] = data['你需要提取的一列内容标题'].apply(lambda x: re.sub(r'[^\w\s]','', x))
# 创建停用词列表(txt文件是自行在网上搜索“最全中文停用词表”保存到同一目录,文章----最下面)
stopwords = set(line.strip() for line in open('ting.txt', encoding='UTF-8'))
# 分词并去除停用词
words = []
for comment in data['你需要提取的一列内容标题']:
words.extend([word for word in jieba.lcut(comment) if word not in stopwords and len(word) > 1])
words = [word for word in words if len(word) > 1]
# 统计词频
word_counts = Counter(words)
# 假设stopwords是一个包含停用词的集合
stopwords = {'的', '了', '在', '是', '我'}
words_cleaned = [word for word in words if word not in stopwords and len(word) > 1]
word_counts_cleaned = Counter(words_cleaned)
import jieba.posseg as pseg
words_classified = pseg.lcut(''.join(data['你需要提取的一列内容标题']))
import matplotlib.pyplot as plt
import seaborn as sns
import jieba.posseg as pseg
words_classified = pseg.lcut(''.join(data['你需要提取的一列内容标题']))
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import pandas as pd
# 假设data['你需要提取的一列内容标题']包含了清洗后的评论数据
# 进行情感分析
data['情感极性'] = data['你需要提取的一列内容标题'].apply(lambda x: TextBlob(x).sentiment.polarity)
# 将情感极性分为积极、中性和消极三类
data['情感分类'] = pd.cut(data['情感极性'], bins=[-0.02,-0.01, 0.01, 1], labels=['消极', '中性', '积极'])
# 绘制情感分类的计数条形图
plt.figure(figsize=(8, 6))
sns.countplot(x='情感分类', data=data, order=['积极', '中性', '消极'])
plt.title('评论情感分类')
plt.xlabel('情感类型')
plt.ylabel('评论数量')
plt.show()
4.中文停用词
最全中文停用词最全中文停用词表(可直接复制)_停用词库-CSDN博客
———————————————————— FXCZ————————————————————