# 导入库import jieba.posseg as pseg
import pandas as pd
withopen('article1.txt')as fn:
string_data = fn.read()# 使用read方法读取整段文本# 分词+词性标注
words = pseg.cut(string_data)# 分词
words_pd = pd.DataFrame(words, columns=['word','type'])# 创建结果数据框print(words_pd.head(4))# 展示结果前4条'''
word type
0 Adobe eng
1 x
2 Analytics eng
3 和 c
'''# 词性分类汇总-两列分类
words_gb = words_pd.groupby(['type','word'])['word'].count()print(words_gb.head(4))'''
type word
a 不同 14
不足 2
不通 1
严谨 2
Name: word, dtype: int64
'''# 词性分类汇总-单列分类
words_gb2 = words_pd.groupby('type').count()
words_gb2 = words_gb2.sort_values(by='word', ascending=False)print(words_gb2.head(4))'''
word
type
x 994
n 981
v 834
eng 295
'''# 选择特定类型词语做展示
words_pd_index = words_pd['type'].isin(['n','eng'])
words_pd_select = words_pd[words_pd_index]print(words_pd_select.head(4))'''
word type
0 Adobe eng
2 Analytics eng
4 Webtrekk eng
9 领域 n
'''#导入库import jieba.analyse # 导入关键字提取库import pandas as pd # 导入pandas# 读取文本数据withopen('article1.txt')as fn:
string_data = fn.read()# 使用read方法读取整段文本# 关键字提取
tags_pairs = jieba.analyse.extract_tags(string_data, topK=5, withWeight=True, allowPOS=['ns','n','vn','v','nr'], withFlag=True)# 提取关键字标签
tags_list =[(i[0].word, i[0].flag, i[1])for i in tags_pairs]#
tags_pd = pd.DataFrame(tags_list, columns=['word','flag','weight'])# 创建数据框print(tags_pd)# 打印数据框'''
word flag weight
0 数据 n 0.313395
1 报表 n 0.163367
2 功能 n 0.150263
3 分析 vn 0.134857
4 用户 n 0.126633
'''