一、读取CSV文件进行中文词频统计
第一种情况:
利用Counter函数进行词频统计,比较简洁,代码如下:
数据:movie_comments.csv文件为23万的影评数据
# -*- coding:utf-8 -*-
import jieba
import re
import pandas as pd
from collections import Counter
content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))
def token(string):
return re.findall('\w+',string)
articles_clean = [''.join(token(str(s))) for s in articles]
print(articles_clean[100])
def cut_word(string):
return list(jieba.cut(string))
articles_words = [cut_word(string) for string in articles_clean]
list_set = []
for i in articles_words:
list_set.extend(i)
words_count = Counter(list_set)
statics = words_count.most_common()[:50]
print(statics)
结果:
[('的', 328262), ('了', 102420), ('是', 73106), ('我', 50338), ('都', 36255), ('很', 34712), ('看', 34022), ('电影', 33675), ('也', 32065), ('和', 31290), ('在', 31245), ('不', 28435), ('有', 27939), ('就', 25685), ('人', 23909), ('好', 22858), ('啊', 20803), ('这', 17484), ('还', 17449), ('一个', 17343), ('你', 17282), ('还是', 16425), ('但', 15578), ('故事', 15010), ('没有', 14343), ('就是', 14007), ('喜欢', 13566), ('让', 13304), ('太', 12676), ('又', 11566), ('剧情', 11359), ('没', 10858), ('说', 10764), ('吧', 10747), ('他', 10675), ('不错', 10416), ('得', 10349), ('到', 10341), ('给', 10300), ('这个', 10058), ('上', 10054), ('被', 9939), ('对', 9824), ('最后', 9694), ('一部', 9693), ('片子', 9590), ('什么', 9571), ('能', 9532), ('与', 9168), ('多', 8977)]
第二种情况:
但是如是大批量语料,电脑配置不好的,建议先预处理保存到本地,再去统计词频,代码如下:
先预处理保存到本地:
# -*- coding:utf-8 -*-
import re
import pandas as pd
content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))
def token(string):
return re.findall('\w+',string)
articles_clean = [''.join(token(str(s))) for s in articles]
#print(articles_clean[600])
with open('writedic.txt','w',encoding='utf-8') as f:
for line in articles_clean:
f.write(line+'\n')
再进行分词、词频统计:
import jieba
from collections import Counter
Token = []
for i,lines in enumerate(open('writedic.txt',encoding='utf-8')):
if i%10000 == 0:
print(i)
Token += jieba.cut(lines)
print(Token[:10])
words_count = Counter(Token)
statics = words_count.most_common()[:50]
print(statics)
结果同上。
二:读取excle文件进行中文词频统计具体分三步:
1、读取文件
2、分词,加载自定义词典,去数字,去停用词
3、统计词频并排序
代码如下:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import pandas as pd
import re
import jieba
"""
第一步:读取文件
"""
f1=open("write.txt", "w", encoding='utf-8')
reviews=pd.read_excel('1.xlsx',usecols=[0],skiprow=[0],sheetname='Sheet1')#skiprow=[0]去除第一行.usecols的第一列是B列
#打印前三行
# for i in range(3):
# print("Review #", i + 1)
# print(reviews.answer[i])
# print()
# ## 数据预处理
"""
第二步:分词,加载有用词典,去数字,去停用词
"""
# 清除不想要的单词(频率比较低的),停用词
jieba.load_userdict("jiebauserdict.txt")
stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
def clean_text(text):
newtext = []
text = re.sub(r'\d+', ' ', text) #去除数字
text = re.sub(r'\n', '', text)
text = jieba.lcut(text) # 分词
for word in text:
if word not in stopword: # 去停用词 + 词性筛选
newtext.append(word)
lineswords=' '.join(newtext)
# print(lineswords)
return lineswords
# 预处理文章内容 Clean the answer
clean_content = []
for summary in reviews.answer:
clean_content.append(clean_text(summary))
print("Content are complete.")
# 打印清洗后的前三行
# for i in range(3):
# print("Clean Review #", i + 1)
# print(clean_content[i])
# print()
"""
第三步:统计文本内容中每个单词出现的频率并排序
"""
# 统计每个句子中每个单词出现的频率
def count_words(count_dict, text):
for sentence in text:
for word in sentence.split():
if word not in count_dict:
count_dict[word] = 1 #如果不在count_dict,则存入词典
else:
count_dict[word] += 1
# print(count_dict)
result = sorted(count_dict.items(),key=lambda item:item[1],reverse=True) #排序
print(result)
for key in result:
# print(key[:500])
f1.write(key.__str__()+'\n')
# # # print(key,count_dict[key])
# # f1.write(key+' '+str(count_dict[key])+'\n')
f1.flush()
# 统计所有词的词频 比如:{'hello':7,'good':3} Find the number of times each word was used and the size of the vocabulary
word_counts = {}
count_words(word_counts, clean_content)
print("词汇数量:", len(word_counts)) #词汇数量
结果:
三、读取txt文件进行中文词频统计:
三部曲:读取文件、文件清洗与分词、统计词频
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import jieba
from collections import Counter
stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
# 文本清洗
def clean_text(text):
newtext = []
text = re.sub(r'\d+', ' ', text) #去除数字
text = jieba.lcut(text) # 分词
for word in text:
if word not in stopword: # 去停用词 + 词性筛选
newtext.append(word)
lineswords=' '.join(newtext)
return lineswords
# 统计词频
def counter_word(data):
lines = ''
for line in data:
lines += line
data_list = lines.split(' ')
words_count = Counter(data_list)
# print(words_count)
count_res = words_count.most_common()[:50]
return count_res
# 读取txt文本文件
def read_content():
data = []
contents = [line.strip() for line in open("langchao.txt", 'r', encoding='utf-8-sig')]
for content in contents:
text = clean_text(content)
data.append(text)
result = counter_word(data)
return result
if __name__ == '__main__':
result = read_content()
print(result)
四、根据统计的词频进行判断,若词频大于设定阈值,则返回该单词并保存在txt文档中:
读取的文档:
#encoding: utf-8
from collections import Counter
f = open("userdic.txt",'w',encoding='utf-8')
result = []
for line in open("writedic.txt", "r", encoding='utf-8'):
result.append(list(line.strip().split(" ")))
def get_keys(d, value):
return [k for k, v in d.items() if v == value]
# print(result)
for words in result:
words_count = Counter(words)
for word in words_count:
if words_count[word] > 1:
print(word)
f.write(word+' ')
f.write('\n')
存储的文档: