Python读取文件进行中文词频统计

一、读取CSV文件进行中文词频统计

第一种情况:

利用Counter函数进行词频统计,比较简洁,代码如下:

数据:movie_comments.csv文件为23万的影评数据

# -*- coding:utf-8 -*-
import jieba
import re
import pandas as pd
from collections import Counter

content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))

def token(string):
    return re.findall('\w+',string)
articles_clean = [''.join(token(str(s))) for  s in articles]
print(articles_clean[100])

def cut_word(string):
    return list(jieba.cut(string))

articles_words = [cut_word(string) for string in articles_clean]
list_set = []
for i in articles_words:
    list_set.extend(i)

words_count = Counter(list_set)
statics = words_count.most_common()[:50]
print(statics)

结果: 

[('的', 328262), ('了', 102420), ('是', 73106), ('我', 50338), ('都', 36255), ('很', 34712), ('看', 34022), ('电影', 33675), ('也', 32065), ('和', 31290), ('在', 31245), ('不', 28435), ('有', 27939), ('就', 25685), ('人', 23909), ('好', 22858), ('啊', 20803), ('这', 17484), ('还', 17449), ('一个', 17343), ('你', 17282), ('还是', 16425), ('但', 15578), ('故事', 15010), ('没有', 14343), ('就是', 14007), ('喜欢', 13566), ('让', 13304), ('太', 12676), ('又', 11566), ('剧情', 11359), ('没', 10858), ('说', 10764), ('吧', 10747), ('他', 10675), ('不错', 10416), ('得', 10349), ('到', 10341), ('给', 10300), ('这个', 10058), ('上', 10054), ('被', 9939), ('对', 9824), ('最后', 9694), ('一部', 9693), ('片子', 9590), ('什么', 9571), ('能', 9532), ('与', 9168), ('多', 8977)]

第二种情况:

但是如是大批量语料,电脑配置不好的,建议先预处理保存到本地,再去统计词频,代码如下:

先预处理保存到本地:

# -*- coding:utf-8 -*-
import re
import pandas as pd

content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))

def token(string):
    return re.findall('\w+',string)

articles_clean = [''.join(token(str(s))) for  s in articles]
#print(articles_clean[600])

with open('writedic.txt','w',encoding='utf-8') as f:
    for line in articles_clean:
        f.write(line+'\n')

再进行分词、词频统计:

import jieba
from collections import Counter

Token = []
for i,lines in enumerate(open('writedic.txt',encoding='utf-8')):
    if i%10000 == 0:
        print(i)
    Token += jieba.cut(lines)

print(Token[:10])
words_count = Counter(Token)
statics = words_count.most_common()[:50]
print(statics)

结果同上。

二:读取excle文件进行中文词频统计具体分三步:

1、读取文件

2、分词,加载自定义词典,去数字,去停用词

3、统计词频并排序

代码如下:

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import pandas as pd
import re
import jieba

"""
第一步:读取文件
"""
f1=open("write.txt", "w", encoding='utf-8')
reviews=pd.read_excel('1.xlsx',usecols=[0],skiprow=[0],sheetname='Sheet1')#skiprow=[0]去除第一行.usecols的第一列是B列
#打印前三行
# for i in range(3):
#     print("Review #", i + 1)
#     print(reviews.answer[i])
#     print()
# ## 数据预处理
"""
第二步:分词,加载有用词典,去数字,去停用词
"""
#  清除不想要的单词(频率比较低的),停用词
jieba.load_userdict("jiebauserdict.txt")
stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
def clean_text(text):
    newtext = []
    text = re.sub(r'\d+', ' ', text) #去除数字
    text = re.sub(r'\n', '', text)
    text = jieba.lcut(text)  # 分词
    for word in text:
        if word not in stopword:  # 去停用词 + 词性筛选
            newtext.append(word)
    lineswords=' '.join(newtext)
    # print(lineswords)
    return lineswords

# 预处理文章内容 Clean the answer
clean_content = []
for summary in reviews.answer:
    clean_content.append(clean_text(summary))
print("Content are complete.")

# 打印清洗后的前三行
# for i in range(3):
#     print("Clean Review #", i + 1)
#     print(clean_content[i])
#     print()

"""
第三步:统计文本内容中每个单词出现的频率并排序
"""
# 统计每个句子中每个单词出现的频率
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1 #如果不在count_dict,则存入词典
            else:
                count_dict[word] += 1

    # print(count_dict)
    result = sorted(count_dict.items(),key=lambda item:item[1],reverse=True) #排序
    print(result)
    for key in result:
        # print(key[:500])
        f1.write(key.__str__()+'\n')
    # #     # print(key,count_dict[key])
    # #     f1.write(key+' '+str(count_dict[key])+'\n')
        f1.flush()
# 统计所有词的词频 比如:{'hello':7,'good':3} Find the number of times each word was used and the size of the vocabulary
word_counts = {}
count_words(word_counts, clean_content)
print("词汇数量:", len(word_counts)) #词汇数量

结果:

                        

三、读取txt文件进行中文词频统计:

三部曲:读取文件、文件清洗与分词、统计词频

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re
import jieba
from collections import Counter

stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]

# 文本清洗
def clean_text(text):
    newtext = []
    text = re.sub(r'\d+', ' ', text) #去除数字
    text = jieba.lcut(text)  # 分词
    for word in text:
        if word not in stopword:  # 去停用词 + 词性筛选
            newtext.append(word)
    lineswords=' '.join(newtext)
    return lineswords


# 统计词频
def counter_word(data):
    lines = ''
    for line in data:
        lines += line
    data_list = lines.split(' ')
    words_count = Counter(data_list)
    # print(words_count)
    count_res = words_count.most_common()[:50]
    return count_res


# 读取txt文本文件
def read_content():
    data = []
    contents = [line.strip() for line in open("langchao.txt", 'r', encoding='utf-8-sig')]
    for content in contents:
        text = clean_text(content)
        data.append(text)
    result = counter_word(data)
    return result


if __name__ == '__main__':
    result = read_content()
    print(result)

四、根据统计的词频进行判断,若词频大于设定阈值,则返回该单词并保存在txt文档中:

读取的文档:

#encoding: utf-8
from collections import Counter
f = open("userdic.txt",'w',encoding='utf-8')
result = []
for line in open("writedic.txt", "r", encoding='utf-8'):
    result.append(list(line.strip().split(" ")))

def get_keys(d, value):
    return [k for k, v in d.items() if v == value]
# print(result)
for words in result:
    words_count = Counter(words)
    for word in words_count:
        if words_count[word] > 1:
            print(word)
            f.write(word+' ')
    f.write('\n')

存储的文档:

  • 14
    点赞
  • 146
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Steven灬

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值