Python读取文件进行中文词频统计

Steven灬

已于 2023-07-15 17:07:49 修改

阅读量1.1w

点赞数 14

分类专栏：文本预处理 Python办公自动化文章标签： python 文本预处理词频统计

于 2019-04-10 15:11:15 首次发布

本文链接：https://blog.csdn.net/weixin_40547993/article/details/89183744

版权

Python办公自动化同时被 2 个专栏收录

10 篇文章 4 订阅

订阅专栏

文本预处理

9 篇文章 1 订阅

订阅专栏

一、读取CSV文件进行中文词频统计

第一种情况：

利用Counter函数进行词频统计，比较简洁，代码如下：

数据：movie_comments.csv文件为23万的影评数据

# -*- coding:utf-8 -*-
import jieba
import re
import pandas as pd
from collections import Counter

content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))

def token(string):
    return re.findall('\w+',string)
articles_clean = [''.join(token(str(s))) for  s in articles]
print(articles_clean[100])

def cut_word(string):
    return list(jieba.cut(string))

articles_words = [cut_word(string) for string in articles_clean]
list_set = []
for i in articles_words:
    list_set.extend(i)

words_count = Counter(list_set)
statics = words_count.most_common()[:50]
print(statics)

结果：

[('的', 328262), ('了', 102420), ('是', 73106), ('我', 50338), ('都', 36255), ('很', 34712), ('看', 34022), ('电影', 33675), ('也', 32065), ('和', 31290), ('在', 31245), ('不', 28435), ('有', 27939), ('就', 25685), ('人', 23909), ('好', 22858), ('啊', 20803), ('这', 17484), ('还', 17449), ('一个', 17343), ('你', 17282), ('还是', 16425), ('但', 15578), ('故事', 15010), ('没有', 14343), ('就是', 14007), ('喜欢', 13566), ('让', 13304), ('太', 12676), ('又', 11566), ('剧情', 11359), ('没', 10858), ('说', 10764), ('吧', 10747), ('他', 10675), ('不错', 10416), ('得', 10349), ('到', 10341), ('给', 10300), ('这个', 10058), ('上', 10054), ('被', 9939), ('对', 9824), ('最后', 9694), ('一部', 9693), ('片子', 9590), ('什么', 9571), ('能', 9532), ('与', 9168), ('多', 8977)]

第二种情况：

但是如是大批量语料，电脑配置不好的，建议先预处理保存到本地，再去统计词频，代码如下：

先预处理保存到本地：

# -*- coding:utf-8 -*-
import re
import pandas as pd

content = pd.read_csv(r"movie_comments.csv")
# print(content.head())
articles = content['comment'].tolist()
print(len(articles))

def token(string):
    return re.findall('\w+',string)

articles_clean = [''.join(token(str(s))) for  s in articles]
#print(articles_clean[600])

with open('writedic.txt','w',encoding='utf-8') as f:
    for line in articles_clean:
        f.write(line+'\n')

再进行分词、词频统计：

import jieba
from collections import Counter

Token = []
for i,lines in enumerate(open('writedic.txt',encoding='utf-8')):
    if i%10000 == 0:
        print(i)
    Token += jieba.cut(lines)

print(Token[:10])
words_count = Counter(Token)
statics = words_count.most_common()[:50]
print(statics)

结果同上。

二：读取excle文件进行中文词频统计具体分三步：

1、读取文件

2、分词，加载自定义词典，去数字，去停用词

3、统计词频并排序

代码如下：

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import pandas as pd
import re
import jieba

"""
第一步：读取文件
"""
f1=open("write.txt", "w", encoding='utf-8')
reviews=pd.read_excel('1.xlsx',usecols=[0],skiprow=[0],sheetname='Sheet1')#skiprow=[0]去除第一行.usecols的第一列是B列
#打印前三行
# for i in range(3):
#     print("Review #", i + 1)
#     print(reviews.answer[i])
#     print()
# ## 数据预处理
"""
第二步：分词，加载有用词典，去数字，去停用词
"""
#  清除不想要的单词(频率比较低的),停用词
jieba.load_userdict("jiebauserdict.txt")
stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
def clean_text(text):
    newtext = []
    text = re.sub(r'\d+', ' ', text) #去除数字
    text = re.sub(r'\n', '', text)
    text = jieba.lcut(text)  # 分词
    for word in text:
        if word not in stopword:  # 去停用词 + 词性筛选
            newtext.append(word)
    lineswords=' '.join(newtext)
    # print(lineswords)
    return lineswords

# 预处理文章内容 Clean the answer
clean_content = []
for summary in reviews.answer:
    clean_content.append(clean_text(summary))
print("Content are complete.")

# 打印清洗后的前三行
# for i in range(3):
#     print("Clean Review #", i + 1)
#     print(clean_content[i])
#     print()

"""
第三步：统计文本内容中每个单词出现的频率并排序
"""
# 统计每个句子中每个单词出现的频率
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1 #如果不在count_dict，则存入词典
            else:
                count_dict[word] += 1

    # print(count_dict)
    result = sorted(count_dict.items(),key=lambda item:item[1],reverse=True) #排序
    print(result)
    for key in result:
        # print(key[:500])
        f1.write(key.__str__()+'\n')
    # #     # print(key,count_dict[key])
    # #     f1.write(key+' '+str(count_dict[key])+'\n')
        f1.flush()
# 统计所有词的词频 比如:{'hello':7,'good':3} Find the number of times each word was used and the size of the vocabulary
word_counts = {}
count_words(word_counts, clean_content)
print("词汇数量:", len(word_counts)) #词汇数量

结果：

三、读取txt文件进行中文词频统计：

三部曲：读取文件、文件清洗与分词、统计词频

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re
import jieba
from collections import Counter

stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]

# 文本清洗
def clean_text(text):
    newtext = []
    text = re.sub(r'\d+', ' ', text) #去除数字
    text = jieba.lcut(text)  # 分词
    for word in text:
        if word not in stopword:  # 去停用词 + 词性筛选
            newtext.append(word)
    lineswords=' '.join(newtext)
    return lineswords


# 统计词频
def counter_word(data):
    lines = ''
    for line in data:
        lines += line
    data_list = lines.split(' ')
    words_count = Counter(data_list)
    # print(words_count)
    count_res = words_count.most_common()[:50]
    return count_res


# 读取txt文本文件
def read_content():
    data = []
    contents = [line.strip() for line in open("langchao.txt", 'r', encoding='utf-8-sig')]
    for content in contents:
        text = clean_text(content)
        data.append(text)
    result = counter_word(data)
    return result


if __name__ == '__main__':
    result = read_content()
    print(result)

四、根据统计的词频进行判断，若词频大于设定阈值，则返回该单词并保存在txt文档中：

读取的文档：

#encoding: utf-8
from collections import Counter
f = open("userdic.txt",'w',encoding='utf-8')
result = []
for line in open("writedic.txt", "r", encoding='utf-8'):
    result.append(list(line.strip().split(" ")))

def get_keys(d, value):
    return [k for k, v in d.items() if v == value]
# print(result)
for words in result:
    words_count = Counter(words)
    for word in words_count:
        if words_count[word] > 1:
            print(word)
            f.write(word+' ')
    f.write('\n')

存储的文档：

Steven灬

关注

14
点赞
踩
146

收藏

觉得还不错? 一键收藏
打赏
0
评论
Python读取文件进行中文词频统计

第一种情况：利用Counter函数进行词频统计，比较简洁，代码如下：数据：movie_comments.csv文件为23万的影评数据结果：第二种情况：但是如是大批量语料，电脑配置不好的，建议先预处理保存到本地，再去统计词频，代码如下：先预处理保存到本地：再进行分词、词频统计：结果同上。1、读取文件2、分词，加载自定义词典，去数字，去停用词3、统计词频并排序代码如下：结果：三部曲：读取文件、文件清洗与分词、统计词频读取的
复制链接

扫一扫