对TXT文档中文分词积累分词库

最新推荐文章于 2022-09-24 01:42:49 发布

梦游的猴子

最新推荐文章于 2022-09-24 01:42:49 发布

阅读量644

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_29632521/article/details/118963839

版权

#! usr/bin/env python
#coding=utf-8

import jieba

class WordCut:
    def __init__(self,sentence):
        # 初始化
        sentence = []

    def ConcatSentences(self,sentence):
        return','.join(sentence.values)

    #创建停用词表
    def stopwordslist(self,):
        stopwords = [line.strip() for line in open('D:/jupyter/dic_stop.txt',encoding='UTF-8').readlines()]
        return stopwords

    # 对句子进行中文分词
    def seg_depart(self,sentence):
        # 对文档中的每一行进行中文分词
        print("(%s):正在分词" % "**")
        sentence_depart = jieba.cut(sentence.strip())
        # 创建一个停用词列表
        stopwords = stopwordslist()
        # 输出结果为outstr
        outstr = ''
        # 去停用词
        for word in sentence_depart:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr

    def wordtank(self, sentence):
        #将word_list列以‘ ’分开，分成多列
        word_list_df = word_list.str.split(' ',expand = True)

#将列转换为行
word_list_df_0 = word_list_df.stack()

# 将所有的分词合并
words = []
for content in words:
words.extend(content)

# 创建分词数据框
import pandas as pd
corpus = pd.DataFrame(word_list_df_0, columns=['word'])
corpus['cnt'] = 1

# 分组统计
g = corpus.groupby(['word']).agg({'cnt': 'count'}).sort_values('cnt', ascending=False)
g = g.reset_index()

梦游的猴子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
对TXT文档中文分词积累分词库

#! usr/bin/env python#coding=utf-8import jiebaclass WordCut: def __init__(self,sentence): # 初始化 sentence = [] def ConcatSentences(self,sentence): return','.join(sentence.values) #创建停用词表 def stopwor...
复制链接

扫一扫