bag_words 练习

最新推荐文章于 2020-12-09 19:37:33 发布

高诗情

最新推荐文章于 2020-12-09 19:37:33 发布

阅读量107

点赞数

分类专栏： bag_words 文章标签： bag_words

本文链接：https://blog.csdn.net/qq_33932451/article/details/102874055

版权

bag_words 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import jieba
import pandas as pd
from gensim import corpora
import os
from itertools import chain
from collections import Counter
#返回一个总的列表
def get_list(root):
    stopwords=pd.read_csv("C:/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
    # quoting=3全不引用
    stopwords=stopwords['stopword'].values
    paths=[]   
    try:
        dirs=os.listdir(root)
        print(dirs)
        for dirx in dirs:
            paths.append(root+'/'+dirx)
            print(root+'/'+dirx)
    except:
        print('error')
    all_list=[]
    for path in paths:
        df = pd.read_csv(path,encoding='gbk')
        df = df.dropna()#删除缺失数据
        content=df.content.values.tolist()
        sentence=[]
        #分词
        for line in content:
            try:
                segs=jieba.lcut(line)
                segs = list(filter(lambda x:len(x)>1, segs))
                segs = list(filter(lambda x:x not in stopwords, segs))
                sentence.append(segs)
            except:
                print(line)
                continue
        sentence=list(chain.from_iterable(sentence))#转换为一维的列表  
        all_list.append(sentence)
    return all_list

#写入文件，返回词袋模型，格式：词，序号，词频
def write(all_list):
    root="C:/NLP_project/NLP_project/result/"
    corpus = set(list(chain.from_iterable(all_list)))
    print(len(corpus))
    corpus_dict = dict(zip(corpus, range(len(corpus))))
    vector=[]
    i=0 #计数
    for sentence in sentences:
        line=root+str(i)+'.txt'
        f=open(line,'w+')
        vec = []
        for key in corpus_dict.keys():
            if key in sentence:
                f.write(key+','+str(corpus_dict[key])+','+str(sentence.count(key))+'\n')
                vec.append((corpus_dict[key], sentence.count(key)))
            else:
                vec.append((corpus_dict[key], 0))
                f.write(key+','+str(corpus_dict[key])+','+'0'+'\n')
        vec = sorted(vec, key= lambda x: x[0]) #利用第一个元素进行排序
        vector.append(vec)
        f.close
        i=i+1
    return vector

sentences=get_list("C:/NLP_project/NLP_project/train")
vec=write(sentences)

高诗情

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
bag_words 练习

import jiebaimport pandas as pdfrom gensim import corporaimport osfrom itertools import chainfrom collections import Counter#返回一个总的列表def get_list(root): stopwords=pd.read_csv("C:/stopwords...
复制链接

扫一扫

专栏目录