python3
jieba分词
代码如下:
import jieba
import jieba.analyse
import math
import pandas as pd
import os
#获取停用词
stopwords_path = 'data/stopword.txt'
stopwords_list = [line.strip() for line in open(stopwords_path,'r',encoding='utf-8').readlines() ]
# 获取数据
data = pd.read_csv('data/all_docs.txt',sep='\001',header=None)
data['text_cut'] = data['text'].map(lambda x:' '.join(jieba.cut(x)))
data['text_cut'] = data['text_cut'].map(lambda x:[ i for i in x.split(' ') if i not in stopwords_list])
# data_content是带分析文本
data_count = len(data['text_cut']) # 总共有多少篇文章
idf_dic = {}
for i in range(len(data['text_cut'])):
new_content = data['text_cut'][i]#.split(',')
for word in set(new_content):
if len(word) > 1:
idf_dic[word] = idf_dic.get(word, 0.0) + 1.0