#pip install jieba
import pandas as pd
import jieba
数据源:http://www.sogou.com/labs/resource/ca.php
df_news = pd.read_table('./data/val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna()
df_news.head()
df_news.shape
(5000, 4)
分词:使用结吧分词器
content = df_news.content.values.tolist()
print (content[1000])
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment) > 1 and current_segment != '\r\n': #换行符
content_S.append(current_segment)
content_S[1000]
df_content=pd.DataFrame({
'content_S':content_S})
df_content.head()
# 停词表
stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords.head(20)
# 去掉停用词