import pandas as pd
import jieba
#pip install jieba
数据源:
http://www.sogou.com/labs/resource/ca.php
df_news = pd.read_table('./data/val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna()
df_news.head()
df_news.shape
分词:使用结吧分词器
# 结吧分词器要求使用list格式
content = df_news.content.values.tolist()
print (content[1000])
进行分词:
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment) > 1 and current_segment != '\r\n': #换行符
content_S.append(current_segment)
content_S[1000]
df_content=pd.DataFrame({
'content_S':content_S})
df_content.head()
清洗
stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords.head(20)
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
all_words.append(str