中文八分类贝叶斯
数据为(30000,2)
训练:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib
df_news = pd.read_csv('./data/data.csv',names=['data','label'],encoding='utf-8')
df_news = df_news.dropna()
content = df_news.data.values.tolist()
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment) >= 1 and current_segment != '\r\n': #换行符
content_S.append(current_segment)
df_content=pd.DataFrame({'data_S':content_S})
stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\n",quoting=3,names=['stopword'], encoding='utf-8')
def drop_stopwords(contents,stopwords):
contents_clean = []
all_word