from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words=stopWord_list, min_df=40, max_df=0.3)
train_df.columns = ["类别","文章"]
# print('词表大小:', len(tfidf.vocabulary_)) #量化的维数,即特征的维数
# X = tfidf.fit_transform(train_df['文章'])
X = tfidf.fit_transform(document)
# 调用TfidfVectorizer对象的fit_transform方法,获得特征矩阵,赋值给X
print(X.shape) #查看特征矩阵的形状
标签编码:
train_df['分类'].value_counts()
对这些用label encoder
#调用sklearn.preprocessing库的LabelEncoder方法对文章分类做标签编码
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(train_df['分类'])
y.shape
from collections import Counter
Counter(y)
逻辑回归解决多分类问题:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=2)
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=2)
# 使用逻辑回归,设置multi_class='multinomial',处理多分类问题
# MvM则相对复杂,这里举MvM的特例one-vs-one(OvO)作讲解。
#如果模型有T类,我们每次在所有的T类样本里面选择两类样本出来,不妨记为T1类和T2类,
#把所有的输出为T1和T2的样本放在一起,把T1作为正例,T2作为负例,进行二元逻辑回归,得到模型参数。一共需要T(T-1)/2次分类。
logistic_model.fit(train_X, train_y)
logistic_model.score(test_X, test_y)
保存模型
import pickle
with open('3.0tfidf.model', 'wb') as file:
save = {
'labelEncoder' : labelEncoder,
'tfidfVectorizer' : tfidf,
'logistic_model' : logistic_model
}
pickle.dump(save, file)
交叉验证:
import pickle
with open('3.0tfidf.model', 'rb') as file:
tfidf_model = pickle.load(file)
tfidfVectorizer = tfidf_model['tfidfVectorizer']
labelEncoder = tfidf_model['labelEncoder']
logistic_model = tfidf_model['logistic_model']
X = tfidfVectorizer.transform(document)
y = labelEncoder.transform(train_df['分类'])
简单交叉验证:
K 折交叉验证:
交叉验证:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=2)
cv_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2)
# ShuffleSplit把数据集分成n份,取test_size份作为测试集,train_size份作为训练集。
score_ndarray = cross_val_score(logistic_model, X, y, cv=cv_split)
print(score_ndarray)
print(score_ndarray.mean())