knn:
结果不太理想,可以删除row_id项。
朴素贝叶斯:
朴素贝叶斯预测:(api省略)
def navie_bayes():
# 获取数据
fet = fetch_20newsgroups(subset='all')
# print(fet)
# 数据基本处理
## 分割数据
x_train,x_test,y_train,y_test = train_test_split(fet.data,fet.target,test_size=0.2)
## 特征抽取
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
print('抽取后的数据:\n',tf.get_feature_names())
# 机器学习
estimator = MultinomialNB(alpha=1.0)
print('要训练的数据:\n',x_train.toarray())
estimator.fit(x_train,y_train)
y_pre = estimator.predict(x_test)
score = estimator.score(x_test,y_test)
print('预测结果为:\n',y_pre)
print('准确率为:\n',score)
if __name__ == '__main__':
# knn()
navie_bayes()
精准率与召回率:
召回率(查全率):不放过一个真实的( 考虑的比较多 )
精确率:
F1—score: 在考虑召回率的同时有没有把其它的考虑进去
api:
交叉验证和 网格搜索:
交叉验证过程:
代码:
data = load_iris()
x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,test_size=0.2,random_state=22)
st = StandardScaler()
x_train = st.fit_transform(x_train)
x_test = st.transform(x_test)
knn = KNeighborsClassifier()
parm = {'n_neighbors':[3,5,7,9]}
estimator = GridSearchCV(knn,param_grid=parm,cv=10)
estimator.fit(x_train,y_train)
y_pre = estimator.predict(x_test)
print('预测的值为:\n',y_pre)
print('预测的准确率为:\n',estimator.score(x_test,y_test))
print('交叉验证最好的结果:\n',estimator.best_score_)
print('交叉验证最好的模型:\n',estimator.best_estimator_)
print('每次交叉验证的结果:\n',estimator.cv_results_)