sklearn的基本操作也不是很复杂

qq_34799796

已于 2024-07-28 22:59:54 修改

阅读量433

点赞数 8

文章标签： sklearn 人工智能 python

于 2024-07-28 22:37:34 首次发布

本文链接：https://blog.csdn.net/qq_34799796/article/details/140757813

版权

import pandas as pd
 
# 数据加载
df = pd.read_csv("test.csv")
print(df)
 
# 删除重复的行
# dest_df = df.drop_duplicates()
# 删除指定列的重复的行
dest_df = df.drop_duplicates(subset=['Pregnancies', 'Glucose', 'BloodPressure'])
print(dest_df)


#查找年龄为23的数据
# df.loc[df['AGE']==23]
# df.loc[~df['Y'].isna()]  ~表示取反
# 查找 DiabetesPedigreeFunction 是 nan的数据
# df.loc[df['DiabetesPedigreeFunction'].isna()]
# # 查找 DiabetesPedigreeFunction 是 nan的数据, 并修改字段DiabetesPedigreeFunction = 0.672
df.loc[df['DiabetesPedigreeFunction'].isna(), 'DiabetesPedigreeFunction'] = 0.672
 
 
# 获取Pregnancies==5的所有列数据
# df.loc[df['Pregnancies']==5]
 
# 获取Pregnancies==5的 BMI列设置为1
# df.loc[df['Pregnancies']==5, 'BMI'] = 1
 
# df.loc[df['SEX']=='Unknown', 'SEX'] = 1 
# 查找列BMI=33.6，将其该为33.7
# df.loc[df['BMI']==33.6, 'BMI'] = 33.7
df.loc[df['DiabetesPedigreeFunction'].isna(), 'DiabetesPedigreeFunction'] = 0.627
# df
# 数据标准化
df['BMI'] = (df['BMI'] - df['BMI'].mean())/(df['BMI'])
print(df)
 
 
# 对boolean类型做one-hot处理
# one_hot_encoded_df = pd.get_dummies(df, columns=['SEX'])
# one_hot_encoded_df
 
 
from matplotlib import pyplot as plt
 
# 创建一个直方图的 Matplotlib 函数
#   bins：‌指定直方图的箱子数量或直方图的边界
df['Glucose'].hist(bins=100)
# plt.show()
 
 
from sklearn.model_selection import train_test_split
x = df[['Glucose','BMI']]
y = df[['Outcome']]
x_train, x_text, y_train,y_text = train_test_split(x, y , test_size=0.2)
 
 
from sklearn.pipeline import Pipeline
# 分类模型
from sklearn.tree import DecisionTreeClassifier  # 分类决策树
from sklearn.svm import SVC #支持向量机
from sklearn.ensemble import RandomForestClassifier #随机森林分类
 
# 回归模型
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.tree import DecisionTreeRegressor # 回归决策树
from sklearn.ensemble import RandomForestRegressor # 随机森林回归
 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
 
x_preprocessor = ColumnTransformer(transformers=[
    ('scale', StandardScaler(), ['Glucose']), # 标准化数值,对x_train中的字段做
    # ("encode", OneHotEncoder(), ['Sex']) # obe hot编码
])
r = x_preprocessor.fit_transform(df)
r
 
pipeline = Pipeline(steps=[
    ('preprocessor', x_preprocessor),
    ("regression", DecisionTreeRegressor())  # 回归决策树模型
])
 
# 拟合模型
pipeline.fit(x_train, y_train)
# 评估模型
print("Model score:",pipeline.score(x_text,y_text))
 
 
 
# 回归模型输出指标
from sklearn.metrics import mean_squared_error, r2_score
# 在测试集上进行预测
y_pred = pipeline.predict(x_text)
# 评估模型性能 均方误差(MSE)
mse = mean_squared_error(y_text, y_pred)
print(f"Mean Squared Error: {mse}")
# 决定系数(R-square)
score = r2_score(y_text, y_pred)
print(f"R-square: {score}")
 
 
# 分类模型
pipeline2 = Pipeline(steps=[
    ('preprocessor', x_preprocessor),
    ("classifier", DecisionTreeClassifier())  # 分类决策树
])
 
# 拟合模型
pipeline2.fit(x_train, y_train)
# 评估模型
print("Model score:",pipeline2.score(x_text,y_text))
 
 
 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, precision_recall_curve, recall_score, precision_score,roc_auc_score
 
# 分类模型输出指标
# 在测试集上进行预测
y_pred = pipeline2.predict(x_text)
# 评估
# accuracy_score
print("accuracy_score:", accuracy_score(y_text, y_pred))
# f1_score
print("f1_score:", f1_score(y_text, y_pred))
# 混淆矩阵
print("confusion_matrix:", confusion_matrix(y_text, y_pred))
# AUC
print("roc_auc_score:", roc_auc_score(y_text, y_pred))
 
# 绘制PR曲线
# recall = recall_score(y_text, y_pred)
# precision = precision_score(y_text, y_pred)
p, r, thresholds = precision_recall_curve(y_text, y_pred)
# 绘制精确率-召回率曲线
plt.plot(r, p, label='PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower right')
plt.show()
 
 
# ROC
fpr,tpr,thresholds =roc_curve(y_text, y_pred)
 
# 绘制ROC曲线
plt.plot(fpr, tpr, 'b', label='ROC curve')
# plt.plot([0, 1], [0, 1], 'k--', label='Random guess')  # 画对角线作为随机猜测的参考线
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
 
plt.show()