pre_Data
from sklearn import preprocessing
import numpy as np
import warnings
warnings.filterwarnings("ignore")
x = np.array([[1., -1., 2.],
[2., 0., 0.],
[0., 1., -3.]])
# 标准化
scaler = preprocessing.StandardScaler()
x_scale = scaler.fit_transform(x)
print(x_scale)
# print(x_scale.mean(0), x_scale.std(0))
# minmax
scaler = preprocessing.MinMaxScaler()
x_scale = scaler.fit_transform(x)
print(x_scale)
print(x_scale.mean(0), x_scale.std(0))
# MaxAbsScaler
scaler = preprocessing.MaxAbsScaler()
x_scale = scaler.fit_transform(x)
print(x_scale)
print(x_scale.mean(0), x_scale.std(0))
# RobustScaler
# 使用数据的四分位数范围(即 IQR,即第一四分位数和第三四分位数之间的差异)来缩放数据,这使得它对异常值的影响较小
scaler = preprocessing.RobustScaler()
x_scale = scaler.fit_transform(x)
print(x_scale)
print(x_scale.mean(0), x_scale.std(0))
# Normalizer
scaler = preprocessing.Normalizer(norm="l2")
x_scale = scaler.fit_transform(x)
print(x_scale)
print(x_scale.mean(0), x_scale.std(0))
# 二值化
scaler = preprocessing.Binarizer(threshold=0)
x_scale = scaler.fit_transform(x)
print(x_scale)
# 缺失数据
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
y_imp = imp.fit_transform([[np.nan, 2], [6, np.nan], [7, 6]])
print(y_imp)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
y_imp = imp.transform([[np.nan, 2], [6, np.nan], [7, 6]])
print(y_imp)
pre_datas
import pandas as pd
import numpy as np
# 创建一个包含数值和类别数据的DataFrame
data = pd.DataFrame({
'num_col1': np.random.rand(10),
'num_col2': np.random.rand(10),
'cat_col1': np.random.choice(['A', 'B', 'C'], 10),
'cat_col2': np.random.choice(['X', 'Y', 'Z'], 10)
})
# 打印原始数据
print("Original data:")
print(data)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# 定义预处理的步骤
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['num_col1', 'num_col2']), # 标准化数值列
('cat', OneHotEncoder(), ['cat_col1', 'cat_col2']) # 独热编码类别列
])
r = preprocessor.fit_transform(data)
r
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# 创建一个包含预处理步骤的管道
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
# 分割数据为训练集和测试集
# X = data.drop('cat_col2', axis=1) # 特征列
X = data
y = data['cat_col2'] # 目标列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#拟合模型
pipeline.fit(X_train, y_train)
# 评估模型
print("Model score:", pipeline.score(X_test, y_test))
reg
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
# 加载糖尿病数据集
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
# 选择第三个特征作为输入
diabetes_X = diabetes_X[:, np.newaxis, 5]
# 切割数据集,留下100个数据进行评估
diabetes_X_train = diabetes_X[:-100]
diabetes_X_test = diabetes_X[-100:]
diabetes_y_train = diabetes_y[:-100]
diabetes_y_test = diabetes_y[-100:]
# 选择一个回归模型
regr = linear_model.LinearRegression()
# 使用训练数据进行训练
regr.fit(diabetes_X_train, diabetes_y_train)
# 使用模型进行预测
diabetes_y_pred = regr.predict(diabetes_X_test)
# 模型评估
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
% mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
% r2_score(diabetes_y_test, diabetes_y_pred))
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
plt.show()
regression
import pandas as pd
df = pd.read_csv("diabetes.csv")
df
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# 定义预处理的步骤
x_preprocessor = ColumnTransformer(
transformers=[
('scale', StandardScaler(), ["S1"]), # 标准化数值列
# ('encode', OneHotEncoder(), ["SEX"]) # 独热编码类别列
])
r = x_preprocessor.fit_transform(df)
r
from sklearn.model_selection import train_test_split
# 分割数据为训练集和测试集
# X = data.drop('cat_col2', axis=1) # 特征列
X = df[['S1']]
y = df[['Y']] # 目标列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(y_train)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
# 创建一个包含预处理步骤的管道
pipeline = Pipeline(steps=[
('preprocessor', x_preprocessor),
('regression', LinearRegression())
])
#拟合模型
pipeline.fit(X_train, y_train)
# 评估模型
print("Model score:", pipeline.score(X_test, y_test))
from sklearn.metrics import mean_squared_error, r2_score
y_pred = pipeline.predict(X_test)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
import matplotlib.pyplot as plt
plt.scatter(X_test["S1"], y_test, color='black')
plt.plot(X_test["S1"], y_pred, color='blue', linewidth=3)
classfier
# 基础结构.py
#
import numpy as np
from sklearn import linear_model, svm, neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve
import warnings
warnings.filterwarnings("ignore")
np.random.RandomState(0)
# 加载数据
cancer = datasets.load_breast_cancer()
x, y = cancer.data, cancer.target
# 划分训练集与测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 数据预处理
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# 创建模型
# clf = neighbors.KNeighborsClassifier(n_neighbors=12,algorithm='ball_tree')
# clf = linear_model.SGDClassifier()
clf = linear_model.LogisticRegression()
# clf = svm.SVC(kernel='rbf')
# 模型拟合
clf.fit(x_train, y_train)
# 预测
y_pred = clf.predict(x_test)
# 评估
print(accuracy_score(y_test, y_pred))
# f1_score
print(f1_score(y_test, y_pred, average='micro'))
# 分类报告
print(classification_report(y_test, y_pred))
# 混淆矩阵
print(confusion_matrix(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_pred, y_test)
plt.plot(fpr, tpr, 'b')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
knn
from missingpy import KNNImputer
imputer = KNNImputer(n_neighbors=3, weights="uniform")
impute_age = imputer.fit_transform(data[['Age']])
impute_age
from missingpy import MissForest
imputer = MissForest()
data_imputed = imputer.fit_transform(data)
data_imputed