# Step 1: Importing the libraries import numpy as np import pandas as pd # Step 2: Importing dataset # 读入数据 dataset = pd.read_csv('../datasets/Data.csv') # 取数据的前三列为X, 最后一列为Y # iloc[:]:前表示取行,后表示取列 # 对于X就是取所有行,取到第三列(原来放的是-1,不是很理解) # 对于Y就是取多有行,取第四列 X = dataset.iloc[:, :3].values Y = dataset.iloc[:, 3].values print("Step 2: Importing dataset") print("X") print(X) print("Y") print(Y)
# Step 3: Handling the missing data from sklearn.preprocessing import Imputer # 将数据中缺失的值用当前列的平均值填充 imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) # 匹配所有行,(1,3]列 imputer = imputer.fit(X[:, 1:3]) # 填充 X[:, 1:3] = imputer.transform(X[:, 1:3]) print("---------------------") print("Step 3: Handling the missing data") print("step2") print("X") print(X)
# Step 4: Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder # 连续特征编码 labelencoder_X = LabelEncoder() # 取第一列 X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # Creating a dummy variable # 离散特征编码 categorical_features是需要独热编码的列索引 onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_Y = LabelEncoder() # fit_transform来学习以及进行编码 Y = labelencoder_Y.fit_transform(Y) print("---------------------") print("Step 4: Encoding categorical data") print("X") print(X) print("Y") print(Y)
# Step 5: Splitting the datasets into training sets and Test sets from sklearn.model_selection import train_test_split # 分离训练集和测试集 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) print("---------------------") print("Step 5: Splitting the datasets into training sets and Test sets") print("X_train") print(X_train) print("X_test") print(X_test) print("Y_train") print(Y_train) print("Y_test") print(Y_test)
# Step 6: Feature Scaling 数据标准化 转化为均值为0 方差为1的数据 from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) print("---------------------") print("Step 6: Feature Scaling") print("X_train") print(X_train) print("X_test") print(X_test)