一、导入库
import numpy as np
import pandas as np
二、导入数据集
dataset=pd.read_csv('Data.csv')
X=dataset.iloc[:,:-1].values
Y=dataset.iloc[:,3].values
三、处理丢失数据
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN',strategy="mean",axis=0)
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])
四、分类数据编码
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])
'''将X[:,0]看做一个集和,去除多余的元素,得到标签值[France,Germany,Spain],然后用各个元素的在标签数组中的索引值替换X[:,0]中元素的对应值,[France,Spain,Germany,Spain,Germany,France,Spain,France,Germany,France]
即为[0,2,1,2,1,0,2,0,1,0]
'''
onehotencoder=OneHotEncoder(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()
labelencoder_Y=LabelEncoder()
Y=labelencoder_Y.fit_transform(Y)
五、划分测试集和训练集
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
六、特征缩放
#对数据进行标准化
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.fit_transform(X_test)