# 机器学习基础100天---day01 数据预处理

## 数据预处理

Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes

#_*_coding:utf-8_*_

import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer,LabelEncoder,OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

#iloc---基于索引位来截取数据集
X= dataset.iloc[:,:-1].values
y = dataset.iloc[ : , 3].values

#处理缺失数据
imputer = Imputer(missing_values="NaN",strategy="mean",axis=0)
imputer = imputer.fit(X[ : , 1:3])
#k=x[:,1:3]，使用k去训练一个Imputer类，用该类的对象去处理k的缺失值；    用k的均值去替换k中的缺失值
X[ : ,1:3] = imputer.transform(X[ : ,1:3])

#解析分类数据   分类数据指的是含有标签值而不是数字值的变量，例如yes、no不能用于模型的数字计算，所以需要解析成数字
label_x = LabelEncoder()#LabelEncoder 将标签分配一个0~n_class之间的数字编码，此处是按照首字母来排序
X[:, 0]= label_x.fit_transform(X[ : , 0])

# #创建虚拟变量
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
y =  labelencoder_Y.fit_transform(y)

# #拆分数据集为训练集合和测试集合
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.fit_transform(x_test)