# 机器学习的典型例子-数据预处理

# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
# dataset导入的数据集

X = dataset.iloc[:, :-1].values
# iloc 数据集里面的某一些行(列)
# [:, :-1] [行数,列数] :表示取所有的行(列)
# :-1 表示我们会取除了最后一列的其余所有的列 也就是Purchased(是否购买)列
# .values 取这些列的值



y = dataset.iloc[:, 3].values
#取所有行,取第三列(索引为3)



from sklearn.model_selection import train_test_split


ctrl+i 查看帮助

from sklearn.preprocessing import Imputer
# Imputer 专门进行数据缺失的类
imputer = Imputer(missing_values='NaN' ,strategy = 'mean', axis = 0)
# fit 拟合数据 取 第一列和第二列[1:3] 前开后闭
imputer = imputer.fit(X[:,1:3])
# transform 转化数据
X[:,1:3]=imputer.transform(X[:,1:3])


Definition :
Imputer(
missing_values="NaN",
strategy="mean",
axis=0,
verbose=0,
copy=True
)


Parameters 参数
missing_values : integer or “NaN”, optional (default=“NaN”)

strategy : string, optional (default=“mean”)

The imputation strategy.
If “mean”, then replace missing values using the mean along the axis.

If “median”, then replace missing values using the median along the axis.

If “most_frequent”, then replace missing using the most frequent value along the axis.

axis : integer, optional (default=0)
The axis along which to impute.
If axis=0, then impute along columns.
axis=0 取列的平均值
If axis=1, then impute along rows.
axis=0 取行的平均值

#分类数据
from sklearn.preprocessing import LabelEncoder
#LabelEncoder(标签编码器) 将不同组的名称转化为数字
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
#fit_transform拟合_转化数据


#分类数据
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
#LabelEncoder(标签编码器) 将不同组的名称转化为数字
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
#fit_transform拟合_转化数据
onehotencoder = OneHotEncoder(categorical_features = [0])
#categorical_features = [0] 处理第0列数据
X = onehotencoder.fit_transform(X).toarray()


labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)


09-30 1366

01-21 301
06-17 3098
02-13 36
07-03 2万+
05-04
06-19
09-21
07-10 172
08-13 6998
02-02 1892
08-15 2624
12-01 1327
06-20 4万+
05-06 296
05-23 5万+