In [1]:
# Importing the libraries 导入库
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [24]:
# Importing the dataset 导入数据
dataset = pd.read_csv('./Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset
Out[24]:
In [25]:
X
Out[25]:
In [26]:
y
Out[26]:
In [27]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) # 将na用平均值代替
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X
Out[27]:
In [28]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # 将国家变成数字
X
Out[28]:
In [29]:
onehotencoder = OneHotEncoder(categorical_features = [0]) # 将国家数字变成虚拟编码
X = onehotencoder.fit_transform(X).toarray()
X
Out[29]:
In [30]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y) # 将是否购买变成数字
y
Out[30]:
In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)
In [33]:
X_test
Out[33]:
In [34]:
from sklearn.preprocessing import StandardScaler # 导入标准化库
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_test
Out[34]:
In [35]:
y
Out[35]:
git项目地址: