代码依旧参考:https://github.com/maplezzz/NTU_ML2017_Hung-yi-Lee_HW
Requirement
Dataset and Task Introduction
-
TASK: Binary Classification
Dtermine whether a person makes over 50K a year
-
Dataset: ADULT
Extraction was done by Barry Becker from the 1994 Census database.
A set of reasonably clean records was extracted using the following conditions: ((AGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)).
Data Attribute Information
-
train.csv 、test.csv:
age, workclass, fnlwgt, education, education num, marital-status, occupation
relationship, race, sex, capital-gain, capital-loss, hours-per-week,
native-country -
make over 50K a year or not
-
For more details please check out Kaggle’s Description Page
一、Probabilistic generation model概率生成模型
- 对于二分类问题,给一个x(由它的特征attribute表示的一个向量vector),考虑它属于某一类(
)的概率:
其中和
都是未知的。
- 假设x服从高斯分布(Gaussian distribution):
只要找到了高斯分布的参数和
,我们就可以知道x属于
的概率
。
- 极大似然估计Maximum Likelihood是一种参数估计方法,Likelihood of a Gaussian with mean
and covariance matrix
= the probability of the Gaussian samples:
- 模型改进:Share the same coveriance matrix to avoid overfitting.
- Posterior Probability与Sigmoid function
令,则
.
将和
、
带入
:
因此.
In generative model, we estimate , then we have
and
.
二、数据预处理
def dataProcess_X(rawData):
#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:#检查列名
Data = rawData.drop(["sex", 'income'], axis=1)#drop函数默认删除行,列需要加axis = 1
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column,dtype检查元素类型
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column,此处不用检查元素类型
ObjectData = Data[listObjectColumn]#展开数据
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1将性别作为数字数据插入
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))#insert在0处插入"sex",astype强制类型转换
#set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData)#get_dummies可进行one-hot处理
Data = pd.concat([NonObjectData, ObjectData], axis=1)#合并到右边
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
#normalize归一化,假设为正态分布
Data_x = (Data_x - Data_x.mean()) / Data_x.std()#.std求标准差
return Data_x
#获取training data_y
def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
- 原始数据:rawData,对于非数字型的数据进行one-hot编码处理,其中性别只有两个选项,只用0、1表示即可:
List(Data)
Data.columns
查看所有数据:
- 将非数字型数据和数字型数据分开:
展开看一下:
把性别数据插入到数字型:
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))#insert在0处插入"sex",astype强制类型转换
- 对字符型数据进行one-hot编码:
ObjectData = pd.get_dummies(ObjectData)#get_dummies可进行one-hot处理
- 归一化
Data_x = (Data_x - Data_x.mean()) / Data_x.std()#.std求标准差
- 处理data_y
三、生成模型Generative Model
def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))#sigmoid函数
return np.clip(res, 1e-8, (1-(1e-8)))#np.clip将范围外的数强制转化为范围内的数
def _shuffle(X, Y): #X and Y are np.array
#shuffle洗牌,shuffle()将序列的所有元素随机排序
randomize = np.arange(X.shape[0])#trainingdata_x中有32560比数据,arange函数用于创建等差数组
np.random.shuffle(randomize)#np.random.shuffle(x):在原数组上进行,改变自身序列,无返回值。
return (X[randomize], Y[randomize])
#将Training set分为Training set和Validation set
def split_valid_set(X, Y, percentage):
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))#floor() 返回数字的下舍整数。
X, Y = _shuffle(X, Y)#shuffle()将序列的所有元素随机排序
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
#求验证集上的结果及其精确度
def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
sigma_inv = inv(shared_sigma)#inv求逆
w = np.dot((mu1-mu2), sigma_inv)
X_t = X.T#.T求转置
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
a = np.dot(w,X_t) + b#得到sigmoid函数输入
y = sigmoid(a)
y_ = np.around(y)#round() 方法返回浮点数x的四舍五入值
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return
#求高斯分布参数
def train(X_train, Y_train):
# vaild_set_percetange = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)
#Gussian distribution parameters
train_data_size = X_train.shape[0]
cnt1 = 0
cnt2 = 0
mu1 = np.zeros((106,))#np.zeros返回一个给定形状和类型的用0填充的数组
mu2 = np.zeros((106,))
for i in range(train_data_size):
if Y_train[i] == 1: # >50k
mu1 += X_train[i]
cnt1 += 1
else:
mu2 += X_train[i]
cnt2 += 1
mu1 /= cnt1
mu2 /= cnt2
sigma1 = np.zeros((106, 106))#np.zeros返回一个给定形状和类型的用0填充的数组
sigma2 = np.zeros((106, 106))
for i in range(train_data_size):
if Y_train[i] == 1:
sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
else:
sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
sigma1 /= cnt1
sigma2 /= cnt2
shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2
N1 = cnt1
N2 = cnt2
return mu1, mu2, shared_sigma, N1, N2
'''
当.py文件被直接运行时,if __name__ == '__main__'之下的代码块将被运行;
当.py文件以模块形式被导入时,if __name__ == '__main__'之下的代码块不被运行。
'''
if __name__ == "__main__":
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")
ans = pd.read_csv("data/correct_answer.csv")
#here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values#values() 函数以列表返回字典中的所有值
y_train = dataProcess_Y(trainData).values
y_ans = ans['label'].values
vaild_set_percetange = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)
mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)
sigma_inv = inv(shared_sigma)
w = np.dot((mu1 - mu2), sigma_inv)
X_t = x_test.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
float(N1) / N2)
a = np.dot(w, X_t) + b
y = sigmoid(a)
y_ = np.around(y).astype(np.int)
#df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
result = (np.squeeze(y_ans) == y_)
print('Test acc = %f' % (float(result.sum()) / result.shape[0]))
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
if not os.path.exists(output_dir):
os.mkdir(output_dir)
df.to_csv(os.path.join(output_dir+'gd_output.csv'), sep='\t', index=False)
y_为模型计算结果,Y为实际结果:
四、逻辑回归Logistic Regression
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os
output_dir = "output/"
def dataProcess_X(rawData):
#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column
ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData)
Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
#normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))
return np.clip(res, 1e-8, (1-(1e-8)))
def _shuffle(X, Y): #X and Y are np.array
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def split_valid_set(X, Y, percentage):
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
#求验证及上地实验结果
def valid(X, Y, w):
a = np.dot(w,X.T)
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return y_
def train(X_train, Y_train):
# valid_set_percentage = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, valid_set_percentage)
w = np.zeros(len(X_train[0]))
l_rate = 0.001
batch_size = 32#minibatch
train_dataz_size = len(X_train)
step_num = int(floor(train_dataz_size / batch_size))
epoch_num = 300#训练迭代次数
list_cost = []
total_loss = 0.0
for epoch in range(1, epoch_num):#左闭右开区间
total_loss = 0.0#total_loss每次清零
X_train, Y_train = _shuffle(X_train, Y_train)#每次都打乱,达到随机的效果
#在每个minibatch中进行训练
for idx in range(1, step_num):
X = X_train[idx*batch_size:(idx+1)*batch_size]
Y = Y_train[idx*batch_size:(idx+1)*batch_size]
s_grad = np.zeros(len(X[0]))
z = np.dot(X, w)
y = sigmoid(z)
loss = y - np.squeeze(Y)
#求交叉熵
cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y)))/ len(Y)
total_loss += cross_entropy
grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0)#求梯度
# grad = np.dot(X.T, loss)
w = w - l_rate * grad
# s_grad += grad ** 2
# ada = np.sqrt(s_grad)
# w = w - l_rate * grad / ada
list_cost.append(total_loss)
# valid(X_valid, Y_valid, w)
plt.plot(np.arange(len(list_cost)), list_cost)
plt.title("Train Process")
plt.xlabel("epoch_num")
plt.ylabel("Cost Function (Cross Entropy)")
plt.savefig(os.path.join(os.path.dirname(output_dir), "TrainProcess"))
plt.show()
return w
if __name__ == "__main__":
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")
ans = pd.read_csv("data/correct_answer.csv")
# here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = ans['label'].values
x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)#加入b
x_train = np.concatenate((np.ones((x_train.shape[0], 1)),x_train), axis=1)
valid_set_percentage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)
w_train = train(X_train, Y_train)#在验证集上做训练
valid(X_train, Y_train, w_train)#验证集上的结果
w = train(x_train, y_train)#在整个训练集上做训练
y_ = valid(x_test, y_ans, w)#测试结果
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
if not os.path.exists(output_dir):
os.mkdir(output_dir)
df.to_csv(os.path.join(output_dir + 'lr_output.csv'), sep='\t', index=False)
1、步骤
- Step1:Function Set
- Step 2:Training Data:(
)
:1 for class1, 2 for class 2
Cross Entropy:
- step3:Update
2、mini-batch 梯度下降
当数据集很大时,训练算法是非常慢的,
和 batch 梯度下降相比,使用 mini batch 梯度下降更新参数更快,有利于更鲁棒地收敛,避免局部最优。
和 stochastic 梯度下降相比,使用 mini batch 梯度下降的计算效率更高,可以帮助快速训练模型。
3、推广:多分类问题Softmax
五、神经网络模型
使用了Adam优化方法,看到训练过程,心情还是无比激动的!
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os
output_dir = "output/"
def dataProcess_X(rawData):
#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column
ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData)
Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
#normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
if __name__ == "__main__":
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")
ans = pd.read_csv("data/correct_answer.csv")
# here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = ans['label'].values
model = Sequential()#由多个网络层线性堆叠的栈
model.add(Dense(units=600, activation='sigmoid', input_dim=106))#全连接层
model.add(Dense(units=600, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=50)
score = model.evaluate(x_test, y_ans)#输入数据和标签,输出损失和精确度
result = np.squeeze(model.predict(x_test))
# print('Total loss on Testing set: ', score[0])
# print('Accuracy of Testing set: ', score[1])
model.save(os.path.join(output_dir+'nn_model.h5'))#保存训练好的模型
y_ = np.around(result).astype(np.int)
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
result = (np.squeeze(y_ans) == y_)
print('Test acc = %f' % (float(result.sum()) / result.shape[0]))
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
if not os.path.exists(output_dir):
os.mkdir(output_dir)
df.to_csv(os.path.join(output_dir + 'nn_output.csv'), sep='\t', index=False)