任务
二分类任务:确定一个人是否年收入超过5万美元。
借助于高斯概率生成模型进行二分类
数据预处理
首先,对我们的train集进行研究,共15列数据,其中有些数据为数值型,不需要进行处理直接使用即可。还有一些非数值类型的数据,需要使用独热编码进行处理。
我们要将train集分为X_train与Y_train两个部分,对于X_train进行过独热编码处理后,每行包含一个106-维特性
Y_train: label=0表示“<=50k”、label=1表示">50K"
def dataProcess_X(rawData):
#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
#axis=0,代表index;axis=1,代表columns
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
#输出内容['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column
ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
#将非数字的column变成独热编码
ObjectData = pd.get_dummies(ObjectData)
Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
#normalize
#mean返回平均值,std返回标准差
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
```![在这里插入图片描述](https://img-blog.csdnimg.cn/20191122212530942.png)
```python
def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
概率生成模型
后验概率公式:
P(C1|x)即为对于x属于C1类的概率
高斯分布模型,求得结果即为P(x|C1)
如果P(C1|x) > 0.5我们就认为x属于C1类,否则反之
之后我们对后验概率公式进行推导,发现求得激活函数的结果就可以得出后验概率的结果。
最终通过复杂的函数推导,我们最终得出后验概率模型的公式
也就是说要想求得z(也就是激活函数),我们只要求得N1(train_Y中1类个数)、N2(train_Y中2类个数),训练数据的均值以及协方差就可以得出我们的公式了。
下面给出具体代码实现:
def train(X_train, Y_train):
# vaild_set_percetange = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)
#Gussian distribution parameters
train_data_size = X_train.shape[0]
cnt1 = 0
cnt2 = 0
mu1 = np.zeros((106,))
mu2 = np.zeros((106,))
for i in range(train_data_size):
if Y_train[i] == 1: # >50k
mu1 += X_train[i]
cnt1 += 1
else:
mu2 += X_train[i]
cnt2 += 1
mu1 /= cnt1
mu2 /= cnt2
sigma1 = np.zeros((106, 106))
sigma2 = np.zeros((106, 106))
for i in range(train_data_size):
if Y_train[i] == 1:
sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
else:
sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
sigma1 /= cnt1
sigma2 /= cnt2
shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2
N1 = cnt1
N2 = cnt2
return mu1, mu2, shared_sigma, N1, N2
训练过程中,有时因为数据过少会划分出部分的训练数据作为验证集,这里我们就使用了这个方法,来优化我们的模型。
实现代码如下:
def split_valid_set(X, Y, percentage):
#求共有多少行
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
最后通过我们的模型,进行预测将结果写入predict.csv文件即可。
完整的实现代码如下
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
import os
import argparse
import pdb
def dataProcess_X(rawData):
#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
#axis=0,代表index;axis=1,代表columns
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
#输出内容['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column
ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
#将非数字的column变成独热编码
ObjectData = pd.get_dummies(ObjectData)
Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
#normalize
#mean返回平均值,std返回标准差
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))
return np.clip(res, 1e-8, (1-(1e-8)))
def _shuffle(X, Y): #X and Y are np.array
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def split_valid_set(X, Y, percentage):
#求共有多少行
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
sigma_inv = inv(shared_sigma)
w = np.dot((mu1-mu2), sigma_inv)
#变为列表
X_t = X.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
a = np.dot(w,X_t) + b
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return
def train(X_train, Y_train):
# vaild_set_percetange = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)
#Gussian distribution parameters
train_data_size = X_train.shape[0]
cnt1 = 0
cnt2 = 0
mu1 = np.zeros((106,))
mu2 = np.zeros((106,))
for i in range(train_data_size):
if Y_train[i] == 1: # >50k
mu1 += X_train[i]
cnt1 += 1
else:
mu2 += X_train[i]
cnt2 += 1
mu1 /= cnt1
mu2 /= cnt2
sigma1 = np.zeros((106, 106))
sigma2 = np.zeros((106, 106))
for i in range(train_data_size):
if Y_train[i] == 1:
sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
else:
sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
sigma1 /= cnt1
sigma2 /= cnt2
shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2
N1 = cnt1
N2 = cnt2
return mu1, mu2, shared_sigma, N1, N2
if __name__ == "__main__":
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")
ans = pd.read_csv("data/correct_answer.csv")
#here is one more attribute in trainData
#数据预处理,并对数据进行读取
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = ans['label'].values
vaild_set_percetange = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)
mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)
sigma_inv = inv(shared_sigma)
w = np.dot((mu1 - mu2), sigma_inv)
X_t = x_test.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
float(N1) / N2)
a = np.dot(w, X_t) + b
y = sigmoid(a)
y_ = np.around(y).astype(np.int)
df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
result = (np.squeeze(y_ans) == y_)
print('Test acc = %f' % (float(result.sum()) / result.shape[0]))
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
if not os.path.exists(output_dir):
os.mkdir(output_dir)
df.to_csv(os.path.join('predict.csv'), sep='\t', index=False)