基于概率生成模型的二分类

任务

二分类任务:确定一个人是否年收入超过5万美元。
借助于高斯概率生成模型进行二分类

数据预处理

首先,对我们的train集进行研究,共15列数据,其中有些数据为数值型,不需要进行处理直接使用即可。还有一些非数值类型的数据,需要使用独热编码进行处理。
我们要将train集分为X_train与Y_train两个部分,对于X_train进行过独热编码处理后,每行包含一个106-维特性
Y_train: label=0表示“<=50k”、label=1表示">50K"

def dataProcess_X(rawData):

    #sex 只有两个属性 先drop之后处理
    if "income" in rawData.columns:
        #axis=0,代表index;axis=1,代表columns
        Data = rawData.drop(["sex", 'income'], axis=1)
    else:
        Data = rawData.drop(["sex"], axis=1)
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
    #输出内容['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column

    ObjectData = Data[listObjectColumn]
    NonObjectData = Data[listNonObjedtColumn]


    #insert set into nonobject data with male = 0 and female = 1
    NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))


    #set every element in object rows as an attribute
    #将非数字的column变成独热编码
    ObjectData = pd.get_dummies(ObjectData)

    Data = pd.concat([NonObjectData, ObjectData], axis=1)
    Data_x = Data.astype("int64")

    # Data_y = (rawData["income"] == " <=50K").astype(np.int)

    #normalize
    #mean返回平均值,std返回标准差
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()

    return Data_x
```![在这里插入图片描述](https://img-blog.csdnimg.cn/20191122212530942.png)

```python
def dataProcess_Y(rawData):
    df_y = rawData['income']
    Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
    return Data_y

概率生成模型

后验概率公式:
在这里插入图片描述
P(C1|x)即为对于x属于C1类的概率
在这里插入图片描述
高斯分布模型,求得结果即为P(x|C1)
在这里插入图片描述
如果P(C1|x) > 0.5我们就认为x属于C1类,否则反之

在这里插入图片描述
之后我们对后验概率公式进行推导,发现求得激活函数的结果就可以得出后验概率的结果。

最终通过复杂的函数推导,我们最终得出后验概率模型的公式
在这里插入图片描述
也就是说要想求得z(也就是激活函数),我们只要求得N1(train_Y中1类个数)、N2(train_Y中2类个数),训练数据的均值以及协方差就可以得出我们的公式了。
下面给出具体代码实现:

def train(X_train, Y_train):
    # vaild_set_percetange = 0.1
    # X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)

    #Gussian distribution parameters
    train_data_size = X_train.shape[0]

    cnt1 = 0
    cnt2 = 0

    mu1 = np.zeros((106,))
    mu2 = np.zeros((106,))
    for i in range(train_data_size):
        if Y_train[i] == 1:     # >50k
            mu1 += X_train[i]
            cnt1 += 1
        else:
            mu2 += X_train[i]
            cnt2 += 1
    mu1 /= cnt1
    mu2 /= cnt2

    sigma1 = np.zeros((106, 106))
    sigma2 = np.zeros((106, 106))
    for i in range(train_data_size):
        if Y_train[i] == 1:
            sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
        else:
            sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
    sigma1 /= cnt1
    sigma2 /= cnt2
    shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2

    N1 = cnt1
    N2 = cnt2

    return mu1, mu2, shared_sigma, N1, N2

训练过程中,有时因为数据过少会划分出部分的训练数据作为验证集,这里我们就使用了这个方法,来优化我们的模型。
实现代码如下:

def split_valid_set(X, Y, percentage):
    #求共有多少行
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))
    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
    X_train, Y_train = X[valid_size:], Y[valid_size:]

    return X_train, Y_train, X_valid, Y_valid

最后通过我们的模型,进行预测将结果写入predict.csv文件即可。
完整的实现代码如下

import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
import os
import argparse
import pdb

def dataProcess_X(rawData):

    #sex 只有两个属性 先drop之后处理
    if "income" in rawData.columns:
        #axis=0,代表index;axis=1,代表columns
        Data = rawData.drop(["sex", 'income'], axis=1)
    else:
        Data = rawData.drop(["sex"], axis=1)
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
    #输出内容['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column

    ObjectData = Data[listObjectColumn]
    NonObjectData = Data[listNonObjedtColumn]


    #insert set into nonobject data with male = 0 and female = 1
    NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))


    #set every element in object rows as an attribute
    #将非数字的column变成独热编码
    ObjectData = pd.get_dummies(ObjectData)

    Data = pd.concat([NonObjectData, ObjectData], axis=1)
    Data_x = Data.astype("int64")

    # Data_y = (rawData["income"] == " <=50K").astype(np.int)

    #normalize
    #mean返回平均值,std返回标准差
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()

    return Data_x

def dataProcess_Y(rawData):
    df_y = rawData['income']
    Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
    return Data_y

def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, (1-(1e-8)))

def _shuffle(X, Y):                                 #X and Y are np.array
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
    #求共有多少行
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))
    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
    X_train, Y_train = X[valid_size:], Y[valid_size:]

    return X_train, Y_train, X_valid, Y_valid

def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
    sigma_inv = inv(shared_sigma)
    w = np.dot((mu1-mu2), sigma_inv)
    #变为列表
    X_t = X.T
    b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
    a = np.dot(w,X_t) + b
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
    return

def train(X_train, Y_train):
    # vaild_set_percetange = 0.1
    # X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)

    #Gussian distribution parameters
    train_data_size = X_train.shape[0]

    cnt1 = 0
    cnt2 = 0

    mu1 = np.zeros((106,))
    mu2 = np.zeros((106,))
    for i in range(train_data_size):
        if Y_train[i] == 1:     # >50k
            mu1 += X_train[i]
            cnt1 += 1
        else:
            mu2 += X_train[i]
            cnt2 += 1
    mu1 /= cnt1
    mu2 /= cnt2

    sigma1 = np.zeros((106, 106))
    sigma2 = np.zeros((106, 106))
    for i in range(train_data_size):
        if Y_train[i] == 1:
            sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
        else:
            sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
    sigma1 /= cnt1
    sigma2 /= cnt2
    shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2

    N1 = cnt1
    N2 = cnt2

    return mu1, mu2, shared_sigma, N1, N2

if __name__ == "__main__":
    trainData = pd.read_csv("data/train.csv")
    testData = pd.read_csv("data/test.csv")
    ans = pd.read_csv("data/correct_answer.csv")
    #here is one more attribute in trainData
    #数据预处理,并对数据进行读取
    x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
    x_test = dataProcess_X(testData).values
    y_train = dataProcess_Y(trainData).values
    y_ans = ans['label'].values
    vaild_set_percetange = 0.1
    X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)
    mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
    valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)
     mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)
    sigma_inv = inv(shared_sigma)
    w = np.dot((mu1 - mu2), sigma_inv)
    X_t = x_test.T
    b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
        float(N1) / N2)
    a = np.dot(w, X_t) + b
    y = sigmoid(a)
    y_ = np.around(y).astype(np.int)
    df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
    result = (np.squeeze(y_ans) == y_)
    print('Test acc = %f' % (float(result.sum()) / result.shape[0]))
    df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    df.to_csv(os.path.join('predict.csv'), sep='\t', index=False)
  • 7
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值