逻辑回归实例,特征预处理

给定训练集spam_train.csv,要求根据每个ID各种属性值来判断该ID对应角色是Winner还是Losser(收入是否大于50K),这是一个典型的二分类问题。

训练集介绍:

(1)、CSV文件,大小为4000行X59列;

(2)、4000行数据对应着4000个角色,ID编号从1到4001;

(3)、59列数据中, 第一列为角色ID,最后一列为分类结果,即label(0、1两种),中间的57列为角色对应的57种属性值;
   (4)、数据集地址:https://pan.baidu.com/s/1mG7ndtlT4jWYHH9V-Rj_5g, 提取码:hwzf 。

import pandas as pd
import numpy as np


# 更新参数,训练模型
def train(x_train, y_train, epoch):
    num = x_train.shape[0]
    dim = x_train.shape[1]
    bias = 0  # 偏置值初始化
    weights = np.ones(dim)  # 权重初始化
    learning_rate = 1  # 初始学习率
    reg_rate = 0.001  # 正则项系数
    bg2_sum = 0  # 用于存放偏置值的梯度平方和
    wg2_sum = np.zeros(dim)  # 用于存放权重的梯度平方和

    for i in range(epoch):
        b_g = 0
        w_g = np.zeros(dim)
        # 在所有数据上计算梯度,梯度计算时针对损失函数求导
        for j in range(num):
            y_pre = weights.dot(x_train[j, :]) + bias
            sig = 1 / (1 + np.exp(-y_pre))
            b_g += (-1) * (y_train[j] - sig)
            for k in range(dim):
                w_g[k] += (-1) * (y_train[j] - sig) * x_train[j, k] + 2 * reg_rate * weights[k]
        b_g /= num
        w_g /= num

        # adagrad
        bg2_sum += b_g ** 2
        wg2_sum += w_g ** 2
        # 更新权重和偏置
        bias -= learning_rate / bg2_sum ** 0.5 * b_g
        weights -= learning_rate / wg2_sum ** 0.5 * w_g

        # 每训练100轮,输出一次在训练集上的正确率
        # 在计算loss时,由于涉及到log()运算,因此可能出现无穷大,计算并打印出来的loss为nan
        # 有兴趣的同学可以把下面涉及到loss运算的注释去掉,观察一波打印出的loss
        if i % 3 == 0:
            # loss = 0
            acc = 0
            result = np.zeros(num)
            for j in range(num):
                y_pre = weights.dot(x_train[j, :]) + bias
                sig = 1 / (1 + np.exp(-y_pre))
                if sig >= 0.5:
                    result[j] = 1
                else:
                    result[j] = 0

                if result[j] == y_train[j]:
                    acc += 1.0
                # loss += (-1) * (y_train[j] * np.log(sig) + (1 - y_train[j]) * np.log(1 - sig))
            # print('after {} epochs, the loss on train data is:'.format(i), loss / num)
            print('after {} epochs, the acc on train data is:'.format(i), acc / num)

    return weights, bias


# 验证模型效果
def validate(x_val, y_val, weights, bias):
    num = 500
    # loss = 0
    acc = 0
    result = np.zeros(num)
    for j in range(num):
        y_pre = weights.dot(x_val[j, :]) + bias
        sig = 1 / (1 + np.exp(-y_pre))
        if sig >= 0.5:
            result[j] = 1
        else:
            result[j] = 0

        if result[j] == y_val[j]:
            acc += 1.0
        # loss += (-1) * (y_val[j] * np.log(sig) + (1 - y_val[j]) * np.log(1 - sig))
    return acc / num


def main():
    # 从csv中读取有用的信息
    df = pd.read_csv('spam_train.csv')
    # 空值填0
    df = df.fillna(0)
    # (4000, 59)
    array = np.array(df)
    # (4000, 57)
    x = array[:, 1:-1]
    # scale
    x[:, -1] /= np.mean(x[:, -1])
    x[:, -2] /= np.mean(x[:, -2])
    # (4000, )
    y = array[:, -1]

    # 划分训练集与验证集
    x_train, x_val = x[0:3500, :], x[3500:4000, :]
    y_train, y_val = y[0:3500], y[3500:4000]

    epoch = 30  # 训练轮数
    # 开始训练
    w, b = train(x_train, y_train, epoch)
    # 在验证集上看效果
    acc = validate(x_val, y_val, w, b)
    print('The acc on val data is:', acc)


if __name__ == '__main__':
    main()
after 0 epochs, the acc on train data is: 0.6134285714285714
after 3 epochs, the acc on train data is: 0.8994285714285715
after 6 epochs, the acc on train data is: 0.914
after 9 epochs, the acc on train data is: 0.9168571428571428
after 12 epochs, the acc on train data is: 0.9225714285714286
after 15 epochs, the acc on train data is: 0.9242857142857143
after 18 epochs, the acc on train data is: 0.9251428571428572
after 21 epochs, the acc on train data is: 0.9242857142857143
after 24 epochs, the acc on train data is: 0.9248571428571428
after 27 epochs, the acc on train data is: 0.9248571428571428
The acc on val data is: 0.94
import pandas as pd
import numpy as np
def train(x_train,y_train,epoch):
    num=x_train.shape[0]
    dim=x_train.shape[1]
    bias=0
    weights=np.ones(dim)
    learning_rate=1
    reg_rate=0.001
    bg2_sum=0
    wg2_sum=np.zeros(dim)
    for i in range(epoch):
        b_g=0
        w_g=np.zeros(dim)
        for j in range(num):
            y_pre=weights.dot(x_train[j,:])+bias
            sig=1/(1+np.exp(-y_pre))
            b_g+=(-1)*(y_train[j]-sig)
            for k in range(dim):
                w_g[k]+=(-1)*(y_train[j]-sig)*x_train[j,k]+2*reg_rate*weights[k]
            b_g/=num
            w_g/=num
            bg2_sum+=b_g**2
            wg2_sum+=w_g**2
            bias-=learning_rate/bg2_sum**0.5*b_g
            weights-=learning_rate/wg2_sum**0.5*w_g
        if i%3==0:
            acc=0
            result=np.zeros(num)
            for j in range(num):
                y_pre=weights.dot(x_train[j,:])+bias
                sig=1/(1+np.exp(-y_pre))
                if sig>=0.5:
                    result[j]=1
                else:
                    result[j]=0
                if result[j]==y_train[j]:
                    acc+=1.0
            print('{}epochs'.format(i),acc/num)
    return  weights,bias
def validate(x_val,y_val,weights,bias):
    num=500
    acc=0
    result=np.zeros(num)
    for j in range(num):
        y_pre=weights.dot(x_val[j,:])+bias
        sig=1/(1+np.exp(-y_pre))
        if sig>=0.5:
            result[j]=1
        else:
            result[j]=0
        if result[j]==y_train[j]:
            acc+=1.0 
    return acc/num
def main():
    df=pd.read_csv('spam_train.csv')
    df=df.fillna(0)
   
    array=np.array(df)
    x=array[:,1:-1]
    x[:,-1]/=np.mean(x[:,-1])
    x[:,-2]/=np.mean(x[:,-2])
    y=array[:,-1]
    x_train,x_val=x[0:3500,:],x[3500:4000]
    y_train,y_val=y[0:3500],y[3500:4000]
    epoch=30
    w,b=train(x_train,y_train,epoch)
    acc=validate(x_val,y_val,w,b)
    print(acc)
if __name__=='__main__':
    main()
   
```python
 df=pd.read_csv('spam_train.csv')
df.head()
100.11.130.20.370.30.40.50.6...0.420.1450.430.4360.440.451.792551470.46
020.00.000.600.00.000.600.000.00.60...0.00.1430.0470.1910.1430.02.041311961
130.00.000.480.00.000.000.000.00.00...0.00.0000.0000.4500.0000.01.1384410
240.00.510.000.00.510.511.020.00.00...0.00.1420.0000.0711.2120.07.0251302811
350.00.000.000.00.000.000.000.00.64...0.00.1160.0000.2320.0000.01.5516450
460.00.800.000.00.600.000.000.20.00...0.00.0000.0000.0600.0000.02.533432280

5 rows × 59 columns

df.shape
(4000, 59)
df.columns
Index(['1', '0', '0.1', '1.13', '0.2', '0.37', '0.3', '0.4', '0.5', '0.6',
       '0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.37.1', '0.13', '0.37.2',
       '1.13.1', '0.14', '0.37.3', '0.15', '0.16', '0.17', '0.18', '0.19',
       '0.20', '0.21', '0.22', '0.23', '0.24', '0.25', '0.26', '0.27', '0.28',
       '0.29', '0.30', '0.31', '0.32', '0.33', '0.34', '0.35', '0.36',
       '0.37.4', '0.38', '0.39', '0.40', '0.41', '0.42', '0.145', '0.43',
       '0.436', '0.44', '0.45', '1.792', '55', '147', '0.46'],
      dtype='object')
from sklearn.impute import SimpleImputer
imp_mean=SimpleImputer()#默认均值填补
imp_median=SimpleImputer(strategy='median')#中位数填补
imp_0=SimpleImputer(strategy='constant',fill_value=0)#0填补
imp_mean=imp_mean.fit_transform(Age)
data.loc[:,'Age']=imp_mean

##用pandas和Numpy填补更简单
import pandas as pd
import numpy as np
data.loc[:,'Age']=data.loc[:,'Age'].fillna(data.loc[:,'Age'].median())

#将标签类别进行编码 ,LabelEncoder 类别专用
from sklearn.preprocessing import LabelEncoder
y=data.iloc[:,-1]
le=LabelEncoder()
le=le.fit(y)
label=le.transform(y)
#或者le.fit_transform(y)
data.iloc[:,-1]=label
#或
data.iloc[:,-1]=LabelEncoder().fit_transform(data.iloc[:,-1])
#连续数据二值化,大于某阙值设为1,否则设为0
from sklearn.preprocessing import Binarizer
x=data.iloc[:,0].values.reshape(-1,-1)#将array转化为矩阵
transformer=Binarizer(threshold=1).fit_transform(x)
data.iloc[:,0]=transformer
import pandas as pd
df=pd.read_csv(r'digit_set/train.csv')
D:\soft\Python\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)
D:\soft\Python\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
  return f(*args, **kwds)
df.head()
labelpixel0pixel1pixel2pixel3pixel4pixel5pixel6pixel7pixel8...pixel774pixel775pixel776pixel777pixel778pixel779pixel780pixel781pixel782pixel783
01000000000...0000000000
10000000000...0000000000
21000000000...0000000000
34000000000...0000000000
40000000000...0000000000

5 rows × 785 columns

特征选择:方差过滤

X=df.iloc[:,1:]
y=df.iloc[:,0]



from sklearn.feature_selection import VarianceThreshold
select=VarianceThreshold()#实例化,不填参数默认方差是0
X_var0=select.fit_transform(df)#获取删除不合格特征之后的特征矩阵
#可写作X_var=VarianceThreshold()
X_var0.shape
(42000, 709)
#若特征是伯努利随机变量,假设二分类特征中的某种分类占到80%以上的时候删除特征
X_bvar=varianceThreshold(.8*(1-.8).fit_transform(X))

%%timeit#统计一个cell的运行时间
#卡方校验
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2h
x_fschi=SelectKBest(chi2,k=300).fit_transform(x_fsvar,y)#k表示所需要的特征,x_fsvar特征矩阵h
x_fschi.shape
cross_val_score(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()


 
# #若k值不确定,可用学习曲线检测
# import matplotlib.pyplot as plt
# score=[]
# for i in range(390,200,-10):
#     x_fschi=SelectKBest(chi2,k=i).fit_transform(x_fschi,y)
#     once=cross_val_scor(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()
#     score.append(once)
# plt.plot(range(350,200,-10),score)
# plt.show()
#嵌入式选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC

RFC_=RFC(n_estimators=10,random_state=0)
X_embedded=SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y)#threshold阙值

用随机森林填补缺失值


#将有缺失要填补的那一列取出作为预测类,有标记的部分作为实际值,空缺部分作为要预测的部分
df=X.copy()
fill=df.loc[:,to_fill]#to_fill是要填补的那一列的名称
#取出剩下的列和标签列
df=pd.concat(df.loc[:,df.columns!=to_fill],pd.DataFrame(y)],axis=1)
Ytrain=fill[fill.notnull()]
Ytest=fill[fill.isnull()]#找出缺失值样本的索引
Xtrain=df.iloc[Ytrain.index,:]#未缺失部分作为训练集
Xtest=df.iloc[Ytest.index,:]#缺失部分作为测试集
from sklearn.ensemble import RandomForestRegressor as rfr
rfr=rfr(n_estimators=100)
rfr=rfr.fit(Xtrain,Ytrain)

data.describe([0.01,0.1,0.25,0.5,0.75,0.99])
#类别不平衡:上采样
import imblearn
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=42)
X,y=sm.fit_sample(X,y)#返回已经上采样的特征矩阵
n_sample_=X.shape[0]
n_1_sample=y.value_counts()[1]
n_0_sample=y.value_counts()[0]

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
逻辑回归是一种用于二分类问题的机器学习算法。在Python中,我们可以使用scikit-learn库进行逻辑回归模型的训练和预测。 在准备数据集方面,我们首先需要加载数据。数据集可以是一个包含特征和标签的二维数组。特征是用来预测标签的变量,而标签是我们要预测的目标变量。 接下来,我们需要对数据集进行拆分,将其分为训练集和测试集。训练集用于训练模型,而测试集用于评估模型的性能。 当数据集准备就绪后,我们可以使用LogisticRegression类来创建逻辑回归模型。在创建模型时,我们可以指定一些参数,如正则化形式和正则化强度,以控制模型的复杂度。 然后,我们可以使用训练集来拟合模型。拟合模型是指利用训练集的特征和标签来寻找最佳拟合线。通过拟合模型,我们可以得到模型的回归系数,它们代表了每个特征对于预测结果的重要性。 最后,我们可以使用测试集来评估模型的性能。我们可以计算模型的准确率、精确率、召回率和F1分数等指标,以衡量模型的预测能力。 值得注意的是,在进行逻辑回归之前,我们需要对数据集进行预处理。这可能包括特征缩放、特征选择和处理缺失值等操作,以提高模型的性能和稳定性。 总结起来,Python中的逻辑回归实例涉及加载和准备数据集、创建和拟合逻辑回归模型,以及评估模型的性能。逻辑回归模型可以用于解决二分类问题,并可以根据特征预测标签。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值