groupby&count&size&nunique

大家好,中山大学研究生,医学生+计科学生的集合体,机器学习爱好者。继续万年易错点。。坚持总结和发布,总有收获。

#!/usr/bin/env python
# -*- coding:utf8 -*-
# @TIME  :2018/12/10 20:42
# @Author:Yolanda
# @File  :ceshi.py

#反击啊!少女

import pandas as pd
import numpy as np

operation=pd.DataFrame({'A':[1,1,1,4,5],'B':[11,'NaN',33,44,55],'C':[111,222,333,444,555],
                        'D':[6,7,8,9,10],'E':[66,77,88,99,1010],'F':[666,777,888,999,101010],
                        'hour':[10,11,12,13,14],'day':[2,3,4,5,6]})
                        

1、直接对表统计,生成series

print(operation.size)#列表总个数
# 40
print(operation.nunique())#每一列的不重复的值的个数
# A       4
# B       4
# C       5
# D       5
# E       5
# F       5
# hour    5
# day     5
# dtype: int64
print(operation.count())#每一列值的个数
# A       5
# B       5
# C       5
# D       5
# E       5
# F       5
# hour    5
# day     5
# dtype: int64

2、groupby分组单列特征且统计表里全部特征

#特征只有A一个的时候,eg1等价于eg2
#eg1:
print(operation['A'].value_counts())
# 1    2
# 5    1
# 4    1
# 3    1
# Name: A, dtype: int64
#eg2:
print(operation.groupby('A').size())
# A
# 1    2
# 3    1
# 4    1
# 5    1
# dtype: int64
print(operation.groupby('A').nunique())#对A列分组,再对整个表做nunique。nunique是有多少个不同值。
# 分组后A=1时候,B对应只有1个不同的值,C对应2个不同值
#    A  B  C  D  E  F  hour  day
# A
# 1  1  1  2  2  2  2     2    2
# 3  1  1  1  1  1  1     1    1
# 4  1  1  1  1  1  1     1    1
# 5  1  1  1  1  1  1     1    1
print(operation.groupby('A').count())
#    B  C  D  E  F  hour  day
# A
# 1  2  2  2  2  2     2    2
# 3  1  1  1  1  1     1    1
# 4  1  1  1  1  1     1    1
# 5  1  1  1  1  1     1    1

3、groupby分组单列特征且统计单列特征

print(operation.groupby('A')['B'].nunique())#等价于print(operation.groupby('A').nunique()['B']),先根据A列分组,再提出B列
# A
# 1    1
# 3    1
# 4    1
# 5    1
# Name: B, dtype: int64
print(operation.groupby('A')['B'].count())
# A
# 1    2
# 3    1
# 4    1
# 5    1
# Name: B, dtype: int64

4、groupby分组多列特征且统计多列特征

temp=operation.groupby(['A','B'])[('C','D')].nunique().reset_index()#依次根据AB分组,分组后依次统计CD特征
print(temp)
#    A   B  C  D
# 0  1  11  2  2
# 1  1  33  1  1
# 2  4  44  1  1
# 3  5  55  1  1
temp=operation.groupby(['A','B','C'])[('D','E')].nunique().reset_index()#依次根据ABC分组,分组后依次统计DE特征
print(temp)
#    A   B    C  D  E
# 0  1  11  111  1  1
# 1  1  11  222  1  1
# 2  1  33  333  1  1
# 3  4  44  444  1  1
# 4  5  55  555  1  1

5、count和size区别:count和size在无空值的时候等价,count统计非空个数,size统计空值个数
5.1、groupby分组单列特征且统计单列特征

path = 'E:/yolanda/algorithm/remote_ceshi/ceshi35/2018tiancheng/chusai/data'
operation_train = pd.read_csv(path+'/operation_train_new.csv',sep=',')
temp=operation_train.groupby('UID').apply(lambda x:x.shape[0])#以UID分组统计每个UID的个数
temp1=operation_train.groupby(['UID'])['ip1'].count()#以UID分组统计ip1列非空值的个数
temp2=operation_train.groupby(['UID'])['ip1'].size()#以UID分组统计ip1列非空值+空值的个数(等于以UID分组统计每个UID的个数)
print(temp.head())
# 10000     9
# 10001    67
# 10002    11
# 10003    15
# 10004    34
# dtype: int64
print(temp1.head())
# 10000     9
# 10001    50
# 10002    10
# 10003    15
# 10004    34
# Name: ip1, dtype: int64
print(temp2.head())
# 10000     9
# 10001    67
# 10002    11
# 10003    15
# 10004    34
# Name: ip1, dtype: int64
a=temp.head()-temp1.head()
print(a)
# 10000     0
# 10001    17
# 10002     1
# 10003     0
# 10004     0
# dtype: int64
b=temp.head()-temp2.head()
print(b)
# 10000    0
# 10001    0
# 10002    0
# 10003    0
# 10004    0
# dtype: int64

5.2、groupby分组多列特征且统计多列特征

temp0=operation_train.groupby(['UID','ip2']).apply(lambda x:x.shape[0])
#以UID分组,再以ip2分组,统计每组UID下每组ip2的个数
temp01=operation_train.groupby(['UID','ip2'])['ip1'].count()
#以UID分组,再以ip2分组,统计UID下再ip2下,ip1列非空值的个数
temp02=operation_train.groupby(['UID','ip2'])['ip1'].size()
#以UID分组,再以ip2分组,统计UID下再ip2下,ip1列非空值+空值的个数(等于以UID分组统计每个UID的个数)
print(temp0.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
print(temp01.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    0
# 10012  6d00509237e58763    0
# 10017  671c59ee94964fbb    0
# 10028  ca8363953168280e    0
# Name: ip1, dtype: int64
print(temp02.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# Name: ip1, dtype: int64
c=temp0.head()-temp01.head()
print(c)
# UID    ip2
# 10001  e65a4b0133385267    0
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
c1=c.reset_index().rename(columns={0: 'ceshi'})
print(c1)
# UID    ip2                 ceshi
# 10001  e65a4b0133385267    0
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
d=temp0.head()-temp02.head()
print(d)
# UID    ip2
# # 10001  e65a4b0133385267    0
# # 10008  b5320af2fb8a7bd5    0
# # 10012  6d00509237e58763    0
# # 10017  671c59ee94964fbb    0
# # 10028  ca8363953168280e    0
# # dtype: int64

5.3、groupby多列、size、count具体应用

isnull_cnt_list=[]
def isnull_cnt(data,features,value):
    new_feature = 'null_cnt'
    for i in features:
        new_feature += '_' + i
    temp=(data.groupby(features).apply(lambda x:x.shape[0])-data.groupby(features)[value].count()). \
        reset_index().rename(columns={0: new_feature + "_0_" + value})#只能命名一个特征
    temp1=(data.groupby(features)[value].size()-data.groupby(features)[value].count()). \
        reset_index().rename(columns={value: new_feature + "_1_" + value})#可以同时命名多个特征。充分表现出size和count的区别
    temp2=data.groupby(features)[value].apply(lambda x:x.shape[0]). \
        reset_index().rename(columns={value: new_feature + "_2_" + value})
    #与data.groupby(features)[value].size()和data.groupby(features).apply(lambda x:x.shape[0])等价
    temp3=data.groupby(features)[value].count(). \
        reset_index().rename(columns={value: new_feature + "_3_" + value})
    data = data.merge(temp, 'left', on=features)
    data = data.merge(temp1, 'left', on=features)
    data = data.merge(temp2, 'left', on=features)
    data = data.merge(temp3, 'left', on=features)
    isnull_cnt_list.append(new_feature)
    return data
operation_train=isnull_cnt(operation_train,['UID'],'ip1')
print(operation_train.head()['null_cnt_UID_0_ip1'])
# 0     2
# 1    85
# 2     1
# 3     0
# 4    67
# Name: null_cnt_UID_0_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_1_ip1'])
# 0     2
# 1    85
# 2     1
# 3     0
# 4    67
# Name: null_cnt_UID_1_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_2_ip1'])
# 0    112
# 1    241
# 2     17
# 3     49
# 4    179
# Name: null_cnt_UID_2_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_3_ip1'])
# 0    110
# 1    156
# 2     16
# 3     49
# 4    112
# Name: null_cnt_UID_3_ip1, dtype: int64

  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值