groupby&count&size&nunique

最新推荐文章于 2023-08-31 14:34:49 发布

Yukyin

最新推荐文章于 2023-08-31 14:34:49 发布

阅读量1.7k

点赞数 2

分类专栏：计算机基础 Yukyin 文章标签： python入门

本文链接：https://blog.csdn.net/yukyin/article/details/84981688

版权

Yukyin 同时被 2 个专栏收录

22 篇文章 0 订阅

订阅专栏

计算机基础

18 篇文章 0 订阅

订阅专栏

大家好，中山大学研究生，医学生+计科学生的集合体，机器学习爱好者。继续万年易错点。。坚持总结和发布，总有收获。

#!/usr/bin/env python
# -*- coding:utf8 -*-
# @TIME  :2018/12/10 20:42
# @Author:Yolanda
# @File  :ceshi.py

#反击啊！少女

import pandas as pd
import numpy as np

operation=pd.DataFrame({'A':[1,1,1,4,5],'B':[11,'NaN',33,44,55],'C':[111,222,333,444,555],
                        'D':[6,7,8,9,10],'E':[66,77,88,99,1010],'F':[666,777,888,999,101010],
                        'hour':[10,11,12,13,14],'day':[2,3,4,5,6]})

1、直接对表统计，生成series

print(operation.size)#列表总个数
# 40
print(operation.nunique())#每一列的不重复的值的个数
# A       4
# B       4
# C       5
# D       5
# E       5
# F       5
# hour    5
# day     5
# dtype: int64
print(operation.count())#每一列值的个数
# A       5
# B       5
# C       5
# D       5
# E       5
# F       5
# hour    5
# day     5
# dtype: int64

2、groupby分组单列特征且统计表里全部特征

#特征只有A一个的时候,eg1等价于eg2
#eg1：
print(operation['A'].value_counts())
# 1    2
# 5    1
# 4    1
# 3    1
# Name: A, dtype: int64
#eg2：
print(operation.groupby('A').size())
# A
# 1    2
# 3    1
# 4    1
# 5    1
# dtype: int64
print(operation.groupby('A').nunique())#对A列分组，再对整个表做nunique。nunique是有多少个不同值。
# 分组后A=1时候，B对应只有1个不同的值，C对应2个不同值
#    A  B  C  D  E  F  hour  day
# A
# 1  1  1  2  2  2  2     2    2
# 3  1  1  1  1  1  1     1    1
# 4  1  1  1  1  1  1     1    1
# 5  1  1  1  1  1  1     1    1
print(operation.groupby('A').count())
#    B  C  D  E  F  hour  day
# A
# 1  2  2  2  2  2     2    2
# 3  1  1  1  1  1     1    1
# 4  1  1  1  1  1     1    1
# 5  1  1  1  1  1     1    1

3、groupby分组单列特征且统计单列特征

print(operation.groupby('A')['B'].nunique())#等价于print(operation.groupby('A').nunique()['B'])，先根据A列分组，再提出B列
# A
# 1    1
# 3    1
# 4    1
# 5    1
# Name: B, dtype: int64
print(operation.groupby('A')['B'].count())
# A
# 1    2
# 3    1
# 4    1
# 5    1
# Name: B, dtype: int64

4、groupby分组多列特征且统计多列特征

temp=operation.groupby(['A','B'])[('C','D')].nunique().reset_index()#依次根据AB分组，分组后依次统计CD特征
print(temp)
#    A   B  C  D
# 0  1  11  2  2
# 1  1  33  1  1
# 2  4  44  1  1
# 3  5  55  1  1
temp=operation.groupby(['A','B','C'])[('D','E')].nunique().reset_index()#依次根据ABC分组，分组后依次统计DE特征
print(temp)
#    A   B    C  D  E
# 0  1  11  111  1  1
# 1  1  11  222  1  1
# 2  1  33  333  1  1
# 3  4  44  444  1  1
# 4  5  55  555  1  1

5、count和size区别：count和size在无空值的时候等价，count统计非空个数，size统计空值个数
5.1、groupby分组单列特征且统计单列特征

path = 'E:/yolanda/algorithm/remote_ceshi/ceshi35/2018tiancheng/chusai/data'
operation_train = pd.read_csv(path+'/operation_train_new.csv',sep=',')
temp=operation_train.groupby('UID').apply(lambda x:x.shape[0])#以UID分组统计每个UID的个数
temp1=operation_train.groupby(['UID'])['ip1'].count()#以UID分组统计ip1列非空值的个数
temp2=operation_train.groupby(['UID'])['ip1'].size()#以UID分组统计ip1列非空值+空值的个数（等于以UID分组统计每个UID的个数）
print(temp.head())
# 10000     9
# 10001    67
# 10002    11
# 10003    15
# 10004    34
# dtype: int64
print(temp1.head())
# 10000     9
# 10001    50
# 10002    10
# 10003    15
# 10004    34
# Name: ip1, dtype: int64
print(temp2.head())
# 10000     9
# 10001    67
# 10002    11
# 10003    15
# 10004    34
# Name: ip1, dtype: int64
a=temp.head()-temp1.head()
print(a)
# 10000     0
# 10001    17
# 10002     1
# 10003     0
# 10004     0
# dtype: int64
b=temp.head()-temp2.head()
print(b)
# 10000    0
# 10001    0
# 10002    0
# 10003    0
# 10004    0
# dtype: int64

5.2、groupby分组多列特征且统计多列特征

temp0=operation_train.groupby(['UID','ip2']).apply(lambda x:x.shape[0])
#以UID分组，再以ip2分组，统计每组UID下每组ip2的个数
temp01=operation_train.groupby(['UID','ip2'])['ip1'].count()
#以UID分组，再以ip2分组，统计UID下再ip2下，ip1列非空值的个数
temp02=operation_train.groupby(['UID','ip2'])['ip1'].size()
#以UID分组，再以ip2分组，统计UID下再ip2下，ip1列非空值+空值的个数（等于以UID分组统计每个UID的个数）
print(temp0.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
print(temp01.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    0
# 10012  6d00509237e58763    0
# 10017  671c59ee94964fbb    0
# 10028  ca8363953168280e    0
# Name: ip1, dtype: int64
print(temp02.head())
# UID    ip2
# 10001  e65a4b0133385267    1
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# Name: ip1, dtype: int64
c=temp0.head()-temp01.head()
print(c)
# UID    ip2
# 10001  e65a4b0133385267    0
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
c1=c.reset_index().rename(columns={0: 'ceshi'})
print(c1)
# UID    ip2                 ceshi
# 10001  e65a4b0133385267    0
# 10008  b5320af2fb8a7bd5    9
# 10012  6d00509237e58763    2
# 10017  671c59ee94964fbb    3
# 10028  ca8363953168280e    8
# dtype: int64
d=temp0.head()-temp02.head()
print(d)
# UID    ip2
# # 10001  e65a4b0133385267    0
# # 10008  b5320af2fb8a7bd5    0
# # 10012  6d00509237e58763    0
# # 10017  671c59ee94964fbb    0
# # 10028  ca8363953168280e    0
# # dtype: int64

5.3、groupby多列、size、count具体应用

isnull_cnt_list=[]
def isnull_cnt(data,features,value):
    new_feature = 'null_cnt'
    for i in features:
        new_feature += '_' + i
    temp=(data.groupby(features).apply(lambda x:x.shape[0])-data.groupby(features)[value].count()). \
        reset_index().rename(columns={0: new_feature + "_0_" + value})#只能命名一个特征
    temp1=(data.groupby(features)[value].size()-data.groupby(features)[value].count()). \
        reset_index().rename(columns={value: new_feature + "_1_" + value})#可以同时命名多个特征。充分表现出size和count的区别
    temp2=data.groupby(features)[value].apply(lambda x:x.shape[0]). \
        reset_index().rename(columns={value: new_feature + "_2_" + value})
    #与data.groupby(features)[value].size()和data.groupby(features).apply(lambda x:x.shape[0])等价
    temp3=data.groupby(features)[value].count(). \
        reset_index().rename(columns={value: new_feature + "_3_" + value})
    data = data.merge(temp, 'left', on=features)
    data = data.merge(temp1, 'left', on=features)
    data = data.merge(temp2, 'left', on=features)
    data = data.merge(temp3, 'left', on=features)
    isnull_cnt_list.append(new_feature)
    return data
operation_train=isnull_cnt(operation_train,['UID'],'ip1')
print(operation_train.head()['null_cnt_UID_0_ip1'])
# 0     2
# 1    85
# 2     1
# 3     0
# 4    67
# Name: null_cnt_UID_0_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_1_ip1'])
# 0     2
# 1    85
# 2     1
# 3     0
# 4    67
# Name: null_cnt_UID_1_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_2_ip1'])
# 0    112
# 1    241
# 2     17
# 3     49
# 4    179
# Name: null_cnt_UID_2_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_3_ip1'])
# 0    110
# 1    156
# 2     16
# 3     49
# 4    112
# Name: null_cnt_UID_3_ip1, dtype: int64

Yukyin

关注

2
点赞
踩
13

收藏

觉得还不错? 一键收藏
5
评论
groupby&count&size&nunique

大家好，中山大学研究生，医学生+计科学生的集合体，机器学习爱好者。继续万年易错点。。坚持总结和发布，总有收获。#!/usr/bin/env python# -*- coding:utf8 -*-# @TIME :2018/12/10 20:42# @Author:Yolanda# @File :ceshi.py#反击啊！少女import pandas as pdimport...
复制链接

扫一扫

专栏目录