大家好,中山大学研究生,医学生+计科学生的集合体,机器学习爱好者。继续万年易错点。。坚持总结和发布,总有收获。
#!/usr/bin/env python
# -*- coding:utf8 -*-
# @TIME :2018/12/10 20:42
# @Author:Yolanda
# @File :ceshi.py
#反击啊!少女
import pandas as pd
import numpy as np
operation=pd.DataFrame({'A':[1,1,1,4,5],'B':[11,'NaN',33,44,55],'C':[111,222,333,444,555],
'D':[6,7,8,9,10],'E':[66,77,88,99,1010],'F':[666,777,888,999,101010],
'hour':[10,11,12,13,14],'day':[2,3,4,5,6]})
1、直接对表统计,生成series
print(operation.size)#列表总个数
# 40
print(operation.nunique())#每一列的不重复的值的个数
# A 4
# B 4
# C 5
# D 5
# E 5
# F 5
# hour 5
# day 5
# dtype: int64
print(operation.count())#每一列值的个数
# A 5
# B 5
# C 5
# D 5
# E 5
# F 5
# hour 5
# day 5
# dtype: int64
2、groupby分组单列特征且统计表里全部特征
#特征只有A一个的时候,eg1等价于eg2
#eg1:
print(operation['A'].value_counts())
# 1 2
# 5 1
# 4 1
# 3 1
# Name: A, dtype: int64
#eg2:
print(operation.groupby('A').size())
# A
# 1 2
# 3 1
# 4 1
# 5 1
# dtype: int64
print(operation.groupby('A').nunique())#对A列分组,再对整个表做nunique。nunique是有多少个不同值。
# 分组后A=1时候,B对应只有1个不同的值,C对应2个不同值
# A B C D E F hour day
# A
# 1 1 1 2 2 2 2 2 2
# 3 1 1 1 1 1 1 1 1
# 4 1 1 1 1 1 1 1 1
# 5 1 1 1 1 1 1 1 1
print(operation.groupby('A').count())
# B C D E F hour day
# A
# 1 2 2 2 2 2 2 2
# 3 1 1 1 1 1 1 1
# 4 1 1 1 1 1 1 1
# 5 1 1 1 1 1 1 1
3、groupby分组单列特征且统计单列特征
print(operation.groupby('A')['B'].nunique())#等价于print(operation.groupby('A').nunique()['B']),先根据A列分组,再提出B列
# A
# 1 1
# 3 1
# 4 1
# 5 1
# Name: B, dtype: int64
print(operation.groupby('A')['B'].count())
# A
# 1 2
# 3 1
# 4 1
# 5 1
# Name: B, dtype: int64
4、groupby分组多列特征且统计多列特征
temp=operation.groupby(['A','B'])[('C','D')].nunique().reset_index()#依次根据AB分组,分组后依次统计CD特征
print(temp)
# A B C D
# 0 1 11 2 2
# 1 1 33 1 1
# 2 4 44 1 1
# 3 5 55 1 1
temp=operation.groupby(['A','B','C'])[('D','E')].nunique().reset_index()#依次根据ABC分组,分组后依次统计DE特征
print(temp)
# A B C D E
# 0 1 11 111 1 1
# 1 1 11 222 1 1
# 2 1 33 333 1 1
# 3 4 44 444 1 1
# 4 5 55 555 1 1
5、count和size区别:count和size在无空值的时候等价,count统计非空个数,size统计空值个数
5.1、groupby分组单列特征且统计单列特征
path = 'E:/yolanda/algorithm/remote_ceshi/ceshi35/2018tiancheng/chusai/data'
operation_train = pd.read_csv(path+'/operation_train_new.csv',sep=',')
temp=operation_train.groupby('UID').apply(lambda x:x.shape[0])#以UID分组统计每个UID的个数
temp1=operation_train.groupby(['UID'])['ip1'].count()#以UID分组统计ip1列非空值的个数
temp2=operation_train.groupby(['UID'])['ip1'].size()#以UID分组统计ip1列非空值+空值的个数(等于以UID分组统计每个UID的个数)
print(temp.head())
# 10000 9
# 10001 67
# 10002 11
# 10003 15
# 10004 34
# dtype: int64
print(temp1.head())
# 10000 9
# 10001 50
# 10002 10
# 10003 15
# 10004 34
# Name: ip1, dtype: int64
print(temp2.head())
# 10000 9
# 10001 67
# 10002 11
# 10003 15
# 10004 34
# Name: ip1, dtype: int64
a=temp.head()-temp1.head()
print(a)
# 10000 0
# 10001 17
# 10002 1
# 10003 0
# 10004 0
# dtype: int64
b=temp.head()-temp2.head()
print(b)
# 10000 0
# 10001 0
# 10002 0
# 10003 0
# 10004 0
# dtype: int64
5.2、groupby分组多列特征且统计多列特征
temp0=operation_train.groupby(['UID','ip2']).apply(lambda x:x.shape[0])
#以UID分组,再以ip2分组,统计每组UID下每组ip2的个数
temp01=operation_train.groupby(['UID','ip2'])['ip1'].count()
#以UID分组,再以ip2分组,统计UID下再ip2下,ip1列非空值的个数
temp02=operation_train.groupby(['UID','ip2'])['ip1'].size()
#以UID分组,再以ip2分组,统计UID下再ip2下,ip1列非空值+空值的个数(等于以UID分组统计每个UID的个数)
print(temp0.head())
# UID ip2
# 10001 e65a4b0133385267 1
# 10008 b5320af2fb8a7bd5 9
# 10012 6d00509237e58763 2
# 10017 671c59ee94964fbb 3
# 10028 ca8363953168280e 8
# dtype: int64
print(temp01.head())
# UID ip2
# 10001 e65a4b0133385267 1
# 10008 b5320af2fb8a7bd5 0
# 10012 6d00509237e58763 0
# 10017 671c59ee94964fbb 0
# 10028 ca8363953168280e 0
# Name: ip1, dtype: int64
print(temp02.head())
# UID ip2
# 10001 e65a4b0133385267 1
# 10008 b5320af2fb8a7bd5 9
# 10012 6d00509237e58763 2
# 10017 671c59ee94964fbb 3
# 10028 ca8363953168280e 8
# Name: ip1, dtype: int64
c=temp0.head()-temp01.head()
print(c)
# UID ip2
# 10001 e65a4b0133385267 0
# 10008 b5320af2fb8a7bd5 9
# 10012 6d00509237e58763 2
# 10017 671c59ee94964fbb 3
# 10028 ca8363953168280e 8
# dtype: int64
c1=c.reset_index().rename(columns={0: 'ceshi'})
print(c1)
# UID ip2 ceshi
# 10001 e65a4b0133385267 0
# 10008 b5320af2fb8a7bd5 9
# 10012 6d00509237e58763 2
# 10017 671c59ee94964fbb 3
# 10028 ca8363953168280e 8
# dtype: int64
d=temp0.head()-temp02.head()
print(d)
# UID ip2
# # 10001 e65a4b0133385267 0
# # 10008 b5320af2fb8a7bd5 0
# # 10012 6d00509237e58763 0
# # 10017 671c59ee94964fbb 0
# # 10028 ca8363953168280e 0
# # dtype: int64
5.3、groupby多列、size、count具体应用
isnull_cnt_list=[]
def isnull_cnt(data,features,value):
new_feature = 'null_cnt'
for i in features:
new_feature += '_' + i
temp=(data.groupby(features).apply(lambda x:x.shape[0])-data.groupby(features)[value].count()). \
reset_index().rename(columns={0: new_feature + "_0_" + value})#只能命名一个特征
temp1=(data.groupby(features)[value].size()-data.groupby(features)[value].count()). \
reset_index().rename(columns={value: new_feature + "_1_" + value})#可以同时命名多个特征。充分表现出size和count的区别
temp2=data.groupby(features)[value].apply(lambda x:x.shape[0]). \
reset_index().rename(columns={value: new_feature + "_2_" + value})
#与data.groupby(features)[value].size()和data.groupby(features).apply(lambda x:x.shape[0])等价
temp3=data.groupby(features)[value].count(). \
reset_index().rename(columns={value: new_feature + "_3_" + value})
data = data.merge(temp, 'left', on=features)
data = data.merge(temp1, 'left', on=features)
data = data.merge(temp2, 'left', on=features)
data = data.merge(temp3, 'left', on=features)
isnull_cnt_list.append(new_feature)
return data
operation_train=isnull_cnt(operation_train,['UID'],'ip1')
print(operation_train.head()['null_cnt_UID_0_ip1'])
# 0 2
# 1 85
# 2 1
# 3 0
# 4 67
# Name: null_cnt_UID_0_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_1_ip1'])
# 0 2
# 1 85
# 2 1
# 3 0
# 4 67
# Name: null_cnt_UID_1_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_2_ip1'])
# 0 112
# 1 241
# 2 17
# 3 49
# 4 179
# Name: null_cnt_UID_2_ip1, dtype: int64
print(operation_train.head()['null_cnt_UID_3_ip1'])
# 0 110
# 1 156
# 2 16
# 3 49
# 4 112
# Name: null_cnt_UID_3_ip1, dtype: int64