pandas
groupby练习
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
'B':['one','one','two','three','two','two','one','three'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
print(df)
grouped = df.groupby('A')
print(grouped.count())
'''
B C D
A
bar 3 3 3
foo 5 5 5
'''
grouped = df.groupby(['A','B'])
print(grouped.count())
'''
C D
A B
bar one 1 1
three 1 1
two 1 1
foo one 2 2
three 1 1
two 2 2
'''
根据自定义设置,例如构造函数等:
def get_letter_type(letter):
if letter.lower() in 'aeiou':
return 'a'
else:
return 'b'
grouped = df.groupby(get_letter_type,axis=1)
print(grouped.count().iloc[0])
'''
a 1
b 3
'''
可以使用np中的统计方法:
print(grouped.aggregate(np.sum))
'''
C D
A B
bar one 0.400157 0.410599
three 2.240893 1.454274
two -0.977278 0.121675
foo one 2.714141 0.340644
three -0.151357 0.333674
two 2.846296 0.905081
'''
取消groupby的索引,或自定义一个新的索引
grouped = df.groupby(['A','B'],as_index= False)
#或者直接重新构造一个索引df.groupby(['A','B']).sum().reset_index()
print(grouped.aggregate(np.sum))
'''
A B C D
0 bar one 0.400157 0.410599
1 bar three 2.240893 1.454274
2 bar two -0.977278 0.121675
3 foo one 2.714141 0.340644
4 foo three -0.151357 0.333674
5 foo two 2.846296 0.905081
'''
查看组合的次数size()
,甚至查看详细的信息describe()
grouped = df.groupby(['A','B'])
print(grouped.size())
'''
A B
bar one 1
three 1
two 1
foo one 2
three 1
two 2
'''
grouped = df.groupby('A')
print(grouped['C'].agg([np.sum,np.mean,np.std]))
用字典进行重命名
print(grouped['C'].agg({'res_sum':np.sum,'res_mean':np.mean,'res_std':np.std}))
'''
sum mean std
A
bar 1.663773 0.554591 1.614634
foo 5.409080 1.081816 0.811100
'''
get想要的group
print(df.groupby(['A']).get_group('bar')) #想拿哪个group,就get
'''
A B C D
1 bar one 0.400157 0.410599
3 bar three 2.240893 1.454274
5 bar two -0.977278 0.121675
'''
取消索引的默认排序
np.random.seed(0)
s = pd.Series([1,2,3,4,5,7],[2,4,6,2,6,7])
print(s)
grouped = s.groupby(level=0)
print(grouped.first())
#第一次出现的索引。默认会排序,grouped = s.groupby(level=0,sort = False)可取消排序
'''
2 1
4 2
6 3
7 7
'''
构造多重索引
arrays = [['bar','bar','baz','baz','foo','foo','qux','qux'],
['one','two','one','two','one','two','one','two']]
index = pd.MultiIndex.from_arrays(arrays,names=['first','second'])
s = pd.Series(np.random.randn(8),index = index)
print(s)
'''
first second
bar one 1.764052
two 0.400157
baz one 0.978738
two 2.240893
foo one 1.867558
two -0.977278
qux one 0.950088
two -0.151357
'''
grouped = s.groupby(level=0)
print(grouped.sum())
'''
first
bar 2.164210
baz 3.219631
foo 0.890280
qux 0.798731
'''
grouped = s.groupby(level=1) #或者直接用名字'second'
print(grouped.sum())
'''
second
one 5.560437
two 1.512415
'''
字符串操作
s = pd.Series(['A','b','B','gear','Ager',np.nan])
s.str.lower() #全部小写
s.str.upper() #全部大写
s.str.len() #返回长度
index = pd.Index([' tang',' yu ','di '])
index.str.strip()#去掉两边的全部空格
index.str.lstrip()#去掉左边的空格
index.str.rstrip()#去掉右边的空格
df = pd.DataFrame(np.random.randn(3,2),columns= ['A a','B b'],index=range(3))
df.columns = df.columns.str.replace(' ','_') #把A a换成A_a
s = pd.Series(['a_b_c','d_e_f','g_h_i'])
print(s.str.split('_'))
'''
0 [a, b, c]
1 [d, e, f]
2 [g, h, i]
'''
print(s.str.split('_',expand = True))
'''
0 1 2
0 a b c
1 d e f
2 g h i
'''
print(s.str.split('_',expand = True,n=1))
'''
0 1
0 a b_c
1 d e_f
2 g h_i
'''
print(s.str.contains('a'))
'''
0 True
1 False
2 False
'''
s = pd.Series(['a','a|b','a|c'])
print(s.str.get_dummies(sep='|'))
'''
a b c
0 1 0 0
1 1 1 0
2 1 0 1
'''
索引
#构造逆序索引
s = pd.Series(np.arange(5),index = np.arange(5)[::-1],dtype='int64')
print(s)
'''
4 0
3 1
2 2
1 3
0 4
'''
print(s.isin([1,3,4]))
#可以利用这个来取 s[s.isin([1,3,4])]
'''
4 False
3 True
2 False
1 True
0 True
'''
s2 = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0,1],['a','b','c']]))
print(s2)
'''
0 a 0
b 1
c 2
1 a 3
b 4
c 5
'''
print(s2.iloc[s2.index.isin([(1,'a'),(2,'b')])])
#1 a 3
s = pd.Series(np.arange(5),index = np.arange(5)[::-1],dtype='int64')
s[s>2]
dates = pd.date_range('20220317',periods=8)
df = pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])
# print(df.select(lambda x:x=='A',axis='columns'))
df.where(df<0)#大于0的部分会变nan
df.where(df<0,-df)
df = pd.DataFrame(np.random.rand(10,3),columns=list('abc'))
df.query('(a<b) & (b<c)')#进行一个查询
对于大数据的处理技巧
g1 = pd.read_csv('data.csv')
查询文件占用内存
g1.info(memory_usage = 'deep')
#返回memory usage
循环遍历每种类型数据,查询每种类型的数据所占用的内存
for dtype in ['float64','int64','object']:
selected_dtype = g1.select_dtype(include = [dtype])
mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
mean_usage_mb = mean_usage_b/1024**2
print('{}平均占用内存:{:.2f}MB'.format(dtype,mean_usage_mb))
#float64平均占用内存:1.29MB
#int64平均占用内存:1.12MB
#object平均占用内存:9.51MB
常用数据类型的范围:
uint8:0到255
int8:-128到127
int16:-32768到32767
int32:-214783648到214783647
int64:-9223372036854775808到9223372036854775807
由于pandas读入表之后很多数据都采用的int64或者float64,但是根本就没有这么大的数据,因此可以将64位降为8到32位,来减小内存
def mem_usage(pandas_obj):
if isinstance(pandas_obj,pd.DataFrame):
usage_b = pandas_obj.memory_usage(deep=True).sum()
else:
usage_b = pandas_obj.memory_usage(deep=True)
usage_mb = usage_b/1024**2
return '{:03.2f}MB'.format(usage_mb)
#计算该类型占用的总数
对数字类型使用to_numeric()
进行downcast
g1_int = g1.select_dtypes(include = ['int64']
converted_int = g1_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(g1_int)) #7.87MB
print(mem_usage(converted_int)) #1.48MB
#同理float
g1_float = g1.select_dtypes(include = ['float64']
converted_float = g1_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(g1_float)) #100.99MB
print(mem_usage(converted_float)) #50.49MB
可以看到对float64类型的数据进行to_numeric()
操作,可以节省一半的空间,但对于原本860MB的数据来说,只减小到了800MB左右
optimized_g1 = g1.copy()
optimized_g1[converted_int.columns] = converted_int
optimized_g1[converted_float.columns] = converted_float
print(mem_usage(g1)) #860.50MB
print(mem_usage(optimized_g1)) #803.61MB
因此可以对object类型的数据进行一些处理
对于object类型的数据,可能存在一列数据,他都是由重复的值构成,例如星期数(day_of_week)等字段,可以利用describe()
来查询表中每一列的unique值
g1_obj = g1.select_dtypes(include = ['object']
g1_obj.describe() #查询列的各种信息count,unique,top,frq等
dow = g1_obj.day_of_week
dow.head()
'''
0 Thu
1 Fri
2 Sat
3 Mon
4 Tue
'''
dow_cat = dow.astype('category')
dow_cat.head()
'''
0 Thu
1 Fri
2 Sat
3 Mon
4 Tue
Name:day_of_week, dtype:category
Categories(7,object):[Fri,Mon,Sat,Sun,Tue,Wed]
#把相同的一个值进行一个编码标识
dow_cat.head(6).cat.codes
'''
0 4
1 0
2 2
3 1
4 5
5 4
'''
print(mem_usage(dow)) #9.84MB
print(mem_usage(dow_cat)) #0.16MB
对每一列循环,判断每列的重复值比例,若比例高,则转换为category类型
converted_obj = pd.DataFrame()
for col in g1_obj.columns:
num_unique = len(g1_obj[col].unique())
num_total = len(g1_obj[col])
if num_unique / num_total <0.5:
converted_obj.loc[:,col] = g1_obj[col].astype('category')
else:
converted_obj.loc[:,col] = g1_obj[col]
print(mem_usage(g1_obj)) #751.64MB
print(mem_usage(converted_obj)) #51.67MB
这些值原本在表中都存储在不同的位置,我们可以将其转换为category类型数据,将相同的值指向同一个位置,从而节省空间。Category类型数据我们定义后,是一个固定的列表了,我们不能直接添加没有定义的类别,而没有定义在列表的值被置成了NaN值。
在数据处理的过程当中,通过使用特殊功能(例如)cut(),可以将数据分组为离散的bin,默认分组标签就是category类型,但若想在category类型下处理数据(例如添加category以外的值会很麻烦),解决办法:
data['group']=data['group'].cat.add_categories(['空缺']) #category中添加想要的值
data['group'].fillna('空缺')
日期也可以进行转换
date = optimized_g1.date
optimized_g1['date'] = pd.to_datetime(date,format='%Y%m%d')
今日遇到问题:
#列筛选值
data1 = data1[(data1.loc[:,'网元类型'] == 'GNB')|(data1.loc[:,'网元类型'] == 'ENB')]
#更改列名
data2.columns = ['1','告警名称','2','3','rank','4']
#排序后按某列删重
data3 = data3.sort_values(by='rank',ascending=True)
data3.drop_duplicates(subset='定位信息')
#dropna
data.dropna(subset=['场景类别'],inplace=True)
#删除某列中带这些字段的行
data5 = data[~data.loc[:,'定位信息'].str.contains('auto|license|应急|保障|超级|卫星')]
#筛选出想要的列,进行数据透视
data_count = data5.loc[:,['地市_y','场景类别']]
data_count['count_num'] = 1
data_pivot = data_count.pivot_table(index='地市_y',columns='场景类别',values = 'count_num',aggfunc='count')
data_pivot = data_pivot.fillna(0)
data_pivot = data_pivot.astype(np.int8)
#数据行列求和
data_pivot['col_sum'] = data_pivot.apply(lambda x :x.sum(),axis = 1)
#data_pivot['交通枢纽']+data_pivot['高速']+data_pivot['机场']+data_pivot['美食商业区']+data_pivot['美景']+data_pivot['高铁']
data_pivot.loc['row_sum'] = data_pivot.apply(lambda x : x.sum())
data_pivot