import pandas as pd
import numpy as np
import seaborn as sns
from pandas import Series,DataFrame
tips=pd.read_csv('tips.csv')
tips.head(20)
| total_bill | tip | sex | smoker | day | time | size |
---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
---|
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
---|
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
---|
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
---|
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
---|
5 | 25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
---|
6 | 8.77 | 2.00 | Male | No | Sun | Dinner | 2 |
---|
7 | 26.88 | 3.12 | Male | No | Sun | Dinner | 4 |
---|
8 | 15.04 | 1.96 | Male | No | Sun | Dinner | 2 |
---|
9 | 14.78 | 3.23 | Male | No | Sun | Dinner | 2 |
---|
10 | 10.27 | 1.71 | Male | No | Sun | Dinner | 2 |
---|
11 | 35.26 | 5.00 | Female | No | Sun | Dinner | 4 |
---|
12 | 15.42 | 1.57 | Male | No | Sun | Dinner | 2 |
---|
13 | 18.43 | 3.00 | Male | No | Sun | Dinner | 4 |
---|
14 | 14.83 | 3.02 | Female | No | Sun | Dinner | 2 |
---|
15 | 21.58 | 3.92 | Male | No | Sun | Dinner | 2 |
---|
16 | 10.33 | 1.67 | Female | No | Sun | Dinner | 3 |
---|
17 | 16.29 | 3.71 | Male | No | Sun | Dinner | 3 |
---|
18 | 16.97 | 3.50 | Female | No | Sun | Dinner | 3 |
---|
19 | 20.65 | 3.35 | Male | No | Sat | Dinner | 3 |
---|
grouped=tips['tip'].groupby(tips['sex'])
grouped=tips.groupby(tips['sex'])
grouped
grouped.mean()
grouped.sum()
data_mean=tips['tip'].groupby([tips['sex'],tips['time']]).mean()
data_mean
data_mean.plot(kind='barh')
for name,group in tips['tip'].groupby(tips['sex']):
print(name)
print(group)
tips.groupby(tips['sex']).size()
grouped=tips['tip'].groupby(tips['sex'])
grouped=tips.groupby(tips['sex'])
smoker_mean=tips.groupby('smoker').mean()
smoker=tips.groupby('somker',group_keys=False)['tip']
smoker_mean=tips.groupby(['sex','smoker']).mean()
smoker_mean=tips.groupby(['sex','smoker'],as_index=False).mean()
size_mean1=tips.groupby('size')['tip'].mean()
size_mean2=tips['tip'].groupby(tips['size']).mean()
size_mean1==size_mean2
df=DataFrame(np.arange(16).reshape(4,4),index=['a','b','a','b'])
df.groupby(df.index).mean()
df=DataFrame(np.arange(16).reshape(4,4))
list1=['a','b','a','b']
df.groupby(list1).mean()
df=DataFrame(np.arange(16).reshape(4,4),index=['a','b','A','B'])
dict1={
'a':'one',
'A':'one',
'b':'two',
'B':'two'
}
df.groupby(dict1).mean()
df=DataFrame(np.random.randn(4,4))
df
df.groupby(df[3].map(lambda x:'a' if x>=0 else 'b')).sum()
df=DataFrame(np.arange(16).reshape(4,4),
index=[['one','one','two','two'],['a','b','a','b']],
columns=[['apple','apple','orange','orange'],['red','green','red','green']])
df
df.groupby(level=0,axis=1).sum()
max_tip=tips.groupby('sex')['tip'].max()
max_tip
max_tip.plot(kind='bar')
def get_range(x):
'''接收一个数字序列,或数字列表,得到数字的范围'''
return x.max()-x.min()
tips_range=tips.groupby('sex')['tip'].agg(lambda x:x.max()-x.min())
tips_range
def get_range(x):
'''接收一个数字序列,或数字列表,得到数字的范围'''
return x.max()-x.min()
tips.groupby(['sex','smoker'])['tip'].agg(['mean','std',get_range])
tips.groupby(['sex','smoker'])['tip'].agg([('tip_mean','mean'),('range',get_range)])
tips.groupby(['day','time'])['tip','total_bill'].agg([('tip_mean','mean'),('range',get_range)])
tips.groupby(['day','time'])['total_bill','tip'].agg({'total_bill':['sum','mean'],'tip':'mean'})
tips.groupby('sex').transform('mean')
tips.groupby('sex')['tip'].transform('mean')
tips.groupby('sex').apply(lambda x:x.mean())
tips.groupby('sex')['tip'].apply(lambda x:x.mean())
tips.groupby('sex').transform('mean')
tips.groupby('sex')['tip'].transform('mean')
df.groupby('sex').apply(lambda x:x.fillna(x.mean()))
df.groupby('sex')['math'].apply(lambda x:x.fillna(x.mean()))
tips['tip_mean_by_sex']=tips.groupby('sex')['tip'].transform('mean')
tip_mean_by_sex=tips.groupby('sex')['tip'].mean()
tip_mean_by_sex
tip_mean_by_sex_df=DataFrame(tip_mean_by_sex)
tip_mean_by_sex_df
new_tips=pd.merge(tips,tip_mean_by_sex_df,left_on='sex',right_index=True,suffixes=('','_mean_by_sex'),how='left')
new_tips.head(10)
new_tips=tips.copy()
new_tips['tip_mean_by_sex']=tips.groupby('sex')['tip'].transform('mean')
new_tips.head(10)
tips.groupby('sex').apply(lambda x:x.sort_values(by='tip',ascending=False)[:5])
tips.groupby('sex',group_keys=False).apply(lambda x:x.sort_values(by='tip',ascending=False)[:5])
data={
'name':['张三','李四',np.nan,'王五','小明','马六'],
'sex':['female','female','male','male','male','female'],
'math':[67,77,np.nan,82,90,np.nan],
'English':[67,77,np.nan,82,90,np.nan]
}
df=DataFrame(data)
df.fillna(df['math'].mean())
df.fillna(df['English'].mean())
df.groupby('sex').apply(lambda x:x.fillna(x.mean()))
df.groupby('sex')['math'].apply(lambda x:x.fillna(x.mean()))
tips.pivot_table(values='tip',index='sex',columns='smoker')
tips.pivot_table(values='tip',index='sex',columns='smoker',aggfunc='sum')
tips.pivot_table(values='tip',index='sex',columns='smoker',aggfunc='sum',margins=True)
tips.groupby(['sex','smoker'])['tip'].mean().unstack()
tips.groupby(['sex','smoker'])['tip'].sum().unstack()
sex_smoker=tips.groupby(['sex','smoker'])['tip'].sum().unstack()
sex_smoker['All']=sex_smoker['No']+sex_smoker['Yes']
sex_smoker=sex_smoker.append({'No':sex_smoker['No'].sum(),'Yes':sex_smoker['Yes'].sum()},ignore_index=True)
sex_smoker.index.name='Sex'
sex_smoker.index=['Female','Male','All']
sex_smoker
cross_table=pd.cross_table(index=tips['day'],columns=tips['size'])
cross_table
df=cross_table.div(cross_table.sum(axis=1),axis=0)
df
df.plot(kind='bar',stacked=True)