pandas常用使用样例及速查表
1.重建index
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 'd': [0, 1, 2, 0, 1, 2, 3]})
print frame
# 重建index drop是否删除变为index的字段
print frame.set_index(['c', 'd'], drop=False)
print frame.reset_index() # 恢复
2.调换索引级别
frame = DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])
print frame
# 调换索引级别
print frame.swaplevel(0, 1, axis=1)
3.数据合并
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})
left = DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'],'rval': [4, 5, 6, 7]})
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'key2': [2000, 2001, 2002, 2001, 2002],'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],[2001, 2000, 2000, 2000, 2001, 2002]],columns=['event1', 'event2'])
# 统一使用key键,默认内联
print pd.merge(df1, df2, on='key')
# 左键lkey,右键rkey,方式外联
print pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='outer')
# 以['key1','key2']为键合并
print pd.merge(left, right, on=['key1', 'key2'], how='outer')
# 以key1为键合并,并特殊标记其他重名键
print pd.merge(left, right, on='key1', suffixes=('_left', '_right'), how='outer')
# 左键key,右键使用索引
print pd.merge(left1, right1, left_on='key', right_index=True)
# 左键key,右键使用索引(多级索引)
print pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
4.数据拼接
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
s4 = pd.concat([s1 * 5, s3])
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],columns=['three', 'four'])
# 数据拼接
print pd.concat([s1, s2, s3])
print pd.concat([s1, s2, s3], axis=1)
# 指定拼接方式
print pd.concat([s1, s4], axis=1, join='inner')
# 指定拼接索引
print pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])
# 对拼接数据来源标记索引
print pd.concat([s1, s1, s4], keys=['one', 'two', 'three'])
print pd.concat([df1, df2], axis=1, keys=['level1',' level2'])
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
# 拼接数据后,取消索引
print pd.concat([df1, df2], ignore_index=True)
5.数据转换
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'pastrami','corned beef', 'bacon', 'pastrami', 'honey ham','nova lox', 'apple'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6, 0]})
meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}
# map转换
print data['food'].map(meat_to_animal)
# transform 使用自定义的函数进行转换
print data['food'].transform(lambda x: meat_to_animal[x] if x in meat_to_animal else 'fruit')
6.数据切割
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# 数据切割
print pd.cut(ages, bins).value_counts()
# 左闭右开
print pd.cut(ages, bins, right=False).value_counts()
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
# 切割重命名
print pd.cut(ages, bins, labels=group_names).value_counts()
7.字符串处理
data = Series({'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan})
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
# 字符串匹配
print data.str.contains('google')
# 字符串切割
print data.str.findall(pattern, flags=re.IGNORECASE)
8.数据聚合
df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5), 'data2': np.random.randn(5)})
# 列多维度聚合
print df['data1'].groupby([df['key1'], df['key2']]).mean()
# 列多维度聚合并在该维度展开
print df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()
# 限定聚合的列的值
print df.groupby('key1')['data1'].mean()
people = DataFrame(np.arange(25).reshape(5, 5), columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
key_list = ['one', 'one', 'one', 'two', 'two']
# 列按dict规则聚合
print people.groupby(mapping, axis=1).sum()
# 函数规则聚合:按索引字符串长度聚合
print people.groupby(len).sum()
# 复合聚合
print people.groupby([len, key_list]).sum()
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.arange(20).reshape(4, 5), columns=columns)
# 根据索引级别聚合
print hier_df.groupby(level='tenor', axis=1).sum()
tips_dict = {'total_bill': [16.99, 10.34, 21.01, 23.68, 24.59, 25.29], 'tip': [1.01, 1.66, 3.50, 3.31, 3.61, 4.71],
'sex': ['F', 'M', 'M', 'M', 'F', 'M'], 'smoker': ['N', 'Y', 'N', 'N', 'Y', 'Y'],
'sizes': [1, 2, 3, 4, 5, 6]}
tips = DataFrame(tips_dict)
tips['tip_pct'] = tips['tip'] / tips['total_bill']
print tips
# groupby取消索引
print tips.groupby(['sex', 'smoker'], as_index=False).mean()
# 按group做apply
print tips.groupby('smoker').apply(lambda x: x.sort_values(by='tip_pct'))
# 数据透视表,默认使用mean聚合
print tips.pivot_table(index=['sex'], columns=['smoker'])
# 数据透视表,margins表示添加汇总项(all)
print tips.pivot_table(index=['sex'], columns=['smoker'], margins=True)
# aggfunc 指定聚合运算规则
print tips.pivot_table(index='sex', columns='smoker', aggfunc=sum)
# 快速建立透视表,默认sum聚合
print pd.crosstab([tips.sex, tips.sizes], tips.smoker, margins=True)
grouped = tips.groupby(['sex', 'smoker'])
print grouped['tip_pct'].mean()
# 数据分析聚合并命名, tuple内首参数为列名,尾参数为运算规则
print grouped['tip_pct'].agg([('foo', 'mean'), ('bar', np.std)])
# 单列多类型分析聚合
print grouped.agg({'tip_pct': ['min', 'max', 'mean', ('std_name', np.std)], 'tip': 'sum'})
frame = DataFrame({'data1': np.arange(1, 1001), 'data2': np.arange(1001, 2001)})
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}
# 将data1列按值平分为4组,并在其基础上分析data2列
print frame.data2.groupby(pd.cut(frame.data1, 4)).apply(get_stats).unstack()
# 将data1列按数量平分为10组,并在其基础上分析data2列
print frame.data2.groupby(pd.qcut(frame.data1, 10, labels=False)).apply(get_stats).unstack()
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.arange(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
fill_values = {'East': 50, 'West': 100}
# apply方法填充缺失值
print data.groupby(group_key).apply(lambda x: x.fillna(x.mean()))
print data.groupby(group_key).apply(lambda x: x.fillna(fill_values[x.name]))