本次介绍 Pandas 总结归纳的8节至15节内容
import pandas as pd
import numpy as np
import matplotlib as plt
import math
8. Pandas分组(GroupBy)
ipl_data = { 'Team' : [ 'Riders' , 'Riders' , 'Devils' , 'Devils' , 'Kings' ,
'kings' , 'Kings' , 'Kings' , 'Riders' , 'Royals' , 'Royals' , 'Riders' ] ,
'Rank' : [ 1 , 2 , 2 , 3 , 3 , 4 , 1 , 1 , 2 , 4 , 1 , 2 ] ,
'Year' : [ 2014 , 2015 , 2014 , 2015 , 2014 , 2015 , 2016 , 2017 , 2016 , 2014 , 2015 , 2017 ] ,
'Points' : [ 876 , 789 , 863 , 673 , 741 , 812 , 756 , 788 , 694 , 701 , 804 , 690 ] }
df = pd. DataFrame( ipl_data)
df. groupby( 'Team' ) . groups
df. groupby( [ 'Team' , 'Year' ] ) . groups
df. groupby( 'Year' ) . get_group( 2014 )
{‘Devils’: Int64Index([2, 3], dtype=‘int64’), ‘Kings’: Int64Index([4, 6, 7], dtype=‘int64’), ‘Riders’: Int64Index([0, 1, 8, 11], dtype=‘int64’), ‘Royals’: Int64Index([9, 10], dtype=‘int64’), ‘kings’: Int64Index([5], dtype=‘int64’)}
{(‘Devils’, 2014): Int64Index([2], dtype=‘int64’), (‘Devils’, 2015): Int64Index([3], dtype=‘int64’), (‘Kings’, 2014): Int64Index([4], dtype=‘int64’), (‘Kings’, 2016): Int64Index([6], dtype=‘int64’), (‘Kings’, 2017): Int64Index([7], dtype=‘int64’), (‘Riders’, 2014): Int64Index([0], dtype=‘int64’), (‘Riders’, 2015): Int64Index([1], dtype=‘int64’), (‘Riders’, 2016): Int64Index([8], dtype=‘int64’), (‘Riders’, 2017): Int64Index([11], dtype=‘int64’), (‘Royals’, 2014): Int64Index([9], dtype=‘int64’), (‘Royals’, 2015): Int64Index([10], dtype=‘int64’), (‘kings’, 2015): Int64Index([5], dtype=‘int64’)}
Points Rank Team Year 0 876 1 Riders 2014 2 863 2 Devils 2014 4 741 3 Kings 2014 9 701 4 Royals 2014
df. groupby( 'Year' ) [ 'Points' ] . agg( np. mean)
df. groupby( 'Team' ) . agg( np. size)
df1= df. groupby( 'Team' )
df1[ 'Points' ] . agg( [ np. sum , np. mean, np. std] )
Year 2014 795.25 2015 769.50 2016 725.00 2017 739.00 Name: Points, dtype: float64
Points Rank Year Team Devils 2 2 2 Kings 3 3 3 Riders 4 4 4 Royals 2 2 2 kings 1 1 1
sum mean std Team Devils 1536 768.000000 134.350288 Kings 2285 761.666667 24.006943 Riders 3049 762.250000 88.567771 Royals 1505 752.500000 72.831998 kings 812 812.000000 NaN
df. groupby( 'Team' ) . filter ( lambda x: len ( x) >= 4 )
Points Rank Team Year 0 876 1 Riders 2014 1 789 2 Riders 2015 8 694 2 Riders 2016 11 690 2 Riders 2017
9. Pandas合并/连接
pd.merge(left, right, how=‘inner’, on=None, left_on=None, right_on=None,left_index=False, right_index=False, sort=True) left - 一个DataFrame对象。 right - 另一个DataFrame对象。 on - 列(名称)连接,必须在左和右DataFrame对象中存在(找到)。 left_on - 左侧DataFrame中的列用作键,可以是列名或长度等于DataFrame长度的数组。 right_on - 来自右的DataFrame的列作为键,可以是列名或长度等于DataFrame长度的数组。 left_index - 如果为True,则使用左侧DataFrame中的索引(行标签)作为其连接键。 在具有MultiIndex(分层)的DataFrame的情况下,级别的数量必须与来自右DataFrame的连接键的数量相匹配。 right_index - 与右DataFrame的left_index具有相同的用法。 how - 它是left, right, outer以及inner之中的一个,默认为内inner。 下面将介绍每种方法的用法。 sort - 按照字典顺序通过连接键对结果DataFrame进行排序。默认为True,设置为False时,在很多情况下大大提高性能。
leftdata = pd. DataFrame( {
'id' : [ 1 , 2 , 3 , 4 , 5 ] ,
'Name' : [ 'Alex' , 'Amy' , 'Allen' , 'Alice' , 'Ayoung' ] ,
'subject_id' : [ 'sub1' , 'sub2' , 'sub4' , 'sub6' , 'sub5' ] } )
rightdata = pd. DataFrame(
{ 'id' : [ 1 , 2 , 3 , 4 , 5 ] ,
'Name' : [ 'Billy' , 'Brian' , 'Bran' , 'Bryce' , 'Betty' ] ,
'subject_id' : [ 'sub2' , 'sub4' , 'sub3' , 'sub6' , 'sub5' ] } )
pd. merge( leftdata, rightdata, on= 'id' )
pd. merge( leftdata, rightdata, on= [ 'id' , 'subject_id' ] )
Name_x id subject_id_x Name_y subject_id_y 0 Alex 1 sub1 Billy sub2 1 Amy 2 sub2 Brian sub4 2 Allen 3 sub4 Bran sub3 3 Alice 4 sub6 Bryce sub6 4 Ayoung 5 sub5 Betty sub5
Name_x id subject_id Name_y 0 Alice 4 sub6 Bryce 1 Ayoung 5 sub5 Betty
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'left' )
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'right' )
Name_x id_x subject_id Name_y id_y 0 Alex 1 sub1 NaN NaN 1 Amy 2 sub2 Billy 1.0 2 Allen 3 sub4 Brian 2.0 3 Alice 4 sub6 Bryce 4.0 4 Ayoung 5 sub5 Betty 5.0
Name_x id_x subject_id Name_y id_y 0 Amy 2.0 sub2 Billy 1 1 Allen 3.0 sub4 Brian 2 2 Alice 4.0 sub6 Bryce 4 3 Ayoung 5.0 sub5 Betty 5 4 NaN NaN sub3 Bran 3
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'inner' )
pd. merge( leftdata, rightdata, on= 'subject_id' , how= 'outer' )
Name_x id_x subject_id Name_y id_y 0 Amy 2 sub2 Billy 1 1 Allen 3 sub4 Brian 2 2 Alice 4 sub6 Bryce 4 3 Ayoung 5 sub5 Betty 5
Name_x id_x subject_id Name_y id_y 0 Alex 1.0 sub1 NaN NaN 1 Amy 2.0 sub2 Billy 1.0 2 Allen 3.0 sub4 Brian 2.0 3 Alice 4.0 sub6 Bryce 4.0 4 Ayoung 5.0 sub5 Betty 5.0 5 NaN NaN sub3 Bran 3.0
10. Pandas级联函数
函数 函数形式 参数解释 concat ( ) pd.concat ( objs,axis=0,join=‘outer’,join_axes=None,ignore_index=False ) bjs - 这是Series,DataFrame或Panel对象的序列或映射 axis - {0,1,…},默认为0,这是连接的轴 join - {‘inner’, ‘outer’},默认inner,如何处理其他轴上的索引,联合的外部和交叉的内部 ignore_index − 布尔值,默认为False。如果指定为True,结果轴将被标记为:0,…,n-1 join_axes - 这是Index对象的列表,用于其他(n-1)轴的特定索引,而不是执行内部/外部集逻辑 append ( ) append ( [object ] ) object - 数据框对象
df1 = pd. DataFrame( { 'Name' : [ 'Alex' , 'Amy' , 'Allen' , 'Alice' , 'Ayoung' ] ,
'subject_id' : [ 'sub1' , 'sub2' , 'sub4' , 'sub6' , 'sub5' ] ,
'Marks_scored' : [ 98 , 90 , 87 , 69 , 78 ] } , index= [ 1 , 2 , 3 , 4 , 5 ] )
df2 = pd. DataFrame( { 'Name' : [ 'Billy' , 'Brian' , 'Bran' , 'Bryce' , 'Betty' ] ,
'subject_id' : [ 'sub2' , 'sub4' , 'sub3' , 'sub6' , 'sub5' ] ,
'Marks_scored' : [ 89 , 80 , 79 , 97 , 88 ] } , index= [ 1 , 2 , 3 , 4 , 5 ] )
pd. concat( [ df1, df2] )
pd. concat( [ df1, df2] , keys= [ 'a' , 'b' ] )
pd. concat( [ df1, df2] , keys= [ 'a' , 'b' ] , ignore_index= True )
Marks_scored Name subject_id 1 98 Alex sub1 2 90 Amy sub2 3 87 Allen sub4 4 69 Alice sub6 5 78 Ayoung sub5 1 89 Billy sub2 2 80 Brian sub4 3 79 Bran sub3 4 97 Bryce sub6 5 88 Betty sub5
Marks_scored Name subject_id a 1 98 Alex sub1 2 90 Amy sub2 3 87 Allen sub4 4 69 Alice sub6 5 78 Ayoung sub5 b 1 89 Billy sub2 2 80 Brian sub4 3 79 Bran sub3 4 97 Bryce sub6 5 88 Betty sub5
Marks_scored Name subject_id 0 98 Alex sub1 1 90 Amy sub2 2 87 Allen sub4 3 69 Alice sub6 4 78 Ayoung sub5 5 89 Billy sub2 6 80 Brian sub4 7 79 Bran sub3 8 97 Bryce sub6 9 88 Betty sub5
pd. concat( [ df1, df2] , axis= 1 )
df1. append( [ df2, df1, df1[ 1 : 3 ] ] )
Marks_scored Name subject_id Marks_scored Name subject_id 1 98 Alex sub1 89 Billy sub2 2 90 Amy sub2 80 Brian sub4 3 87 Allen sub4 79 Bran sub3 4 69 Alice sub6 97 Bryce sub6 5 78 Ayoung sub5 88 Betty sub5
Marks_scored Name subject_id 1 98 Alex sub1 2 90 Amy sub2 3 87 Allen sub4 4 69 Alice sub6 5 78 Ayoung sub5 1 89 Billy sub2 2 80 Brian sub4 3 79 Bran sub3 4 97 Bryce sub6 5 88 Betty sub5 1 98 Alex sub1 2 90 Amy sub2 3 87 Allen sub4 4 69 Alice sub6 5 78 Ayoung sub5 2 90 Amy sub2 3 87 Allen sub4
11. Pandas日期时间函数
函数 函数作用 pd.datetime.now( ) 用于获取当前的日期和时间 pd.Timestamp( ) 创建一个时间戳 pd.to_datetime( [date1,date2,…. ] ) 转换为时间戳 pd.date_range( start_date, end_date ,freq=’ ’ ).time 创建一个时间范围 pd.date_range( date, periods=int_number ) 创建日期序列 pd.date_range( date, periods=int_number, freq=‘M’ ) 更改日期频率 pd.bdate_range( date, periods=int_number ) 用来创建商业日期范围,不包括周六和周天
偏移别名 描述说明 偏移别名 描述说明 B 工作日频率 H 小时频率 BQS 商务季度开始频率 MS 月起始频率 D 日历/自然日频率 T, min 分钟的频率 A 年度(年)结束频率 SMS SMS半开始频率 W 每周频率 S 秒频率 BA 商务年底结束 BMS 商务月开始频率 M 月结束频率 L, ms 毫秒 BAS 商务年度开始频率 Q 季度结束频率 SM 半月结束频率 U, us 微秒 BH 商务时间频率 BQ 商务季度结束频率 SM 半月结束频率 N 纳秒 BH 商务时间频率 BQ 商务季度结束频率 BM 商务月结束频率 QS 季度开始频率
pd. datetime. now( )
pd. Timestamp( '2018-12-20' )
pd. Timestamp( 1588612345 , unit= 's' )
pd. date_range( "12:00" , "23:59" , freq= "H" ) . time
pd. to_datetime( [ 'Jul 31, 2009' , '2019-10-10' , '2009/11/23' , '2019.12.31' , None ] )
datetime.datetime(2018, 12, 21, 14, 20, 25, 4339)
Timestamp(‘2018-12-20 00:00:00’)
Timestamp(‘2020-05-04 17:12:25’)
array([datetime.time(12, 0), datetime.time(13, 0), datetime.time(14, 0), datetime.time(15, 0), datetime.time(16, 0), datetime.time(17, 0), datetime.time(18, 0), datetime.time(19, 0), datetime.time(20, 0), datetime.time(21, 0), datetime.time(22, 0), datetime.time(23, 0)], dtype=object)
DatetimeIndex([‘2009-07-31’, ‘2019-10-10’, ‘2009-11-23’, ‘2019-12-31’, ‘NaT’], dtype=‘datetime64[ns]’, freq=None)
pd. date_range( '2018/11/21' , periods= 5 )
pd. date_range( '2018/11/21' , periods= 5 , freq= 'M' )
pd. bdate_range( '2018/12/21' , periods= 5 )
DatetimeIndex([‘2018-11-21’, ‘2018-11-22’, ‘2018-11-23’, ‘2018-11-24’, ‘2018-11-25’], dtype=‘datetime64[ns]’, freq=‘D’)
DatetimeIndex([‘2018-11-30’, ‘2018-12-31’, ‘2019-01-31’, ‘2019-02-28’, ‘2019-03-31’], dtype=‘datetime64[ns]’, freq=‘M’)
DatetimeIndex([‘2018-12-21’, ‘2018-12-24’, ‘2018-12-25’, ‘2018-12-26’, ‘2018-12-27’], dtype=‘datetime64[ns]’, freq=‘B’)
12. Pandas分类构造函数
函数 函数作用 pd.Categorical(values, categories, ordered) 创建一个类别对象
pd. Series( [ "a" , "b" , "c" , "a" ] , dtype= "category" )
pd. Categorical( [ 'a' , 'b' , 'c' , 'a' , 'b' , 'c' ] , ordered= True )
0 a 1 b 2 c 3 a dtype: category Categories (3, object): [a, b, c]
[a, b, c, a, b, c] Categories (3, object): [a < b < c]
cat = pd. Categorical( [ "a" , "c" , "c" , np. nan] , categories= [ "b" , "a" , "c" ] )
df = pd. DataFrame( { "cat" : cat, "s" : [ "a" , "c" , "c" , np. nan] } )
df. describe( )
df[ "cat" ] . describe( )
cat s count 3 3 unique 2 2 top c c freq 2 2
count 3 unique 2 top c freq 2 Name: cat, dtype: object
cat. ordered
cat. categories = [ "Group %s" % g for g in cat. categories]
cat. add_categories( [ 4 ] )
cat. remove_categories( "Group a" )
False
[Group a, Group c, Group c, NaN] Categories (4, object): [Group b, Group a, Group c, 4]
[NaN, Group c, Group c, NaN] Categories (2, object): [Group b, Group c]
13. Pandas可视化
绘图函数 图形 df.plot( ) 绘图 df.plot.bar()或plot.barh() 条形图 df.plot.hist() 直方图 df.plot.box() 盒型图 df.plot.area() 面积 df.plot.scatter(x=,y=) 散点图 df.plot.pie( ) 饼状图
14. Pandas其他相关函数应用
函数 函数作用 形式 pipe() 可以通过将函数和适当数量的参数作为管道参数来执行自定义函数对DataFrame进行操作 df.piper ( function, perp ) apply() apply()方法沿DataFrame或Panel的轴应用任意函数,它与描述性统计方法一样,采用可选的轴参数 df.apply(np.mean,axis=1) reindex() 采用可选参数方法,它是一个填充方法 df.reindex( index= , columns= ) df2.reindex_like( df1, method=‘ffill’ ) pad/ffill - 向前填充值;bfill/backfill - 向后填充值;nearest - 从最近的索引值填充 rename() 允许基于一些映射(字典或者系列)或任意函数来重新标记一个轴 df.rename( index= , columns= ) isnull() 检查缺失值 notnull() 检查缺失值 fillna() 函数可以通过几种方法用非空数据“填充”NA值 df.fillna(number) 用标量值替换NaN df.fillna(method=‘pad’) 填写NA前进和后退method=pad/fill /bfill/backfill dropna() 如果行内的任何值是NA,那么整个行被排除 df.dropna(axis=0/1 )
def adder ( ele1, ele2) :
return ele1+ ele2
df = pd. DataFrame( np. random. randn( 5 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df. pipe( adder, 2 )
col1 col2 col3 0 0.490731 0.926610 2.269177 1 1.154097 2.823656 2.141345 2 1.475614 2.054501 1.195006 3 1.984239 2.726985 2.644600 4 1.836258 2.737405 1.790489
df. apply ( np. mean)
df. apply ( np. mean, axis= 1 )
df. apply ( lambda x: x. max ( ) - x. min ( ) )
col1 -0.611812 col2 0.253831 col3 0.008124 dtype: float64
0 -0.771160 1 0.039699 2 -0.424960 3 0.451941 4 0.121384 dtype: float64
col1 1.493508 col2 1.897046 col3 1.449594 dtype: float64
df1 = pd. DataFrame( np. random. randn( 6 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df2 = pd. DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'col1' , 'col2' , 'col3' ] )
df_reindexed = df1. reindex( index= [ 0 , 2 , 3 ] , columns= [ 'col1' , 'col2' ] )
df2. reindex_like( df1)
df2. reindex_like( df1, method= 'ffill' )
df2. reindex_like( df1, method= 'ffill' , limit= 1 )
col1 col2 col3 0 -2.311801 0.782469 0.696058 1 -1.351515 -0.019053 1.087809 2 NaN NaN NaN 3 NaN NaN NaN 4 NaN NaN NaN 5 NaN NaN NaN
col1 col2 col3 0 -2.311801 0.782469 0.696058 1 -1.351515 -0.019053 1.087809 2 -1.351515 -0.019053 1.087809 3 -1.351515 -0.019053 1.087809 4 -1.351515 -0.019053 1.087809 5 -1.351515 -0.019053 1.087809
col1 col2 col3 0 -2.311801 0.782469 0.696058 1 -1.351515 -0.019053 1.087809 2 -1.351515 -0.019053 1.087809 3 NaN NaN NaN 4 NaN NaN NaN 5 NaN NaN NaN
df. rename( columns= { 'col1' : 'c1' , 'col2' : 'c2' } , index = { 0 : 'apple' , 1 : 'banana' , 2 : 'durian' } )
c1 c2 col3 apple -1.509269 -1.073390 0.269177 banana -0.845903 0.823656 0.141345 durian -0.524386 0.054501 -0.804994 3 -0.015761 0.726985 0.644600 4 -0.163742 0.737405 -0.209511
df3= df_reindexed. reindex_like( df1)
df3
df3[ 'col1' ] . isnull( )
df3[ 'col1' ] . notnull( )
df3. fillna( 0 )
col1 col2 col3 0 -0.321847 0.838906 NaN 1 NaN NaN NaN 2 1.113037 -1.161580 NaN 3 -0.256907 -1.525070 NaN 4 NaN NaN NaN 5 NaN NaN NaN
0 False 1 True 2 False 3 False 4 True 5 True Name: col1, dtype: bool
0 True 1 False 2 True 3 True 4 False 5 False Name: col1, dtype: bool
col1 col2 col3 0 -0.321847 0.838906 0.0 1 0.000000 0.000000 0.0 2 1.113037 -1.161580 0.0 3 -0.256907 -1.525070 0.0 4 0.000000 0.000000 0.0 5 0.000000 0.000000 0.0
df3. fillna( method= 'pad' )
df3. fillna( method= 'pad' ) . dropna( axis= 1 )
col1 col2 col3 0 -0.321847 0.838906 NaN 1 -0.321847 0.838906 NaN 2 1.113037 -1.161580 NaN 3 -0.256907 -1.525070 NaN 4 -0.256907 -1.525070 NaN 5 -0.256907 -1.525070 NaN
col1 col2 0 -0.321847 0.838906 1 -0.321847 0.838906 2 1.113037 -1.161580 3 -0.256907 -1.525070 4 -0.256907 -1.525070 5 -0.256907 -1.525070
15. Pandas读取外部数据
读取函数 函数作用 pd.read_csv(filename) 导入CSV文档 pd.read_table(filename) 导入分隔的文本文件 (如TSV) pd.read_excel(filename) 导入Excel文档 pd.read_sql(query, connection_object) 读取SQL 表/数据库 pd.read_json(json_string) 读取JSON格式的字符串, URL或文件. pd.read_html(url) 解析html URL,字符串或文件,并将表提取到数据框列表 pd.read_clipboard() 获取剪贴板的内容并将其传递给read_table() 写入函数 函数作用 df.to_csv(filename) 写入CSV文件 df.to_excel(filename) 写入Excel文件 df.to_sql(table_name, connection_object) 写入一个SQL表 df.to_json(filename) 写入JSON格式的文件 以上函数具体参数用法自行查阅