Python3–Pandas
1. 导包
import numpy as np
import pandas as pd
2. Series的创建
data = pd.Series(["skey" ,"syl" ,"earth" ])
print(data)
0 skey 1 syl 2 earth dtype: object
data.values
array([‘skey’, ‘syl’, ‘earth’], dtype=object)
data.index
RangeIndex(start=0, stop=3, step=1)
data[:2 ]
0 skey 1 syl dtype: object
info = pd.Series(["job" ,"15" ,"student" ],index=["name" ,"age" ,"work" ])
print(info)
name job age 15 work student dtype: object
info["age" ]
‘15’
dic = {
"name" :"zha" ,
"age" :"15"
}
dic2ser = pd.Series(dic)
print(dic2ser,type(dic2ser))
name zha age 15 dtype: object
print(
pd.Series({
1 :1 ,
3 :13 ,
2 :14
})
)
1 1 3 13 2 14 dtype: int64
3.Dataframe的创建与读取
df = pd.DataFrame(np.array(range(10 )).reshape(2 ,5 ),index=["one" ,"two" ],columns=[1 ,2 ,3 ,4 ,5 ])
print(df)
1 2 3 4 5 one 0 1 2 3 4 two 5 6 7 8 9
print(df.index,"\n" ,df.columns)
Index([‘one’, ‘two’], dtype=’object’) Int64Index([1, 2, 3, 4, 5], dtype=’int64’)
print(df[1 ])
one 0 two 5 Name: 1, dtype: int32
print(df.loc["one" ])
print(df.iloc[0 ])
1 0 2 1 3 2 4 3 5 4 Name: one, dtype: int32 1 0 2 1 3 2 4 3 5 4 Name: one, dtype: int32
4. As joins across datasets
indA = pd.Index([1 ,2 ,3 ,4 ,5 ,6 ])
indB = pd.Index([1 ,3 ,5 ,7 ])
indA & indB
Int64Index([1, 3, 5], dtype=’int64’)
indA | indB
Int64Index([1, 2, 3, 4, 5, 6, 7], dtype=’int64’)
indA ^ indB
Int64Index([2, 4, 6, 7], dtype=’int64’)
Se = pd.Series(range(5 ),index=["one" ,"two" ,"three" ,"four" ,"five" ])
print(Se)
print(Se[0 :2 ])
one 0 two 1 three 2 four 3 five 4 dtype: int64 one 0 two 1 dtype: int64
print(Se["one" :"three" ])
one 0 two 1 three 2 dtype: int64
5.Dataframe数据读取
df = pd.read_excel("G:\\python3\\Python_notebook\\第3关:数据分析的基本过程\\朝阳医院2018年销售数据.xlsx" )
print(df.head(5 ))
购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1.616528e+06 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1.616528e+06 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 1.260283e+07 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 1.007034e+10 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 1.015543e+08 236701.0 三九感冒灵 8.0 224.0 208.00
print(df.columns)
Index([‘购药时间’, ‘社保卡号’, ‘商品编码’, ‘商品名称’, ‘销售数量’, ‘应收金额’, ‘实收金额’], dtype=’object’)
print(df.shape,df.count())
(6578, 7) 购药时间 6576 社保卡号 6576 商品编码 6577 商品名称 6577 销售数量 6577 应收金额 6577 实收金额 6577 dtype: int64
print(df.dtypes)
购药时间 object 社保卡号 float64 商品编码 float64 商品名称 object 销售数量 float64 应收金额 float64 实收金额 float64 dtype: object
print(df.head(5 ))
print(df["社保卡号" ].dtype)
df["社保卡号" ] = df["社保卡号" ].map(lambda x : str(x))
print(df.head(5 ))
购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1.616528e+06 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1.616528e+06 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 1.260283e+07 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 1.007034e+10 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 1.015543e+08 236701.0 三九感冒灵 8.0 224.0 208.00 float64 购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1616528.0 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1616528.0 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 12602828.0 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 10070343428.0 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 101554328.0 236701.0 三九感冒灵 8.0 224.0 208.00
print(df.loc[:3 ,"购药时间" :"商品编码" ])
print(df.iloc[:4 ,:3 ])
购药时间 社保卡号 商品编码 0 2018-01-01 星期五 1616528.0 236701.0 1 2018-01-02 星期六 1616528.0 236701.0 2 2018-01-06 星期三 12602828.0 236701.0 3 2018-01-11 星期一 10070343428.0 236701.0 购药时间 社保卡号 商品编码 0 2018-01-01 星期五 1616528.0 236701.0 1 2018-01-02 星期六 1616528.0 236701.0 2 2018-01-06 星期三 12602828.0 236701.0 3 2018-01-11 星期一 10070343428.0 236701.0
ReName = {"购药时间" :"售药时间" }
df.rename(columns = ReName,inplace = True )
print(df.head(5 ))
售药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1616528.0 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1616528.0 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 12602828.0 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 10070343428.0 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 101554328.0 236701.0 三九感冒灵 8.0 224.0 208.00
6. 缺失值处理
6.1. 空值类型–None
Python中内置的None值 Pandas中,将缺失值表示为NA,表示不可用not available。 对于数值数据,pandas使用浮点值NaN(Not a Number)表示缺失数据。
df = pd.DataFrame({"name" : ['Alfred' , 'Batman' , 'Catwoman' ],
... "toy" : [np.nan, '' , 'Bullwhip' ],
... "born" : [pd.NaT, pd.Timestamp("1940-04-25" ),
... pd.NaT],
"age" :[12 ,None ,13 ]})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born age 0 Alfred NaN NaT 12.0 1 Batman 1940-04-25 NaN 2 Catwoman Bullwhip NaT 13.0
6.2 Detecting null values 检测空值
print(df.isnull())
print(df.isna())
print(df.isnull()==df.isna())
name toy born age 0 False True True False 1 False False False True 2 False False True False name toy born age 0 False True True False 1 False False False True 2 False False True False name toy born age 0 True True True True 1 True True True True 2 True True True True
df[df.loc[:,"age" ].notna()]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born age 0 Alfred NaN NaT 12.0 2 Catwoman Bullwhip NaT 13.0
6.3 Dropping null values 删除空值
DataFrame.dropna(axis=0, how=’any’, thresh=None, subset=None, inplace=False)
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born age 0 Alfred NaN NaT 12.0 1 Batman 1940-04-25 NaN 2 Catwoman Bullwhip NaT 13.0
print(df.dropna(axis = 0 ))
print(df.dropna(axis=1 ))
Empty DataFrame Columns: [name, toy, born, age] Index: [] name 0 Alfred 1 Batman 2 Catwoman
print(df.dropna(how = 'any' ,axis=1 ))
print(df.dropna(how = 'all' ,axis=1 ))
name 0 Alfred 1 Batman 2 Catwoman name toy born age 0 Alfred NaN NaT 12.0 1 Batman 1940-04-25 NaN 2 Catwoman Bullwhip NaT 13.0
df.dropna()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
df.dropna(subset=["toy" ,"age" ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born age 2 Catwoman Bullwhip NaT 13.0
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born age 0 Alfred NaN NaT 12.0 1 Batman 1940-04-25 NaN 2 Catwoman Bullwhip NaT 13.0
df.dropna(axis=1 ,inplace=True )
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name 0 Alfred 1 Batman 2 Catwoman
6.4 Filling null values 填充空值
df = pd.DataFrame({"name" : ['Alfred' , 'Batman' , 'Catwoman' ],
... "toy" : [np.nan, 'Batmobile' , 'Bullwhip' ],
... "born" : [pd.NaT, pd.Timestamp("1940-04-25" ),
... pd.NaT]})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT
df.fillna(0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born 0 Alfred 0 0 1 Batman Batmobile 1940-04-25 00:00:00 2 Catwoman Bullwhip 0
df.fillna(method='ffill' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip 1940-04-25
df.fillna(method='bfill' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born 0 Alfred Batmobile 1940-04-25 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT
df.fillna(method='ffill' ,axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name toy born 0 Alfred Alfred Alfred 1 Batman Batmobile 1940-04-25 00:00:00 2 Catwoman Bullwhip Bullwhip
7. Combing Datasets 数据合并
7.1. pd.concat(objs,axis = 0,join = ‘outer’,join_axes = None,ignore_index = False,key = None,levels = None,names = None,verify_intrgrity = False,copy = true)
df1 = pd.DataFrame(np.array(range(12 )).reshape(4 ,3 ),index=[1 ,2 ,3 ,4 ],columns=["one" ,"two" ,"three" ])
df2 = pd.DataFrame(np.array(range(12 ,24 )).reshape(4 ,3 ),index=[1 ,2 ,3 ,4 ],columns=["four" ,"five" ,"six" ])
print(df1)
print(df2)
pd.concat([df1,df2])
one two three 1 0 1 2 2 3 4 5 3 6 7 8 4 9 10 11 four five six 1 12 13 14 2 15 16 17 3 18 19 20 4 21 22 23 D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “””
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
five four one six three two 1 NaN NaN 0.0 NaN 2.0 1.0 2 NaN NaN 3.0 NaN 5.0 4.0 3 NaN NaN 6.0 NaN 8.0 7.0 4 NaN NaN 9.0 NaN 11.0 10.0 1 13.0 12.0 NaN 14.0 NaN NaN 2 16.0 15.0 NaN 17.0 NaN NaN 3 19.0 18.0 NaN 20.0 NaN NaN 4 22.0 21.0 NaN 23.0 NaN NaN
pd.concat([df1,df2],ignore_index=True )
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
five four one six three two 0 NaN NaN 0.0 NaN 2.0 1.0 1 NaN NaN 3.0 NaN 5.0 4.0 2 NaN NaN 6.0 NaN 8.0 7.0 3 NaN NaN 9.0 NaN 11.0 10.0 4 13.0 12.0 NaN 14.0 NaN NaN 5 16.0 15.0 NaN 17.0 NaN NaN 6 19.0 18.0 NaN 20.0 NaN NaN 7 22.0 21.0 NaN 23.0 NaN NaN
pd.concat([df1,df2],axis = 1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three four five six 1 0 1 2 12 13 14 2 3 4 5 15 16 17 3 6 7 8 18 19 20 4 9 10 11 21 22 23
pd.concat([df1,df2],keys=["df1" ,"df2" ])
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
five four one six three two df1 1 NaN NaN 0.0 NaN 2.0 1.0 2 NaN NaN 3.0 NaN 5.0 4.0 3 NaN NaN 6.0 NaN 8.0 7.0 4 NaN NaN 9.0 NaN 11.0 10.0 df2 1 13.0 12.0 NaN 14.0 NaN NaN 2 16.0 15.0 NaN 17.0 NaN NaN 3 19.0 18.0 NaN 20.0 NaN NaN 4 22.0 21.0 NaN 23.0 NaN NaN
reColName = {"four" :"two" ,"five" :"three" }
reRowName = {3 :5 ,4 :6 }
df2.rename(columns=reColName,index=reRowName,inplace=True )
print(df2)
two three six 1 12 13 14 2 15 16 17 5 18 19 20 6 21 22 23
pd.concat([df1,df2])
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one six three two 1 0.0 NaN 2 1 2 3.0 NaN 5 4 3 6.0 NaN 8 7 4 9.0 NaN 11 10 1 NaN 14.0 13 12 2 NaN 17.0 16 15 5 NaN 20.0 19 18 6 NaN 23.0 22 21
pd.concat([df1,df2],join_axes = [df1.columns])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three 1 0.0 1 2 2 3.0 4 5 3 6.0 7 8 4 9.0 10 11 1 NaN 12 13 2 NaN 15 16 5 NaN 18 19 6 NaN 21 22
pd.concat([df1,df2],join = "inner" )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
two three 1 1 2 2 4 5 3 7 8 4 10 11 1 12 13 2 15 16 5 18 19 6 21 22
pd.concat([df1,df2],join = "outer" ,axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three two three six 1 0.0 1.0 2.0 12.0 13.0 14.0 2 3.0 4.0 5.0 15.0 16.0 17.0 3 6.0 7.0 8.0 NaN NaN NaN 4 9.0 10.0 11.0 NaN NaN NaN 5 NaN NaN NaN 18.0 19.0 20.0 6 NaN NaN NaN 21.0 22.0 23.0
7.2 pd.merge()
print(df2)
df2.drop(columns=["two" ,"three" ],inplace=True )
print(df2)
two three six 1 12 13 14 2 15 16 17 5 18 19 20 6 21 22 23 six 1 14 2 17 5 20 6 23
df1 = pd.DataFrame({"name" :["zhangsan" ,"lisi" ,"wangwu" ,"maliu" ],
"work" :["policy" ,"teacher" ,"nuers" ,"students" ],
"age" :[23 ,34 ,45 ,56 ]
})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name work age 0 zhangsan policy 23 1 lisi teacher 34 2 wangwu nuers 45 3 maliu students 56
df2 = pd.DataFrame({"name" :["zhangsan" ,"lisi" ,"wangwu" ,"maliu" ],
"sex" :["f" ,"w" ,"w" ,"f" ]
})
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name sex 0 zhangsan f 1 lisi w 2 wangwu w 3 maliu f
pd.merge(df1,df2)
pd.merge(df1,df2,on="name" )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name work age sex 0 zhangsan policy 23 f 1 lisi teacher 34 w 2 wangwu nuers 45 w 3 maliu students 56 f
df3 = pd.DataFrame({"name" :["zhouqi" ],"sex" :["w" ]})
df2 = pd.concat([df2,df3],ignore_index=True )
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name sex 0 zhangsan f 1 lisi w 2 wangwu w 3 maliu f 4 zhouqi w
pd.merge(df1,df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name work age sex 0 zhangsan policy 23 f 1 lisi teacher 34 w 2 wangwu nuers 45 w 3 maliu students 56 f
8. 时间格式
date = np.array(np.datetime64('2012-05-17' ))
date
array('2012-05-17', dtype='datetime64[D]')
date + range(12 )
array(['2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
'2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
'2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28'],
dtype='datetime64[D]')
print(np.datetime64('2012-05-27' ,'Y' ))
2012
Code Meaning Y Year M Month W Week D Day h Hour m minute s Second
pd.date_range('2018-9-14' ,'2018-9-24' )
DatetimeIndex(['2018-09-14', '2018-09-15', '2018-09-16', '2018-09-17',
'2018-09-18', '2018-09-19', '2018-09-20', '2018-09-21',
'2018-09-22', '2018-09-23', '2018-09-24'],
dtype='datetime64[ns]', freq='D')
pd.date_range('2018-9-14' ,periods=12 ,freq='H' )
DatetimeIndex(['2018-09-14 00:00:00', '2018-09-14 01:00:00',
'2018-09-14 02:00:00', '2018-09-14 03:00:00',
'2018-09-14 04:00:00', '2018-09-14 05:00:00',
'2018-09-14 06:00:00', '2018-09-14 07:00:00',
'2018-09-14 08:00:00', '2018-09-14 09:00:00',
'2018-09-14 10:00:00', '2018-09-14 11:00:00'],
dtype='datetime64[ns]', freq='H')
9. 计算各列数据总和并作为新列添加到末尾
df['Col_sum' ] = df.apply(lambda x: x.sum(), axis=1 )
10. 计算各行数据总和并作为新行添加到末尾
df.loc['Row_sum' ] = df.apply(lambda x: x.sum())