Python3----Pandas

Python3–Pandas

1. 导包
import numpy as np
import pandas as pd
2. Series的创建
data = pd.Series(["skey","syl","earth"]) #Series is a one-dimensional array of indexed data.
print(data)
0 skey 1 syl 2 earth dtype: object
data.values #取值
array([‘skey’, ‘syl’, ‘earth’], dtype=object)
data.index
RangeIndex(start=0, stop=3, step=1)
data[:2] #Series取值,总的来讲Series更像是一个版本的Array
0 skey 1 syl dtype: object
info = pd.Series(["job","15","student"],index=["name","age","work"]) #编辑索引
print(info) 
name job age 15 work student dtype: object
info["age"] #取值方法
‘15’
dic = {
    "name":"zha",
    "age":"15"
}
dic2ser = pd.Series(dic)
print(dic2ser,type(dic2ser))  # 利用字典来创建一个Series
name zha age 15 dtype: object
print(
pd.Series({
    1:1,
    3:13,
    2:14
})
)
1 1 3 13 2 14 dtype: int64
3.Dataframe的创建与读取
df = pd.DataFrame(np.array(range(10)).reshape(2,5),index=["one","two"],columns=[1,2,3,4,5])
print(df)
1 2 3 4 5 one 0 1 2 3 4 two 5 6 7 8 9
print(df.index,"\n",df.columns)
Index([‘one’, ‘two’], dtype=’object’) Int64Index([1, 2, 3, 4, 5], dtype=’int64’)
print(df[1]) #取列数据
one 0 two 5 Name: 1, dtype: int32
#.ix is deprecated. Please use
#.loc for label based indexing or
#.iloc for positional indexing

print(df.loc["one"]) #取行数据,即记录
print(df.iloc[0])
1 0 2 1 3 2 4 3 5 4 Name: one, dtype: int32 1 0 2 1 3 2 4 3 5 4 Name: one, dtype: int32
4. As joins across datasets
indA = pd.Index([1,2,3,4,5,6])
indB = pd.Index([1,3,5,7])
indA & indB #即数据库中的内连接,两个集合的交集
Int64Index([1, 3, 5], dtype=’int64’)
indA | indB #即数据库中的外链接,两个集合的并集
Int64Index([1, 2, 3, 4, 5, 6, 7], dtype=’int64’)
indA ^ indB #两个集合中不重复的部分
Int64Index([2, 4, 6, 7], dtype=’int64’)
Se = pd.Series(range(5),index=["one","two","three","four","five"])
print(Se)
print(Se[0:2]) #按照下标取值,不包含尾端元素
one 0 two 1 three 2 four 3 five 4 dtype: int64 one 0 two 1 dtype: int64
print(Se["one":"three"]) #按照index取值,包含尾端元素
one 0 two 1 three 2 dtype: int64
5.Dataframe数据读取
df = pd.read_excel("G:\\python3\\Python_notebook\\第3关:数据分析的基本过程\\朝阳医院2018年销售数据.xlsx")
print(df.head(5)) #查看前五行数据
购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1.616528e+06 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1.616528e+06 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 1.260283e+07 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 1.007034e+10 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 1.015543e+08 236701.0 三九感冒灵 8.0 224.0 208.00
print(df.columns)
Index([‘购药时间’, ‘社保卡号’, ‘商品编码’, ‘商品名称’, ‘销售数量’, ‘应收金额’, ‘实收金额’], dtype=’object’)
print(df.shape,df.count())
(6578, 7) 购药时间 6576 社保卡号 6576 商品编码 6577 商品名称 6577 销售数量 6577 应收金额 6577 实收金额 6577 dtype: int64
print(df.dtypes)
购药时间 object 社保卡号 float64 商品编码 float64 商品名称 object 销售数量 float64 应收金额 float64 实收金额 float64 dtype: object
print(df.head(5))
print(df["社保卡号"].dtype)
df["社保卡号"] = df["社保卡号"].map(lambda x : str(x)) #map方法和匿名函数使用,改变某一列的数据类型
print(df.head(5))
购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1.616528e+06 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1.616528e+06 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 1.260283e+07 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 1.007034e+10 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 1.015543e+08 236701.0 三九感冒灵 8.0 224.0 208.00 float64 购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1616528.0 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1616528.0 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 12602828.0 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 10070343428.0 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 101554328.0 236701.0 三九感冒灵 8.0 224.0 208.00
print(df.loc[:3,"购药时间":"商品编码"])
print(df.iloc[:4,:3])
购药时间 社保卡号 商品编码 0 2018-01-01 星期五 1616528.0 236701.0 1 2018-01-02 星期六 1616528.0 236701.0 2 2018-01-06 星期三 12602828.0 236701.0 3 2018-01-11 星期一 10070343428.0 236701.0 购药时间 社保卡号 商品编码 0 2018-01-01 星期五 1616528.0 236701.0 1 2018-01-02 星期六 1616528.0 236701.0 2 2018-01-06 星期三 12602828.0 236701.0 3 2018-01-11 星期一 10070343428.0 236701.0
ReName = {"购药时间":"售药时间"} #创建一个重命名字典序
df.rename(columns = ReName,inplace = True) #重新命名列名,并更新inplace = True
print(df.head(5))
售药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额 0 2018-01-01 星期五 1616528.0 236701.0 强力VC银翘片 6.0 82.8 69.00 1 2018-01-02 星期六 1616528.0 236701.0 清热解毒口服液 1.0 28.0 24.64 2 2018-01-06 星期三 12602828.0 236701.0 感康 2.0 16.8 15.00 3 2018-01-11 星期一 10070343428.0 236701.0 三九感冒灵 1.0 28.0 28.00 4 2018-01-15 星期五 101554328.0 236701.0 三九感冒灵 8.0 224.0 208.00
6. 缺失值处理
6.1. 空值类型–None
  1. Python中内置的None值
  2. Pandas中,将缺失值表示为NA,表示不可用not available。
  3. 对于数值数据,pandas使用浮点值NaN(Not a Number)表示缺失数据。
 df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
...                    "toy": [np.nan, '', 'Bullwhip'],
...                    "born": [pd.NaT, pd.Timestamp("1940-04-25"),
...                             pd.NaT],
                   "age":[12,None,13]}) # Pandas自动将Python中的数值空值转化为NAN
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
0AlfredNaNNaT12.0
1Batman1940-04-25NaN
2CatwomanBullwhipNaT13.0
6.2 Detecting null values 检测空值
print(df.isnull()) #The isnull() and isnull() methods produce similar Boolean results for DataFrames.
print(df.isna())

print(df.isnull()==df.isna())
name toy born age 0 False True True False 1 False False False True 2 False False True False name toy born age 0 False True True False 1 False False False True 2 False False True False name toy born age 0 True True True True 1 True True True True 2 True True True True
df[df.loc[:,"age"].notna()] #选择年龄不为空的记录
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
0AlfredNaNNaT12.0
2CatwomanBullwhipNaT13.0
6.3 Dropping null values 删除空值

DataFrame.dropna(axis=0, how=’any’, thresh=None, subset=None, inplace=False)

df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
0AlfredNaNNaT12.0
1Batman1940-04-25NaN
2CatwomanBullwhipNaT13.0
print(df.dropna(axis = 0)) #返回没有空值的行

print(df.dropna(axis=1)) #返回没有空值的列
Empty DataFrame Columns: [name, toy, born, age] Index: [] name 0 Alfred 1 Batman 2 Catwoman
print(df.dropna(how = 'any',axis=1)) #how = 'any',axis=1:列中有任意数量的空值我们都要删除
print(df.dropna(how = 'all',axis=1)) #列的值都为空,我们才删除该列
name 0 Alfred 1 Batman 2 Catwoman name toy born age 0 Alfred NaN NaT 12.0 1 Batman 1940-04-25 NaN 2 Catwoman Bullwhip NaT 13.0
df.dropna() #行中有空值即删除改行
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
df.dropna(subset=["toy","age"]) #删除toy、age中有空值的记录
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
2CatwomanBullwhipNaT13.0
df #所有的操作均没有改变原来的数据
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoybornage
0AlfredNaNNaT12.0
1Batman1940-04-25NaN
2CatwomanBullwhipNaT13.0
df.dropna(axis=1,inplace=True)
df #这个修改了原数据,尽量不用
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name
0Alfred
1Batman
2Catwoman
6.4 Filling null values 填充空值
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
...                    "toy": [np.nan, 'Batmobile', 'Bullwhip'],
...                    "born": [pd.NaT, pd.Timestamp("1940-04-25"),
...                             pd.NaT]})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoyborn
0AlfredNaNNaT
1BatmanBatmobile1940-04-25
2CatwomanBullwhipNaT
df.fillna(0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoyborn
0Alfred00
1BatmanBatmobile1940-04-25 00:00:00
2CatwomanBullwhip0
df.fillna(method='ffill') #forward-fill 向前填充
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoyborn
0AlfredNaNNaT
1BatmanBatmobile1940-04-25
2CatwomanBullwhip1940-04-25
df.fillna(method='bfill') #back-fill 向后填充
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoyborn
0AlfredBatmobile1940-04-25
1BatmanBatmobile1940-04-25
2CatwomanBullwhipNaT
df.fillna(method='ffill',axis=1) #左右控制
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nametoyborn
0AlfredAlfredAlfred
1BatmanBatmobile1940-04-25 00:00:00
2CatwomanBullwhipBullwhip
7. Combing Datasets 数据合并
7.1. pd.concat(objs,axis = 0,join = ‘outer’,join_axes = None,ignore_index = False,key = None,levels = None,names = None,verify_intrgrity = False,copy = true)
df1 = pd.DataFrame(np.array(range(12)).reshape(4,3),index=[1,2,3,4],columns=["one","two","three"])
df2 = pd.DataFrame(np.array(range(12,24)).reshape(4,3),index=[1,2,3,4],columns=["four","five","six"])
print(df1)
print(df2)
pd.concat([df1,df2]) #default axis = 0
one two three 1 0 1 2 2 3 4 5 3 6 7 8 4 9 10 11 four five six 1 12 13 14 2 15 16 17 3 18 19 20 4 21 22 23 D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “””
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
fivefouronesixthreetwo
1NaNNaN0.0NaN2.01.0
2NaNNaN3.0NaN5.04.0
3NaNNaN6.0NaN8.07.0
4NaNNaN9.0NaN11.010.0
113.012.0NaN14.0NaNNaN
216.015.0NaN17.0NaNNaN
319.018.0NaN20.0NaNNaN
422.021.0NaN23.0NaNNaN
pd.concat([df1,df2],ignore_index=True) #忽略原来的索引,新建索引
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
fivefouronesixthreetwo
0NaNNaN0.0NaN2.01.0
1NaNNaN3.0NaN5.04.0
2NaNNaN6.0NaN8.07.0
3NaNNaN9.0NaN11.010.0
413.012.0NaN14.0NaNNaN
516.015.0NaN17.0NaNNaN
619.018.0NaN20.0NaNNaN
722.021.0NaN23.0NaNNaN
pd.concat([df1,df2],axis = 1) 
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onetwothreefourfivesix
1012121314
2345151617
3678181920
491011212223
pd.concat([df1,df2],keys=["df1","df2"]) #标识数据的来源
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
fivefouronesixthreetwo
df11NaNNaN0.0NaN2.01.0
2NaNNaN3.0NaN5.04.0
3NaNNaN6.0NaN8.07.0
4NaNNaN9.0NaN11.010.0
df2113.012.0NaN14.0NaNNaN
216.015.0NaN17.0NaNNaN
319.018.0NaN20.0NaNNaN
422.021.0NaN23.0NaNNaN
# df1 = pd.DataFrame(np.array(range(12)).reshape(4,3),index=[1,2,3,4],columns=["one","two","three"])
# df2 = pd.DataFrame(np.array(range(12,24)).reshape(4,3),index=[1,2,3,4],columns=["four","five","six"])
reColName = {"four":"two","five":"three"}
reRowName = {3:5,4:6}
df2.rename(columns=reColName,index=reRowName,inplace=True)
print(df2)
two three six 1 12 13 14 2 15 16 17 5 18 19 20 6 21 22 23
pd.concat([df1,df2])
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass ‘sort=True’. To retain the current behavior and silence the warning, pass sort=False “”“Entry point for launching an IPython kernel.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onesixthreetwo
10.0NaN21
23.0NaN54
36.0NaN87
49.0NaN1110
1NaN14.01312
2NaN17.01615
5NaN20.01918
6NaN23.02221
pd.concat([df1,df2],join_axes = [df1.columns]) #根据df1的列来填充表,类似于数据库的左联接
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onetwothree
10.012
23.045
36.078
49.01011
1NaN1213
2NaN1516
5NaN1819
6NaN2122
pd.concat([df1,df2],join = "inner") #列名的交集
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
twothree
112
245
378
41011
11213
21516
51819
62122
pd.concat([df1,df2],join = "outer",axis=1) #索引名的并集
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onetwothreetwothreesix
10.01.02.012.013.014.0
23.04.05.015.016.017.0
36.07.08.0NaNNaNNaN
49.010.011.0NaNNaNNaN
5NaNNaNNaN18.019.020.0
6NaNNaNNaN21.022.023.0
7.2 pd.merge()
print(df2)
df2.drop(columns=["two","three"],inplace=True) #删除制定列
print(df2)
two three six 1 12 13 14 2 15 16 17 5 18 19 20 6 21 22 23 six 1 14 2 17 5 20 6 23
df1 = pd.DataFrame({"name":["zhangsan","lisi","wangwu","maliu"],
                  "work":["policy","teacher","nuers","students"],
                   "age":[23,34,45,56]
                   })
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nameworkage
0zhangsanpolicy23
1lisiteacher34
2wangwunuers45
3maliustudents56
df2 = pd.DataFrame({"name":["zhangsan","lisi","wangwu","maliu"],
                  "sex":["f","w","w","f"]
                   })
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
namesex
0zhangsanf
1lisiw
2wangwuw
3maliuf
pd.merge(df1,df2)
pd.merge(df1,df2,on="name") #按照name属性合并
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nameworkagesex
0zhangsanpolicy23f
1lisiteacher34w
2wangwunuers45w
3maliustudents56f
df3 = pd.DataFrame({"name":["zhouqi"],"sex":["w"]})
df2 = pd.concat([df2,df3],ignore_index=True)
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
namesex
0zhangsanf
1lisiw
2wangwuw
3maliuf
4zhouqiw
pd.merge(df1,df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
nameworkagesex
0zhangsanpolicy23f
1lisiteacher34w
2wangwunuers45w
3maliustudents56f
8. 时间格式
date = np.array(np.datetime64('2012-05-17')) #Numpy中的时间格式
date
array('2012-05-17', dtype='datetime64[D]')
date + range(12)
array(['2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
       '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
       '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28'],
      dtype='datetime64[D]')
print(np.datetime64('2012-05-27','Y'))
2012
CodeMeaning
YYear
MMonth
WWeek
DDay
hHour
mminute
sSecond
pd.date_range('2018-9-14','2018-9-24')
DatetimeIndex(['2018-09-14', '2018-09-15', '2018-09-16', '2018-09-17',
               '2018-09-18', '2018-09-19', '2018-09-20', '2018-09-21',
               '2018-09-22', '2018-09-23', '2018-09-24'],
              dtype='datetime64[ns]', freq='D')
pd.date_range('2018-9-14',periods=12,freq='H')
DatetimeIndex(['2018-09-14 00:00:00', '2018-09-14 01:00:00',
               '2018-09-14 02:00:00', '2018-09-14 03:00:00',
               '2018-09-14 04:00:00', '2018-09-14 05:00:00',
               '2018-09-14 06:00:00', '2018-09-14 07:00:00',
               '2018-09-14 08:00:00', '2018-09-14 09:00:00',
               '2018-09-14 10:00:00', '2018-09-14 11:00:00'],
              dtype='datetime64[ns]', freq='H')
9. 计算各列数据总和并作为新列添加到末尾
df['Col_sum'] = df.apply(lambda x: x.sum(), axis=1)
10. 计算各行数据总和并作为新行添加到末尾
df.loc['Row_sum'] = df.apply(lambda x: x.sum())
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值