#为什么要学习pandas,numpy能够帮我们处理数值型的数据,但是这还不够
#而pandas能处理字符串,还有时间序列
import pandas as pd
import numpy as np
import string
#创建一维Series的方法:
# #1,默认索引为从0开始的数值
# a=pd.Series([1,2,3,4,5,6,])
# print (a)
# print(type(a))
# #返回:
# # 0 1
# # 1 2
# # 2 3
# # 3 4
# # 4 5
# # 5 6
# # dtype: int64
# # <class 'pandas.core.series.Series'>
# #2.指定索引创建Series
# a=pd.Series([1,2,3,4,5],index=list("abcde"))
# print (a)
# print(type(a))
# #返回
# # a 1
# # b 2
# # c 3
# # d 4
# # e 5
# # dtype: int64
# # <class 'pandas.core.series.Series'>
# #3.用字典来创建Series,Series的键就是字典的键,Series的值就是字典的值
# a={"name":'python',"age":25,"tel":10000}
# print(pd.Series(a))
# print(type(a))
# #返回:
# # name python
# # age 25
# # tel 10000
# # dtype: object
# # <class 'dict'>
# #另一种方法,通过for循环生成字典,索引用string.ascii_uppercase生成的大写字母
# a={string.ascii_uppercase[i]:i for i in range(10)}
# print (a)
# #{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9}
# #将上面的字典生成一个Series
# print(pd.Series(a))
# # A 0
# # B 1
# # C 2
# # D 3
# # E 4
# # F 5
# # G 6
# # H 7
# # I 8
# # J 9
# # dtype: int64
# #和上面的有所区别,指定索引从字母F~O,那么只有F~J能对应到字典的数值,
# # 其它的没有数值,所以值为NaN,并且这时的dtype为float64了
# print(pd.Series(a,index=list(string.ascii_uppercase[5:15])))
# # F 5.0
# # G 6.0
# # H 7.0
# # I 8.0
# # J 9.0
# # K NaN
# # L NaN
# # M NaN
# # N NaN
# # O NaN
# # dtype: float64
# #取Series的值,可以按键名取,也可以切片取
# a={"name":'python',"age":25,"tel":10000}
# b=(pd.Series(a))
# print(b["name"])
# #返回:
# #python
#也可以按位置来切片
# print(b[0])
# #返回:
# #python
# #Series的两个函数index和values的用法
# print(b.index)
# #返回:
# # Index(['name', 'age', 'tel'], dtype='object')
# print(b.values)
# # 返回:
# # ['python' 25 10000]
#
# #通过键值的boole判断取值
# a=pd.Series([1,2,3,4,5,6,])
# print(a[a>3])
# #返回
# # 3 4
# # 4 5
# # 5 6
# # dtype: int64
# #取外部数据,pandas取外部数据的方法非常方便,取csv的格式用read_csv,
# #excel,json,html,sql都有对应的方法,下面以read_csv为例:
# df=pd.read_csv("./pandas_data.csv")
# print(df)
# #输出结果,最左边被加了一个Index例,从0开始
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.00
# # 1 dd dd 66.00
# # 2 bb aat 55.00
# # 3 ww ww 0.30
# # 4 CR Cereal 0.85
# # .. ... ... ...
# # 76 CEM CEM 1.00
# # 77 WMA WMAWMA 1.00
# # 78 jgIILLLLLL jgIILLLLLLJ 0.10
# # 79 2 adfa 2.00
# # 80 l5m l5m 12.00
# #
# # [81 rows x 3 columns]
# #用DataFrame构建二维series
# a=pd.DataFrame(np.arange(12).reshape(3,4))
# print(a)
# #返回结果:
# #第一列是行索引,表明不同行,横向索引,叫index,0轴,axis=0
# #第一行是列索引,表明不同列,纵向索引,叫columns,1轴,axis=1
# # 0 1 2 3
# # 0 0 1 2 3
# # 1 4 5 6 7
# # 2 8 9 10 11
# #除了上面自动的column和index名,也可以指定名
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# #返回结果
# # W X Y Z
# # a 0 1 2 3
# # b 4 5 6 7
# # c 8 9 10 11
# #用字典来创建DataFrame
# d1={"name":["xiaoming","xiaowang"],"aga":[19,20],"tel":["10000","10086"]}
# t1=pd.DataFrame(d1)
# print (t1)
# #返回结果:
# # name aga tel
# # 0 xiaoming 19 10000
# # 1 xiaowang 20 10086
#
# #另一种用字典创建DataFrame方法
# d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}]
# t2=pd.DataFrame(d2)
# print (t2)
# #返回结果:
# #没有指定的位置是NaN
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang NaN 20010.0
# # 2 xiaowang 28.0 NaN
# #DataFrame的一些方法
# print(t2.index)
# #RangeIndex(start=0, stop=3, step=1)
# print(t2.columns)
# #Index(['name', 'age', 'tel'], dtype='object')
# print(t2.values)
# # [['xiaohong' 23.0 10010.0]
# # ['xiaogang' nan 20010.0]
# # ['xiaowang' 28.0 nan]]
# print(t2.shape)
# #(3, 3)
# print(t2.dtypes)
# #name object
# # age float64
# # tel float64
# # dtype: object
# print(t2.ndim) #显示维度,结果为2说明是二维series
# #2
# print(t2.head(2))#取前两行数据
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang NaN 20010.0
# print(t2.tail(2))#取后两行数据
# # name age tel
# # 1 xiaogang NaN 20010.0
# # 2 xiaowang 28.0 NaN
# print(t2.info())#取t2的信息概览,包括行数,列数,列非空值数,列类型,行类型,占用内存大小
# # <class 'pandas.core.frame.DataFrame'>
# # RangeIndex: 3 entries, 0 to 2
# # Data columns (total 3 columns):
# # # Column Non-Null Count Dtype
# # --- ------ -------------- -----
# # 0 name 3 non-null object
# # 1 age 2 non-null float64
# # 2 tel 2 non-null float64
# # dtypes: float64(2), object(1)
# # memory usage: 200.0+ bytes
# # None
# print(t2.describe())#快速综合统计结果:计数,均值,标准差,最大值,最小值,四分位数
# # age tel
# # count 2.000000 2.000000
# # mean 25.500000 15010.000000
# # std 3.535534 7071.067812
# # min 23.000000 10010.000000
# # 25% 24.250000 12510.000000
# # 50% 25.500000 15010.000000
# # 75% 26.750000 17510.000000
# # max 28.000000 20010.000000
# #DataFrame排序,sort_values的参数by是指排序指定的列,ascending默认为True(顺序)
# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.0
# df=df.sort_values(by="GROSS_FACTOR",ascending=False)
# print(df.head(5))
# #返回的值,是以GROSS_FACTOR为倒序排序的
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 1 dd dd 66.0
# # 2 bb aat 55.0
# # 71 test3 test3 22.0
# # 64 tes yt 0219 V2 15.0
# # 44 yt_up yt <html> @0219_up 13.0
#DataFrame的切片操作
# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.0
# #方括号写数组,表示取行,对行进行操作
# print(df[:20]) #取df的前20行
# print(df[20:])#取第20行以后的数组
# #方括号写字符串,表示取列索引,对列进行操作
# print(df["GROSS_FACTOR"])#取其"GROSS_FACTOR"的列
# print(df[:20]["GROSS_FACTOR"])#取前20行的"GROSS_FACTOR"值
# #用loc切片
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# # W X Y Z
# # a 0 1 2 3
# # b 4 5 6 7
# # c 8 9 10 11
# print(a.loc["a"])#取a行数据
# # W 0
# # X 1
# # Y 2
# # Z 3
# #Name: a, dtype: int32
# print(a.loc["a","Z"])#取a行Z列数据
# #3
# print(a.loc[["a","c"],"Z"])#取a和c行的Z列
# # a 3
# # c 11
# # Name: Z, dtype: int32
# print(a.loc["a":"c","Z"])
# #这个结果与上面的是一致的,需要注意这里的结果是包括c行的,和其它的切片操作有区别,其它的切片操作都是包头不包尾
# # a 3
# # b 7
# # c 11
# # Name: Z, dtype: int32
# #用iloc切片,这i表示index,用索引的值来切片切片
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# # W X Y Z
# # a 0 1 2 3
# # b 4 5 6 7
# # c 8 9 10 11
# print(a.iloc[1])#取第2行的数据
# # W 4
# # X 5
# # Y 6
# # Z 7
# # Name: b, dtype: int32
# print(a.iloc[:,1])#取第2列
# # a 1
# # b 5
# # c 9
# # Name: X, dtype: int32
# print(a.iloc[1,1])#取第2行第2列
# #5
# print(a.iloc[1:,1])#取从第2行开始的第2列
# # b 5
# # c 9
# #赋值改变DataFrame的数值
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# # W X Y Z
# # a 0 1 2 3
# # b 4 5 6 7
# # c 8 9 10 11
#
# a.iloc[1:,1]=100#将第2行开始的第2列值都改为100
# print (a)
# # W X Y Z
# # a 0 1 2 3
# # b 4 100 6 7
# # c 8 100 10 11
# a.iloc[1:,1]=np.nan#直接改成NaN也是可以的
# print (a)
# # W X Y Z
# # a 0 1.0 2 3
# # b 4 NaN 6 7
# # c 8 NaN 10 11
# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.0
# print(df["GROSS_FACTOR"])#通过指定columns条件查出所有的行
# # 0 12.00
# # 1 66.00
# # 2 55.00
# # 3 0.30
# # 4 0.85
# # ...
# # 76 1.00
# # 77 1.00
# # 78 0.10
# # 79 2.00
# # 80 12.00
# print(df[df["GROSS_FACTOR"]>10])#再根据所有行找出>10的行
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.0
# # 1 dd dd 66.0
# # 2 bb aat 55.0
# # 44 yt_up yt <html> @0219_up 13.0
# # 47 yzh''test yaozihe test % 's 12.0
# # 48 test bus yzh test bus 12.0
# # 57 yzh''test yaozihe test % 's 12.0
# # 58 yzh tst yzh test 2 12.0
# # 61 123'13 yzh test 12.0
# # 64 tes yt 0219 V2 15.0
# # 71 test3 test3 22.0
# # 80 l5m l5m 12.0
#
# #找出"GROSS_FACTOR">10并且<12的值
# #&是且,|是或
# print(df[(df["GROSS_FACTOR"]>10)&(df["GROSS_FACTOR"]<20)])
# # DIVISION_CODE DIVISION_NAME GROSS_FACTOR
# # 0 jg gf 12.0
# # 44 yt_up yt <html> @0219_up 13.0
# # 47 yzh''test yaozihe test % 's 12.0
# # 48 test bus yzh test bus 12.0
# # 57 yzh''test yaozihe test % 's 12.0
# # 58 yzh tst yzh test 2 12.0
# # 61 123'13 yzh test 12.0
# # 64 tes yt 0219 V2 15.0
# # 80 l5m l5m 12.0
# #处理缺失数据NaN
# #判断数据是否是NaN,用isnull方法
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# # W X Y Z
# # a 0 1.0 2 3
# # b 4 NaN 6 7
# # c 8 NaN 10 11
# print(pd.isnull(a))#isnull方法判断是否为NaN
# # W X Y Z
# # a False False False False
# # b False True False False
# # c False True False False
# print(pd.notnull(a))#另一个方法notnull,结果与isnull相反
# # W X Y Z
# # a True True True True
# # b True False True True
# # c True False True True
# print(pd.notnull(a["X"]))#只取”X"列的notnull,返回的结果为bool类型
# # a True
# # b False
# # c False
# # Name: X, dtype: bool
# print(a[pd.notnull(a["X"])])
# #接着将上面的内容再套一层,因为上面只有第一行结果返回为True,所以这里只返回第一行的数据
# #bool索引可以看成一个位置矩阵,然后把位置矩阵传给数组,True的取出来,False的不取
# #这里是按“W"列中是否有NaN得到的一个矩阵,只有第一列返回为True,然后由广播到数组里面,取出了数组第一行的数据
# # W X Y Z
# # a 0 1.0 2 3
# #dropna的用法,可以将值为NaN的值删除
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# # W X Y Z
# # a 0 1.0 2 3
# # b 4 NaN 6 7
# # c 8 NaN 10 11
#
# print(a.dropna(axis=0))#axis=1表示删除行,所以删除矩阵里面有NaN的的所在行
# # W X Y Z
# # a 0 1.0 2 3
# print(a.dropna(axis=1))#axis=1表示删除列,所以删除矩阵里面有NaN的的所在列
# # W Y Z
# # a 0 2 3
# # b 4 6 7
# # c 8 10 11
# print(a.dropna(axis=0,how="all"))#how参数说明这一行所有的值都为NaN才删除,默认为"any“
# # W X Y Z
# # a 0 1.0 2 3
# # b 4 NaN 6 7
# # c 8 NaN 10 11
# print(a.dropna(axis=0,how="any"))#how="any"是默认值,所以和上面不加此参数的结果一样
# # W X Y Z
# # a 0 1.0 2 3
# #另一个参数inplace,此参数默认为False,为True时原地修改矩阵
# #相当于与结果重新赋值给自己
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# #a.dropna(axis=0,inplace=True)
# a.dropna(axis=0,inplace=True)
# #加了inplace参数,相于与a=a.dropna(axis=0)
# print(a)
# #返回的结果已改变了a的值
# # W X Y Z
# # a 0 1.0 2 3
# #填充NaN数据
# d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}]
# t2=pd.DataFrame(d2)
# print(t2)
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang NaN 20010.0
# # 2 xiaowang 28.0 NaN
# #1.通过fillna方法直接填充指定的值,把NaN的值都填充为100
# print(t2.fillna(100))
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang 100.0 20010.0
# # 2 xiaowang 28.0 100.0
# #2.填充平均数
# print(t2.fillna(t2.mean()))
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang 25.5 20010.0
# # 2 xiaowang 28.0 15010.0
# t2["age"]=t2["age"].fillna(t2["age"].mean())
# #如果只想更改其中一列的NaN值为平均数
# print (t2)
# # name age tel
# # 0 xiaohong 23.0 10010.0
# # 1 xiaogang 25.5 20010.0
# # 2 xiaowang 28.0 NaN
# print(t2["age"].mean())
# # 25.5
# #age列的平均值为25.5,这里和numpy的结果不同
# #numpy的矩阵,如果这一列或一行的数据只要有NaN的值,mean()的结果为Nan
Python之pandas基本操作基础
最新推荐文章于 2021-10-29 00:17:58 发布