Python之pandas基本操作基础

#为什么要学习pandas,numpy能够帮我们处理数值型的数据,但是这还不够
#而pandas能处理字符串,还有时间序列

import pandas as pd
import numpy as np
import string

#创建一维Series的方法:
# #1,默认索引为从0开始的数值
# a=pd.Series([1,2,3,4,5,6,])
# print (a)
# print(type(a))
# #返回:
# # 0    1
# # 1    2
# # 2    3
# # 3    4
# # 4    5
# # 5    6
# # dtype: int64
# # <class 'pandas.core.series.Series'>
# #2.指定索引创建Series
# a=pd.Series([1,2,3,4,5],index=list("abcde"))
# print (a)
# print(type(a))
# #返回
# # a    1
# # b    2
# # c    3
# # d    4
# # e    5
# # dtype: int64
# # <class 'pandas.core.series.Series'>
# #3.用字典来创建Series,Series的键就是字典的键,Series的值就是字典的值
# a={"name":'python',"age":25,"tel":10000}
# print(pd.Series(a))
# print(type(a))
# #返回:
# # name    python
# # age         25
# # tel      10000
# # dtype: object
# # <class 'dict'>

# #另一种方法,通过for循环生成字典,索引用string.ascii_uppercase生成的大写字母
# a={string.ascii_uppercase[i]:i for i in range(10)}
# print (a)
# #{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9}
# #将上面的字典生成一个Series
# print(pd.Series(a))
# # A    0
# # B    1
# # C    2
# # D    3
# # E    4
# # F    5
# # G    6
# # H    7
# # I    8
# # J    9
# # dtype: int64
# #和上面的有所区别,指定索引从字母F~O,那么只有F~J能对应到字典的数值,
# # 其它的没有数值,所以值为NaN,并且这时的dtype为float64了
# print(pd.Series(a,index=list(string.ascii_uppercase[5:15])))
# # F    5.0
# # G    6.0
# # H    7.0
# # I    8.0
# # J    9.0
# # K    NaN
# # L    NaN
# # M    NaN
# # N    NaN
# # O    NaN
# # dtype: float64

# #取Series的值,可以按键名取,也可以切片取
# a={"name":'python',"age":25,"tel":10000}
# b=(pd.Series(a))
# print(b["name"])
# #返回:
# #python
#也可以按位置来切片
# print(b[0])
# #返回:
# #python

# #Series的两个函数index和values的用法
# print(b.index)
# #返回:
# # Index(['name', 'age', 'tel'], dtype='object')
# print(b.values)
# # 返回:
# # ['python' 25 10000]
#
# #通过键值的boole判断取值
# a=pd.Series([1,2,3,4,5,6,])
# print(a[a>3])
# #返回
# # 3    4
# # 4    5
# # 5    6
# # dtype: int64

# #取外部数据,pandas取外部数据的方法非常方便,取csv的格式用read_csv,
# #excel,json,html,sql都有对应的方法,下面以read_csv为例:
# df=pd.read_csv("./pandas_data.csv")
# print(df)
# #输出结果,最左边被加了一个Index例,从0开始
# #    DIVISION_CODE DIVISION_NAME  GROSS_FACTOR
# # 0             jg            gf         12.00
# # 1             dd            dd         66.00
# # 2             bb           aat         55.00
# # 3             ww            ww          0.30
# # 4             CR        Cereal          0.85
# # ..           ...           ...           ...
# # 76           CEM           CEM          1.00
# # 77           WMA        WMAWMA          1.00
# # 78    jgIILLLLLL   jgIILLLLLLJ          0.10
# # 79             2          adfa          2.00
# # 80           l5m           l5m         12.00
# #
# # [81 rows x 3 columns]

# #用DataFrame构建二维series
# a=pd.DataFrame(np.arange(12).reshape(3,4))
# print(a)
# #返回结果:
# #第一列是行索引,表明不同行,横向索引,叫index,0轴,axis=0
# #第一行是列索引,表明不同列,纵向索引,叫columns,1轴,axis=1
# #    0  1   2   3
# # 0  0  1   2   3
# # 1  4  5   6   7
# # 2  8  9  10  11

# #除了上面自动的column和index名,也可以指定名
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# #返回结果
# #    W  X   Y   Z
# # a  0  1   2   3
# # b  4  5   6   7
# # c  8  9  10  11

# #用字典来创建DataFrame
# d1={"name":["xiaoming","xiaowang"],"aga":[19,20],"tel":["10000","10086"]}
# t1=pd.DataFrame(d1)
# print (t1)
# #返回结果:
# #        name  aga    tel
# # 0  xiaoming   19  10000
# # 1  xiaowang   20  10086
#
# #另一种用字典创建DataFrame方法
# d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}]
# t2=pd.DataFrame(d2)
# print (t2)
# #返回结果:
# #没有指定的位置是NaN
# #        name   age      tel
# # 0  xiaohong  23.0  10010.0
# # 1  xiaogang   NaN  20010.0
# # 2  xiaowang  28.0      NaN
# #DataFrame的一些方法
# print(t2.index)
# #RangeIndex(start=0, stop=3, step=1)
# print(t2.columns)
# #Index(['name', 'age', 'tel'], dtype='object')
# print(t2.values)
# # [['xiaohong' 23.0 10010.0]
# #  ['xiaogang' nan 20010.0]
# #  ['xiaowang' 28.0 nan]]
# print(t2.shape)
# #(3, 3)
# print(t2.dtypes)
# #name     object
# # age     float64
# # tel     float64
# # dtype: object
# print(t2.ndim) #显示维度,结果为2说明是二维series
# #2
# print(t2.head(2))#取前两行数据
# #        name   age      tel
# # 0  xiaohong  23.0  10010.0
# # 1  xiaogang   NaN  20010.0
# print(t2.tail(2))#取后两行数据
# #        name   age      tel
# # 1  xiaogang   NaN  20010.0
# # 2  xiaowang  28.0      NaN
# print(t2.info())#取t2的信息概览,包括行数,列数,列非空值数,列类型,行类型,占用内存大小
# # <class 'pandas.core.frame.DataFrame'>
# # RangeIndex: 3 entries, 0 to 2
# # Data columns (total 3 columns):
# #  #   Column  Non-Null Count  Dtype
# # ---  ------  --------------  -----
# #  0   name    3 non-null      object
# #  1   age     2 non-null      float64
# #  2   tel     2 non-null      float64
# # dtypes: float64(2), object(1)
# # memory usage: 200.0+ bytes
# # None
# print(t2.describe())#快速综合统计结果:计数,均值,标准差,最大值,最小值,四分位数
# #              age           tel
# # count   2.000000      2.000000
# # mean   25.500000  15010.000000
# # std     3.535534   7071.067812
# # min    23.000000  10010.000000
# # 25%    24.250000  12510.000000
# # 50%    25.500000  15010.000000
# # 75%    26.750000  17510.000000
# # max    28.000000  20010.000000

# #DataFrame排序,sort_values的参数by是指排序指定的列,ascending默认为True(顺序)
# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# #   DIVISION_CODE DIVISION_NAME  GROSS_FACTOR
# # 0            jg            gf          12.0
# df=df.sort_values(by="GROSS_FACTOR",ascending=False)
# print(df.head(5))
# #返回的值,是以GROSS_FACTOR为倒序排序的
# #    DIVISION_CODE       DIVISION_NAME  GROSS_FACTOR
# # 1             dd                  dd          66.0
# # 2             bb                 aat          55.0
# # 71         test3               test3          22.0
# # 64           tes          yt 0219 V2          15.0
# # 44         yt_up  yt <html> @0219_up          13.0

#DataFrame的切片操作
# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# #   DIVISION_CODE DIVISION_NAME  GROSS_FACTOR
# # 0            jg            gf          12.0
# #方括号写数组,表示取行,对行进行操作
# print(df[:20]) #取df的前20行
# print(df[20:])#取第20行以后的数组
# #方括号写字符串,表示取列索引,对列进行操作
# print(df["GROSS_FACTOR"])#取其"GROSS_FACTOR"的列
# print(df[:20]["GROSS_FACTOR"])#取前20行的"GROSS_FACTOR"值

# #用loc切片
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# #    W  X   Y   Z
# # a  0  1   2   3
# # b  4  5   6   7
# # c  8  9  10  11
# print(a.loc["a"])#取a行数据
# # W    0
# # X    1
# # Y    2
# # Z    3
# #Name: a, dtype: int32
# print(a.loc["a","Z"])#取a行Z列数据
# #3
# print(a.loc[["a","c"],"Z"])#取a和c行的Z列
# # a     3
# # c    11
# # Name: Z, dtype: int32
# print(a.loc["a":"c","Z"])
# #这个结果与上面的是一致的,需要注意这里的结果是包括c行的,和其它的切片操作有区别,其它的切片操作都是包头不包尾
# # a     3
# # b     7
# # c    11
# # Name: Z, dtype: int32

# #用iloc切片,这i表示index,用索引的值来切片切片
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# #    W  X   Y   Z
# # a  0  1   2   3
# # b  4  5   6   7
# # c  8  9  10  11
# print(a.iloc[1])#取第2行的数据
# # W    4
# # X    5
# # Y    6
# # Z    7
# # Name: b, dtype: int32
# print(a.iloc[:,1])#取第2列
# # a    1
# # b    5
# # c    9
# # Name: X, dtype: int32
# print(a.iloc[1,1])#取第2行第2列
# #5
# print(a.iloc[1:,1])#取从第2行开始的第2列
# # b    5
# # c    9

# #赋值改变DataFrame的数值
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# print(a)
# #    W  X   Y   Z
# # a  0  1   2   3
# # b  4  5   6   7
# # c  8  9  10  11
#
# a.iloc[1:,1]=100#将第2行开始的第2列值都改为100
# print (a)
# #    W    X   Y   Z
# # a  0    1   2   3
# # b  4  100   6   7
# # c  8  100  10  11
# a.iloc[1:,1]=np.nan#直接改成NaN也是可以的
# print (a)
# #    W    X   Y   Z
# # a  0  1.0   2   3
# # b  4  NaN   6   7
# # c  8  NaN  10  11

# df=pd.read_csv("./pandas_data.csv")
# print(df.head(1))
# #   DIVISION_CODE DIVISION_NAME  GROSS_FACTOR
# # 0            jg            gf          12.0
# print(df["GROSS_FACTOR"])#通过指定columns条件查出所有的行
# # 0     12.00
# # 1     66.00
# # 2     55.00
# # 3      0.30
# # 4      0.85
# #       ...
# # 76     1.00
# # 77     1.00
# # 78     0.10
# # 79     2.00
# # 80    12.00
# print(df[df["GROSS_FACTOR"]>10])#再根据所有行找出>10的行
# #    DIVISION_CODE       DIVISION_NAME  GROSS_FACTOR
# # 0             jg                  gf          12.0
# # 1             dd                  dd          66.0
# # 2             bb                 aat          55.0
# # 44         yt_up  yt <html> @0219_up          13.0
# # 47     yzh''test   yaozihe test % 's          12.0
# # 48      test bus        yzh test bus          12.0
# # 57     yzh''test   yaozihe test % 's          12.0
# # 58       yzh tst          yzh test 2          12.0
# # 61        123'13            yzh test          12.0
# # 64           tes          yt 0219 V2          15.0
# # 71         test3               test3          22.0
# # 80           l5m                 l5m          12.0
#
# #找出"GROSS_FACTOR">10并且<12的值
# #&是且,|是或
# print(df[(df["GROSS_FACTOR"]>10)&(df["GROSS_FACTOR"]<20)])
# #    DIVISION_CODE       DIVISION_NAME  GROSS_FACTOR
# # 0             jg                  gf          12.0
# # 44         yt_up  yt <html> @0219_up          13.0
# # 47     yzh''test   yaozihe test % 's          12.0
# # 48      test bus        yzh test bus          12.0
# # 57     yzh''test   yaozihe test % 's          12.0
# # 58       yzh tst          yzh test 2          12.0
# # 61        123'13            yzh test          12.0
# # 64           tes          yt 0219 V2          15.0
# # 80           l5m                 l5m          12.0

# #处理缺失数据NaN
# #判断数据是否是NaN,用isnull方法
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# #    W    X   Y   Z
# # a  0  1.0   2   3
# # b  4  NaN   6   7
# # c  8  NaN  10  11
# print(pd.isnull(a))#isnull方法判断是否为NaN
# #        W      X      Y      Z
# # a  False  False  False  False
# # b  False   True  False  False
# # c  False   True  False  False
# print(pd.notnull(a))#另一个方法notnull,结果与isnull相反
# #       W      X     Y     Z
# # a  True   True  True  True
# # b  True  False  True  True
# # c  True  False  True  True
# print(pd.notnull(a["X"]))#只取”X"列的notnull,返回的结果为bool类型
# # a     True
# # b    False
# # c    False
# # Name: X, dtype: bool
# print(a[pd.notnull(a["X"])])
# #接着将上面的内容再套一层,因为上面只有第一行结果返回为True,所以这里只返回第一行的数据
# #bool索引可以看成一个位置矩阵,然后把位置矩阵传给数组,True的取出来,False的不取
# #这里是按“W"列中是否有NaN得到的一个矩阵,只有第一列返回为True,然后由广播到数组里面,取出了数组第一行的数据
# #    W    X  Y  Z
# # a  0  1.0  2  3

# #dropna的用法,可以将值为NaN的值删除
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# #    W    X   Y   Z
# # a  0  1.0   2   3
# # b  4  NaN   6   7
# # c  8  NaN  10  11
#
# print(a.dropna(axis=0))#axis=1表示删除行,所以删除矩阵里面有NaN的的所在行
# #    W    X  Y  Z
# # a  0  1.0  2  3
# print(a.dropna(axis=1))#axis=1表示删除列,所以删除矩阵里面有NaN的的所在列
# #    W   Y   Z
# # a  0   2   3
# # b  4   6   7
# # c  8  10  11
# print(a.dropna(axis=0,how="all"))#how参数说明这一行所有的值都为NaN才删除,默认为"any“
# #    W    X   Y   Z
# # a  0  1.0   2   3
# # b  4  NaN   6   7
# # c  8  NaN  10  11
# print(a.dropna(axis=0,how="any"))#how="any"是默认值,所以和上面不加此参数的结果一样
# #    W    X  Y  Z
# # a  0  1.0  2  3

# #另一个参数inplace,此参数默认为False,为True时原地修改矩阵
# #相当于与结果重新赋值给自己
# a=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
# a.iloc[1:,1]=np.nan#将第2行开始的第2列值改为NaN
# print(a)
# #a.dropna(axis=0,inplace=True)
# a.dropna(axis=0,inplace=True)
# #加了inplace参数,相于与a=a.dropna(axis=0)
# print(a)
# #返回的结果已改变了a的值
# #    W    X  Y  Z
# # a  0  1.0  2  3

# #填充NaN数据
# d2=[{"name":"xiaohong","age":23,"tel":10010},{"name":"xiaogang","tel":20010},{"name":"xiaowang","age":28}]
# t2=pd.DataFrame(d2)
# print(t2)
# #        name   age      tel
# # 0  xiaohong  23.0  10010.0
# # 1  xiaogang   NaN  20010.0
# # 2  xiaowang  28.0      NaN
# #1.通过fillna方法直接填充指定的值,把NaN的值都填充为100
# print(t2.fillna(100))
# #        name    age      tel
# # 0  xiaohong   23.0  10010.0
# # 1  xiaogang  100.0  20010.0
# # 2  xiaowang   28.0    100.0
# #2.填充平均数
# print(t2.fillna(t2.mean()))
# #        name   age      tel
# # 0  xiaohong  23.0  10010.0
# # 1  xiaogang  25.5  20010.0
# # 2  xiaowang  28.0  15010.0
# t2["age"]=t2["age"].fillna(t2["age"].mean())
# #如果只想更改其中一列的NaN值为平均数
# print (t2)
# #        name   age      tel
# # 0  xiaohong  23.0  10010.0
# # 1  xiaogang  25.5  20010.0
# # 2  xiaowang  28.0      NaN
# print(t2["age"].mean())
# # 25.5
# #age列的平均值为25.5,这里和numpy的结果不同
# #numpy的矩阵,如果这一列或一行的数据只要有NaN的值,mean()的结果为Nan
已标记关键词 清除标记
©️2020 CSDN 皮肤主题: Age of Ai 设计师:meimeiellie 返回首页