pandas_DataFram基础

3 篇文章 0 订阅
1 篇文章 0 订阅
'''
【课程2.5】  Pandas数据结构Dataframe:基本概念及创建

"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。

Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。

'''
import pandas as pd
import numpy as np
# Datafrom 数据结构
# DataFrame是表格型的数据结构“带有标签的二维数组--index--columns”
# index--行标签 , columns--列标签
df=pd.DataFrame({
        'name':['J','M','D'],
        'age':[18,19,20],
        'gender':['m','w','m'],
    },index=list('abc'))
print(df)
   age gender name
a   18      m    J
b   19      w    M
c   20      m    D
Index(['a', 'b', 'c'], dtype='object')
print(df.index)
Index(['a', 'b', 'c'], dtype='object')
print(df.columns)
Index(['age', 'gender', 'name'], dtype='object')
# DataFrame创建方法
# 由数组/list组成的字典
df1=pd.DataFrame({
        'name':['a','b','c'],
        'age':[20,21,22]
    })
print(df1)
df2=pd.DataFrame({
        'one':np.random.rand(3),
        'two':np.random.rand(3)
    },index=list('abc'))
print(df2)
# 字典的key就是df的columns
# 字典的value是df的valuse
# 字典的长度需要保持一致
   age name
0   20    a
1   21    b
2   22    c
        one       two
a  0.154260  0.372618
b  0.544828  0.497320
c  0.811307  0.071709
df2=pd.DataFrame({
        'one':np.random.rand(3),
        'two':np.random.rand(3)
    },index=list('abc'),columns=['two','three','one'])
print(df2)
# columns 可以重新排序,若不存在则填充为NaN
        two three       one
a  0.593346   NaN  0.772806
b  0.556199   NaN  0.025852
c  0.705040   NaN  0.173555
# 创建方法2 由Series组成的字典
df1=pd.DataFrame({
        'one':pd.Series(np.random.rand(3)),
        'two':pd.Series(np.random.rand(3))
    })
print(df1)
        one       two
0  0.414850  0.665624
1  0.249675  0.699969
2  0.702770  0.900113
df2=pd.DataFrame({
        'one':pd.Series(np.random.rand(3),index=list('abc')),
        'two':pd.Series(np.random.rand(2),index=list('ab'))
    })
print(df2)
# 字典的key为df的columns,df的index为Series的index
# Series的长度可以不一样,会产生NaN
        one       two
a  0.288482  0.007029
b  0.314350  0.708146
c  0.891189       NaN
# DataFrame 通过二位数组直接创建
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),index=list('abc'),columns=list('xyz'))
print(df1)
   x  y  z
a  0  1  2
b  3  4  5
c  6  7  8
# DataFrame 字典组成的列表
df1=pd.DataFrame([
        {'one':'a','two':'b'},
        {'three':'c','four':'d'}
    ],index=list('ab'))
print(df1)
# index不指定则为默认,
# 字典的key为df的columns,
  four  one three  two
a  NaN    a   NaN    b
b    d  NaN     c  NaN
# 由字典组成的字典创建
df=pd.DataFrame({
        'A':{'math':80,'art':90},
         'B':{'math':80,'art':90}    
    })
print(df)
# df的columns为字字典的key,df的index为字典的key
       A   B
art   90  90
math  80  80
'''
【课程2.6】  Pandas数据结构Dataframe:索引

Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)

选择列 / 选择行 / 切片 / 布尔判断

'''
df=pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
               index=['one','two','three'],
               columns=list('abcd'))
print(df)
               a          b          c          d
one    43.412719  76.530491  16.054262   1.146777
two    69.950579  67.093175  23.407335  43.765119
three  70.160305  81.423366  20.290999  78.110449
# 选择列
data1=df['a']
print(data1)
data2=df[['a','c','d']]
print(data2)
# 按照列名选择列,一列输出Series,多列输出DataFrame
one      43.412719
two      69.950579
three    70.160305
Name: a, dtype: float64
               a          c          d
one    43.412719  16.054262   1.146777
two    69.950579  23.407335  43.765119
three  70.160305  20.290999  78.110449
# 选择行
data1=df.loc['one']
print(data1)
data2=df.loc[['one','three']]
print(data2)
# 选择行是loc,一行为Series,多行为dataframe
a    43.412719
b    76.530491
c    16.054262
d     1.146777
Name: one, dtype: float64
               a          b          c          d
one    43.412719  76.530491  16.054262   1.146777
three  70.160305  81.423366  20.290999  78.110449
# df[] 一般都用来做选择列
# 也可以选择行
data1=df[:1]
print(data1)
# 利用切片选择

             a          b          c         d
one  43.412719  76.530491  16.054262  1.146777
# 选择行
# 当为默认index时,直接可用。loc数字选择
df=pd.DataFrame(np.random.rand(9).reshape((3,3)),
              )
print(df)
data=df.loc[[1,2]]
print(data)
          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462
          0         1         2
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462
# df.iloc[] - 按照整数位置(从轴的0到length-1)选择行
# 类似list的索引,其顺序就是dataframe的整数位置,从0开始计
print(df)
          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462
# 选择行
# 选择第一行
print(df.iloc[0])
# 选择倒数第一行
print(df.iloc[-1])
0    0.373822
1    0.144124
2    0.919020
Name: 0, dtype: float64
0    0.432987
1    0.870418
2    0.060462
Name: 2, dtype: float64
#选择第一和第三行
print(df.iloc[[0,2]])
          0         1         2
0  0.373822  0.144124  0.919020
2  0.432987  0.870418  0.060462
# 切片索引
print(df.iloc[0:2])
print(df.iloc[::2])
          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
          0         1         2
0  0.373822  0.144124  0.919020
2  0.432987  0.870418  0.060462
# 布尔型索引
# 与Series原理相同
df=pd.DataFrame(np.random.rand(12).reshape((3,4))*100)
print(df)
           0          1          2          3
0  29.892506  58.911433   8.225744  82.950538
1   0.025366  38.071510  63.716461  48.258320
2  45.380903  70.560190  30.495624   4.703547
print(df>50)
       0      1      2      3
0  False   True  False   True
1  False  False   True  False
2  False   True  False  False
print(df[df>50])
    0          1          2          3
0 NaN  58.911433        NaN  82.950538
1 NaN        NaN  63.716461        NaN
2 NaN  70.560190        NaN        NaN
# 多重索引
df=pd.DataFrame(np.random.rand(16).reshape((4,4)),
               index=list('abcd'),
               columns=['one','two','three','four'])
print(df)
        one       two     three      four
a  0.325516  0.274562  0.876918  0.627544
b  0.303586  0.012632  0.387609  0.237904
c  0.826723  0.575291  0.560848  0.001186
d  0.730478  0.284428  0.822887  0.095292
#  选择one列的ac行
print(df['one'].loc[['a','c']])
a    0.325516
c    0.826723
Name: one, dtype: float64
# 选择two three four 的ac行
print(df[['two','three','four']].loc[['a','c']])
        two     three      four
a  0.274562  0.876918  0.627544
c  0.575291  0.560848  0.001186
'''
【课程2.7】  Pandas数据结构Dataframe:基本技巧

数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

'''
# 数据查看、转置
df=pd.DataFrame(np.random.rand(16).reshape((8,2)))
print(df)
          0         1
0  0.134166  0.010255
1  0.316007  0.524728
2  0.994353  0.544601
3  0.926491  0.686111
4  0.317319  0.109836
5  0.871562  0.614354
6  0.444111  0.805243
7  0.256888  0.037670
# 头部
print(df.head())
          0         1
0  0.134166  0.010255
1  0.316007  0.524728
2  0.994353  0.544601
3  0.926491  0.686111
4  0.317319  0.109836
# 尾部
print(df.tail())
          0         1
3  0.926491  0.686111
4  0.317319  0.109836
5  0.871562  0.614354
6  0.444111  0.805243
7  0.256888  0.037670
# 转置 .T
print(df.T)
          0         1         2         3         4         5         6  \
0  0.134166  0.316007  0.994353  0.926491  0.317319  0.871562  0.444111   
1  0.010255  0.524728  0.544601  0.686111  0.109836  0.614354  0.805243   

          7  
0  0.256888  
1  0.037670  
# 添加与修改
df=pd.DataFrame(np.random.rand(16).reshape((4,4)),
               columns=list('abcd'))
print(df)
          a         b         c         d
0  0.833651  0.422714  0.064032  0.176095
1  0.438021  0.878705  0.102511  0.468040
2  0.811842  0.080692  0.127141  0.154094
3  0.507186  0.920764  0.549470  0.935110
# 直接修改
df['d']=100
print(df)
df.loc[2]=200
print(df)
          a         b         c    d
0  0.833651  0.422714  0.064032  100
1  0.438021  0.878705  0.102511  100
2  0.811842  0.080692  0.127141  100
3  0.507186  0.920764  0.549470  100
            a           b           c    d
0    0.833651    0.422714    0.064032  100
1    0.438021    0.878705    0.102511  100
2  200.000000  200.000000  200.000000  200
3    0.507186    0.920764    0.549470  100
# 增加
df['e']=300
print(df)
            a           b           c    d    e
0    0.833651    0.422714    0.064032  100  300
1    0.438021    0.878705    0.102511  100  300
2  200.000000  200.000000  200.000000  200  300
3    0.507186    0.920764    0.549470  100  300
#删除 del/drop
print(df)
            a           b           c    d    e
0    0.833651    0.422714    0.064032  100  300
1    0.438021    0.878705    0.102511  100  300
2  200.000000  200.000000  200.000000  200  300
3    0.507186    0.920764    0.549470  100  300
# 删除列
# 修改原数据
del df['a']
print(df)
            b           c    d    e
0    0.422714    0.064032  100  300
1    0.878705    0.102511  100  300
2  200.000000  200.000000  200  300
3    0.920764    0.549470  100  300
# 删除行
# drop 不修改原数据,生成新数列
print(df.drop(0))
# 如果使用drop删除行的话 可以用axis
print(df.drop(['b'],axis=1))
            b           c    d    e
1    0.878705    0.102511  100  300
2  200.000000  200.000000  200  300
3    0.920764    0.549470  100  300
            c    d    e
0    0.064032  100  300
1    0.102511  100  300
2  200.000000  200  300
3    0.549470  100  300
# 对齐
df1=pd.DataFrame(np.random.randn(10,4),columns=['a','b','c','d'])
df2=pd.DataFrame(np.random.randn(7,3),columns=['a','b','c'])
print(df1+df2)
# DataFrame 会自动根据列标签对其数据,没有的补NaN
          a         b         c   d
0 -0.118693 -0.587134  1.240605 NaN
1 -0.439958  2.960476 -0.728936 NaN
2  0.855115 -0.659808  1.018583 NaN
3 -3.006156  0.376281  0.559385 NaN
4 -2.946353  0.428331 -1.788409 NaN
5  0.820590 -1.077892 -0.506990 NaN
6 -0.628339  0.215887 -2.513543 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN
# 排序, 安值排序 .sort_values
# 同样适用于Series
df=pd.DataFrame(np.random.rand(16).reshape((4,4))*100,
               columns=['a','b','c','d'])
print(df)
           a          b          c          d
0  53.044153   6.949537  76.201332  72.157734
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
3  64.429832  14.861729  16.369679  72.744620
print(df.sort_values(['a']))
print(df.sort_values(['b'],ascending=False))
# ascending =True 为升序,false为降序
# 默认为升序
           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
0  53.044153   6.949537  76.201332  72.157734
3  64.429832  14.861729  16.369679  72.744620
           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
3  64.429832  14.861729  16.369679  72.744620
0  53.044153   6.949537  76.201332  72.157734
2  21.737004   2.606586  76.871439  52.364927
# 多列排序
df = df.sort_values(['a','b'])
# 先排a,如果a有重复,则排b
# 排序,索引排序 sort_index
print(df)
           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
0  53.044153   6.949537  76.201332  72.157734
3  64.429832  14.861729  16.369679  72.744620
print(df.sort_index())
           a          b          c          d
0  53.044153   6.949537  76.201332  72.157734
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
3  64.429832  14.861729  16.369679  72.744620
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值