pandas_DataFram基础

最新推荐文章于 2024-05-01 22:51:03 发布

MrRenLG

最新推荐文章于 2024-05-01 22:51:03 发布

阅读量372

点赞数 1

分类专栏： pandas 基础 dataframe

本文链接：https://blog.csdn.net/MrRenLG/article/details/90516271

版权

基础同时被 3 个专栏收录

4 篇文章 0 订阅

订阅专栏

pandas

3 篇文章 0 订阅

订阅专栏

dataframe

1 篇文章 0 订阅

订阅专栏

'''
【课程2.5】  Pandas数据结构Dataframe：基本概念及创建

"二维数组"Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值、字符串、布尔值等。

Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构。

'''

import pandas as pd
import numpy as np

# Datafrom 数据结构
# DataFrame是表格型的数据结构“带有标签的二维数组--index--columns”
# index--行标签 ， columns--列标签
df=pd.DataFrame({
        'name':['J','M','D'],
        'age':[18,19,20],
        'gender':['m','w','m'],
    },index=list('abc'))
print(df)

   age gender name
a   18      m    J
b   19      w    M
c   20      m    D
Index(['a', 'b', 'c'], dtype='object')

print(df.index)

Index(['a', 'b', 'c'], dtype='object')

print(df.columns)

Index(['age', 'gender', 'name'], dtype='object')

# DataFrame创建方法
# 由数组/list组成的字典
df1=pd.DataFrame({
        'name':['a','b','c'],
        'age':[20,21,22]
    })
print(df1)
df2=pd.DataFrame({
        'one':np.random.rand(3),
        'two':np.random.rand(3)
    },index=list('abc'))
print(df2)
# 字典的key就是df的columns
# 字典的value是df的valuse
# 字典的长度需要保持一致

   age name
0   20    a
1   21    b
2   22    c
        one       two
a  0.154260  0.372618
b  0.544828  0.497320
c  0.811307  0.071709

df2=pd.DataFrame({
        'one':np.random.rand(3),
        'two':np.random.rand(3)
    },index=list('abc'),columns=['two','three','one'])
print(df2)
# columns 可以重新排序，若不存在则填充为NaN

        two three       one
a  0.593346   NaN  0.772806
b  0.556199   NaN  0.025852
c  0.705040   NaN  0.173555

# 创建方法2 由Series组成的字典
df1=pd.DataFrame({
        'one':pd.Series(np.random.rand(3)),
        'two':pd.Series(np.random.rand(3))
    })
print(df1)

        one       two
0  0.414850  0.665624
1  0.249675  0.699969
2  0.702770  0.900113

df2=pd.DataFrame({
        'one':pd.Series(np.random.rand(3),index=list('abc')),
        'two':pd.Series(np.random.rand(2),index=list('ab'))
    })
print(df2)
# 字典的key为df的columns，df的index为Series的index
# Series的长度可以不一样，会产生NaN

        one       two
a  0.288482  0.007029
b  0.314350  0.708146
c  0.891189       NaN

# DataFrame 通过二位数组直接创建
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),index=list('abc'),columns=list('xyz'))
print(df1)

# DataFrame 字典组成的列表
df1=pd.DataFrame([
        {'one':'a','two':'b'},
        {'three':'c','four':'d'}
    ],index=list('ab'))
print(df1)
# index不指定则为默认，
# 字典的key为df的columns，

  four  one three  two
a  NaN    a   NaN    b
b    d  NaN     c  NaN

# 由字典组成的字典创建
df=pd.DataFrame({
        'A':{'math':80,'art':90},
         'B':{'math':80,'art':90}    
    })
print(df)
# df的columns为字字典的key，df的index为字典的key

       A   B
art   90  90
math  80  80

'''
【课程2.6】  Pandas数据结构Dataframe：索引

Dataframe既有行索引也有列索引，可以被看做由Series组成的字典（共用一个索引）

选择列 / 选择行 / 切片 / 布尔判断

'''

df=pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
               index=['one','two','three'],
               columns=list('abcd'))
print(df)

               a          b          c          d
one    43.412719  76.530491  16.054262   1.146777
two    69.950579  67.093175  23.407335  43.765119
three  70.160305  81.423366  20.290999  78.110449

# 选择列
data1=df['a']
print(data1)
data2=df[['a','c','d']]
print(data2)
# 按照列名选择列，一列输出Series，多列输出DataFrame

one      43.412719
two      69.950579
three    70.160305
Name: a, dtype: float64
               a          c          d
one    43.412719  16.054262   1.146777
two    69.950579  23.407335  43.765119
three  70.160305  20.290999  78.110449

# 选择行
data1=df.loc['one']
print(data1)
data2=df.loc[['one','three']]
print(data2)
# 选择行是loc，一行为Series，多行为dataframe

a    43.412719
b    76.530491
c    16.054262
d     1.146777
Name: one, dtype: float64
               a          b          c          d
one    43.412719  76.530491  16.054262   1.146777
three  70.160305  81.423366  20.290999  78.110449

# df[] 一般都用来做选择列
# 也可以选择行
data1=df[:1]
print(data1)
# 利用切片选择

             a          b          c         d
one  43.412719  76.530491  16.054262  1.146777

# 选择行
# 当为默认index时，直接可用。loc数字选择
df=pd.DataFrame(np.random.rand(9).reshape((3,3)),
              )
print(df)
data=df.loc[[1,2]]
print(data)

          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462
          0         1         2
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462

# df.iloc[] - 按照整数位置（从轴的0到length-1）选择行
# 类似list的索引，其顺序就是dataframe的整数位置，从0开始计
print(df)

          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
2  0.432987  0.870418  0.060462

# 选择行
# 选择第一行
print(df.iloc[0])
# 选择倒数第一行
print(df.iloc[-1])

0    0.373822
1    0.144124
2    0.919020
Name: 0, dtype: float64
0    0.432987
1    0.870418
2    0.060462
Name: 2, dtype: float64

#选择第一和第三行
print(df.iloc[[0,2]])

          0         1         2
0  0.373822  0.144124  0.919020
2  0.432987  0.870418  0.060462

# 切片索引
print(df.iloc[0:2])
print(df.iloc[::2])

          0         1         2
0  0.373822  0.144124  0.919020
1  0.946216  0.037750  0.719912
          0         1         2
0  0.373822  0.144124  0.919020
2  0.432987  0.870418  0.060462

# 布尔型索引
# 与Series原理相同

df=pd.DataFrame(np.random.rand(12).reshape((3,4))*100)
print(df)

           0          1          2          3
0  29.892506  58.911433   8.225744  82.950538
1   0.025366  38.071510  63.716461  48.258320
2  45.380903  70.560190  30.495624   4.703547

print(df>50)

       0      1      2      3
0  False   True  False   True
1  False  False   True  False
2  False   True  False  False

print(df[df>50])

    0          1          2          3
0 NaN  58.911433        NaN  82.950538
1 NaN        NaN  63.716461        NaN
2 NaN  70.560190        NaN        NaN

# 多重索引
df=pd.DataFrame(np.random.rand(16).reshape((4,4)),
               index=list('abcd'),
               columns=['one','two','three','four'])
print(df)

        one       two     three      four
a  0.325516  0.274562  0.876918  0.627544
b  0.303586  0.012632  0.387609  0.237904
c  0.826723  0.575291  0.560848  0.001186
d  0.730478  0.284428  0.822887  0.095292

#  选择one列的ac行
print(df['one'].loc[['a','c']])

a    0.325516
c    0.826723
Name: one, dtype: float64

# 选择two three four 的ac行
print(df[['two','three','four']].loc[['a','c']])

        two     three      four
a  0.274562  0.876918  0.627544
c  0.575291  0.560848  0.001186

'''
【课程2.7】  Pandas数据结构Dataframe：基本技巧

数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

'''

# 数据查看、转置
df=pd.DataFrame(np.random.rand(16).reshape((8,2)))
print(df)

          0         1
0  0.134166  0.010255
1  0.316007  0.524728
2  0.994353  0.544601
3  0.926491  0.686111
4  0.317319  0.109836
5  0.871562  0.614354
6  0.444111  0.805243
7  0.256888  0.037670

# 头部
print(df.head())

          0         1
0  0.134166  0.010255
1  0.316007  0.524728
2  0.994353  0.544601
3  0.926491  0.686111
4  0.317319  0.109836

# 尾部
print(df.tail())

          0         1
3  0.926491  0.686111
4  0.317319  0.109836
5  0.871562  0.614354
6  0.444111  0.805243
7  0.256888  0.037670

# 转置 .T
print(df.T)

          0         1         2         3         4         5         6  \
0  0.134166  0.316007  0.994353  0.926491  0.317319  0.871562  0.444111   
1  0.010255  0.524728  0.544601  0.686111  0.109836  0.614354  0.805243   

          7  
0  0.256888  
1  0.037670

# 添加与修改
df=pd.DataFrame(np.random.rand(16).reshape((4,4)),
               columns=list('abcd'))
print(df)

          a         b         c         d
0  0.833651  0.422714  0.064032  0.176095
1  0.438021  0.878705  0.102511  0.468040
2  0.811842  0.080692  0.127141  0.154094
3  0.507186  0.920764  0.549470  0.935110

# 直接修改
df['d']=100
print(df)
df.loc[2]=200
print(df)

          a         b         c    d
0  0.833651  0.422714  0.064032  100
1  0.438021  0.878705  0.102511  100
2  0.811842  0.080692  0.127141  100
3  0.507186  0.920764  0.549470  100
            a           b           c    d
0    0.833651    0.422714    0.064032  100
1    0.438021    0.878705    0.102511  100
2  200.000000  200.000000  200.000000  200
3    0.507186    0.920764    0.549470  100

# 增加
df['e']=300
print(df)

            a           b           c    d    e
0    0.833651    0.422714    0.064032  100  300
1    0.438021    0.878705    0.102511  100  300
2  200.000000  200.000000  200.000000  200  300
3    0.507186    0.920764    0.549470  100  300

#删除 del/drop
print(df)

            a           b           c    d    e
0    0.833651    0.422714    0.064032  100  300
1    0.438021    0.878705    0.102511  100  300
2  200.000000  200.000000  200.000000  200  300
3    0.507186    0.920764    0.549470  100  300

# 删除列
# 修改原数据
del df['a']
print(df)

            b           c    d    e
0    0.422714    0.064032  100  300
1    0.878705    0.102511  100  300
2  200.000000  200.000000  200  300
3    0.920764    0.549470  100  300

# 删除行
# drop 不修改原数据，生成新数列
print(df.drop(0))
# 如果使用drop删除行的话 可以用axis
print(df.drop(['b'],axis=1))

            b           c    d    e
1    0.878705    0.102511  100  300
2  200.000000  200.000000  200  300
3    0.920764    0.549470  100  300
            c    d    e
0    0.064032  100  300
1    0.102511  100  300
2  200.000000  200  300
3    0.549470  100  300

# 对齐
df1=pd.DataFrame(np.random.randn(10,4),columns=['a','b','c','d'])
df2=pd.DataFrame(np.random.randn(7,3),columns=['a','b','c'])
print(df1+df2)
# DataFrame 会自动根据列标签对其数据，没有的补NaN

          a         b         c   d
0 -0.118693 -0.587134  1.240605 NaN
1 -0.439958  2.960476 -0.728936 NaN
2  0.855115 -0.659808  1.018583 NaN
3 -3.006156  0.376281  0.559385 NaN
4 -2.946353  0.428331 -1.788409 NaN
5  0.820590 -1.077892 -0.506990 NaN
6 -0.628339  0.215887 -2.513543 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN

# 排序， 安值排序 .sort_values
# 同样适用于Series
df=pd.DataFrame(np.random.rand(16).reshape((4,4))*100,
               columns=['a','b','c','d'])
print(df)

           a          b          c          d
0  53.044153   6.949537  76.201332  72.157734
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
3  64.429832  14.861729  16.369679  72.744620

print(df.sort_values(['a']))
print(df.sort_values(['b'],ascending=False))
# ascending =True 为升序，false为降序
# 默认为升序

           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
0  53.044153   6.949537  76.201332  72.157734
3  64.429832  14.861729  16.369679  72.744620
           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
3  64.429832  14.861729  16.369679  72.744620
0  53.044153   6.949537  76.201332  72.157734
2  21.737004   2.606586  76.871439  52.364927

# 多列排序
df = df.sort_values(['a','b'])
# 先排a，如果a有重复，则排b

# 排序，索引排序 sort_index
print(df)

           a          b          c          d
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
0  53.044153   6.949537  76.201332  72.157734
3  64.429832  14.861729  16.369679  72.744620

print(df.sort_index())

           a          b          c          d
0  53.044153   6.949537  76.201332  72.157734
1   2.389659  21.536710  62.266274  86.264873
2  21.737004   2.606586  76.871439  52.364927
3  64.429832  14.861729  16.369679  72.744620