pandas基础

最新推荐文章于 2023-11-30 15:03:23 发布

qq_40488951

最新推荐文章于 2023-11-30 15:03:23 发布

阅读量160

点赞数

本文链接：https://blog.csdn.net/qq_40488951/article/details/111418267

版权

1、series

1.1、创建数组

1）从ndarray创建

import pandas as pd
import numpy as np

data = np.array(['a','b','c'])
s = pd.Series(data) #默认索引
s_index = pd.Series(data,index=[101,102,103]) #index，索引
print (s)
print(s_index)
---------------------------------------------
0    a
1    b
2    c
dtype: object
101    a
102    b
103    c
dtype: object

2）从字典创建

import pandas as pd
import numpy as np
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data) #默认索引为key，数据为value
s_index = pd.Series(data,index=['a','b','d','a']) #索引为key，key不存在时，数据为NaN
print (s)
print(s_index)
----------------------------------
a    0.0
b    1.0
c    2.0
dtype: float64
a    0.0
b    1.0
d    NaN
a    0.0
dtype: float64

3）从标量创建

import pandas as pd
import numpy as np
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s_index = pd.Series(5,index=['a','b','d','a']) #数据是标量，必须提供索引
print(s_index)
---------------------------------
a    5
b    5
d    5
a    5
dtype: int64

1.2、提取数据

import pandas as pd
import numpy as np
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s_index = pd.Series(5,index=['a','b','d','a']) #数据是标量，必须提供索引
print(s_index)
print('s_index[0]\n',s_index[0]) #位置索引 
print('s_index[:2]\n',s_index[:2]) #前两个
print('s_index[-2:]\n',s_index[-2:]) #最后两个
print("s_index['a']\n",s_index["a"])  #键值索引
----------------------------------------------------
a    5
b    5
d    5
a    5
dtype: int64
s_index[0]
 5
s_index[:2]
 a    5
b    5
dtype: int64
s_index[-2:]
 d    5
a    5
dtype: int64
s_index['a']
 a    5
a    5
dtype: int64

1.3、方法属性

import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(4))
print (s)
print (s.axes)     #返回标签列表
print (s.empty)    #判断是否为空
print (s.size)     #返回元素个数
print (s.values)   #返回ndarray数组
print (s.head(2))  #返回头两个值
print (s.tail(2))  #返回最后两个值

2、DataFrame

2.1创建数组

1）从列表创建：需要长度一致

import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',13]]
#从列表创建，（数据，列索引，行索引，数据类型），所有列表长度必须相同
df = pd.DataFrame(data,columns=['Name','Age'],index=[101,102,103],dtype=float)
print (df)
--------------------------------------------
       Name   Age
101    Alex  10.0
102     Bob  12.0
103  Clarke  13.0

2）从数组创建：需要长度一致

import pandas as pd
#从ndarray 创建，每个数组与数组名组成键值对，以字典形式传入，每一个数组为一列，数组长度必须一致
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
print (df)
-------------------------------------------
        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42

3）从字典创建：长度可以不一致

import pandas as pd
#从字典创建，每条字典为一行，键为列名，值为数据
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data, index=['first', 'second'])
print (df)
#设置行索引可以替换原来的索引值，设置列索引只能起到筛选作用
df1 = pd.DataFrame(data, index=['101', '102',],columns=['a','e','f'])
print (df1)
---------------------------------------------
        a   b     c
first   1   2   NaN
second  5  10  20.0
     a   e   f
101  1 NaN NaN
102  5 NaN NaN

4）从series创建：长度可以不一致

import pandas as pd
#列名：series数据，组成键值对形式，每一条是一列
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print (df)
---------------------------------------------
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4

2.2、读取

import pandas as pd
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print (df['one'])    #操作列名，读取一列
print (df.loc['a',['one','two']])  #操作标签，[行标签，列标签]
print (df.iloc[2,[0,1]])   #操作索引，[行索引，列索引]
print(df[:2])        #切片索引，读取行
print(df.one)  #读取’one‘列
------------------------------------------------------
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
one    1.0
two    1.0
Name: a, dtype: float64
one    3.0
two    3.0
Name: c, dtype: float64
   one  two
a  1.0    1
b  2.0    2
a    1.0
b    2.0
c    3.0
d    NaN

2.3、添加

import pandas as pd
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df['three']=pd.Series([10,20,30],index=['a','b','c']) #赋值添加列，要求列名不存在，不能进行覆盖操作
print (df)
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
df = df.append(df2) #append添加行
print (df)
-------------------------------------------
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
   one  two  three    a    b
a  1.0  1.0   10.0  NaN  NaN
b  2.0  2.0   20.0  NaN  NaN
c  3.0  3.0   30.0  NaN  NaN
d  NaN  4.0    NaN  NaN  NaN
0  NaN  NaN    NaN  5.0  6.0
1  NaN  NaN    NaN  7.0  8.0

2.4、删除

import pandas as pd
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
     'three' : pd.Series([10,20,30], index=['a','b','c'])}
df = pd.DataFrame(d)
print (df)
del df['one'] #操作列名，删除一列
print (df)
df.pop('two') #操作列名，删除一列
print (df)
df = df.drop(['a','b']) #操作行名，删除行
print (df)
----------------------------------------------
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
   two  three
a    1   10.0
b    2   20.0
c    3   30.0
d    4    NaN
   three
a   10.0
b   20.0
c   30.0
d    NaN
   three
c   30.0
d    NaN

2.5方法属性

import pandas as pd
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df = pd.DataFrame(d)
print (df)
print (df.T)        #转置
print (df.axes)     #返回行列标签列表
print (df.dtypes)   #返回数据类型
print (df.empty)    #判断数据是否为空
print (df.shape)    #返回数组形状
print (df.size)     #返回数组元素个数
print (df.values)   #返回ndarray数组
print (df.head(2))  #前两行
print (df.tail(2))  #后两行

2.6、常用函数

import pandas as pd
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
df = pd.DataFrame(d)
print (df)
#默认按列（0），1代表按行，忽略非数字类型数据
print (df.sum(1))  #求和，
print (df.mean()) #求平均值
print (df.std(1)) #求标准差
# 默认为number
# object – 汇总字符串列
# number – 汇总数字列
# all – 将所有列汇总在一起(不应将其作为列表值传递)
print (df.describe())  #返回统计描述信息
------------------------------------------------------
             Age     Rating
count  12.000000  12.000000
mean   31.833333   3.743333
std     9.232682   0.661628
min    23.000000   2.560000
25%    25.000000   3.230000
50%    29.500000   3.790000
75%    35.500000   4.132500
max    51.000000   4.800000

2.6.1、groupby：分组统计

import pandas as pd
import numpy as np

df=pd.DataFrame({'key1':['a','a','b','b','a'],
                 'key2':['one','two','one','two','one'],
                  'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})
print(df)
#按key1分组，提取data1的数据求和，分别赋值在key1对应的行，列名为key3，transform中的参数为函数名
df['key3']=df.groupby('key1')['data1'].transform('sum')
print(df)
print(df.groupby('key1')['data1'].sum())
----------------------------------------------------------
  key1 key2     data1     data2
0    a  one -0.067532 -0.231632
1    a  two -1.097676 -1.240643
2    b  one  0.435301 -0.614660
3    b  two -0.262712 -0.305120
4    a  one  0.437135  1.264481
  key1 key2     data1     data2      key3
0    a  one -0.067532 -0.231632 -0.728073
1    a  two -1.097676 -1.240643 -0.728073
2    b  one  0.435301 -0.614660  0.172588
3    b  two -0.262712 -0.305120  0.172588
4    a  one  0.437135  1.264481 -0.728073
key1
a   -0.728073
b    0.172588
Name: data1, dtype: float64

2.7、函数应用

1）表达式函数应用

import pandas as pd
import numpy as np
#可以通过将函数和适当数量的参数作为管道参数来执行自定义操作，从而对整个DataFrame执行操作。
def adder(ele1,ele2):
   return ele1+ele2
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df1 = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
a = df.pipe(adder,df1)  #数组对应元素会参与计算返回一个值给结果数组中的对应元素
print (a)
--------------------------------------------
       col1      col2      col3
0  0.554267 -0.799278  4.513507
1  1.566448 -0.676099  3.954296
2 -1.431411 -1.013234  0.419875
3  2.842543  0.717818 -1.791243
4  0.690443 -1.266336 -1.596015

2）行或列函数应用

import pandas as pd
import numpy as np
#默认沿列操作，1代表行
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
a=df.apply(np.mean,axis=1)
print (a)
b = df.apply(lambda x: x.max() - x.min())
print(b)
--------------------------------------------
0    0.821802
1   -0.234692
2    0.772914
3    0.237424
4   -0.028231
dtype: float64
col1    2.495410
col2    3.089016
col3    1.681998
dtype: float64

3）元素函数应用

import pandas as pd
import numpy as np
#可以是任何函数，要求函数接受一个值，返回一个值
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
a=df['col1'].map(lambda x:x*100)
print(a)
----------------------------------------
0    -42.930172
1     -9.736097
2    -19.023640
3    142.077451
4    180.125545
Name: col1, dtype: float64

2.8、重新索引

import pandas as pd
import numpy as np
N=20
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),# 时间序列(起始时间，周期，日期偏移)
   'x': np.linspace(0,stop=N-1,num=N),  #（起点，终点，个数）返回等差数列列表
   'y': np.random.rand(N),  #（个数）返回服从“0~1”均匀分布的列表
   'C': np.random.choice(['Low','Medium','High'],N).tolist(), #从[]中随机抽取元素，并组成大小为N(size)的数组
   'D': np.random.normal(100, 10, size=(N)).tolist()  #（均值，标准差，数量）返回符合正态分布的数组
})
#重新索引
df_reindexed = df.reindex(index=[0,2,5], columns=['A', 'C', 'f']) #行使用位置索引，列使用标签值索引
print (df_reindexed)
------------------------------------------------
           A       C   f
0 2016-01-01    High NaN
2 2016-01-03  Medium NaN
5 2016-01-06  Medium NaN

2.9、迭代

import pandas as pd
import numpy as np
N=5
df = pd.DataFrame({
    'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
    'x': np.linspace(0,stop=N-1,num=N),
    })
#迭代是用于读取，迭代器返回原始对象(视图)的副本，因此更改将不会反映在原始对象上。 
for col in df:  #迭代变量名，返回列标签
   print (col)
for key,value in df.iteritems():  #返回列名，整列数据，列数据为series对象
   print(key)
   print(value)
for row_index,row in df.iterrows():  #返回行索引，整行数据，数据会被转置且包含列名
   print(row_index)
   print(row)
for row in df.itertuples(): #返回每一行，包含索引，列名
    print(row)

2.10、排序

import pandas as pd
import numpy as np

unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
print(unsorted_df)
#根据标签排序,按列降序，默认为按行升序
sorted_df = unsorted_df.sort_index(ascending=False,axis=1) #(升序标志，轴)
print (sorted_df)
#根据值排序,排序值来自 col1 列,可以传入多个列名
sorted_df = unsorted_df.sort_values(by=['col1'],kind='mergesort') #（排序列，排序算法）
print (sorted_df)
#准确的说这不是排序方式，而是数据读取方式
new_order = [3,4,2,2]
print(unsorted_df.take(new_order))  #会按列表给出的行位置索引读取相关的行

2.11、窗口

1）移动窗口

import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10, 4),index = pd.date_range('1/1/2020', periods=10),columns = ['A', 'B', 'C', 'D'])
print(df)
# 标签处的值为原数据窗口内数值的平均数，窗口将逐个元素移动
print (df.rolling(window=3,center=True).mean()) #窗口大小为3，标签位置居中
---------------------------------------------------------------
                   A         B         C         D
2020-01-01 -1.329515 -0.327150 -1.637397 -1.074427
2020-01-02 -0.682514  0.073182  0.150801  1.623448
2020-01-03 -0.630081 -1.640845 -0.073798  0.164683
2020-01-04 -0.235115 -1.887705 -0.607426  1.434585
2020-01-05  0.707332  0.610329  2.206267 -0.600586
2020-01-06  1.583110 -0.205962  0.306968 -0.944161
2020-01-07 -1.223039  0.465538 -1.236737  1.950642
2020-01-08 -1.495124 -0.899027 -0.423346  0.487664
2020-01-09 -0.523139  0.449063 -1.957372 -0.704989
2020-01-10 -0.712558  0.207098  0.388138  1.730874
                   A         B         C         D
2020-01-01       NaN       NaN       NaN       NaN
2020-01-02 -0.880703 -0.631604 -0.520131  0.237901
2020-01-03 -0.515904 -1.151789 -0.176808  1.074238
2020-01-04 -0.052622 -0.972740  0.508347  0.332894
2020-01-05  0.685109 -0.494446  0.635269 -0.036721
2020-01-06  0.355801  0.289968  0.425499  0.135298
2020-01-07 -0.378351 -0.213150 -0.451039  0.498048
2020-01-08 -1.080434  0.005191 -1.205818  0.577772
2020-01-09 -0.910274 -0.080955 -0.664193  0.504516
2020-01-10       NaN       NaN       NaN       NaN

2）扩展窗口

import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10, 4),
      index = pd.date_range('1/1/2018', periods=10),
      columns = ['A', 'B', 'C', 'D'])
print(df)
#标签处的值为原数据中窗口内数据的平均值，窗口会逐个元素扩展，标签跟着移动
print (df.expanding(min_periods=3).mean()) #最小窗口数据量为3
--------------------------------------------------------------
                 A         B         C         D
2018-01-01  1.018750  0.150684 -1.032303  1.593390
2018-01-02 -0.401192  0.708518 -0.017426  1.589136
2018-01-03  2.224556 -0.144970  0.997834 -0.184756
2018-01-04  1.031094 -0.361314  0.082602 -0.057918
2018-01-05 -0.167457  0.695748  0.002473 -1.239786
2018-01-06 -0.714424  0.359726 -0.260429 -0.877045
2018-01-07  0.288361 -0.859253 -0.040149 -0.395638
2018-01-08 -1.458254  1.002058  0.318921 -0.264168
2018-01-09  0.803575 -0.934905 -0.467661  1.785015
2018-01-10  1.243235 -1.089097 -0.727002 -0.773102
                   A         B         C         D
2018-01-01       NaN       NaN       NaN       NaN
2018-01-02       NaN       NaN       NaN       NaN
2018-01-03  0.947371  0.238077 -0.017298  0.999257
2018-01-04  0.968302  0.088229  0.007677  0.734963
2018-01-05  0.741150  0.209733  0.006636  0.340013
2018-01-06  0.498555  0.234732 -0.037875  0.137170
2018-01-07  0.468527  0.078448 -0.038200  0.061055
2018-01-08  0.227679  0.193900  0.006440  0.020402
2018-01-09  0.291668  0.068477 -0.046237  0.216470
2018-01-10  0.386824 -0.047280 -0.114314  0.117513

2.12、缺失值处理

1）赋值为缺失值

import pandas as pd
import numpy as np

ser = pd.Series([0, 1, 2, np.NaN, 9], index=['red', 'blue', 'yellow', 'white', 'green'])#在原数据中用 np.NaN 占位
print(ser)
print('-------------')
ser['red'] = None  #通过赋值
print(ser)
---------------------------------------
red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64
-------------
red       NaN
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

2）删除缺失值

import pandas as pd
import numpy as np

df = pd.DataFrame([[6,np.nan,6], [np.nan,np.nan,np.nan], [2,np.nan,5]],
                   index=['blue', 'green', 'red'],
                   columns=['ball', 'mug', 'pen'])
print(df)
print("--------------")
print(df.dropna(how='all')) #删除缺失值，'all'要求整行都是缺失值
-----------------------------------------------------
       ball  mug  pen
blue    6.0  NaN  6.0
green   NaN  NaN  NaN
red     2.0  NaN  5.0
--------------
      ball  mug  pen
blue   6.0  NaN  6.0
red    2.0  NaN  5.0

3）为缺失值赋值

import pandas as pd
import numpy as np

df = pd.DataFrame([[6,np.nan,6], [np.nan,np.nan,np.nan], [2,np.nan,5]],
                   index=['blue', 'green', 'red'],
                   columns=['ball', 'mug', 'pen'])
print(df)
print("--------------")
print(df.fillna(10))  #将缺失值全部赋值为10
print(df.fillna({'ball':1,'mug':0,'pen':99}))  #为不同的列缺失值各自赋值
----------------------------------------------------------
       ball  mug  pen
blue    6.0  NaN  6.0
green   NaN  NaN  NaN
red     2.0  NaN  5.0
--------------
       ball   mug   pen
blue    6.0  10.0   6.0
green  10.0  10.0  10.0
red     2.0  10.0   5.0
       ball  mug   pen
blue    6.0  0.0   6.0
green   1.0  0.0  99.0
red     2.0  0.0   5.0

2.13、替换

import pandas as pd

data = pd.read_excel('test_replace.xlsx') #从excel读取数据（路径）
print(data)
new_data=data.replace(52,100)  #从整张表中匹配，不会改变原数据
print(new_data)
data['年龄'].replace(52,200,inplace=True)  #从所选的列中匹配，并改变原数据
print(data)
-------------------------------------------
  国家  姓名  年龄   武力值
0  魏  曹操  52  72.6
1  魏  张辽  41  72.6
2  魏  徐晃  28  52.0
3  魏  曹昂  33  92.5
4  魏  于禁  46  79.3
  国家  姓名   年龄    武力值
0  魏  曹操  100   72.6
1  魏  张辽   41   72.6
2  魏  徐晃   28  100.0
3  魏  曹昂   33   92.5
4  魏  于禁   46   79.3
  国家  姓名   年龄   武力值
0  魏  曹操  200  72.6
1  魏  张辽   41  72.6
2  魏  徐晃   28  52.0
3  魏  曹昂   33  92.5
4  魏  于禁   46  79.3

2.14、表格连接

1）合并：merge

import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K5'],
                       'A': ['A0', 'A1', 'A2', 'A3'],
                       'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K4', 'K2', 'K2', 'K3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#合并表格（左表，右表，连接键,连接方式）,返回交集，需要列名相同，支持复建，左连接等
result = pd.merge(left, right, on='key',how='inner')
print(result)
-------------------------------------------------
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K5  A3  B3
  key   C   D
0  K4  C0  D0
1  K2  C1  D1
2  K2  C2  D2
3  K3  C3  D3
  key   A   B   C   D
0  K2  A2  B2  C1  D1
1  K2  A2  B2  C2  D2

2）交叉：crosstab

import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K5'],
                       'A': ['A0', 'A1', 'A1', 'A3'],
                       'B': ['B0', 'B1', 'B2', 'B3']})
print(left)
#交叉显示，（行标签，列标签，求和）
cross_table = pd.crosstab(left.key,left.A,margins = True)
print(cross_table)
-------------------------------------
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A1  B2
3  K5  A3  B3
A    A0  A1  A3  All
key                 
K0    1   0   0    1
K1    0   1   0    1
K2    0   1   0    1
K5    0   0   1    1
All   1   2   1    4

qq_40488951

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
pandas基础

1、series1.1、创建数组1）从ndarray创建import pandas as pdimport numpy as npdata = np.array(['a','b','c'])s = pd.Series(data) #默认索引s_index = pd.Series(data,index=[101,102,103]) #index，索引print (s)print(s_index)-------------------------------------------
复制链接

扫一扫