pandas的入门代码演示

# -*- coding: utf-8 -*-
"""
Created on Wed May 23 09:41:24 2018

@author: Loulch C.C
"""
import numpy as np
import pandas as pd 

#pd.Series会对给定的序列创建一个默认的整数索引
s= pd.Series((1,3,5,np.nan,6,8))
#0    1.0
#1    3.0
#2    5.0
#3    NaN
#4    6.0
#5    8.0
#dtype: float64

#建立数据框DataFrame

dates=pd.date_range('20180501',periods=6)

#方法一:通过指定矩阵,行index和列columns来建立DataFrame
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
#                   a         b         c         d
#2018-05-01  0.001029  1.004829 -0.842121  0.484472
#2018-05-02  0.302972 -0.569209 -1.590139  0.296509
#2018-05-03 -1.250095  0.368155  0.334202  0.502626
#2018-05-04 -0.970516  0.898725  0.983519 -3.175286
#2018-05-05  0.446584 -0.155259  0.521254  0.366596
#2018-05-06  0.415108  0.698861  0.044719  2.223663

#pd.DataFrame()中参数,行index,列columns未指定时,返回DataFrame如下:
df1=pd.DataFrame(np.arange(12).reshape(3,4))
#   0  1   2   3
#0  0  1   2   3
#1  4  5   6   7
#2  8  9  10  11

#方法二:用包含数据的字典创建一个数据框。
#字典的键指向对应的每一列,字典的值就是每一列的元素,
#返回数据框的行索引从0开始计起
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
#     A          B    C  D      E    F
#0  1.0 2013-01-02  1.0  3   test  foo
#1  1.0 2013-01-02  1.0  3  train  foo
#2  1.0 2013-01-02  1.0  3   test  foo
#3  1.0 2013-01-02  1.0  3  train  foo

#查看DataFrame的一些性质

df.head(3)  #查看整个数据头部的前3个数据
#                   a         b         c         d
#2018-05-01  2.690739 -1.801108 -0.079628 -0.348525
#2018-05-02 -0.104225  0.823937  0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800

df.tail(2)  #查看整个数据尾部的后两个数据
#                   a         b         c         d
#2018-05-05 -0.044390 -0.469293  0.239750 -2.238378
#2018-05-06  1.222172 -0.590590 -0.128827 -1.495030


df2.dtypes   #按列查看该数据框的数据类型
#A           float64
#B    datetime64[ns]
#C           float32
#D             int32
#E          category
#F            object
#dtype: object

df2.index  #查看行的索引值
#Int64Index([0, 1, 2, 3], dtype='int64')

df2.columns  #查看列的索引值
#Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

df2.values  #查看数据矩阵
#array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
#       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
#      dtype=object)

df2.describe()  #描述关于数据的简短统计结果
#         A    C    D
#count  4.0  4.0  4.0  计数
#mean   1.0  1.0  3.0  均值
#std    0.0  0.0  0.0  方差
#min    1.0  1.0  3.0
#25%    1.0  1.0  3.0
#50%    1.0  1.0  3.0
#75%    1.0  1.0  3.0
#max    1.0  1.0  3.0

df.T  #对数据框进行转置
#   2018-05-01  2018-05-02  2018-05-03  2018-05-04  2018-05-05  2018-05-06
#a    2.690739   -0.104225   -0.026593    0.467239   -0.044390    1.222172
#b   -1.801108    0.823937   -0.052256    0.505369   -0.469293   -0.590590
#c   -0.079628    0.050710   -0.900132    1.386622    0.239750   -0.128827
#d   -0.348525   -1.254029   -0.570800   -0.289954   -2.238378   -1.495030

df2.sort_index(axis=1,ascending=False) #对列进行逆序排列
#     F      E  D    C          B    A
#0  foo   test  3  1.0 2013-01-02  1.0
#1  foo  train  3  1.0 2013-01-02  1.0
#2  foo   test  3  1.0 2013-01-02  1.0
#3  foo  train  3  1.0 2013-01-02  1.0

df2.sort_index(axis=0,ascending=False) #对行进行逆序排列
#     A          B    C  D      E    F
#3  1.0 2013-01-02  1.0  3  train  foo
#2  1.0 2013-01-02  1.0  3   test  foo
#1  1.0 2013-01-02  1.0  3  train  foo
#0  1.0 2013-01-02  1.0  3   test  foo

df2.sort_values(by='E')   #对数据集中某一列的元素进行排序进行
#     A          B    C  D      E    F
#0  1.0 2013-01-02  1.0  3   test  foo
#2  1.0 2013-01-02  1.0  3   test  foo
#1  1.0 2013-01-02  1.0  3  train  foo
#3  1.0 2013-01-02  1.0  3  train  foo

df.sort_values(by='a',ascending=False)  #对df数据框的a列进行逆排序
#                   a         b         c         d
#2018-05-01  2.690739 -1.801108 -0.079628 -0.348525
#2018-05-06  1.222172 -0.590590 -0.128827 -1.495030
#2018-05-04  0.467239  0.505369  1.386622 -0.289954
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
#2018-05-05 -0.044390 -0.469293  0.239750 -2.238378
#2018-05-02 -0.104225  0.823937  0.050710 -1.254029

#选择数据

#在方括号中输入单个的列名,来获得一个Series,该操作相当于df.a
df['a']
#2018-05-01    2.690739
#2018-05-02   -0.104225
#2018-05-03   -0.026593
#2018-05-04    0.467239
#2018-05-05   -0.044390
#2018-05-06    1.222172
#Freq: D, Name: a, dtype: float64
df.a
#2018-05-01    2.690739
#2018-05-02   -0.104225
#2018-05-03   -0.026593
#2018-05-04    0.467239
#2018-05-05   -0.044390
#2018-05-06    1.222172
#Freq: D, Name: a, dtype: float64

df[0:3]  #通过对行切片来获取数据
#                   a         b         c         d
#2018-05-01  2.690739 -1.801108 -0.079628 -0.348525
#2018-05-02 -0.104225  0.823937  0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800

df['20180502':'20180504']  #过对指定行索引范围来获取数据
#                   a         b         c         d
#2018-05-02 -0.104225  0.823937  0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
#2018-05-04  0.467239  0.505369  1.386622 -0.289954

#用标签筛选loc()来筛选一行数据,其实就是要给loc()的行列各提供一个关键字参数进行索引
df.loc[dates[0]]  #相当于df.loc['20180501']
#a    2.690739
#b   -1.801108
#c   -0.079628
#d   -0.348525
#Name: 2018-05-01 00:00:00, dtype: float64

#用标签筛选loc()来筛选所有行和两列数据
#如果要连续索引,可以使用类似切片操作;若要间断索引,可以使用列表,将关键字放入其中
#但对行只能用切片操作,不能识别列表
df.loc[:,['a','b']]  #相当于df.loc[:,'a':'b'] 
#                   a         b
#2018-05-01  2.690739 -1.801108
#2018-05-02 -0.104225  0.823937
#2018-05-03 -0.026593 -0.052256
#2018-05-04  0.467239  0.505369
#2018-05-05 -0.044390 -0.469293
#2018-05-06  1.222172 -0.590590

#用标签筛选loc()来筛选选定范围行和两列数据
df.loc['20180502':'20180504',['a','b']]
#                   a         b
#2018-05-02 -0.104225  0.823937
#2018-05-03 -0.026593 -0.052256
#2018-05-04  0.467239  0.505369

#用标签筛选loc()来筛选选定行的某两列数据
df.loc['20180502',['a','b']]
#a   -0.104225
#b    0.823937
#Name: 2018-05-02 00:00:00, dtype: float64

#仅仅获取标量值的方法
df.loc[dates[0],'a']
#2.6907392029547235

#更快地获取标量值(效果相当于前一个方法)
df.at[dates[0],'a']
#2.6907392029547235

#通过位置筛选iloc()进行索引,其实就是要给loc()的行列各提供一个位置参数进行索引
#索引方法同loc()相似
df.iloc[3]
#a    0.467239
#b    0.505369
#c    1.386622
#d   -0.289954
#Name: 2018-05-04 00:00:00, dtype: float64

df.iloc[3:5,0:2]
#                   a         b
#2018-05-04  0.467239  0.505369
#2018-05-05 -0.044390 -0.469293

#使系统快速地获取标量值
df.iat[1,1]
#0.8239374131622427

#使用ix(相当于loc和iloc的组合)来筛选,一个提供关键字参数,另一个提供位置参数
#但不建议使用
df.ix[2:4,['a','b']]
#                   a         b
#2018-05-03  0.884561  1.461977
#2018-05-04  0.109828  0.505714

df.ix['20180502':'20180504',0:2]
#                   a         b
#2018-05-02  1.109674  0.364267
#2018-05-03  0.884561  1.461977
#2018-05-04  0.109828  0.505714

#布尔值索引
#使用单一的列的值来选取数据
df[df.a > 0]
#                   a         b         c         d
#2018-05-01  2.690739 -1.801108 -0.079628 -0.348525
#2018-05-04  0.467239  0.505369  1.386622 -0.289954
#2018-05-06  1.222172 -0.590590 -0.128827 -1.495030

#从给出布尔条件的数据框来获取数据
df[df > 0]
#                   a         b         c   d
#2018-05-01  2.690739       NaN       NaN NaN
#2018-05-02       NaN  0.823937  0.050710 NaN
#2018-05-03       NaN       NaN       NaN NaN
#2018-05-04  0.467239  0.505369  1.386622 NaN
#2018-05-05       NaN       NaN  0.239750 NaN
#2018-05-06  1.222172       NaN       NaN NaN

#使用isin()方法来过滤数据,
df3 = df.copy()
df3['E'] = ['one', 'one','two','three','four','three']
df3['E'].isin(['two','four'])  
#2018-05-01    False
#2018-05-02    False
#2018-05-03     True
#2018-05-04    False
#2018-05-05     True
#2018-05-06    False
#Freq: D, Name: E, dtype: bool
df3[df3['E'].isin(['two','four'])] 
#                   a         b         c         d     E
#2018-05-03 -0.560460  1.274713 -0.203275 -1.096028   two
#2018-05-05 -0.004281  0.495102  1.256900  0.170312  four

#安插值/替换值
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180501', periods=6))
#插入一列
df['F'] = s1
#                   a         b         c         d  F
#2018-05-01  0.908912  0.233254  2.409890 -0.070449  1
#2018-05-02  1.109674  0.364267 -0.355951 -1.412754  2
#2018-05-03  0.884561  1.461977 -0.188587 -0.798350  3
#2018-05-04  0.109828  0.505714  0.315985 -0.397701  4
#2018-05-05  0.573763  1.952128 -0.810650 -0.434227  5
#2018-05-06  1.537330 -0.711214  1.236344  1.027827  6

#通过标签安插值
df.at[dates[0],'a'] = 0  #df.loc[dates[0],'a']=0.908912
#                   a         b         c         d  F
#2018-05-01  0.000000  0.233254  2.409890 -0.070449  1
#2018-05-02  1.109674  0.364267 -0.355951 -1.412754  2
#2018-05-03  0.884561  1.461977 -0.188587 -0.798350  3
#2018-05-04  0.109828  0.505714  0.315985 -0.397701  4
#2018-05-05  0.573763  1.952128 -0.810650 -0.434227  5
#2018-05-06  1.537330 -0.711214  1.236344  1.027827  6

#通过位置安插值
df.iat[0,1] = 0   #df.iloc[0,1]=0.233254
#                   a         b         c         d  F
#2018-05-01  0.908912  0.000000  2.409890 -0.070449  1
#2018-05-02  1.109674  0.364267 -0.355951 -1.412754  2
#2018-05-03  0.884561  1.461977 -0.188587 -0.798350  3
#2018-05-04  0.109828  0.505714  0.315985 -0.397701  4
#2018-05-05  0.573763  1.952128 -0.810650 -0.434227  5
#2018-05-06  1.537330 -0.711214  1.236344  1.027827  6

#通过布尔条件来安插值
#通过判断a>1,来改变整个df数据框对应的值
df[df.a>1] = np.nan
#                   a         b         c         d    F
#2018-05-01  0.908912  0.233254  2.409890 -0.070449  1.0
#2018-05-02       NaN       NaN       NaN       NaN  NaN
#2018-05-03  0.884561  1.461977 -0.188587 -0.798350  3.0
#2018-05-04  0.109828  0.505714  0.315985 -0.397701  4.0
#2018-05-05  0.573763  1.952128 -0.810650 -0.434227  5.0
#2018-05-06       NaN       NaN       NaN       NaN  NaN

#通过判断b>1,来改变df数据框a列对应的值
df.a[df.b>1] = np.nan
#                   a         b         c         d    F
#2018-05-01  0.908912  0.233254  2.409890 -0.070449  1.0
#2018-05-02       NaN       NaN       NaN       NaN  NaN
#2018-05-03       NaN  1.461977 -0.188587 -0.798350  3.0
#2018-05-04  0.109828  0.505714  0.315985 -0.397701  4.0
#2018-05-05       NaN  1.952128 -0.810650 -0.434227  5.0
#2018-05-06       NaN       NaN       NaN       NaN  NaN

#通过分配numpy数组来安插新的列
df.loc[:,'d'] = np.array([5] * len(df))
#                   a         b         c  d    F
#2018-05-01  0.908912  0.233254  2.409890  5  1.0
#2018-05-02       NaN       NaN       NaN  5  NaN
#2018-05-03       NaN  1.461977 -0.188587  5  3.0
#2018-05-04  0.109828  0.505714  0.315985  5  4.0
#2018-05-05       NaN  1.952128 -0.810650  5  5.0
#2018-05-06       NaN       NaN       NaN  5  NaN

#早先的pandas使用 np.nan的值来代表缺失值。缺失值默认不会进行计算。

#df.reindex创建新索引并重新定义dataframe索引。
#在默认情况下,新索引在dataframe中没有相应记录的值被分配为``NaN``。 
df1 = df.reindex(index=dates[0:4], columns=list(df.columns))
df1.iloc[1:3,0]=0
df1
#                   a         b         c  d    F   E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0 NaN
#2018-05-02  0.000000       NaN       NaN -5  NaN NaN
#2018-05-03  0.000000 -1.461977 -0.188587 -5 -3.0 NaN
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0 NaN

#how='any',有一个为nan。how='all',全部为nan。
#删除所有含有缺失值的行,可以通过axis指定行或列。
df1.dropna(axis=1,how='any')  #返回结果如下,但他不会改变原来ataframe
#                  a  d
#2018-05-01 -0.908912 -5
#2018-05-02  0.000000 -5
#2018-05-03  0.000000 -5
#2018-05-04 -0.109828 -5

#将某一列全部为nan的数据删除
df1.dropna(axis=1,how='all')  #返回结果如下,但他不会改变原来ataframe
#                   a         b         c  d    F
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0
#2018-05-02  0.000000       NaN       NaN -5  NaN
#2018-05-03  0.000000 -1.461977 -0.188587 -5 -3.0
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0

#若dataframe中某个值是nan,则用指定的值value替换
df1.fillna(value=0)  #返回结果如下,但他不会改变原来ataframe
#                   a         b         c  d    F    E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0  0.0
#2018-05-02  0.000000  0.000000  0.000000 -5  0.0  0.0
#2018-05-03  0.000000 -1.461977 -0.188587 -5 -3.0  0.0
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0  0.0

#判断dataframe中有没有缺失值
pd.isnull(df1)  #返回结果如下,但他不会改变原来ataframe
#                a      b      c      d      F     E
#2018-05-01  False  False  False  False  False  True
#2018-05-02  False   True   True  False   True  True
#2018-05-03  False  False  False  False  False  True
#2018-05-04  False  False  False  False  False  True

#如果数据太多,可以用下面方法判断原dataframe中有没有缺失值
np.any(df1.isnull())==True
#True

df1  #还是原来的dataframe
#                   a         b         c  d    F   E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0 NaN
#2018-05-02  0.000000       NaN       NaN -5  NaN NaN
#2018-05-03  0.000000 -1.461977 -0.188587 -5 -3.0 NaN
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0 NaN

#做一些统计运算

#先建立一个dataframe
dates=pd.date_range('20180501',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df
#                   a         b         c         d
#2018-05-01  1.894068 -1.417584  0.719539  0.272695
#2018-05-02 -0.604556  0.463475 -1.492414  0.335559
#2018-05-03 -0.731261  1.866179  1.900834 -0.793897
#2018-05-04  0.093965 -1.552160  0.686152  0.876278
#2018-05-05  0.269545 -0.925049  2.492038  0.152587
#2018-05-06 -0.008658  0.566519  0.947896 -0.222369

df.mean(0) 
#a    0.152184
#b   -0.166437
#c    0.875674
#d    0.103476
#dtype: float64

df.mean(1)
#2018-05-01    0.367179
#2018-05-02   -0.324484
#2018-05-03    0.560464
#2018-05-04    0.026059
#2018-05-05    0.497280
#2018-05-06    0.320847
#Freq: D, dtype: float64

#统计值的频数
s = pd.Series(np.random.randint(0, 7, size=10))
#0    1
#1    6
#2    1
#3    0
#4    0
#5    2
#6    6
#7    1
#8    4
#9    6
s.count()  #统计s中总共出现了几个数
#10
s.value_counts()  #统计s中不同的值出现的次数
#4    4
#0    3
#5    2
#2    1

#进行一些数学运算应用

df.apply(np.cumsum)# 对指定方向进行累积求和,
#                   a         b         c         d
#2018-05-01  1.894068 -1.417584  0.719539  0.272695
#2018-05-02  1.289511 -0.954110 -0.772875  0.608254
#2018-05-03  0.558250  0.912069  1.127959 -0.185643
#2018-05-04  0.652215 -0.640091  1.814111  0.690635
#2018-05-05  0.921760 -1.565141  4.306149  0.843223
#2018-05-06  0.913101 -0.998622  5.254045  0.620854

df.apply(lambda x: x.max() - x.min())
#a    2.625329
#b    3.418339
#c    3.984452
#d    1.670175
#dtype: float64

#读取数据

pd.read_excel('name.xls')
pd.read_csv('name.csv')

#   ID   name  age  gender
#0   0   Mike   23    male
#1   1   Lily   22  female
#2   2  David   25    male
#3   3  Catty   23  female
#4   4  Kelly   21  female
#5   5   Habe   26  female
#6   6   Tony   25    male

#写入数据
pd.to_excel('name.xls')
pd.to_csv('name.csv')

#用concat()函数来连接pandas对象
#concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, 
#       keys=None, levels=None, names=None, verify_integrity=False, copy=True)

df = pd.DataFrame(np.random.randn(10, 4))
#          0         1         2         3
#0 -1.148774 -1.636030 -1.377984 -0.036423
#1  0.427075  1.140022 -0.247616  1.380535
#2 -0.685177 -0.178192  0.960987 -0.655759
#3  0.576935 -1.300454  1.132967 -0.311107
#4 -0.056756  0.768039  1.820931 -0.306055
#5 -1.627412  1.008629 -0.584862  0.413822
#6  0.976862 -1.579591  2.364852 -1.291575
#7  0.076845 -2.311444 -1.019777  0.139305
#8 -1.876678 -0.254856 -1.594575  0.175033
#9 -1.816396 -1.480799 -1.754511 -1.150694

pieces = [df[:3], df[3:7], df[7:]]
#[          0         1         2         3
# 0 -1.148774 -1.636030 -1.377984 -0.036423
# 1  0.427075  1.140022 -0.247616  1.380535
# 2 -0.685177 -0.178192  0.960987 -0.655759,
#           0         1         2         3
# 3  0.576935 -1.300454  1.132967 -0.311107
# 4 -0.056756  0.768039  1.820931 -0.306055
# 5 -1.627412  1.008629 -0.584862  0.413822
# 6  0.976862 -1.579591  2.364852 -1.291575,
#           0         1         2         3
# 7  0.076845 -2.311444 -1.019777  0.139305
# 8 -1.876678 -0.254856 -1.594575  0.175033
# 9 -1.816396 -1.480799 -1.754511 -1.150694]

pd.concat(pieces)
#          0         1         2         3
#0 -1.148774 -1.636030 -1.377984 -0.036423
#1  0.427075  1.140022 -0.247616  1.380535
#2 -0.685177 -0.178192  0.960987 -0.655759
#3  0.576935 -1.300454  1.132967 -0.311107
#4 -0.056756  0.768039  1.820931 -0.306055
#5 -1.627412  1.008629 -0.584862  0.413822
#6  0.976862 -1.579591  2.364852 -1.291575
#7  0.076845 -2.311444 -1.019777  0.139305
#8 -1.876678 -0.254856 -1.594575  0.175033
#9 -1.816396 -1.480799 -1.754511 -1.150694

df1=pd.DataFrame(np.ones((3,4)),columns=['a','b','c','d'])
#     a    b    c    d
#0  1.0  1.0  1.0  1.0
#1  1.0  1.0  1.0  1.0
#2  1.0  1.0  1.0  1.0
df2=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,columns=['a','b','c','d'])

res=pd.concat([df1,df2,df3],axis=1,ignore_index=True)
#     a    b    c    d
#0  1.0  1.0  1.0  1.0
#1  1.0  1.0  1.0  1.0
#2  1.0  1.0  1.0  1.0
#3  2.0  2.0  2.0  2.0
#4  2.0  2.0  2.0  2.0
#5  2.0  2.0  2.0  2.0
#6  3.0  3.0  3.0  3.0
#7  3.0  3.0  3.0  3.0
#8  3.0  3.0  3.0  3.0

#pd.concat的 join参数('outer','inner')
df2=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,columns=['b','c','d','e'])

res=pd.concat([df2,df3],join='outer',axis=1,ignore_index=True)
#     a    b    c    d    e
#0  2.0  2.0  2.0  2.0  NaN
#1  2.0  2.0  2.0  2.0  NaN
#2  2.0  2.0  2.0  2.0  NaN
#3  NaN  3.0  3.0  3.0  3.0
#4  NaN  3.0  3.0  3.0  3.0
#5  NaN  3.0  3.0  3.0  3.0

res=pd.concat([df2,df3],join='inner',axis=0,ignore_index=True)
#     b    c    d
#0  2.0  2.0  2.0
#1  2.0  2.0  2.0
#2  2.0  2.0  2.0
#3  3.0  3.0  3.0
#4  3.0  3.0  3.0
#5  3.0  3.0  3.0

#pd.concat的 join_axes参数,指定按那个dataframe的行索引连接
df2=pd.DataFrame(np.ones((3,4))*2,index=[1,2,3],columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,index=[2,3,4],columns=['b','c','d','e'])
res=pd.concat([df2,df3],axis=1,join_axes=[df2.index])
#     a    b    c    d    b    c    d    e
#1  2.0  2.0  2.0  2.0  NaN  NaN  NaN  NaN
#2  2.0  2.0  2.0  2.0  3.0  3.0  3.0  3.0
#3  2.0  2.0  2.0  2.0  3.0  3.0  3.0  3.0

res=pd.concat([df2,df3],axis=1)
#     a    b    c    d    b    c    d    e
#1  2.0  2.0  2.0  2.0  NaN  NaN  NaN  NaN
#2  2.0  2.0  2.0  2.0  3.0  3.0  3.0  3.0
#3  2.0  2.0  2.0  2.0  3.0  3.0  3.0  3.0
#4  NaN  NaN  NaN  NaN  3.0  3.0  3.0  3.0

#对数据框附加行
res=df1.append([df2,df3],ignore_index=True)
#     a    b    c    d    e
#0  1.0  1.0  1.0  1.0  NaN
#1  1.0  1.0  1.0  1.0  NaN
#2  1.0  1.0  1.0  1.0  NaN
#3  2.0  2.0  2.0  2.0  NaN
#4  2.0  2.0  2.0  2.0  NaN
#5  2.0  2.0  2.0  2.0  NaN
#6  NaN  3.0  3.0  3.0  3.0
#7  NaN  3.0  3.0  3.0  3.0
#8  NaN  3.0  3.0  3.0  3.0
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
df1.append(s1,ignore_index=True)
#     a    b    c    d
#0  1.0  1.0  1.0  1.0
#1  1.0  1.0  1.0  1.0
#2  1.0  1.0  1.0  1.0
#3  1.0  2.0  3.0  4.0

#merge聚合方式

left = pd.DataFrame({'key1': ['K0','K1','K2','K3'],'key2': ['K0','K1','K0','K1'],
                     'A': ['A0','A1','A2','A3'],
                     'B': ['B0','B1','B2','B3']})
right = pd.DataFrame({'key': ['K0','K1','K2','K3'],'C': ['C0','C1','C2','C3'],
                     'D': ['D0','D1','D2','D3']})
pd.merge(left, right, on='key')
#    A   B key   C   D
#0  A0  B0  K0  C0  D0
#1  A1  B1  K1  C1  D1
#2  A2  B2  K2  C2  D2
#3  A3  B3  K3  C3  D3

left = pd.DataFrame({'key1': ['K0','K0','K1','K2'],'key2': ['K0','K1','K0','K1'],
                     'A': ['A0','A1','A2','A3'],
                     'B': ['B0','B1','B2','B3']})
right = pd.DataFrame({'key1': ['K0','K1','K1','K2'],'key2': ['K0','K0','K0','K0'],
                      'C': ['C0','C1','C2','C3'],
                      'D': ['D0','D1','D2','D3']})
#how={'inner','outer','left','right'}
pd.merge(left, right, on=['key1','key2'],how='inner')
#    A   B key1 key2   C   D
#0  A0  B0   K0   K0  C0  D0
#1  A2  B2   K1   K0  C1  D1
#2  A2  B2   K1   K0  C2  D2
pd.merge(left, right, on=['key1','key2'],how='outer')
#     A    B key1 key2    C    D
#0   A0   B0   K0   K0   C0   D0
#1   A1   B1   K0   K1  NaN  NaN
#2   A2   B2   K1   K0   C1   D1
#3   A2   B2   K1   K0   C2   D2
#4   A3   B3   K2   K1  NaN  NaN
#5  NaN  NaN   K2   K0   C3   D3

#设置indicator参数
left = pd.DataFrame({'key': ['foo','bar','laa'], 'lval': [1, 2,3]})
right = pd.DataFrame({'key': ['foo','bar','haa'], 'rval': [4, 5,6]})
pd.merge(left, right, on='key',how='outer',indicator=True)
#   key  lval  rval      _merge
#0  foo   1.0   4.0        both
#1  bar   2.0   5.0        both
#2  laa   3.0   NaN   left_only
#3  haa   NaN   6.0  right_only

good_students = pd.DataFrame({'score': ['K0','K1','K2'], 'age': [1, 2,3]})
girls = pd.DataFrame({'score': ['K0','K1','K3'], 'age': [4, 5,6]})
pd.merge(good_students,girls, on='score',how='outer',
         suffixes=['_good_students','_girls'])
#   age_good_students score  age_girls
#0                1.0    K0        4.0
#1                2.0    K1        5.0
#2                3.0    K2        NaN
#3                NaN    K3        6.0

#分组计算
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
#     A      B         C         D
#0  foo    one  0.672596  0.099568
#1  bar    one -1.903051 -0.008498
#2  foo    two -0.640851 -0.246812
#3  bar  three  1.514539  1.220103
#4  foo    two  0.837496  0.754398
#5  bar    two -0.790593 -0.123624
#6  foo    one -1.037150  0.042985
#7  foo  three -1.158803 -1.120753

#分组然后应用sum函数到分组的结果中
df.groupby('A').sum()
#            C         D
#A                      
#bar -1.179105  1.087980
#foo -1.326713 -0.470613

#通过多列形式分组获得多重索引进行应用函数
df.groupby(['A','B']).sum()
#                  C         D
#A   B                        
#bar one   -1.903051 -0.008498
#    three  1.514539  1.220103
#    two   -0.790593 -0.123624
#foo one   -0.364554  0.142553
#    three -1.158803 -1.120753
#    two    0.196645  0.507586
    
tuples = list(zip(['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux','qux'],
                    ['one', 'two', 'one','two',
                     'one', 'two', 'one', 'two']))
#多层索引
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
#MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
#           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
#           names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
#                     A         B
#first second                    
#bar   one    -0.694858  0.527970
#      two     0.666057  1.199938
#baz   one    -0.900507  1.248533
#      two     0.858889 -1.110924
#foo   one     0.980166 -1.090929
#      two    -0.959029 -0.683805
#qux   one     0.371380 -0.617407
#      two    -0.975026 -0.507172

#stack()方法”压缩”DataFrame的列,堆叠的数据库
stacked = df.stack()
#first  second   
#bar    one     A   -0.673415
#               B   -0.808613
#       two     A    0.134393
#               B   -1.670767
#baz    one     A    1.862296
#               B   -0.557928
#       two     A    0.891700
#               B   -0.733257
#foo    one     A    0.478092
#               B    0.337301
#       two     A   -0.840462
#               B   -1.702561
#qux    one     A   -1.240449
#               B    0.429788
#       two     A    0.580564
#               B    0.819463
#dtype: float64

#对于堆叠的数据库,相反的stack()操作是unstack(),unstack()默认解除最后一个索引的堆叠状态。
stacked.unstack()
#                     A         B
#first second                    
#bar   one    -0.673415 -0.808613
#      two     0.134393 -1.670767
#baz   one     1.862296 -0.557928
#      two     0.891700 -0.733257
#foo   one     0.478092  0.337301
#      two    -0.840462 -1.702561
#qux   one    -1.240449  0.429788
#      two     0.580564  0.819463

#数据透视表
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
#        A  B    C         D         E
#0     one  A  foo -0.824777 -0.338986
#1     one  B  foo  0.093007 -0.557541
#2     two  C  foo -0.381217  0.605170
#3   three  A  bar  0.650311 -0.302759
#4     one  B  bar -0.070880  1.585431
#5     one  C  bar  0.367186 -0.335522
#6     two  A  foo -1.587308 -0.139651
#7   three  B  foo -0.429347  0.977659
#8     one  C  foo -0.285123 -1.319764
#9     one  A  bar -1.118810  1.101361
#10    two  B  bar  0.280708  0.738208
#11  three  C  bar -0.184328 -0.519279

#我们可以从这个数据中轻松地制作出数据透视表pd.pivot_table
#就是指定原dataframe中某列(可以是多列)作为行索引,再指定某列作为列索引,做一个数据透视表
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
#C             bar       foo
#A     B                    
#one   A  2.059555 -1.002014
#      B -1.091296 -1.905552
#      C  1.561891 -1.336773
#three A  0.482432       NaN
#      B       NaN -1.072260
#      C -0.854480       NaN
#two   A       NaN  1.140933
#      B -0.591507       NaN
#      C       NaN -0.131300

#时间序列
#对于频率转换,pandas有简单、强大和高效的执行再取样操作的工具。
#(例如,把频率为1s的数据转化为频率为5min的数据)这种操作通常应用在金融领域,但也不限于此。
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
#每10秒求一下和,不累加。
ts.resample('10S').sum()

#pandas就可以在数据框内包含分类数据。
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
#把 raw_grade转变为分类数据类型。
df["grade"] = df["raw_grade"].astype("category")
#0    a
#1    b
#2    b
#3    a
#4    a
#5    e
#Name: grade, dtype: category
#Categories (3, object): [a, b, e]

#将分类数据重命名为更有意义的名字。
df["grade"].cat.categories = ["very good", "good", "very bad"]

#重新排列分类数据,同时添加缺失的分类数据
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
#0    very good
#1         good
#2         good
#3    very good
#4    very good
#5     very bad
#Name: grade, dtype: category
#Categories (5, object): [very bad, bad, medium, good, very good]

df
#   id raw_grade      grade
#0   1         a  very good
#1   2         b       good
#2   3         b       good
#3   4         a  very good
#4   5         a  very good
#5   6         e   very bad

#对分类数据进行排序会作用于每列而不是指定的列。
df.sort_values(by="grade")
#   id raw_grade      grade
#0   1         a  very good
#3   4         a  very good
#4   5         a  very good
#1   2         b       good
#2   3         b       good
#5   6         e   very bad

#对分类数据列那列进行分组,查看每组类别的大小
df.groupby("grade").size()
#grade
#very good    3
#good         2
#very bad     1
#dtype: int64

#画图
import matplotlib.pyplot as plt

#对于Series对象
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
plt.show()

#对于DataFrame对象
df=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('abcd'))
df.head()
#          a         b         c         d
#0  1.967384  1.079496 -0.489001  0.255740
#1  0.222863  1.876075 -1.454127 -0.184069
#2 -0.590762  1.534659  0.698180 -0.954305
#3  0.394999 -0.167066 -0.013934 -0.244500
#4 -0.131368 -0.761295  0.116348  1.336276

df_cum=df.cumsum()
df_cum.plot()
plt.show()

ax=df_cum.plot.scatter(x='a',y='b',label='1',c='B')
df_cum.plot.scatter(x='a',y='c',c='G',label='2',ax=ax)
plt.show()



  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值