# -*- coding: utf-8 -*-
"""
Created on Wed May 23 09:41:24 2018
@author: Loulch C.C
"""
import numpy as np
import pandas as pd
#pd.Series会对给定的序列创建一个默认的整数索引
s= pd.Series((1,3,5,np.nan,6,8))
#0 1.0
#1 3.0
#2 5.0
#3 NaN
#4 6.0
#5 8.0
#dtype: float64
#建立数据框DataFrame
dates=pd.date_range('20180501',periods=6)
#方法一:通过指定矩阵,行index和列columns来建立DataFrame
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
# a b c d
#2018-05-01 0.001029 1.004829 -0.842121 0.484472
#2018-05-02 0.302972 -0.569209 -1.590139 0.296509
#2018-05-03 -1.250095 0.368155 0.334202 0.502626
#2018-05-04 -0.970516 0.898725 0.983519 -3.175286
#2018-05-05 0.446584 -0.155259 0.521254 0.366596
#2018-05-06 0.415108 0.698861 0.044719 2.223663
#pd.DataFrame()中参数,行index,列columns未指定时,返回DataFrame如下:
df1=pd.DataFrame(np.arange(12).reshape(3,4))
# 0 1 2 3
#0 0 1 2 3
#1 4 5 6 7
#2 8 9 10 11
#方法二:用包含数据的字典创建一个数据框。
#字典的键指向对应的每一列,字典的值就是每一列的元素,
#返回数据框的行索引从0开始计起
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
# A B C D E F
#0 1.0 2013-01-02 1.0 3 test foo
#1 1.0 2013-01-02 1.0 3 train foo
#2 1.0 2013-01-02 1.0 3 test foo
#3 1.0 2013-01-02 1.0 3 train foo
#查看DataFrame的一些性质
df.head(3) #查看整个数据头部的前3个数据
# a b c d
#2018-05-01 2.690739 -1.801108 -0.079628 -0.348525
#2018-05-02 -0.104225 0.823937 0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
df.tail(2) #查看整个数据尾部的后两个数据
# a b c d
#2018-05-05 -0.044390 -0.469293 0.239750 -2.238378
#2018-05-06 1.222172 -0.590590 -0.128827 -1.495030
df2.dtypes #按列查看该数据框的数据类型
#A float64
#B datetime64[ns]
#C float32
#D int32
#E category
#F object
#dtype: object
df2.index #查看行的索引值
#Int64Index([0, 1, 2, 3], dtype='int64')
df2.columns #查看列的索引值
#Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
df2.values #查看数据矩阵
#array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
# dtype=object)
df2.describe() #描述关于数据的简短统计结果
# A C D
#count 4.0 4.0 4.0 计数
#mean 1.0 1.0 3.0 均值
#std 0.0 0.0 0.0 方差
#min 1.0 1.0 3.0
#25% 1.0 1.0 3.0
#50% 1.0 1.0 3.0
#75% 1.0 1.0 3.0
#max 1.0 1.0 3.0
df.T #对数据框进行转置
# 2018-05-01 2018-05-02 2018-05-03 2018-05-04 2018-05-05 2018-05-06
#a 2.690739 -0.104225 -0.026593 0.467239 -0.044390 1.222172
#b -1.801108 0.823937 -0.052256 0.505369 -0.469293 -0.590590
#c -0.079628 0.050710 -0.900132 1.386622 0.239750 -0.128827
#d -0.348525 -1.254029 -0.570800 -0.289954 -2.238378 -1.495030
df2.sort_index(axis=1,ascending=False) #对列进行逆序排列
# F E D C B A
#0 foo test 3 1.0 2013-01-02 1.0
#1 foo train 3 1.0 2013-01-02 1.0
#2 foo test 3 1.0 2013-01-02 1.0
#3 foo train 3 1.0 2013-01-02 1.0
df2.sort_index(axis=0,ascending=False) #对行进行逆序排列
# A B C D E F
#3 1.0 2013-01-02 1.0 3 train foo
#2 1.0 2013-01-02 1.0 3 test foo
#1 1.0 2013-01-02 1.0 3 train foo
#0 1.0 2013-01-02 1.0 3 test foo
df2.sort_values(by='E') #对数据集中某一列的元素进行排序进行
# A B C D E F
#0 1.0 2013-01-02 1.0 3 test foo
#2 1.0 2013-01-02 1.0 3 test foo
#1 1.0 2013-01-02 1.0 3 train foo
#3 1.0 2013-01-02 1.0 3 train foo
df.sort_values(by='a',ascending=False) #对df数据框的a列进行逆排序
# a b c d
#2018-05-01 2.690739 -1.801108 -0.079628 -0.348525
#2018-05-06 1.222172 -0.590590 -0.128827 -1.495030
#2018-05-04 0.467239 0.505369 1.386622 -0.289954
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
#2018-05-05 -0.044390 -0.469293 0.239750 -2.238378
#2018-05-02 -0.104225 0.823937 0.050710 -1.254029
#选择数据
#在方括号中输入单个的列名,来获得一个Series,该操作相当于df.a
df['a']
#2018-05-01 2.690739
#2018-05-02 -0.104225
#2018-05-03 -0.026593
#2018-05-04 0.467239
#2018-05-05 -0.044390
#2018-05-06 1.222172
#Freq: D, Name: a, dtype: float64
df.a
#2018-05-01 2.690739
#2018-05-02 -0.104225
#2018-05-03 -0.026593
#2018-05-04 0.467239
#2018-05-05 -0.044390
#2018-05-06 1.222172
#Freq: D, Name: a, dtype: float64
df[0:3] #通过对行切片来获取数据
# a b c d
#2018-05-01 2.690739 -1.801108 -0.079628 -0.348525
#2018-05-02 -0.104225 0.823937 0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
df['20180502':'20180504'] #过对指定行索引范围来获取数据
# a b c d
#2018-05-02 -0.104225 0.823937 0.050710 -1.254029
#2018-05-03 -0.026593 -0.052256 -0.900132 -0.570800
#2018-05-04 0.467239 0.505369 1.386622 -0.289954
#用标签筛选loc()来筛选一行数据,其实就是要给loc()的行列各提供一个关键字参数进行索引
df.loc[dates[0]] #相当于df.loc['20180501']
#a 2.690739
#b -1.801108
#c -0.079628
#d -0.348525
#Name: 2018-05-01 00:00:00, dtype: float64
#用标签筛选loc()来筛选所有行和两列数据
#如果要连续索引,可以使用类似切片操作;若要间断索引,可以使用列表,将关键字放入其中
#但对行只能用切片操作,不能识别列表
df.loc[:,['a','b']] #相当于df.loc[:,'a':'b']
# a b
#2018-05-01 2.690739 -1.801108
#2018-05-02 -0.104225 0.823937
#2018-05-03 -0.026593 -0.052256
#2018-05-04 0.467239 0.505369
#2018-05-05 -0.044390 -0.469293
#2018-05-06 1.222172 -0.590590
#用标签筛选loc()来筛选选定范围行和两列数据
df.loc['20180502':'20180504',['a','b']]
# a b
#2018-05-02 -0.104225 0.823937
#2018-05-03 -0.026593 -0.052256
#2018-05-04 0.467239 0.505369
#用标签筛选loc()来筛选选定行的某两列数据
df.loc['20180502',['a','b']]
#a -0.104225
#b 0.823937
#Name: 2018-05-02 00:00:00, dtype: float64
#仅仅获取标量值的方法
df.loc[dates[0],'a']
#2.6907392029547235
#更快地获取标量值(效果相当于前一个方法)
df.at[dates[0],'a']
#2.6907392029547235
#通过位置筛选iloc()进行索引,其实就是要给loc()的行列各提供一个位置参数进行索引
#索引方法同loc()相似
df.iloc[3]
#a 0.467239
#b 0.505369
#c 1.386622
#d -0.289954
#Name: 2018-05-04 00:00:00, dtype: float64
df.iloc[3:5,0:2]
# a b
#2018-05-04 0.467239 0.505369
#2018-05-05 -0.044390 -0.469293
#使系统快速地获取标量值
df.iat[1,1]
#0.8239374131622427
#使用ix(相当于loc和iloc的组合)来筛选,一个提供关键字参数,另一个提供位置参数
#但不建议使用
df.ix[2:4,['a','b']]
# a b
#2018-05-03 0.884561 1.461977
#2018-05-04 0.109828 0.505714
df.ix['20180502':'20180504',0:2]
# a b
#2018-05-02 1.109674 0.364267
#2018-05-03 0.884561 1.461977
#2018-05-04 0.109828 0.505714
#布尔值索引
#使用单一的列的值来选取数据
df[df.a > 0]
# a b c d
#2018-05-01 2.690739 -1.801108 -0.079628 -0.348525
#2018-05-04 0.467239 0.505369 1.386622 -0.289954
#2018-05-06 1.222172 -0.590590 -0.128827 -1.495030
#从给出布尔条件的数据框来获取数据
df[df > 0]
# a b c d
#2018-05-01 2.690739 NaN NaN NaN
#2018-05-02 NaN 0.823937 0.050710 NaN
#2018-05-03 NaN NaN NaN NaN
#2018-05-04 0.467239 0.505369 1.386622 NaN
#2018-05-05 NaN NaN 0.239750 NaN
#2018-05-06 1.222172 NaN NaN NaN
#使用isin()方法来过滤数据,
df3 = df.copy()
df3['E'] = ['one', 'one','two','three','four','three']
df3['E'].isin(['two','four'])
#2018-05-01 False
#2018-05-02 False
#2018-05-03 True
#2018-05-04 False
#2018-05-05 True
#2018-05-06 False
#Freq: D, Name: E, dtype: bool
df3[df3['E'].isin(['two','four'])]
# a b c d E
#2018-05-03 -0.560460 1.274713 -0.203275 -1.096028 two
#2018-05-05 -0.004281 0.495102 1.256900 0.170312 four
#安插值/替换值
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180501', periods=6))
#插入一列
df['F'] = s1
# a b c d F
#2018-05-01 0.908912 0.233254 2.409890 -0.070449 1
#2018-05-02 1.109674 0.364267 -0.355951 -1.412754 2
#2018-05-03 0.884561 1.461977 -0.188587 -0.798350 3
#2018-05-04 0.109828 0.505714 0.315985 -0.397701 4
#2018-05-05 0.573763 1.952128 -0.810650 -0.434227 5
#2018-05-06 1.537330 -0.711214 1.236344 1.027827 6
#通过标签安插值
df.at[dates[0],'a'] = 0 #df.loc[dates[0],'a']=0.908912
# a b c d F
#2018-05-01 0.000000 0.233254 2.409890 -0.070449 1
#2018-05-02 1.109674 0.364267 -0.355951 -1.412754 2
#2018-05-03 0.884561 1.461977 -0.188587 -0.798350 3
#2018-05-04 0.109828 0.505714 0.315985 -0.397701 4
#2018-05-05 0.573763 1.952128 -0.810650 -0.434227 5
#2018-05-06 1.537330 -0.711214 1.236344 1.027827 6
#通过位置安插值
df.iat[0,1] = 0 #df.iloc[0,1]=0.233254
# a b c d F
#2018-05-01 0.908912 0.000000 2.409890 -0.070449 1
#2018-05-02 1.109674 0.364267 -0.355951 -1.412754 2
#2018-05-03 0.884561 1.461977 -0.188587 -0.798350 3
#2018-05-04 0.109828 0.505714 0.315985 -0.397701 4
#2018-05-05 0.573763 1.952128 -0.810650 -0.434227 5
#2018-05-06 1.537330 -0.711214 1.236344 1.027827 6
#通过布尔条件来安插值
#通过判断a>1,来改变整个df数据框对应的值
df[df.a>1] = np.nan
# a b c d F
#2018-05-01 0.908912 0.233254 2.409890 -0.070449 1.0
#2018-05-02 NaN NaN NaN NaN NaN
#2018-05-03 0.884561 1.461977 -0.188587 -0.798350 3.0
#2018-05-04 0.109828 0.505714 0.315985 -0.397701 4.0
#2018-05-05 0.573763 1.952128 -0.810650 -0.434227 5.0
#2018-05-06 NaN NaN NaN NaN NaN
#通过判断b>1,来改变df数据框a列对应的值
df.a[df.b>1] = np.nan
# a b c d F
#2018-05-01 0.908912 0.233254 2.409890 -0.070449 1.0
#2018-05-02 NaN NaN NaN NaN NaN
#2018-05-03 NaN 1.461977 -0.188587 -0.798350 3.0
#2018-05-04 0.109828 0.505714 0.315985 -0.397701 4.0
#2018-05-05 NaN 1.952128 -0.810650 -0.434227 5.0
#2018-05-06 NaN NaN NaN NaN NaN
#通过分配numpy数组来安插新的列
df.loc[:,'d'] = np.array([5] * len(df))
# a b c d F
#2018-05-01 0.908912 0.233254 2.409890 5 1.0
#2018-05-02 NaN NaN NaN 5 NaN
#2018-05-03 NaN 1.461977 -0.188587 5 3.0
#2018-05-04 0.109828 0.505714 0.315985 5 4.0
#2018-05-05 NaN 1.952128 -0.810650 5 5.0
#2018-05-06 NaN NaN NaN 5 NaN
#早先的pandas使用 np.nan的值来代表缺失值。缺失值默认不会进行计算。
#df.reindex创建新索引并重新定义dataframe索引。
#在默认情况下,新索引在dataframe中没有相应记录的值被分配为``NaN``。
df1 = df.reindex(index=dates[0:4], columns=list(df.columns))
df1.iloc[1:3,0]=0
df1
# a b c d F E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0 NaN
#2018-05-02 0.000000 NaN NaN -5 NaN NaN
#2018-05-03 0.000000 -1.461977 -0.188587 -5 -3.0 NaN
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0 NaN
#how='any',有一个为nan。how='all',全部为nan。
#删除所有含有缺失值的行,可以通过axis指定行或列。
df1.dropna(axis=1,how='any') #返回结果如下,但他不会改变原来ataframe
# a d
#2018-05-01 -0.908912 -5
#2018-05-02 0.000000 -5
#2018-05-03 0.000000 -5
#2018-05-04 -0.109828 -5
#将某一列全部为nan的数据删除
df1.dropna(axis=1,how='all') #返回结果如下,但他不会改变原来ataframe
# a b c d F
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0
#2018-05-02 0.000000 NaN NaN -5 NaN
#2018-05-03 0.000000 -1.461977 -0.188587 -5 -3.0
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0
#若dataframe中某个值是nan,则用指定的值value替换
df1.fillna(value=0) #返回结果如下,但他不会改变原来ataframe
# a b c d F E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0 0.0
#2018-05-02 0.000000 0.000000 0.000000 -5 0.0 0.0
#2018-05-03 0.000000 -1.461977 -0.188587 -5 -3.0 0.0
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0 0.0
#判断dataframe中有没有缺失值
pd.isnull(df1) #返回结果如下,但他不会改变原来ataframe
# a b c d F E
#2018-05-01 False False False False False True
#2018-05-02 False True True False True True
#2018-05-03 False False False False False True
#2018-05-04 False False False False False True
#如果数据太多,可以用下面方法判断原dataframe中有没有缺失值
np.any(df1.isnull())==True
#True
df1 #还是原来的dataframe
# a b c d F E
#2018-05-01 -0.908912 -0.233254 -2.409890 -5 -1.0 NaN
#2018-05-02 0.000000 NaN NaN -5 NaN NaN
#2018-05-03 0.000000 -1.461977 -0.188587 -5 -3.0 NaN
#2018-05-04 -0.109828 -0.505714 -0.315985 -5 -4.0 NaN
#做一些统计运算
#先建立一个dataframe
dates=pd.date_range('20180501',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df
# a b c d
#2018-05-01 1.894068 -1.417584 0.719539 0.272695
#2018-05-02 -0.604556 0.463475 -1.492414 0.335559
#2018-05-03 -0.731261 1.866179 1.900834 -0.793897
#2018-05-04 0.093965 -1.552160 0.686152 0.876278
#2018-05-05 0.269545 -0.925049 2.492038 0.152587
#2018-05-06 -0.008658 0.566519 0.947896 -0.222369
df.mean(0)
#a 0.152184
#b -0.166437
#c 0.875674
#d 0.103476
#dtype: float64
df.mean(1)
#2018-05-01 0.367179
#2018-05-02 -0.324484
#2018-05-03 0.560464
#2018-05-04 0.026059
#2018-05-05 0.497280
#2018-05-06 0.320847
#Freq: D, dtype: float64
#统计值的频数
s = pd.Series(np.random.randint(0, 7, size=10))
#0 1
#1 6
#2 1
#3 0
#4 0
#5 2
#6 6
#7 1
#8 4
#9 6
s.count() #统计s中总共出现了几个数
#10
s.value_counts() #统计s中不同的值出现的次数
#4 4
#0 3
#5 2
#2 1
#进行一些数学运算应用
df.apply(np.cumsum)# 对指定方向进行累积求和,
# a b c d
#2018-05-01 1.894068 -1.417584 0.719539 0.272695
#2018-05-02 1.289511 -0.954110 -0.772875 0.608254
#2018-05-03 0.558250 0.912069 1.127959 -0.185643
#2018-05-04 0.652215 -0.640091 1.814111 0.690635
#2018-05-05 0.921760 -1.565141 4.306149 0.843223
#2018-05-06 0.913101 -0.998622 5.254045 0.620854
df.apply(lambda x: x.max() - x.min())
#a 2.625329
#b 3.418339
#c 3.984452
#d 1.670175
#dtype: float64
#读取数据
pd.read_excel('name.xls')
pd.read_csv('name.csv')
# ID name age gender
#0 0 Mike 23 male
#1 1 Lily 22 female
#2 2 David 25 male
#3 3 Catty 23 female
#4 4 Kelly 21 female
#5 5 Habe 26 female
#6 6 Tony 25 male
#写入数据
pd.to_excel('name.xls')
pd.to_csv('name.csv')
#用concat()函数来连接pandas对象
#concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
# keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df = pd.DataFrame(np.random.randn(10, 4))
# 0 1 2 3
#0 -1.148774 -1.636030 -1.377984 -0.036423
#1 0.427075 1.140022 -0.247616 1.380535
#2 -0.685177 -0.178192 0.960987 -0.655759
#3 0.576935 -1.300454 1.132967 -0.311107
#4 -0.056756 0.768039 1.820931 -0.306055
#5 -1.627412 1.008629 -0.584862 0.413822
#6 0.976862 -1.579591 2.364852 -1.291575
#7 0.076845 -2.311444 -1.019777 0.139305
#8 -1.876678 -0.254856 -1.594575 0.175033
#9 -1.816396 -1.480799 -1.754511 -1.150694
pieces = [df[:3], df[3:7], df[7:]]
#[ 0 1 2 3
# 0 -1.148774 -1.636030 -1.377984 -0.036423
# 1 0.427075 1.140022 -0.247616 1.380535
# 2 -0.685177 -0.178192 0.960987 -0.655759,
# 0 1 2 3
# 3 0.576935 -1.300454 1.132967 -0.311107
# 4 -0.056756 0.768039 1.820931 -0.306055
# 5 -1.627412 1.008629 -0.584862 0.413822
# 6 0.976862 -1.579591 2.364852 -1.291575,
# 0 1 2 3
# 7 0.076845 -2.311444 -1.019777 0.139305
# 8 -1.876678 -0.254856 -1.594575 0.175033
# 9 -1.816396 -1.480799 -1.754511 -1.150694]
pd.concat(pieces)
# 0 1 2 3
#0 -1.148774 -1.636030 -1.377984 -0.036423
#1 0.427075 1.140022 -0.247616 1.380535
#2 -0.685177 -0.178192 0.960987 -0.655759
#3 0.576935 -1.300454 1.132967 -0.311107
#4 -0.056756 0.768039 1.820931 -0.306055
#5 -1.627412 1.008629 -0.584862 0.413822
#6 0.976862 -1.579591 2.364852 -1.291575
#7 0.076845 -2.311444 -1.019777 0.139305
#8 -1.876678 -0.254856 -1.594575 0.175033
#9 -1.816396 -1.480799 -1.754511 -1.150694
df1=pd.DataFrame(np.ones((3,4)),columns=['a','b','c','d'])
# a b c d
#0 1.0 1.0 1.0 1.0
#1 1.0 1.0 1.0 1.0
#2 1.0 1.0 1.0 1.0
df2=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,columns=['a','b','c','d'])
res=pd.concat([df1,df2,df3],axis=1,ignore_index=True)
# a b c d
#0 1.0 1.0 1.0 1.0
#1 1.0 1.0 1.0 1.0
#2 1.0 1.0 1.0 1.0
#3 2.0 2.0 2.0 2.0
#4 2.0 2.0 2.0 2.0
#5 2.0 2.0 2.0 2.0
#6 3.0 3.0 3.0 3.0
#7 3.0 3.0 3.0 3.0
#8 3.0 3.0 3.0 3.0
#pd.concat的 join参数('outer','inner')
df2=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,columns=['b','c','d','e'])
res=pd.concat([df2,df3],join='outer',axis=1,ignore_index=True)
# a b c d e
#0 2.0 2.0 2.0 2.0 NaN
#1 2.0 2.0 2.0 2.0 NaN
#2 2.0 2.0 2.0 2.0 NaN
#3 NaN 3.0 3.0 3.0 3.0
#4 NaN 3.0 3.0 3.0 3.0
#5 NaN 3.0 3.0 3.0 3.0
res=pd.concat([df2,df3],join='inner',axis=0,ignore_index=True)
# b c d
#0 2.0 2.0 2.0
#1 2.0 2.0 2.0
#2 2.0 2.0 2.0
#3 3.0 3.0 3.0
#4 3.0 3.0 3.0
#5 3.0 3.0 3.0
#pd.concat的 join_axes参数,指定按那个dataframe的行索引连接
df2=pd.DataFrame(np.ones((3,4))*2,index=[1,2,3],columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*3,index=[2,3,4],columns=['b','c','d','e'])
res=pd.concat([df2,df3],axis=1,join_axes=[df2.index])
# a b c d b c d e
#1 2.0 2.0 2.0 2.0 NaN NaN NaN NaN
#2 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0
#3 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0
res=pd.concat([df2,df3],axis=1)
# a b c d b c d e
#1 2.0 2.0 2.0 2.0 NaN NaN NaN NaN
#2 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0
#3 2.0 2.0 2.0 2.0 3.0 3.0 3.0 3.0
#4 NaN NaN NaN NaN 3.0 3.0 3.0 3.0
#对数据框附加行
res=df1.append([df2,df3],ignore_index=True)
# a b c d e
#0 1.0 1.0 1.0 1.0 NaN
#1 1.0 1.0 1.0 1.0 NaN
#2 1.0 1.0 1.0 1.0 NaN
#3 2.0 2.0 2.0 2.0 NaN
#4 2.0 2.0 2.0 2.0 NaN
#5 2.0 2.0 2.0 2.0 NaN
#6 NaN 3.0 3.0 3.0 3.0
#7 NaN 3.0 3.0 3.0 3.0
#8 NaN 3.0 3.0 3.0 3.0
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
df1.append(s1,ignore_index=True)
# a b c d
#0 1.0 1.0 1.0 1.0
#1 1.0 1.0 1.0 1.0
#2 1.0 1.0 1.0 1.0
#3 1.0 2.0 3.0 4.0
#merge聚合方式
left = pd.DataFrame({'key1': ['K0','K1','K2','K3'],'key2': ['K0','K1','K0','K1'],
'A': ['A0','A1','A2','A3'],
'B': ['B0','B1','B2','B3']})
right = pd.DataFrame({'key': ['K0','K1','K2','K3'],'C': ['C0','C1','C2','C3'],
'D': ['D0','D1','D2','D3']})
pd.merge(left, right, on='key')
# A B key C D
#0 A0 B0 K0 C0 D0
#1 A1 B1 K1 C1 D1
#2 A2 B2 K2 C2 D2
#3 A3 B3 K3 C3 D3
left = pd.DataFrame({'key1': ['K0','K0','K1','K2'],'key2': ['K0','K1','K0','K1'],
'A': ['A0','A1','A2','A3'],
'B': ['B0','B1','B2','B3']})
right = pd.DataFrame({'key1': ['K0','K1','K1','K2'],'key2': ['K0','K0','K0','K0'],
'C': ['C0','C1','C2','C3'],
'D': ['D0','D1','D2','D3']})
#how={'inner','outer','left','right'}
pd.merge(left, right, on=['key1','key2'],how='inner')
# A B key1 key2 C D
#0 A0 B0 K0 K0 C0 D0
#1 A2 B2 K1 K0 C1 D1
#2 A2 B2 K1 K0 C2 D2
pd.merge(left, right, on=['key1','key2'],how='outer')
# A B key1 key2 C D
#0 A0 B0 K0 K0 C0 D0
#1 A1 B1 K0 K1 NaN NaN
#2 A2 B2 K1 K0 C1 D1
#3 A2 B2 K1 K0 C2 D2
#4 A3 B3 K2 K1 NaN NaN
#5 NaN NaN K2 K0 C3 D3
#设置indicator参数
left = pd.DataFrame({'key': ['foo','bar','laa'], 'lval': [1, 2,3]})
right = pd.DataFrame({'key': ['foo','bar','haa'], 'rval': [4, 5,6]})
pd.merge(left, right, on='key',how='outer',indicator=True)
# key lval rval _merge
#0 foo 1.0 4.0 both
#1 bar 2.0 5.0 both
#2 laa 3.0 NaN left_only
#3 haa NaN 6.0 right_only
good_students = pd.DataFrame({'score': ['K0','K1','K2'], 'age': [1, 2,3]})
girls = pd.DataFrame({'score': ['K0','K1','K3'], 'age': [4, 5,6]})
pd.merge(good_students,girls, on='score',how='outer',
suffixes=['_good_students','_girls'])
# age_good_students score age_girls
#0 1.0 K0 4.0
#1 2.0 K1 5.0
#2 3.0 K2 NaN
#3 NaN K3 6.0
#分组计算
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
# A B C D
#0 foo one 0.672596 0.099568
#1 bar one -1.903051 -0.008498
#2 foo two -0.640851 -0.246812
#3 bar three 1.514539 1.220103
#4 foo two 0.837496 0.754398
#5 bar two -0.790593 -0.123624
#6 foo one -1.037150 0.042985
#7 foo three -1.158803 -1.120753
#分组然后应用sum函数到分组的结果中
df.groupby('A').sum()
# C D
#A
#bar -1.179105 1.087980
#foo -1.326713 -0.470613
#通过多列形式分组获得多重索引进行应用函数
df.groupby(['A','B']).sum()
# C D
#A B
#bar one -1.903051 -0.008498
# three 1.514539 1.220103
# two -0.790593 -0.123624
#foo one -0.364554 0.142553
# three -1.158803 -1.120753
# two 0.196645 0.507586
tuples = list(zip(['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux','qux'],
['one', 'two', 'one','two',
'one', 'two', 'one', 'two']))
#多层索引
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
#MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
# labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
# names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
# A B
#first second
#bar one -0.694858 0.527970
# two 0.666057 1.199938
#baz one -0.900507 1.248533
# two 0.858889 -1.110924
#foo one 0.980166 -1.090929
# two -0.959029 -0.683805
#qux one 0.371380 -0.617407
# two -0.975026 -0.507172
#stack()方法”压缩”DataFrame的列,堆叠的数据库
stacked = df.stack()
#first second
#bar one A -0.673415
# B -0.808613
# two A 0.134393
# B -1.670767
#baz one A 1.862296
# B -0.557928
# two A 0.891700
# B -0.733257
#foo one A 0.478092
# B 0.337301
# two A -0.840462
# B -1.702561
#qux one A -1.240449
# B 0.429788
# two A 0.580564
# B 0.819463
#dtype: float64
#对于堆叠的数据库,相反的stack()操作是unstack(),unstack()默认解除最后一个索引的堆叠状态。
stacked.unstack()
# A B
#first second
#bar one -0.673415 -0.808613
# two 0.134393 -1.670767
#baz one 1.862296 -0.557928
# two 0.891700 -0.733257
#foo one 0.478092 0.337301
# two -0.840462 -1.702561
#qux one -1.240449 0.429788
# two 0.580564 0.819463
#数据透视表
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
# A B C D E
#0 one A foo -0.824777 -0.338986
#1 one B foo 0.093007 -0.557541
#2 two C foo -0.381217 0.605170
#3 three A bar 0.650311 -0.302759
#4 one B bar -0.070880 1.585431
#5 one C bar 0.367186 -0.335522
#6 two A foo -1.587308 -0.139651
#7 three B foo -0.429347 0.977659
#8 one C foo -0.285123 -1.319764
#9 one A bar -1.118810 1.101361
#10 two B bar 0.280708 0.738208
#11 three C bar -0.184328 -0.519279
#我们可以从这个数据中轻松地制作出数据透视表pd.pivot_table
#就是指定原dataframe中某列(可以是多列)作为行索引,再指定某列作为列索引,做一个数据透视表
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
#C bar foo
#A B
#one A 2.059555 -1.002014
# B -1.091296 -1.905552
# C 1.561891 -1.336773
#three A 0.482432 NaN
# B NaN -1.072260
# C -0.854480 NaN
#two A NaN 1.140933
# B -0.591507 NaN
# C NaN -0.131300
#时间序列
#对于频率转换,pandas有简单、强大和高效的执行再取样操作的工具。
#(例如,把频率为1s的数据转化为频率为5min的数据)这种操作通常应用在金融领域,但也不限于此。
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
#每10秒求一下和,不累加。
ts.resample('10S').sum()
#pandas就可以在数据框内包含分类数据。
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
#把 raw_grade转变为分类数据类型。
df["grade"] = df["raw_grade"].astype("category")
#0 a
#1 b
#2 b
#3 a
#4 a
#5 e
#Name: grade, dtype: category
#Categories (3, object): [a, b, e]
#将分类数据重命名为更有意义的名字。
df["grade"].cat.categories = ["very good", "good", "very bad"]
#重新排列分类数据,同时添加缺失的分类数据
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
#0 very good
#1 good
#2 good
#3 very good
#4 very good
#5 very bad
#Name: grade, dtype: category
#Categories (5, object): [very bad, bad, medium, good, very good]
df
# id raw_grade grade
#0 1 a very good
#1 2 b good
#2 3 b good
#3 4 a very good
#4 5 a very good
#5 6 e very bad
#对分类数据进行排序会作用于每列而不是指定的列。
df.sort_values(by="grade")
# id raw_grade grade
#0 1 a very good
#3 4 a very good
#4 5 a very good
#1 2 b good
#2 3 b good
#5 6 e very bad
#对分类数据列那列进行分组,查看每组类别的大小
df.groupby("grade").size()
#grade
#very good 3
#good 2
#very bad 1
#dtype: int64
#画图
import matplotlib.pyplot as plt
#对于Series对象
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
plt.show()
#对于DataFrame对象
df=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('abcd'))
df.head()
# a b c d
#0 1.967384 1.079496 -0.489001 0.255740
#1 0.222863 1.876075 -1.454127 -0.184069
#2 -0.590762 1.534659 0.698180 -0.954305
#3 0.394999 -0.167066 -0.013934 -0.244500
#4 -0.131368 -0.761295 0.116348 1.336276
df_cum=df.cumsum()
df_cum.plot()
plt.show()
ax=df_cum.plot.scatter(x='a',y='b',label='1',c='B')
df_cum.plot.scatter(x='a',y='c',c='G',label='2',ax=ax)
plt.show()
pandas的入门代码演示
最新推荐文章于 2023-10-07 01:17:24 发布