Pandas学习(3)DataFrame操作1

import numpy as np
import pandas as pd
#创建一个日期DataFrame,数据为randint随机生成1-24
d=pd.DataFrame(np.random.randint(1,24,size=(6,4)),
               index=pd.date_range("19980102",periods=6),
               columns=["a","b","c","d"])
print(d)
             a   b   c   d
1998-01-02  15  21  23   7
1998-01-03  11   6   5   2
1998-01-04  13   4  23  23
1998-01-05  21  20  22  20
1998-01-06   5  23  19  22
1998-01-07   4   9  11   4
#数组+Series创建多个格式的DataFrames
d2 = pd.DataFrame({'A': np.random.rand(4),
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(np.arange(1,5), index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(["test", "train", "test", "train"]),
                   'F': 'foo'})
d2
ABCDEF
00.0757412013-01-021.03testfoo
10.4100902013-01-022.03trainfoo
20.6206532013-01-023.03testfoo
30.0618352013-01-024.03trainfoo

查看数据


查看数据类型,dtypes输出每一列数据类型

print(d.dtypes)
print(d2.dtypes)
a    int32
b    int32
c    int32
d    int32
dtype: object
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

查看 DataFrame 头部

print(d.head())

查看 DataFrame 尾部,可选择数量

print(d.tail(3))
             a   b   c   d
1998-01-02  15  21  23   7
1998-01-03  11   6   5   2
1998-01-04  13   4  23  23
1998-01-05  21  20  22  20
1998-01-06   5  23  19  22
             a   b   c   d
1998-01-05  21  20  22  20
1998-01-06   5  23  19  22
1998-01-07   4   9  11   4

行索引列标签

print(d.index)
print(d.columns)
DatetimeIndex(['1998-01-02', '1998-01-03', '1998-01-04', '1998-01-05',
               '1998-01-06', '1998-01-07'],
              dtype='datetime64[ns]', freq='D')
Index(['a', 'b', 'c', 'd'], dtype='object')

输出数组对象

print(d.to_numpy())
print(d2.to_numpy())#数组形式输出
[[15 21 23  7]
 [11  6  5  2]
 [13  4 23 23]
 [21 20 22 20]
 [ 5 23 19 22]
 [ 4  9 11  4]]
[[0.07574114092524209 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [0.41008997302466055 Timestamp('2013-01-02 00:00:00') 2.0 3 'train'
  'foo']
 [0.6206525618816419 Timestamp('2013-01-02 00:00:00') 3.0 3 'test' 'foo']
 [0.061835078456274895 Timestamp('2013-01-02 00:00:00') 4.0 3 'train'
  'foo']]

describe() 快速查看数据的统计摘要:

数字数据
DataFrame.count
计算非NA /空观测值的数量。
DataFrame.max
对象中的最大值。
DataFrame.min
对象中的最小值。
DataFrame.mean
值的平均值。
DataFrame.std
观测值的标准差。
DataFrame.select_dtypes
DataFrame的子集,包括/基于列的dtype排除列。

对象的数据(例如字符串或时间戳)
结果的指数将包括count,unique,top,和freq。
该top 是最常见的值。该freq是最常见的值的频率。时间戳记还包括first和last项目。

混合数据输出数字数据

d.describe()
abcd
count6.0000006.0000006.0000006.000000
mean11.50000013.83333317.16666713.000000
std6.3796558.4241727.4944429.674709
min4.0000004.0000005.0000002.000000
25%6.5000006.75000013.0000004.750000
50%12.00000014.50000020.50000013.500000
75%14.50000020.75000022.75000021.500000
max21.00000023.00000023.00000023.000000

25%、50%、75%为分位值
一共6个数,共5个数字间隔,每个四分位间5/4=1.25个数。

计算25分位:
第1个四分位数为上面6个数中的第1+1.25=2.25个数
指的是第2个数+第2个和第3个数中间的0.25位置处,即:
5+0.25*(11-5)= 6.5

计算50分位:
第2个四分位数为上面6个数中的第1+1.25*2=3.5个数
即:11+0.5(13-11)=12

计算75分位:
第3个四分位数为上面6个数中的第1+1.25*3=4.75个数
指的是第4个数+第4个和第5个数中间的0.75位置处,即:
13+0.75(15-13)=14.5

d2.describe()
ACD
count4.0000004.0000004.0
mean0.2920802.5000003.0
std0.2718461.2909940.0
min0.0618351.0000003.0
25%0.0722651.7500003.0
50%0.2429162.5000003.0
75%0.4627313.2500003.0
max0.6206534.0000003.0

转置

d.T#行标签、列索引互换
1998-01-021998-01-031998-01-041998-01-051998-01-061998-01-07
a1511132154
b216420239
c23523221911
d722320224

排序

索引轴排序

d.sort_index(axis=0, ascending=False)#行标签倒序
abcd
1998-01-0749114
1998-01-065231922
1998-01-0521202220
1998-01-041342323
1998-01-0311652
1998-01-021521237
d.sort_index(axis=0, ascending=False)#列标签倒序
abcd
1998-01-0749114
1998-01-065231922
1998-01-0521202220
1998-01-041342323
1998-01-0311652
1998-01-021521237

值排序

d.sort_values(by=['a'],axis=0)#通pa过by和axis参数选择排序的列标签
abcd
1998-01-0749114
1998-01-065231922
1998-01-0311652
1998-01-041342323
1998-01-021521237
1998-01-0521202220
d.sort_values(by='1998-01-02',axis=1)#通pa过by和axis参数选择排序的行标签
dabc
1998-01-027152123
1998-01-0321165
1998-01-042313423
1998-01-0520212022
1998-01-062252319
1998-01-0744911

索引和切片

[ ] 切片

#选择单列,产生 Series
print(d.a)
print(d['a'])
1998-01-02    15
1998-01-03    11
1998-01-04    13
1998-01-05    21
1998-01-06     5
1998-01-07     4
Freq: D, Name: a, dtype: int32
1998-01-02    15
1998-01-03    11
1998-01-04    13
1998-01-05    21
1998-01-06     5
1998-01-07     4
Freq: D, Name: a, dtype: int32

选择多列,切片,

d[:3]
abcd
1998-01-021521237
1998-01-0311652
1998-01-041342323
d['1998-01-03':'1998-01-05']
abcd
1998-01-0311652
1998-01-041342323
1998-01-0521202220

按标签选择切片loc

d.loc['1998-01-03':'1998-01-05']#用行标签切片
abcd
1998-01-0311652
1998-01-041342323
1998-01-0521202220
d.loc['1998-01-03':'1998-01-06', 'b':'d']#加上列标签切片
bcd
1998-01-03652
1998-01-0442323
1998-01-05202220
1998-01-06231922

使用布尔数组获取值:

print(d.loc['1998-01-04'] > 0)
print(d.loc[:, d.loc['1998-01-04'] > 0])
a    True
b    True
c    True
d    True
Name: 1998-01-04 00:00:00, dtype: bool
             a   b   c   d
1998-01-02  15  21  23   7
1998-01-03  11   6   5   2
1998-01-04  13   4  23  23
1998-01-05  21  20  22  20
1998-01-06   5  23  19  22
1998-01-07   4   9  11   4

iloc属性类似NumPy 用整数切片:

d.iloc[3]
a    21
b    20
c    22
d    20
Name: 1998-01-05 00:00:00, dtype: int32
d.iloc[3:5, 0:2]
ab
1998-01-052120
1998-01-06523
#提取指定值
d.iloc[1, 1]
6

用 isin() 插入、筛选:

d['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
d
abcdE
1998-01-021521237one
1998-01-0311652one
1998-01-041342323two
1998-01-0521202220three
1998-01-065231922four
1998-01-0749114three
d[d['E'].isin(['two', 'four'])]
abcdE
1998-01-041342323two
1998-01-065231922four
d
abcdE
1998-01-021521237one
1998-01-0311652one
1998-01-041342323two
1998-01-0521202220three
1998-01-065231922four
1998-01-0749114three

reindex更改、添加、删除指定轴的索引,并返回数据副本,即不更改原数据。

df = pd.DataFrame(np.random.randn(6, 4), 
                  index=pd.date_range('20130101', periods=6),
                  columns=list('ABCD'))
df1=df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
ABCDE
1998-01-02NaNNaNNaNNaN1.0
1998-01-03NaNNaNNaNNaN1.0
1998-01-04NaNNaNNaNNaNNaN
1998-01-05NaNNaNNaNNaNNaN

删除所有含缺失值的行:

df1.dropna(how='any')
ABCDE

填充缺失值:

df1.fillna(value=5)
ABCDE
1998-01-025.05.05.05.01.0
1998-01-035.05.05.05.01.0
1998-01-045.05.05.05.05.0
1998-01-055.05.05.05.05.0

3、运算


统计mean()

print(df)
print(df.mean())#列
                   A         B         C         D
2013-01-01  0.725530  0.304938  0.988725  0.749843
2013-01-02 -0.697489 -0.916037  0.019967  0.256584
2013-01-03 -0.590450 -0.261403 -0.414659 -0.344422
2013-01-04 -1.720918 -0.553150 -1.047237  0.222394
2013-01-05 -3.493505  1.665040  0.356288  0.953887
2013-01-06  0.936351 -1.692798  1.251221  1.718479
A   -0.806747
B   -0.242235
C    0.192384
D    0.592794
dtype: float64
df.mean(1)#行
2013-01-01    0.692259
2013-01-02   -0.334244
2013-01-03   -0.402733
2013-01-04   -0.774728
2013-01-05   -0.129572
2013-01-06    0.553313
Freq: D, dtype: float64

## 不同维度对象运算时,要先对齐。 此外,Pandas 自动沿指定维度广播。

s = pd.Series([1, 3, 5, np.nan, 6, 8], 
              index=pd.date_range('20130101', periods=6)).shift(2)
s
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
df
ABCD
2013-01-010.7255300.3049380.9887250.749843
2013-01-02-0.697489-0.9160370.0199670.256584
2013-01-03-0.590450-0.261403-0.414659-0.344422
2013-01-04-1.720918-0.553150-1.0472370.222394
2013-01-05-3.4935051.6650400.3562880.953887
2013-01-060.936351-1.6927981.2512211.718479
df.sub(s, axis='index')
ABCD
2013-01-01NaNNaNNaNNaN
2013-01-02NaNNaNNaNNaN
2013-01-03-1.590450-1.261403-1.414659-1.344422
2013-01-04-4.720918-3.553150-4.047237-2.777606
2013-01-05-8.493505-3.334960-4.643712-4.046113
2013-01-06NaNNaNNaNNaN

合并(Merge)

# 结合(Concat)
pd.concat([df[:2],df[4:5]])
ABCD
2013-01-010.7255300.3049380.9887250.749843
2013-01-02-0.697489-0.9160370.0199670.256584
2013-01-05-3.4935051.6650400.3562880.953887

连接(join)


# 连接(join)

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
keylval
0foo1
1foo2
right
keyrval
0foo4
1foo5
pd.merge(left, right, on='key')
keylvalrval
0foo14
1foo15
2foo24
3foo25

追加(Append)

df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df.append(df, ignore_index=True)
ABCD
0-0.7610270.4300540.4527841.122863
1-2.276889-0.9435611.823242-0.716462
20.430023-0.8122280.938351-0.839029
30.1699740.890258-0.3872690.510224
4-1.6473501.135522-1.064858-0.303383
5-0.3824460.890663-0.052855-0.548905
6-0.963716-1.3072391.8308300.106964
71.481288-0.0228461.371338-0.227230
8-0.7610270.4300540.4527841.122863
9-2.276889-0.9435611.823242-0.716462
100.430023-0.8122280.938351-0.839029
110.1699740.890258-0.3872690.510224
12-1.6473501.135522-1.064858-0.303383
13-0.3824460.890663-0.052855-0.548905
14-0.963716-1.3072391.8308300.106964
151.481288-0.0228461.371338-0.227230

未完待续。。。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值