pandas的简单操作(3):数据整形、数据透视、时间序列、数据可视化、数据载入与保存

input:

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as plt

input:

tuples = list(zip(*[['bar','bar','baz','baz',
                    'foo','foo','qux','qux'],
                   ['one','two','one','two',
                   'one','two','one','two']]))
tuples

output:

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

input:

index = pd.MultiIndex.from_tuples(tuples,names=['first','second']) #双层索引
index

output:

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

input:

df = pd.DataFrame(np.random.randn(8,2),index = index ,columns = ['A','B'])
df

output:

AB
firstsecond
barone0.537622-0.577700
two2.0281410.281219
bazone-0.0705681.000496
two0.0054781.907888
fooone-0.305029-0.149257
two0.104812-0.166493
quxone-0.3403780.849778
two1.137278-0.002255

input:

stacked = df.stack() #stacked函数:把行索引和列索引进行转换
stacked

output:

first  second   
bar    one     A    0.537622
               B   -0.577700
       two     A    2.028141
               B    0.281219
baz    one     A   -0.070568
               B    1.000496
       two     A    0.005478
               B    1.907888
foo    one     A   -0.305029
               B   -0.149257
       two     A    0.104812
               B   -0.166493
qux    one     A   -0.340378
               B    0.849778
       two     A    1.137278
               B   -0.002255
dtype: float64

input:

stacked = df.unstack() #转换回来

input:

df = pd.DataFrame({'A':['one','one','two','three']*3,
                   'B':['A','B','C']*4,
                   'C':['foo','foo','foo','bar','bar','bar']*2,
                   'D': np.random.randn(12),
                   'E':np.random.randn(12)})
df

output:

ABCDE
0oneAfoo-0.7644171.005010
1oneBfoo0.774953-1.087644
2twoCfoo0.205976-1.945244
3threeAbar-0.503414-1.890938
4oneBbar0.5733730.380991
5oneCbar-0.9730880.647617
6twoAfoo0.777202-1.975924
7threeBfoo-1.1527220.639917
8oneCfoo-1.1376950.178866
9oneAbar0.266740-0.599844
10twoBbar1.459513-0.665667
11threeCbar-0.8545460.136330

input:

df.pivot_table(values=['D'],index=['A','B'], columns=['C']) #以A B 为行索引,C为列索引,值为D 数据透视表

output:

D
Cbarfoo
AB
oneA0.266740-0.764417
B0.5733730.774953
C-0.973088-1.137695
threeA-0.503414NaN
BNaN-1.152722
C-0.854546NaN
twoANaN0.777202
B1.459513NaN
CNaN0.205976

input:

df.pivot_table(values=['E'],index=['A'],columns=['C'])

output:

E
Cbarfoo
A
one0.1429210.032077
three-0.8773040.639917
two-0.665667-1.960584

input:

df[df.A =='one'].groupby('C').mean()

output:

DE
C
bar-0.0443250.142921
foo-0.3757200.032077

input:

rng = pd.date_range('20200301',periods=600, freq='s')
rng

output:

DatetimeIndex(['2020-03-01 00:00:00', '2020-03-01 00:00:01',
               '2020-03-01 00:00:02', '2020-03-01 00:00:03',
               '2020-03-01 00:00:04', '2020-03-01 00:00:05',
               '2020-03-01 00:00:06', '2020-03-01 00:00:07',
               '2020-03-01 00:00:08', '2020-03-01 00:00:09',
               ...
               '2020-03-01 00:09:50', '2020-03-01 00:09:51',
               '2020-03-01 00:09:52', '2020-03-01 00:09:53',
               '2020-03-01 00:09:54', '2020-03-01 00:09:55',
               '2020-03-01 00:09:56', '2020-03-01 00:09:57',
               '2020-03-01 00:09:58', '2020-03-01 00:09:59'],
              dtype='datetime64[ns]', length=600, freq='S')

input:

s = pd.Series(np.random.randint(0,500,len(rng)),index=rng) #时间序列
s

output:

2020-03-01 00:00:00    434
2020-03-01 00:00:01    200
2020-03-01 00:00:02    429
2020-03-01 00:00:03    498
2020-03-01 00:00:04    431
2020-03-01 00:00:05     33
2020-03-01 00:00:06    431
2020-03-01 00:00:07    360
2020-03-01 00:00:08    101
2020-03-01 00:00:09     89
2020-03-01 00:00:10     27
2020-03-01 00:00:11    257
2020-03-01 00:00:12    493
2020-03-01 00:00:13    309
2020-03-01 00:00:14    431
2020-03-01 00:00:15    346
2020-03-01 00:00:16    378
2020-03-01 00:00:17    267
2020-03-01 00:00:18     25
2020-03-01 00:00:19    271
2020-03-01 00:00:20    425
2020-03-01 00:00:21    169
2020-03-01 00:00:22     92
2020-03-01 00:00:23    220
2020-03-01 00:00:24    391
2020-03-01 00:00:25     81
2020-03-01 00:00:26    283
2020-03-01 00:00:27    304
2020-03-01 00:00:28    344
2020-03-01 00:00:29    316
                      ... 
2020-03-01 00:09:30     84
2020-03-01 00:09:31     70
2020-03-01 00:09:32     74
2020-03-01 00:09:33     57
2020-03-01 00:09:34    110
2020-03-01 00:09:35    111
2020-03-01 00:09:36    173
2020-03-01 00:09:37    239
2020-03-01 00:09:38    273
2020-03-01 00:09:39    200
2020-03-01 00:09:40    246
2020-03-01 00:09:41     75
2020-03-01 00:09:42    392
2020-03-01 00:09:43    262
2020-03-01 00:09:44    480
2020-03-01 00:09:45    154
2020-03-01 00:09:46     66
2020-03-01 00:09:47     51
2020-03-01 00:09:48    178
2020-03-01 00:09:49    395
2020-03-01 00:09:50    375
2020-03-01 00:09:51    228
2020-03-01 00:09:52    485
2020-03-01 00:09:53    161
2020-03-01 00:09:54    483
2020-03-01 00:09:55     92
2020-03-01 00:09:56     45
2020-03-01 00:09:57    488
2020-03-01 00:09:58      3
2020-03-01 00:09:59    469
Freq: S, Length: 600, dtype: int64

input:

s.resample('2Min',how='mean') #因为上面数据太多不利于分析,所以进行重采样,这里每两分钟采样一次,可以取平均值采样的方法或者求和的方法
/Users/mac/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).mean()
  """Entry point for launching an IPython kernel.





2020-03-01 00:00:00    246.425000
2020-03-01 00:02:00    272.933333
2020-03-01 00:04:00    253.975000
2020-03-01 00:06:00    268.491667
2020-03-01 00:08:00    242.025000
Freq: 2T, dtype: float64

input:

rng = pd.period_range('2000Q1','2016Q1',freq='Q')
rng

output:

PeriodIndex(['2000Q1', '2000Q2', '2000Q3', '2000Q4', '2001Q1', '2001Q2',
             '2001Q3', '2001Q4', '2002Q1', '2002Q2', '2002Q3', '2002Q4',
             '2003Q1', '2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2',
             '2004Q3', '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4',
             '2006Q1', '2006Q2', '2006Q3', '2006Q4', '2007Q1', '2007Q2',
             '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', '2008Q4',
             '2009Q1', '2009Q2', '2009Q3', '2009Q4', '2010Q1', '2010Q2',
             '2010Q3', '2010Q4', '2011Q1', '2011Q2', '2011Q3', '2011Q4',
             '2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2',
             '2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4',
             '2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1'],
            dtype='period[Q-DEC]', freq='Q-DEC')

input:

rng.to_timestamp()

output:

DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01',
               '2001-01-01', '2001-04-01', '2001-07-01', '2001-10-01',
               '2002-01-01', '2002-04-01', '2002-07-01', '2002-10-01',
               '2003-01-01', '2003-04-01', '2003-07-01', '2003-10-01',
               '2004-01-01', '2004-04-01', '2004-07-01', '2004-10-01',
               '2005-01-01', '2005-04-01', '2005-07-01', '2005-10-01',
               '2006-01-01', '2006-04-01', '2006-07-01', '2006-10-01',
               '2007-01-01', '2007-04-01', '2007-07-01', '2007-10-01',
               '2008-01-01', '2008-04-01', '2008-07-01', '2008-10-01',
               '2009-01-01', '2009-04-01', '2009-07-01', '2009-10-01',
               '2010-01-01', '2010-04-01', '2010-07-01', '2010-10-01',
               '2011-01-01', '2011-04-01', '2011-07-01', '2011-10-01',
               '2012-01-01', '2012-04-01', '2012-07-01', '2012-10-01',
               '2013-01-01', '2013-04-01', '2013-07-01', '2013-10-01',
               '2014-01-01', '2014-04-01', '2014-07-01', '2014-10-01',
               '2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01',
               '2016-01-01'],
              dtype='datetime64[ns]', freq='QS-OCT')

input:

pd.Timestamp('20160301')-pd.Timestamp('20160201') #pandas里时间运算非常方便

output:

Timedelta('29 days 00:00:00')

input:

pd.Timestamp('20160301')+pd.Timedelta(days=5)

output:

Timestamp('2016-03-06 00:00:00')

input:

df = pd.DataFrame({'id':[1,2,3,4,5,6],'raw_grade':['a','b','b','a','a','d']})
df

output:

idraw_grade
01a
12b
23b
34a
45a
56d

input:

df['grade'] = df.raw_grade.astype('category') #增加grade 列
df

output:

idraw_gradegrade
01aa
12bb
23bb
34aa
45aa
56dd

input:

df.grade

output:

0    a
1    b
2    b
3    a
4    a
5    d
Name: grade, dtype: category
Categories (3, object): [a, b, d]

input:

df.grade.cat.categories

output:

Index(['a', 'b', 'd'], dtype='object')

input:

df.grade.cat.categories =['very good','good','bad'] #可以替换成任意内容 这里是以id值排序
df

output:

idraw_gradegrade
01avery good
12bgood
23bgood
34avery good
45avery good
56dbad

input:

df.sort_values(by='grade',ascending=False) #以grade排序

output:

idraw_gradegrade
56dbad
23bgood
12bgood
45avery good
34avery good
01avery good

input:

s = pd.Series(np.random.randn(1000),index=pd.date_range('20000101',periods=1000))
s

output:

2000-01-01   -0.532396
2000-01-02   -0.438004
2000-01-03    1.211030
2000-01-04    0.736409
2000-01-05   -0.721090
2000-01-06   -0.978725
2000-01-07    1.189487
2000-01-08    0.017212
2000-01-09    0.014859
2000-01-10    0.807690
2000-01-11    1.201846
2000-01-12   -1.622267
2000-01-13   -0.670456
2000-01-14    1.860123
2000-01-15   -0.355513
2000-01-16   -1.697407
2000-01-17   -0.349072
2000-01-18    0.493644
2000-01-19   -0.202796
2000-01-20    0.443553
2000-01-21    0.579038
2000-01-22    0.015395
2000-01-23   -1.703529
2000-01-24    0.200692
2000-01-25   -0.649918
2000-01-26    0.103963
2000-01-27    0.044044
2000-01-28   -0.211696
2000-01-29    0.016201
2000-01-30    0.350856
                ...   
2002-08-28   -0.145103
2002-08-29   -1.800730
2002-08-30   -1.216044
2002-08-31   -0.052010
2002-09-01    0.932612
2002-09-02   -1.063313
2002-09-03   -1.739508
2002-09-04   -1.398266
2002-09-05   -0.209526
2002-09-06   -0.048596
2002-09-07   -1.707406
2002-09-08   -0.117668
2002-09-09    0.208738
2002-09-10   -1.751093
2002-09-11   -0.143911
2002-09-12   -1.176041
2002-09-13    1.148417
2002-09-14    0.785341
2002-09-15   -1.165482
2002-09-16    1.184074
2002-09-17   -1.252482
2002-09-18   -2.602539
2002-09-19   -0.335973
2002-09-20    1.390396
2002-09-21    0.289487
2002-09-22   -0.121280
2002-09-23   -0.229991
2002-09-24    0.684140
2002-09-25   -0.391517
2002-09-26    0.003391
Freq: D, Length: 1000, dtype: float64

input:

s = s.cumsum() #累加求和
s
2000-01-01    -0.532396
2000-01-02    -0.970400
2000-01-03     0.240631
2000-01-04     0.977040
2000-01-05     0.255949
2000-01-06    -0.722775
2000-01-07     0.466711
2000-01-08     0.483923
2000-01-09     0.498782
2000-01-10     1.306472
2000-01-11     2.508318
2000-01-12     0.886052
2000-01-13     0.215595
2000-01-14     2.075718
2000-01-15     1.720205
2000-01-16     0.022798
2000-01-17    -0.326274
2000-01-18     0.167370
2000-01-19    -0.035426
2000-01-20     0.408127
2000-01-21     0.987165
2000-01-22     1.002560
2000-01-23    -0.700968
2000-01-24    -0.500276
2000-01-25    -1.150195
2000-01-26    -1.046232
2000-01-27    -1.002187
2000-01-28    -1.213884
2000-01-29    -1.197683
2000-01-30    -0.846828
                ...    
2002-08-28   -13.662979
2002-08-29   -15.463709
2002-08-30   -16.679753
2002-08-31   -16.731763
2002-09-01   -15.799151
2002-09-02   -16.862464
2002-09-03   -18.601972
2002-09-04   -20.000239
2002-09-05   -20.209765
2002-09-06   -20.258361
2002-09-07   -21.965767
2002-09-08   -22.083435
2002-09-09   -21.874697
2002-09-10   -23.625790
2002-09-11   -23.769701
2002-09-12   -24.945741
2002-09-13   -23.797324
2002-09-14   -23.011983
2002-09-15   -24.177466
2002-09-16   -22.993391
2002-09-17   -24.245873
2002-09-18   -26.848412
2002-09-19   -27.184385
2002-09-20   -25.793989
2002-09-21   -25.504502
2002-09-22   -25.625782
2002-09-23   -25.855773
2002-09-24   -25.171632
2002-09-25   -25.563150
2002-09-26   -25.559759
Freq: D, Length: 1000, dtype: float64

input:

s.plot() #数据可视化

output:

<matplotlib.axes._subplots.AxesSubplot at 0x116707f60>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jV2jN8Ku-1604852354405)(output_25_1.png)]

input:

df = pd.DataFrame(np.random.randn(100,4),columns=list('ABCD'))
df

output:

ABCD
01.564117-0.463773-0.091550-1.857433
1-1.1844170.937198-1.788886-0.380206
2-0.2960870.4126531.936886-1.718887
31.671879-0.2039850.015520-1.426769
40.258212-0.599114-1.5715012.172205
5-1.0998292.1781600.215177-1.248541
60.8604450.958647-1.098336-0.023771
71.956408-1.9141370.1942430.855831
80.396876-0.4550981.1245360.526166
90.1364700.0955131.2972671.154325
10-0.214534-0.9508001.6079500.828318
11-1.2980540.100998-0.5150460.476188
121.163658-1.994909-0.7580150.036630
130.9054101.2329970.633099-0.450712
14-0.4887990.904530-0.226913-0.027718
15-1.3382790.607139-0.7142410.678542
161.5729351.6677770.597482-1.846735
171.2996550.7553830.553495-0.816247
18-0.1380440.213682-0.6854820.187415
190.8492370.442252-1.808233-0.408269
20-1.257406-0.005541-0.588033-1.417979
21-1.725082-0.411008-0.4157891.219006
22-1.093083-0.0174840.1774620.444620
23-0.674220-0.209629-0.9877570.804393
240.858732-0.9616481.765393-0.946591
250.835091-0.1195950.276223-0.499499
261.2930171.683998-1.2399530.578722
271.4550330.492396-1.3971962.129232
280.035302-0.8131831.1816860.199079
29-0.246104-1.071980-0.092767-0.871213
...............
700.523143-1.5838700.037297-0.359236
71-1.186931-0.921222-1.634302-0.231714
720.647745-0.667466-0.555059-0.169008
730.7032170.035210-0.5505780.064030
74-1.2396520.458127-0.102111-0.812303
750.730404-0.1094440.3939240.902011
761.503345-0.163920-0.381882-0.758856
770.7580331.4120920.8936780.966148
78-2.095461-1.3437931.0092651.021844
79-0.2434990.8120860.713547-0.315148
80-1.842734-0.3825230.8426320.326414
810.2933150.715708-0.603879-0.822051
820.320864-0.750298-0.9373380.231808
83-0.506367-0.3785030.7998950.465828
84-1.2193930.2956971.3116842.146081
850.186327-0.1265601.477611-1.200722
860.0736171.448020-0.5283520.512307
870.7247850.075949-0.618813-1.652172
881.238503-0.3190080.157979-0.025204
890.2484230.278020-0.4535520.588062
90-0.629882-2.1253690.2372800.030703
911.500885-0.7250881.012414-1.572393
92-0.751267-0.235083-1.0529011.387676
930.1514001.313931-0.424040-0.983791
940.667903-0.1055571.6143670.426456
951.8715450.072405-0.303439-0.347861
96-0.0518751.7186290.1983100.383673
971.3901780.866700-0.019794-0.125795
98-0.571301-1.722597-2.2292140.400489
990.649530-0.262061-0.694856-0.566315

100 rows × 4 columns

input:

df.to_csv('data.cav') #保存到磁盘

input:

%ls #查看内容

output:

[30m[43mAdlm[m[m/                       Untitled.ipynb
[34mApplications[m[m/               [34mVirtual Machines.localized[m[m/
[34mApplications (Parallels)[m[m/   [34manaconda3[m[m/
[34mCreative Cloud Files[m[m/       data.cav
[34mDesktop[m[m/                    matlab_crash_dump.1087-1
[34mDocuments[m[m/                  matlab_crash_dump.1095-1
[34mDownloads[m[m/                  matlab_crash_dump.661-1
[34mLibrary[m[m/                    [34mopt[m[m/
[34mMovies[m[m/                     pandas_tutorial(2).ipynb
[34mMusic[m[m/                      pandas_tutorials(1).ipynb
[34mPictures[m[m/                   pandas_tutorials(3).ipynb
[34mPublic[m[m/                     pandas_tutorials_1.ipynb
[34mQt5.14.1[m[m/

.14.1[m[m/

input:

%more data.csv   #查看内容

input:

pd.read_csv('data.csv') #读回来
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值