pandas简单操作(1):创建、删除、处理等
input:
import pandas as pd
a = pd.Series([1,3,4,5,6])
print(a)
import os
print(os.path.abspath('.'))
output:
0 1
1 3
2 4
3 5
4 6
dtype: int64
/Users/mac
input:
datas_1 = pd.date_range(start ='20160301',end = '20160306')
datas = pd.date_range(start ='20160301',periods = 6)
datas_1
output:
DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
'2016-03-05', '2016-03-06'],
dtype='datetime64[ns]', freq='D')
input:
import numpy as np
data = pd.DataFrame(np.random.randn(6,4), index = datas, columns = list('ABCD'))
#data.shape
#data.values
data
output:
A | B | C | D | |
---|---|---|---|---|
2016-03-01 | -0.271952 | -1.217686 | -0.037302 | -0.179983 |
2016-03-02 | -1.803772 | 0.689236 | -2.532408 | -0.819420 |
2016-03-03 | -0.596284 | 0.747474 | -2.169957 | 0.173793 |
2016-03-04 | 0.349248 | -1.013605 | 0.963074 | -0.306067 |
2016-03-05 | 0.981165 | 0.348213 | 0.011119 | 0.721957 |
2016-03-06 | -0.576315 | 1.312909 | -0.407155 | 0.793780 |
input:
#data.head()
data.head(2) #只查看前两行
#data.tail()
data.tail(2)#只看最后两行
output:
A | B | C | D | |
---|---|---|---|---|
2016-03-05 | 0.981165 | 0.348213 | 0.011119 | 0.721957 |
2016-03-06 | -0.576315 | 1.312909 | -0.407155 | 0.793780 |
input:
data.index #查看行标签
data.columns #查看列标签
data.describe() #count:几位有效数字 mean:平均值 std:方差 min:最小值 25%:4分位
output:
A | B | C | D | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | -0.319652 | 0.144424 | -0.695438 | 0.064010 |
std | 0.947295 | 1.026036 | 1.364903 | 0.625212 |
min | -1.803772 | -1.217686 | -2.532408 | -0.819420 |
25% | -0.591291 | -0.673150 | -1.729257 | -0.274546 |
50% | -0.424133 | 0.518724 | -0.222229 | -0.003095 |
75% | 0.193948 | 0.732915 | -0.000986 | 0.584916 |
max | 0.981165 | 1.312909 | 0.963074 | 0.793780 |
input:
data.T #转置
output:
2016-03-01 00:00:00 | 2016-03-02 00:00:00 | 2016-03-03 00:00:00 | 2016-03-04 00:00:00 | 2016-03-05 00:00:00 | 2016-03-06 00:00:00 | |
---|---|---|---|---|---|---|
A | -0.271952 | -1.803772 | -0.596284 | 0.349248 | 0.981165 | -0.576315 |
B | -1.217686 | 0.689236 | 0.747474 | -1.013605 | 0.348213 | 1.312909 |
C | -0.037302 | -2.532408 | -2.169957 | 0.963074 | 0.011119 | -0.407155 |
D | -0.179983 | -0.819420 | 0.173793 | -0.306067 | 0.721957 | 0.793780 |
input:
data.sort_index(axis=1,ascending=False) #根据列标签排序(降序)
data.sort_index(axis=0,ascending=True) #根据列标签排序(升序)
data.sort_values(by='A') #根据A列的数据排序(升序)
output:
A | B | C | D | |
---|---|---|---|---|
2016-03-02 | -1.803772 | 0.689236 | -2.532408 | -0.819420 |
2016-03-03 | -0.596284 | 0.747474 | -2.169957 | 0.173793 |
2016-03-06 | -0.576315 | 1.312909 | -0.407155 | 0.793780 |
2016-03-01 | -0.271952 | -1.217686 | -0.037302 | -0.179983 |
2016-03-04 | 0.349248 | -1.013605 | 0.963074 | -0.306067 |
2016-03-05 | 0.981165 | 0.348213 | 0.011119 | 0.721957 |
input:
data['A'] #数据选择 选择一个列 或者data.A
data[2:4] #选择2-4行
data.iloc[2:4] #这个效率比较高,用这个,(iloc)只认数字
data['20160302':'20160305'] #按行选择
data.loc['20160302':'20160305'] #这个效率比较高,用这个,(loc)只认标签
output:
A | B | C | D | |
---|---|---|---|---|
2016-03-03 | -0.596284 | 0.747474 | -2.169957 | 0.173793 |
2016-03-04 | 0.349248 | -1.013605 | 0.963074 | -0.306067 |
input:
data.loc[:,['B','C']] #只选择B-C两列的数据
data.loc['20160302':'20160304',['B','C']] #选择B-C两列,20160302-0304之间的数据
output:
B | C | |
---|---|---|
2016-03-02 | 0.689236 | -2.532408 |
2016-03-03 | 0.747474 | -2.169957 |
2016-03-04 | -1.013605 | 0.963074 |
data.loc['20160302','B'] #获得表格中的具体的一个值
data.at[pd.Timestamp('20160302'),'B'] #这个效率更高,用这个,但是得传入时间戳的格式
output:
0.6892360151598519
input:
data.iat[0,0]=100 #修改单个数据
data.A = range(6) #修改一列数据
data.A = 100 #修改一列数据
d = {'A':1,'B':pd.Timestamp('20201106'),'C':range(4),'D':np.arange(4)}
d
output:
{'A': 1,
'B': Timestamp('2020-11-06 00:00:00'),
'C': range(0, 4),
'D': array([0, 1, 2, 3])}
input:
df = pd.DataFrame(d)
df
output:
A | B | C | D | |
---|---|---|---|---|
0 | 1 | 2020-11-06 | 0 | 0 |
1 | 1 | 2020-11-06 | 1 | 1 |
2 | 1 | 2020-11-06 | 2 | 2 |
3 | 1 | 2020-11-06 | 3 | 3 |
input:
df.dtypes
df.A
output:
0 1
1 1
2 1
3 1
Name: A, dtype: int64