(本专栏是我的慕课学习笔记,后续发现不足之处会更新)
pandas基础操作:
>>> import numpy as np
>>> import pandas as pd
>>> dates = pd.date_range("20200806", periods=8) # 连续的8天
>>> # randn为标准正态分布,index为key值,columns为属性值
>>> df = pd.DataFrame(np.random.randn(8, 5), index=dates, columns=list("ABCDE"))
>>> print("整个表:\n", df) # 打印整个表
A B C D E
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
2020-08-09 -0.012218 -1.224213 -1.319811 0.323090 0.063972
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873
2020-08-12 -0.572311 -2.700809 0.258386 0.114140 0.487276
2020-08-13 0.050315 0.475426 0.581819 -2.242387 1.220767
>>> print("\n表的前3行:\n", df.head(3)) # 打印表的前3行
A B C D E
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
>>> print("\n表的后3行:\n", df.tail(3)) # 打印表的后3行
A B C D E
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873
2020-08-12 -0.572311 -2.700809 0.258386 0.114140 0.487276
2020-08-13 0.050315 0.475426 0.581819 -2.242387 1.220767
>>> print("\nindex值:\n", df.index) # 只打印index值
index值:
DatetimeIndex(['2020-08-06', '2020-08-07', '2020-08-08', '2020-08-09',
'2020-08-10', '2020-08-11', '2020-08-12', '2020-08-13'],
dtype='datetime64[ns]', freq='D')
>>> print("\nvalues值:\n", df.values) # 只打印values值
[[ 0.70728844 0.4999616 0.13061059 1.76455701 -1.1653581 ]
[-0.99735496 1.22000868 -0.45235699 -0.43247666 1.31265841]
[-0.52401434 0.37303048 -0.01251882 -0.05932329 -2.15578928]
[-0.01221819 -1.22421337 -1.31981069 0.32308968 0.06397238]
[ 0.29782485 0.54606854 -0.19202779 0.41367944 -1.11302353]
[-1.07056955 -0.53002278 -0.30193276 0.53860645 -1.23187343]
[-0.57231052 -2.70080883 0.2583857 0.11414044 0.48727611]
[ 0.05031547 0.47542649 0.58181917 -2.24238729 1.22076725]]
>>> print("\n转置:\n", df.T) # 转置
2020-08-06 2020-08-07 2020-08-08 ... 2020-08-11 2020-08-12 2020-08-13
A 0.707288 -0.997355 -0.524014 ... -1.070570 -0.572311 0.050315
B 0.499962 1.220009 0.373030 ... -0.530023 -2.700809 0.475426
C 0.130611 -0.452357 -0.012519 ... -0.301933 0.258386 0.581819
D 1.764557 -0.432477 -0.059323 ... 0.538606 0.114140 -2.242387
E -1.165358 1.312658 -2.155789 ... -1.231873 0.487276 1.220767
[5 rows x 8 columns]
>>> print("\n按A属性排序:\n", df.sort_values("A")) # 按A属性递增排序
# axis=0时,按index倒序输出;axis=1时,属性从"ABCDE"变成"EDCBA"
A B C D E
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-12 -0.572311 -2.700809 0.258386 0.114140 0.487276
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
2020-08-09 -0.012218 -1.224213 -1.319811 0.323090 0.063972
2020-08-13 0.050315 0.475426 0.581819 -2.242387 1.220767
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358
>>> print("\n对index排序:\n", df.sort_index(axis=0, ascending=False))
A B C D E
2020-08-13 0.050315 0.475426 0.581819 -2.242387 1.220767
2020-08-12 -0.572311 -2.700809 0.258386 0.114140 0.487276
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024
2020-08-09 -0.012218 -1.224213 -1.319811 0.323090 0.063972
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358
>>> # 所有属性值的数量,平均值,标准差,最小值,下四分位数,中位数,上四分位数,最大值
>>> print("\n描述:\n", df.describe())
A B C D E
count 8.000000 8.000000 8.000000 8.000000 8.000000
mean -0.265130 -0.167569 -0.163479 0.052486 -0.322671
std 0.629314 1.267794 0.571121 1.126887 1.275246
min -1.070570 -2.700809 -1.319811 -2.242387 -2.155789
25% -0.678572 -0.703570 -0.339539 -0.152612 -1.181987
50% -0.268116 0.424228 -0.102273 0.218615 -0.524526
75% 0.112193 0.511488 0.162554 0.444911 0.670649
max 0.707288 1.220009 0.581819 1.764557 1.312658
>>> print("\n只打印A属性:\n", df["A"]) # 只打印A属性
2020-08-06 0.707288
2020-08-07 -0.997355
2020-08-08 -0.524014
2020-08-09 -0.012218
2020-08-10 0.297825
2020-08-11 -1.070570
2020-08-12 -0.572311
2020-08-13 0.050315
Freq: D, Name: A, dtype: float64
>>> print("\n使用切片方式打印前3行:\n", df[:3]) # 打印前3行
A B C D E
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
>>> print("\n打印指定index区间:\n", df["20200808":"20200811"]) # 打印指定index区间
A B C D E
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
2020-08-09 -0.012218 -1.224213 -1.319811 0.323090 0.063972
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873
>>> print("\n打印第1行数据:\n", df.loc[dates[0]]) # 打印第1行数据
A 0.707288
B 0.499962
C 0.130611
D 1.764557
E -1.165358
Name: 2020-08-06 00:00:00, dtype: float64
>>> print("\n打印指定index指定属性:\n", df.loc["20200808":"20200811", ["A", "C"]]) # 打印指定index指定属性
A C
2020-08-08 -0.524014 -0.012519
2020-08-09 -0.012218 -1.319811
2020-08-10 0.297825 -0.192028
2020-08-11 -1.070570 -0.301933
>>> print("\n打印指定index指定属性的值:\n", df.at[dates[0], "B"]) # 打印指定index指定属性的值
0.4999616023174961
>>> print("\n通过下标打印指定index指定属性:\n", df.iloc[1:4, 2:5]) # 第2~4个index的第3~5列属性
C D E
2020-08-07 -0.452357 -0.432477 1.312658
2020-08-08 -0.012519 -0.059323 -2.155789
2020-08-09 -1.319811 0.323090 0.063972
>>> print("\n设定条件打印指定值:\n", df[(df.B > 0) & (df.C < 0)]) # 设定条件打印指定值
A B C D E
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024
>>> print("\n设定条件值打印总表(不满足条件为NaN):\n", df[df > 0]) # 设定条件值打印总表(不满足条件为NaN)
A B C D E
2020-08-06 0.707288 0.499962 0.130611 1.764557 NaN
2020-08-07 NaN 1.220009 NaN NaN 1.312658
2020-08-08 NaN 0.373030 NaN NaN NaN
2020-08-09 NaN NaN NaN 0.323090 0.063972
2020-08-10 0.297825 0.546069 NaN 0.413679 NaN
2020-08-11 NaN NaN NaN 0.538606 NaN
2020-08-12 NaN NaN 0.258386 0.114140 0.487276
2020-08-13 0.050315 0.475426 0.581819 NaN 1.220767
>>> s1 = pd.Series(list(range(101, 109)), index=pd.date_range("20200806", periods=8))
>>> df["F"] = s1 # 添加一列"F",属性为101~108
>>> print("\n添加一列"F",属性为101~108:\n", df)
A B C D E F
2020-08-06 0.707288 0.499962 0.130611 1.764557 -1.165358 101
2020-08-07 -0.997355 1.220009 -0.452357 -0.432477 1.312658 102
2020-08-08 -0.524014 0.373030 -0.012519 -0.059323 -2.155789 103
2020-08-09 -0.012218 -1.224213 -1.319811 0.323090 0.063972 104
2020-08-10 0.297825 0.546069 -0.192028 0.413679 -1.113024 105
2020-08-11 -1.070570 -0.530023 -0.301933 0.538606 -1.231873 106
2020-08-12 -0.572311 -2.700809 0.258386 0.114140 0.487276 107
2020-08-13 0.050315 0.475426 0.581819 -2.242387 1.220767 108
>>> df[df < 0] = -df # 将表格中所有的负数改为绝对值
>>> print("\n将表格中所有的负数改为绝对值:\n", df)
A B C D E F
2020-08-06 0.707288 0.499962 0.130611 1.764557 1.165358 101
2020-08-07 0.997355 1.220009 0.452357 0.432477 1.312658 102
2020-08-08 0.524014 0.373030 0.012519 0.059323 2.155789 103
2020-08-09 0.012218 1.224213 1.319811 0.323090 0.063972 104
2020-08-10 0.297825 0.546069 0.192028 0.413679 1.113024 105
2020-08-11 1.070570 0.530023 0.301933 0.538606 1.231873 106
2020-08-12 0.572311 2.700809 0.258386 0.114140 0.487276 107
2020-08-13 0.050315 0.475426 0.581819 2.242387 1.220767 108
缺失值处理:
>>> import numpy as np
>>> import pandas as pd
>>> dates = pd.date_range("20200806", periods=8) # 连续的8天
>>> # randn为标准正态分布,index为key值,columns为属性值
>>> df = pd.DataFrame(np.random.randn(8, 5), index=dates, columns=list("ABCDE"))
>>> print("整个表:\n", df) # 打印整个表
A B C D E
2020-08-06 1.480559 2.011120 -0.446669 0.660951 -0.206286
2020-08-07 -0.776498 1.367207 -1.829035 1.205739 -1.435127
2020-08-08 -0.313502 -0.639334 0.623946 0.238073 -1.537367
2020-08-09 0.756416 0.748446 0.561026 -0.651465 0.567481
2020-08-10 0.472599 -1.200840 -1.622537 0.198806 0.991217
2020-08-11 -0.179116 0.477361 0.019794 -0.522681 0.750547
2020-08-12 0.833937 0.567930 0.427194 0.592522 1.281558
2020-08-13 0.797320 -0.833195 0.389737 0.516310 0.276850
>>> df1 = df.reindex(index=dates[:5], columns=list("AC") + ["G"]) # 5行数据,有ACG三个属性
>>> df1.loc[dates[0:3], "G"] = 1234 # 给前3行的G属性赋值为1234
>>> print("整个表:\n", df1) # 打印整个表
A C G
2020-08-06 1.480559 -0.446669 1234.0
2020-08-07 -0.776498 -1.829035 1234.0
2020-08-08 -0.313502 0.623946 1234.0
2020-08-09 0.756416 0.561026 NaN
2020-08-10 0.472599 -1.622537 NaN
>>> print("丢弃有空值的行:\n", df1.dropna()) # 丢弃有空值的行
A C G
2020-08-06 1.480559 -0.446669 1234.0
2020-08-07 -0.776498 -1.829035 1234.0
2020-08-08 -0.313502 0.623946 1234.0
>>> print("给空值赋指定值:\n", df1.fillna(value=9999)) # 丢弃有空值的行
A C G
2020-08-06 1.480559 -0.446669 1234.0
2020-08-07 -0.776498 -1.829035 1234.0
2020-08-08 -0.313502 0.623946 1234.0
2020-08-09 0.756416 0.561026 9999.0
2020-08-10 0.472599 -1.622537 9999.0
表格数据分析:
>>> import numpy as np
>>> import pandas as pd
>>> dates = pd.date_range("20200806", periods=5) # 连续的8天
>>> # randn为标准正态分布,index为key值,columns为属性值
>>> df = pd.DataFrame(np.random.randn(5, 3), index=dates, columns=list("ABC"))
>>> print("整个表:\n", df) # 打印整个表
A B C
2020-08-06 -0.231186 -0.971756 1.702146
2020-08-07 -0.431149 -0.025855 -0.887768
2020-08-08 -0.693589 -0.253785 1.686983
2020-08-09 -1.332854 -1.194220 0.606153
2020-08-10 0.580246 -0.080983 -0.154577
>>> print("\n各属性平均值:\n", df.mean()) # 各属性平均值
A -0.421706
B -0.505320
C 0.590587
dtype: float64
>>> print("\n各属性方差:\n", df.var()) # 各属性方差
A 0.486102
B 0.291341
C 1.294673
dtype: float64
>>> s = pd.Series([1, 3, 5, np.nan, 7], index=dates)
>>> print("\n创建新表格:\n", s) # 创建新表格
2020-08-06 1.0
2020-08-07 3.0
2020-08-08 5.0
2020-08-09 NaN
2020-08-10 7.0
Freq: D, dtype: float64
>>> print("\n值后移2位,前2位为空,后2位抛弃\n", s.shift(2)) # 值后移2位,前2位为空,后2位抛弃
2020-08-06 NaN
2020-08-07 NaN
2020-08-08 1.0
2020-08-09 3.0
2020-08-10 5.0
Freq: D, dtype: float64
>>> print("\n每个值出现的次数:\n", s.value_counts()) # 每个值出现的次数
7.0 1
5.0 1
3.0 1
1.0 1
dtype: int64
>>> print("\n每个属性的累加值:\n", df.apply(np.cumsum)) # 每个属性的累加值
A B C
2020-08-06 -0.231186 -0.971756 1.702146
2020-08-07 -0.662336 -0.997611 0.814378
2020-08-08 -1.355924 -1.251396 2.501361
2020-08-09 -2.688778 -2.445616 3.107514
2020-08-10 -2.108532 -2.526599 2.952937
>>> print("\n每个属性的极差值:\n", df.apply(lambda x: x.max() - x.min())) # 每个属性的极差值
A 1.913100
B 1.168365
C 2.589914
dtype: float64
氷鸢鸢鸢
2020.8.6