03.pandas
笔记
import pandas as pd
import numpy as np
#定义series
s1 = pd.Series([1,2,2,2,5,4,5,5,5,5,6,8])
print(s1)
s1 = pd.Series([1,2,2,2,5,4,5,5,5,5,6,8],index=list("abcdefghijkl"))
print(s1)
#通过字典定义series
temp_dict = {"name":"xxx","age":25,"tel":123456789}
s2 = pd.Series(temp_dict)
print(s2)
#dataframe定义 index-0 columns-1
s3 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(s3)
#字典
d1 = {"name":["xiaoming","xiaohong"],"sge":[20,30],"tel":[12345215,1544646]}
s4 = pd.DataFrame(d1)
print(s4)
df = pd.read_csv("../BJXS1_df_test_with_needsnow.csv")
print(df.head(10))
print(df.describe())
print(df.info())
# 降序排列,按照AAGA_max列的降序排列
df = df.sort_values(by="AAGA_max",ascending=False)
print(df.head(10))
#取前20行
print(df[:20])
#取前20行的"AAGA_max"
print(df[:20]["AAGA_max"])
#loc[]
s5 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz"))
print(s5)
#取一个
print(s5.loc["a","z"])
#取一行
print(s5.loc["a",:])
#取一列
print(s5.loc[:,"w"])
#取多行
print(s5.loc[["a","c"],["w","z"]])
#iloc[]
print(s5.iloc[:,[2,1]])
print(s5.iloc[[0,2],[2,1]])
print(s5.iloc[1:,:2])
print(df[(4<df["AAFB_mean"])&(df["AAFB_mean"]<6)])
s6 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz"))
s6.iloc[[1],[1,2]]=None
print(s6)
#找到为空的
print(pd.isnull(s6))
#找到不为空的
print(pd.notnull(s6))
#删掉x这一列中为nan的行
print(s6[pd.notnull(s6["x"])])
#删除任何带有空的行
print(s6.dropna(axis=0,how="any"))
#全部为空值的行才删除
print(s6.dropna(axis=0,how="all"))
# #inplace=True原地修改???
# print(s6.dropna(axis=0,how="any",inplace=True))
#均值填充
a = s6.fillna(s6.mean())
print(a)
#对某一列进行填充
s6["x"] = s6["x"].fillna(s6["x"].mean())
print(s6["x"])
输出
0 1
1 2
2 2
3 2
4 5
5 4
6 5
7 5
8 5
9 5
10 6
11 8
dtype: int64
a 1
b 2
c 2
d 2
e 5
f 4
g 5
h 5
i 5
j 5
k 6
l 8
dtype: int64
name xxx
age 25
tel 123456789
dtype: object
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
name sge tel
0 xiaoming 20 12345215
1 xiaohong 30 1544646
areaDate AAAA_max AAGA_max ... AAFB_mean AAHA1D_sum needsnow
0 2019-01-03 -11 793 ... 5 66 0
1 2019-01-04 -17 793 ... 5 0 0
2 2019-01-05 -7 793 ... 1 0 0
3 2019-01-06 -10 792 ... 3 0 0
4 2019-01-07 -16 792 ... 4 0 0
5 2019-01-08 -13 795 ... 3 0 0
6 2019-01-09 -8 795 ... 2 0 0
7 2019-01-10 -8 790 ... 4 66 0
8 2019-01-11 -6 790 ... 1 66 0
9 2019-01-12 -8 791 ... 2 0 0
[10 rows x 18 columns]
AAAA_max AAGA_max AADA_max ... AAFB_mean AAHA1D_sum needsnow
count 140.000000 140.000000 140.000000 ... 140.000000 140.000000 140.000000
mean 1.757143 790.928571 62.535714 ... 2.685714 9.564286 -0.421429
std 10.529077 3.170554 19.070264 ... 1.330796 23.625133 0.495561
min -20.000000 782.000000 24.000000 ... 1.000000 0.000000 -1.000000
25% -7.000000 789.000000 48.000000 ... 2.000000 0.000000 -1.000000
50% 1.000000 791.000000 63.000000 ... 2.000000 0.000000 0.000000
75% 11.000000 792.250000 78.000000 ... 4.000000 0.000000 0.000000
max 23.000000 799.000000 99.000000 ... 6.000000 81.000000 0.000000
[8 rows x 17 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 areaDate 140 non-null object
1 AAAA_max 140 non-null int64
2 AAGA_max 140 non-null int64
3 AADA_max 140 non-null int64
4 AAEB_max 140 non-null int64
5 AAFB_max 140 non-null int64
6 AAAA_min 140 non-null int64
7 AAGA_min 140 non-null int64
8 AADA_min 140 non-null int64
9 AAEB_min 140 non-null int64
10 AAFB_min 140 non-null int64
11 AAAA_mean 140 non-null int64
12 AAGA_mean 140 non-null int64
13 AADA_mean 140 non-null int64
14 AAEB_mean 140 non-null int64
15 AAFB_mean 140 non-null int64
16 AAHA1D_sum 140 non-null int64
17 needsnow 140 non-null int64
dtypes: int64(17), object(1)
memory usage: 19.8+ KB
None
areaDate AAAA_max AAGA_max ... AAFB_mean AAHA1D_sum needsnow
107 2019-05-05 4 799 ... 5 0 -1
108 2019-05-06 8 799 ... 4 0 -1
23 2019-01-26 -5 799 ... 2 0 0
22 2019-01-25 -11 799 ... 5 0 0
83 2019-04-03 6 797 ... 2 0 -1
82 2019-04-02 2 797 ... 2 77 -1
125 2019-05-27 10 796 ... 6 0 -1
42 2019-02-20 -5 796 ... 3 0 0
43 2019-02-21 2 796 ... 1 0 0
126 2019-05-28 17 796 ... 3 0 -1
[10 rows x 18 columns]
areaDate AAAA_max AAGA_max ... AAFB_mean AAHA1D_sum needsnow
107 2019-05-05 4 799 ... 5 0 -1
108 2019-05-06 8 799 ... 4 0 -1
23 2019-01-26 -5 799 ... 2 0 0
22 2019-01-25 -11 799 ... 5 0 0
83 2019-04-03 6 797 ... 2 0 -1
82 2019-04-02 2 797 ... 2 77 -1
125 2019-05-27 10 796 ... 6 0 -1
42 2019-02-20 -5 796 ... 3 0 0
43 2019-02-21 2 796 ... 1 0 0
126 2019-05-28 17 796 ... 3 0 -1
44 2019-02-22 2 795 ... 1 0 0
45 2019-02-23 1 795 ... 1 0 0
5 2019-01-08 -13 795 ... 3 0 0
6 2019-01-09 -8 795 ... 2 0 0
104 2019-05-01 11 795 ... 2 0 -1
109 2019-05-07 13 795 ... 2 0 -1
56 2019-03-07 4 795 ... 2 0 0
55 2019-03-06 -8 795 ... 4 0 0
72 2019-03-23 -4 794 ... 3 0 0
21 2019-01-24 -4 794 ... 3 0 0
[20 rows x 18 columns]
107 799
108 799
23 799
22 799
83 797
82 797
125 796
42 796
43 796
126 796
44 795
45 795
5 795
6 795
104 795
109 795
56 795
55 795
72 794
21 794
Name: AAGA_max, dtype: int64
w x y z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
3
w 0
x 1
y 2
z 3
Name: a, dtype: int32
a 0
b 4
c 8
Name: w, dtype: int32
w z
a 0 3
c 8 11
y x
a 2 1
b 6 5
c 10 9
y x
a 2 1
c 10 9
w x
b 4 5
c 8 9
areaDate AAAA_max AAGA_max ... AAFB_mean AAHA1D_sum needsnow
107 2019-05-05 4 799 ... 5 0 -1
22 2019-01-25 -11 799 ... 5 0 0
1 2019-01-04 -17 793 ... 5 0 0
0 2019-01-03 -11 793 ... 5 66 0
12 2019-01-15 -19 793 ... 5 0 0
135 2019-06-09 15 791 ... 5 0 -1
24 2019-01-27 -8 790 ... 5 0 0
13 2019-01-16 -12 790 ... 5 0 0
19 2019-01-22 -7 789 ... 5 0 0
31 2019-02-03 -13 786 ... 5 0 0
61 2019-03-12 -9 786 ... 5 0 0
60 2019-03-11 -4 785 ... 5 0 0
[12 rows x 18 columns]
w x y z
a 0 1.0 2.0 3
b 4 NaN NaN 7
c 8 9.0 10.0 11
w x y z
a False False False False
b False True True False
c False False False False
w x y z
a True True True True
b True False False True
c True True True True
w x y z
a 0 1.0 2.0 3
c 8 9.0 10.0 11
w x y z
a 0 1.0 2.0 3
c 8 9.0 10.0 11
w x y z
a 0 1.0 2.0 3
b 4 NaN NaN 7
c 8 9.0 10.0 11
w x y z
a 0 1.0 2.0 3
b 4 5.0 6.0 7
c 8 9.0 10.0 11
a 1.0
b 5.0
c 9.0
Name: x, dtype: float64
Process finished with exit code 0
PPT