本学习系列介绍:
本学习系列主要针对对编程略有了解或有其他语言基础并要进一步学习Python的同学,通过简易的代码快速入门掌握Python语言。
系列总目录: |
---|
用简易代码快速学习Python(一) |
用简易代码快速学习Python(二) |
用简易代码快速学习Python(三) |
用简易代码快速学习Python(四) |
用简易代码快速学习Python(五) |
用简易代码快速学习Python(六) |
用简易代码快速学习Python(七) |
用简易代码快速学习Python(八) |
用简易代码快速学习Python(九) |
用简易代码快速学习Python(十) |
Day10:
pandas的一些基本操作:
import pandas as pd
import numpy as np
s = pd.Series([1,2,3,5,8,np.nan,10])
print(s)
#输出:
#0 1.0
#1 2.0
#2 3.0
#3 5.0
#4 8.0
#5 NaN
#6 10.0
#dtype: float64
datas = pd.date_range('20200901',periods=5)
print(datas)
#输出:
#DatetimeIndex(['2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04',
# '2020-09-05'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(5,4),index=datas,columns=['a','b','c','d'])
print(df)
#输出:
# a b c d
#2020-09-01 0.424217 1.027256 -0.116358 0.795805
#2020-09-02 -0.356010 -0.263972 1.792676 -1.047615
#2020-09-03 -0.658466 -0.423116 -0.090830 0.924449
#2020-09-04 0.900463 -0.966019 -1.153638 -0.438660
#2020-09-05 -1.053531 1.567020 0.676001 -0.866624
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
#输出:
# 0 1 2 3
#0 0 1 2 3
#1 4 5 6 7
#2 8 9 10 11
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20200901'),
'C':pd.Series(1,index=list(range(4)), dtype='float32'),
'D':np.array([3] * 4, dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
print(df2)
#输出:
# A B C D E F
#0 1.0 2020-09-01 1.0 3 test foo
#1 1.0 2020-09-01 1.0 3 train foo
#2 1.0 2020-09-01 1.0 3 test foo
#3 1.0 2020-09-01 1.0 3 train foo
print(df2.dtypes)
#输出:
#A float64
#B datetime64[ns]
#C float32
#D int32
#E category
#F object
#dtype: object
print(df2.index) #输出行的索引名称:Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns) #输出列的索引名称:Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
#输出所有的值,以二维数组的形式列出:
#[[1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'train' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'train' 'foo']]
print(df2.describe()) #输出值为数字的列的加和、均值、方差、最小值、最大值等。
#输出:
# A C D
#count 4.0 4.0 4.0
#mean 1.0 1.0 3.0
#std 0.0 0.0 0.0
#min 1.0 1.0 3.0
#25% 1.0 1.0 3.0
#50% 1.0 1.0 3.0
#75% 1.0 1.0 3.0
#max 1.0 1.0 3.0
print(df2.T) #将df2进行转置。
print(df2.sort_index(axis=0, ascending=False)) #按照索引进行排序。
print(df2.sort_values(by='E')) #按照值进行排序。
pandas选择(查询)数据:
mport numpy as np
import pandas as pd
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=[7,8,"A"], columns=["A", "B", "C", "D"])
具体输出请同学自己动手尝试。
直接选择某列:(按列名查询)
print(data['A'])
#输出:
#7 0
#8 4
#A 8
#Name: A, dtype: int32
直接选择行:(按行号查询,需要有冒号)
print(data[1:])
#输出:
# A B C D
#8 4 5 6 7
#A 8 9 10 11
按名查询:
#select by lable:loc
print(data.loc[["A",7],["A","C"]])
#输出:
# A C
#A 8 10
#7 0 2
按号查询:
#selsct by position:iloc
print(data.iloc[1])
#输出:
#A 4
#B 5
#C 6
#D 7
#Name: 8, dtype: int32
print(data.iloc[1:,2:])
#输出:
# C D
#8 6 7
#A 10 11
按大小布尔查询:
#boolean select
print(data[data["B"] < 9])
#输出:
# A B C D
#7 0 1 2 3
#8 4 5 6 7
print(data[data[1:2] > 4])
#输出:
# A B C D
#7 NaN NaN NaN NaN
#8 NaN 5.0 6.0 7.0
#A NaN NaN NaN NaN
pandas修改数据:
import numpy as np
import pandas as pd
dates = pd.date_range("20200920",periods=3)
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=dates, columns=["A", "B", "C", "D"])
具体输出请同学自己动手尝试。
data.iloc[1,2] = 0
print(data)
#输出:
# A B C D
#2020-09-20 0 1 2 3
#2020-09-21 4 5 0 7
#2020-09-22 8 9 10 11
data[data["B"] > 2] = 2
print(data)
#输出:
# A B C D
#2020-09-20 0 1 2 3
#2020-09-21 2 2 2 2
#2020-09-22 2 2 2 2
data["A"][data["B"] > 2] = 10
print(data)
#输出:
# A B C D
#2020-09-20 0 1 2 3
#2020-09-21 10 5 0 7
#2020-09-22 10 9 10 11
data["E"] = pd.Series(["q", "w", "e"], index=dates)
print(data)
#输出:
# A B C D E
#2020-09-20 0 1 2 3 q
#2020-09-21 10 5 0 7 w
#2020-09-22 10 9 10 11 e
data["F"] = np.nan
print(data)
#输出:
# A B C D E F
#2020-09-20 0 1 2 3 q NaN
#2020-09-21 10 5 0 7 w NaN
#2020-09-22 10 9 10 11 e NaN
data.loc[pd.datetime(2020,9,23)] = pd.Series([1,2,3,4,5,6], index=["A", "B", "C", "D", "E", "F"])
print(data)
#输出:
# A B C D E F
#2020-09-20 0 1 2 3 q NaN
#2020-09-21 10 5 0 7 w NaN
#2020-09-22 10 9 10 11 e NaN
#2020-09-23 1 2 3 4 5 6.0
pandas丢失数据处理:
import numpy as np
import pandas as pd
dates = pd.date_range("20200920", periods=4)
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=dates, columns=["A", "B", "C", "D"])
data.iloc[1,1] = np.nan #假设NaN为丢失的数据。
data.iloc[3,2] = np.nan #假设NaN为丢失的数据。
具体输出请同学自己动手尝试。
data1 = data.dropna(axis=0, how="any") #注:how="all"表示全行都为NaN时才删除。
print(data1)
#输出:
# A B C D
#2020-09-20 0 1.0 2.0 3
#2020-09-22 8 9.0 10.0 11
data2 = data.dropna(axis=1, how="any") #注:how="all"表示全列都为NaN时才删除。
print(data2)
#输出:
# A D
#2020-09-20 0 3
#2020-09-21 4 7
#2020-09-22 8 11
#2020-09-23 12 15
data3 = data.fillna(0)
print(data3)
#输出:
# A B C D
#2020-09-20 0 1.0 2.0 3
#2020-09-21 4 0.0 6.0 7
#2020-09-22 8 9.0 10.0 11
#2020-09-23 12 13.0 0.0 15
print(data.isnull())
#输出:
# A B C D
#2020-09-20 False False False False
#2020-09-21 False True False False
#2020-09-22 False False False False
#2020-09-23 False False True False
print(np.any(data.isnull())) #判断data中是否有NaN值。输出:True
pandas处理数据合并:
import numpy as np
import pandas as pd
data1 = pd.DataFrame(np.arange(20).reshape((4,5)), index=range(4), columns=["A", "B", "C", "D", "E"])
data2 = pd.DataFrame(np.ones((5,5)), index=range(5), columns=["C", "D", "E", "F", "G"])
具体输出请同学自己动手尝试。
concat:
data3 = pd.concat([data1,data2], axis=0, ignore_index=True)
print(data3)
#输出:
# A B C D E F G
#0 0.0 1.0 2.0 3.0 4.0 NaN NaN
#1 5.0 6.0 7.0 8.0 9.0 NaN NaN
#2 10.0 11.0 12.0 13.0 14.0 NaN NaN
#3 15.0 16.0 17.0 18.0 19.0 NaN NaN
#4 NaN NaN 1.0 1.0 1.0 1.0 1.0
#5 NaN NaN 1.0 1.0 1.0 1.0 1.0
#6 NaN NaN 1.0 1.0 1.0 1.0 1.0
#7 NaN NaN 1.0 1.0 1.0 1.0 1.0
#8 NaN NaN 1.0 1.0 1.0 1.0 1.0
data4 = pd.concat([data1,data2], axis=0, ignore_index=True, join="inner")
#join参数默认为“outer”。
print(data4)
#输出:
# C D E
#0 2.0 3.0 4.0
#1 7.0 8.0 9.0
#2 12.0 13.0 14.0
#3 17.0 18.0 19.0
#4 1.0 1.0 1.0
#5 1.0 1.0 1.0
#6 1.0 1.0 1.0
#7 1.0 1.0 1.0
#8 1.0 1.0 1.0
data5 = pd.concat([data1,data2], axis=1)
print(data5)
#输出:
# A B C D E C D E F G
#0 0.0 1.0 2.0 3.0 4.0 1.0 1.0 1.0 1.0 1.0
#1 5.0 6.0 7.0 8.0 9.0 1.0 1.0 1.0 1.0 1.0
#2 10.0 11.0 12.0 13.0 14.0 1.0 1.0 1.0 1.0 1.0
#3 15.0 16.0 17.0 18.0 19.0 1.0 1.0 1.0 1.0 1.0
#4 NaN NaN NaN NaN NaN 1.0 1.0 1.0 1.0 1.0
append:
data6 = data1.append(data2,ignore_index=True)
print(data6)
#输出:
# A B C D E F G
#0 0.0 1.0 2.0 3.0 4.0 NaN NaN
#1 5.0 6.0 7.0 8.0 9.0 NaN NaN
#2 10.0 11.0 12.0 13.0 14.0 NaN NaN
#3 15.0 16.0 17.0 18.0 19.0 NaN NaN
#4 NaN NaN 1.0 1.0 1.0 1.0 1.0
#5 NaN NaN 1.0 1.0 1.0 1.0 1.0
#6 NaN NaN 1.0 1.0 1.0 1.0 1.0
#7 NaN NaN 1.0 1.0 1.0 1.0 1.0
#8 NaN NaN 1.0 1.0 1.0 1.0 1.0
s1 = pd.Series([9,8,7,6,5,4,3], index=["A", "B", "C", "D", "E", "F", "G"])
s2 = pd.Series([1,2,3], index=["A", "C", "D"])
data7 = data1.append([s1,s2], ignore_index=True)
print(data7)
#输出:
# A B C D E F G
#0 0.0 1.0 2.0 3.0 4.0 NaN NaN
#1 5.0 6.0 7.0 8.0 9.0 NaN NaN
#2 10.0 11.0 12.0 13.0 14.0 NaN NaN
#3 15.0 16.0 17.0 18.0 19.0 NaN NaN
#4 9.0 8.0 7.0 6.0 5.0 4.0 3.0
#5 1.0 NaN 2.0 3.0 NaN NaN NaN
merge:
data8 = pd.merge(data1, data2, on="C", how="outer", indicator=True, suffixes=("_left","_right")) #indicator指描述连接的方法。suffixes指有相同列名的时候的区分后缀。
print(data8)
#输出:
# A B C D_left E_left D_right E_right F G _merge
#0 0 1 2 3 4 NaN NaN NaN NaN left_only
#1 5 6 7 8 9 7.0 7.0 7.0 7.0 both
#2 5 6 7 8 9 7.0 7.0 7.0 7.0 both
#3 5 6 7 8 9 7.0 7.0 7.0 7.0 both
#4 5 6 7 8 9 7.0 7.0 7.0 7.0 both
#5 5 6 7 8 9 7.0 7.0 7.0 7.0 both
#6 10 11 12 13 14 NaN NaN NaN NaN left_only
#7 15 16 17 18 19 NaN NaN NaN NaN left_only
data9 = pd.merge(data1, data2, on=["C", "D", "E"], how="left") #how可以等于[outer,inner,left,right]。
print(data9)
#输出:
# A B C D E F G
#0 0 1 2 3 4 NaN NaN
#1 5 6 7 8 9 NaN NaN
#2 10 11 12 13 14 NaN NaN
#3 15 16 17 18 19 NaN NaN
data10 = pd.merge(data1, data2, left_index=True, right_index=True, how="outer") #使用index进行连接。
print(data10)
#输出:
A B C_x D_x E_x C_y D_y E_y F G
0 0.0 1.0 2.0 3.0 4.0 7.0 7.0 7.0 7.0 7.0
1 5.0 6.0 7.0 8.0 9.0 7.0 7.0 7.0 7.0 7.0
2 10.0 11.0 12.0 13.0 14.0 7.0 7.0 7.0 7.0 7.0
3 15.0 16.0 17.0 18.0 19.0 7.0 7.0 7.0 7.0 7.0
4 NaN NaN NaN NaN NaN 7.0 7.0 7.0 7.0 7.0
pandas画图:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data1 = pd.Series(np.random.randn(1000))
data1 = data1.cumsum()
data1.plot()
plt.show()
data2 = pd.DataFrame(np.random.randn(1000,4),columns=["A","B","C","D"])
data2 = data2.cumsum()
data2.plot()
plt.show()
data3 = data2.plot.scatter(x="A", y="B", color='red') #scatter为散点图。
data2.plot.scatter(x="A", y="C", color="blue",ax=data3)
plt.show()
具体输出请同学自己动手尝试。
注意:plot methods:‘bar’,‘hist’,‘box’,‘kde’,‘area’,‘scatter’,‘hexbin’,‘pie’.