用简易代码快速学习Python(十)

本学习系列介绍:

本学习系列主要针对对编程略有了解有其他语言基础并要进一步学习Python的同学,通过简易的代码快速入门掌握Python语言。

系列总目录:
用简易代码快速学习Python(一)
用简易代码快速学习Python(二)
用简易代码快速学习Python(三)
用简易代码快速学习Python(四)
用简易代码快速学习Python(五)
用简易代码快速学习Python(六)
用简易代码快速学习Python(七)
用简易代码快速学习Python(八)
用简易代码快速学习Python(九)
用简易代码快速学习Python(十)

Day10:

pandas的一些基本操作:

import pandas as pd
import numpy as np
s = pd.Series([1,2,3,5,8,np.nan,10])
print(s)
#输出:
#0     1.0
#1     2.0
#2     3.0
#3     5.0
#4     8.0
#5     NaN
#6    10.0
#dtype: float64
datas = pd.date_range('20200901',periods=5)
print(datas)
#输出:
#DatetimeIndex(['2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04',
#               '2020-09-05'],
#              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(5,4),index=datas,columns=['a','b','c','d'])
print(df)
#输出:
#                   a         b         c         d
#2020-09-01  0.424217  1.027256 -0.116358  0.795805
#2020-09-02 -0.356010 -0.263972  1.792676 -1.047615
#2020-09-03 -0.658466 -0.423116 -0.090830  0.924449
#2020-09-04  0.900463 -0.966019 -1.153638 -0.438660
#2020-09-05 -1.053531  1.567020  0.676001 -0.866624

df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
#输出:
#   0  1   2   3
#0  0  1   2   3
#1  4  5   6   7
#2  8  9  10  11

df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20200901'),
                    'C':pd.Series(1,index=list(range(4)), dtype='float32'),
                    'D':np.array([3] * 4, dtype='int32'),
                    'E':pd.Categorical(["test","train","test","train"]),
                    'F':'foo'})
print(df2)
#输出:
#     A          B    C  D      E    F
#0  1.0 2020-09-01  1.0  3   test  foo
#1  1.0 2020-09-01  1.0  3  train  foo
#2  1.0 2020-09-01  1.0  3   test  foo
#3  1.0 2020-09-01  1.0  3  train  foo
print(df2.dtypes)
#输出:
#A           float64
#B    datetime64[ns]
#C           float32
#D             int32
#E          category
#F            object
#dtype: object
print(df2.index) #输出行的索引名称:Int64Index([0, 1, 2, 3], dtype='int64')
print(df2.columns) #输出列的索引名称:Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
#输出所有的值,以二维数组的形式列出:
#[[1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'train' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-09-01 00:00:00') 1.0 3 'train' 'foo']]
print(df2.describe()) #输出值为数字的列的加和、均值、方差、最小值、最大值等。
#输出:
#         A    C    D
#count  4.0  4.0  4.0
#mean   1.0  1.0  3.0
#std    0.0  0.0  0.0
#min    1.0  1.0  3.0
#25%    1.0  1.0  3.0
#50%    1.0  1.0  3.0
#75%    1.0  1.0  3.0
#max    1.0  1.0  3.0
print(df2.T) #将df2进行转置。
print(df2.sort_index(axis=0, ascending=False)) #按照索引进行排序。
print(df2.sort_values(by='E')) #按照值进行排序。

pandas选择(查询)数据:

mport numpy as np
import pandas as pd
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=[7,8,"A"], columns=["A", "B", "C", "D"])

具体输出请同学自己动手尝试。

直接选择某列:(按列名查询)

print(data['A'])
#输出:
#7    0
#8    4
#A    8
#Name: A, dtype: int32

直接选择行:(按行号查询,需要有冒号)

print(data[1:])
#输出:
#   A  B   C   D
#8  4  5   6   7
#A  8  9  10  11

按名查询:

#select by lable:loc
print(data.loc[["A",7],["A","C"]])
#输出:
#   A   C
#A  8  10
#7  0   2

按号查询:

#selsct by position:iloc
print(data.iloc[1])
#输出:
#A    4
#B    5
#C    6
#D    7
#Name: 8, dtype: int32
print(data.iloc[1:,2:])
#输出:
#    C   D
#8   6   7
#A  10  11

按大小布尔查询:

#boolean select
print(data[data["B"] < 9])
#输出:
#   A  B  C  D
#7  0  1  2  3
#8  4  5  6  7
print(data[data[1:2] > 4])
#输出:
#    A    B    C    D
#7 NaN  NaN  NaN  NaN
#8 NaN  5.0  6.0  7.0
#A NaN  NaN  NaN  NaN

pandas修改数据:

import numpy as np
import pandas as pd
dates = pd.date_range("20200920",periods=3)
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=dates, columns=["A", "B", "C", "D"])

具体输出请同学自己动手尝试。

data.iloc[1,2] = 0
print(data)
#输出:
#            A  B   C   D
#2020-09-20  0  1   2   3
#2020-09-21  4  5   0   7
#2020-09-22  8  9  10  11
data[data["B"] > 2] = 2
print(data)
#输出:
#            A  B  C  D
#2020-09-20  0  1  2  3
#2020-09-21  2  2  2  2
#2020-09-22  2  2  2  2
data["A"][data["B"] > 2] = 10
print(data)
#输出:
#             A  B   C   D
#2020-09-20   0  1   2   3
#2020-09-21  10  5   0   7
#2020-09-22  10  9  10  11
data["E"] = pd.Series(["q", "w", "e"], index=dates)
print(data)
#输出:
#             A  B   C   D  E
#2020-09-20   0  1   2   3  q
#2020-09-21  10  5   0   7  w
#2020-09-22  10  9  10  11  e
data["F"] = np.nan
print(data)
#输出:
#             A  B   C   D  E   F
#2020-09-20   0  1   2   3  q NaN
#2020-09-21  10  5   0   7  w NaN
#2020-09-22  10  9  10  11  e NaN
data.loc[pd.datetime(2020,9,23)] = pd.Series([1,2,3,4,5,6], index=["A", "B", "C", "D", "E", "F"])
print(data)
#输出:
#             A  B   C   D  E    F
#2020-09-20   0  1   2   3  q  NaN
#2020-09-21  10  5   0   7  w  NaN
#2020-09-22  10  9  10  11  e  NaN
#2020-09-23   1  2   3   4  5  6.0

pandas丢失数据处理:

import numpy as np
import pandas as pd
dates = pd.date_range("20200920", periods=4)
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=dates, columns=["A", "B", "C", "D"])
data.iloc[1,1] = np.nan #假设NaN为丢失的数据。
data.iloc[3,2] = np.nan #假设NaN为丢失的数据。

具体输出请同学自己动手尝试。

data1 = data.dropna(axis=0, how="any") #注:how="all"表示全行都为NaN时才删除。
print(data1)
#输出:
#            A    B     C   D
#2020-09-20  0  1.0   2.0   3
#2020-09-22  8  9.0  10.0  11
data2 = data.dropna(axis=1, how="any") #注:how="all"表示全列都为NaN时才删除。
print(data2)
#输出:
#             A   D
#2020-09-20   0   3
#2020-09-21   4   7
#2020-09-22   8  11
#2020-09-23  12  15
data3 = data.fillna(0)
print(data3)
#输出:
#             A     B     C   D
#2020-09-20   0   1.0   2.0   3
#2020-09-21   4   0.0   6.0   7
#2020-09-22   8   9.0  10.0  11
#2020-09-23  12  13.0   0.0  15
print(data.isnull())
#输出:
#                A      B      C      D
#2020-09-20  False  False  False  False
#2020-09-21  False   True  False  False
#2020-09-22  False  False  False  False
#2020-09-23  False  False   True  False
print(np.any(data.isnull())) #判断data中是否有NaN值。输出:True

pandas处理数据合并:

import numpy as np
import pandas as pd
data1 = pd.DataFrame(np.arange(20).reshape((4,5)), index=range(4), columns=["A", "B", "C", "D", "E"])
data2 = pd.DataFrame(np.ones((5,5)), index=range(5), columns=["C", "D", "E", "F", "G"])

具体输出请同学自己动手尝试。

concat:

data3 = pd.concat([data1,data2], axis=0, ignore_index=True)
print(data3)
#输出:
#      A     B     C     D     E    F    G
#0   0.0   1.0   2.0   3.0   4.0  NaN  NaN
#1   5.0   6.0   7.0   8.0   9.0  NaN  NaN
#2  10.0  11.0  12.0  13.0  14.0  NaN  NaN
#3  15.0  16.0  17.0  18.0  19.0  NaN  NaN
#4   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#5   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#6   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#7   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#8   NaN   NaN   1.0   1.0   1.0  1.0  1.0
data4 = pd.concat([data1,data2], axis=0, ignore_index=True, join="inner")
#join参数默认为“outer”。
print(data4)
#输出:
#      C     D     E
#0   2.0   3.0   4.0
#1   7.0   8.0   9.0
#2  12.0  13.0  14.0
#3  17.0  18.0  19.0
#4   1.0   1.0   1.0
#5   1.0   1.0   1.0
#6   1.0   1.0   1.0
#7   1.0   1.0   1.0
#8   1.0   1.0   1.0
data5 = pd.concat([data1,data2], axis=1)
print(data5)
#输出:
#      A     B     C     D     E    C    D    E    F    G
#0   0.0   1.0   2.0   3.0   4.0  1.0  1.0  1.0  1.0  1.0
#1   5.0   6.0   7.0   8.0   9.0  1.0  1.0  1.0  1.0  1.0
#2  10.0  11.0  12.0  13.0  14.0  1.0  1.0  1.0  1.0  1.0
#3  15.0  16.0  17.0  18.0  19.0  1.0  1.0  1.0  1.0  1.0
#4   NaN   NaN   NaN   NaN   NaN  1.0  1.0  1.0  1.0  1.0

append:

data6 = data1.append(data2,ignore_index=True)
print(data6)
#输出:
#      A     B     C     D     E    F    G
#0   0.0   1.0   2.0   3.0   4.0  NaN  NaN
#1   5.0   6.0   7.0   8.0   9.0  NaN  NaN
#2  10.0  11.0  12.0  13.0  14.0  NaN  NaN
#3  15.0  16.0  17.0  18.0  19.0  NaN  NaN
#4   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#5   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#6   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#7   NaN   NaN   1.0   1.0   1.0  1.0  1.0
#8   NaN   NaN   1.0   1.0   1.0  1.0  1.0
s1 = pd.Series([9,8,7,6,5,4,3], index=["A", "B", "C", "D", "E", "F", "G"])
s2 = pd.Series([1,2,3], index=["A", "C", "D"])
data7 = data1.append([s1,s2], ignore_index=True)
print(data7)
#输出:
#      A     B     C     D     E    F    G
#0   0.0   1.0   2.0   3.0   4.0  NaN  NaN
#1   5.0   6.0   7.0   8.0   9.0  NaN  NaN
#2  10.0  11.0  12.0  13.0  14.0  NaN  NaN
#3  15.0  16.0  17.0  18.0  19.0  NaN  NaN
#4   9.0   8.0   7.0   6.0   5.0  4.0  3.0
#5   1.0   NaN   2.0   3.0   NaN  NaN  NaN

merge:

data8 = pd.merge(data1, data2, on="C", how="outer", indicator=True, suffixes=("_left","_right")) #indicator指描述连接的方法。suffixes指有相同列名的时候的区分后缀。
print(data8)
#输出:
#    A   B   C  D_left  E_left  D_right  E_right    F    G     _merge
#0   0   1   2       3       4      NaN      NaN  NaN  NaN  left_only
#1   5   6   7       8       9      7.0      7.0  7.0  7.0       both
#2   5   6   7       8       9      7.0      7.0  7.0  7.0       both
#3   5   6   7       8       9      7.0      7.0  7.0  7.0       both
#4   5   6   7       8       9      7.0      7.0  7.0  7.0       both
#5   5   6   7       8       9      7.0      7.0  7.0  7.0       both
#6  10  11  12      13      14      NaN      NaN  NaN  NaN  left_only
#7  15  16  17      18      19      NaN      NaN  NaN  NaN  left_only
data9 = pd.merge(data1, data2, on=["C", "D", "E"], how="left") #how可以等于[outer,inner,left,right]。
print(data9)
#输出:
#    A   B   C   D   E   F   G
#0   0   1   2   3   4 NaN NaN
#1   5   6   7   8   9 NaN NaN
#2  10  11  12  13  14 NaN NaN
#3  15  16  17  18  19 NaN NaN
data10 = pd.merge(data1, data2, left_index=True, right_index=True, how="outer") #使用index进行连接。
print(data10)
#输出:
      A     B   C_x   D_x   E_x  C_y  D_y  E_y    F    G
0   0.0   1.0   2.0   3.0   4.0  7.0  7.0  7.0  7.0  7.0
1   5.0   6.0   7.0   8.0   9.0  7.0  7.0  7.0  7.0  7.0
2  10.0  11.0  12.0  13.0  14.0  7.0  7.0  7.0  7.0  7.0
3  15.0  16.0  17.0  18.0  19.0  7.0  7.0  7.0  7.0  7.0
4   NaN   NaN   NaN   NaN   NaN  7.0  7.0  7.0  7.0  7.0

pandas画图:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data1 = pd.Series(np.random.randn(1000))
data1 = data1.cumsum()
data1.plot()
plt.show()
data2 = pd.DataFrame(np.random.randn(1000,4),columns=["A","B","C","D"])
data2 = data2.cumsum()
data2.plot()
plt.show()
data3 = data2.plot.scatter(x="A", y="B", color='red') #scatter为散点图。
data2.plot.scatter(x="A", y="C", color="blue",ax=data3)
plt.show()

具体输出请同学自己动手尝试。

注意:plot methods:‘bar’,‘hist’,‘box’,‘kde’,‘area’,‘scatter’,‘hexbin’,‘pie’.

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

刘学长丶

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值