Matplotlib、Numpy、Pandas学习笔记

Matpoltlib

import matplotlib.pyplot as plt
import random

折线图绘制

单图表绘制

import matplotlib.pyplot as plt
import random
# 绘制折线图

# 数据准备
x_data = range(60)
y_data = [random.uniform(10,15) for i in x_data]

# 获取画布
plt.figure(figsize=(20,8),dpi=200)    # 指定画布大小与清晰度

# 绘制图形
plt.plot(x_data,y_data,color='blue',linestyle='-',label='Legend Info')        

# 添加xy轴刻度
plt.xticks(x_data[::5]) # 每隔5位设置为一个刻度
plt.yticks(range(10,20)[::2])

# 添加xy轴说明
plt.xlabel("Temp",fontsize=15)
plt.ylabel("Date",fontsize=15)

# 添加网格
plt.grid(True,linestyle='--',alpha=0.5) # grid(是否显示网格,网格样式,网格颜色深度) 

# 添加表格标题
plt.title("Date of Temp",fontsize=20)

# 添加图例说明(使用此功能需要在绘制图形时写明label参数)
plt.legend()  

# 图片保存
plt.savefig("D://Desktop//test.png")

# 展示图形(展示图片后将释放图片资源)
plt.show()


在这里插入图片描述

# 补充
'''
【关于线段颜色】
在linestyle中可以填写的参数有
1. -   表示实线
2. --  表示虚线
3. -.  表示点划线
4. :   表示电线
5. 空  表示无线段 但在每个刻度有标记点
【关于线段颜色】
color参数可以使用green等常见颜色的英文或者使用RBG三原色表示
如color='blue'  或 color = (0,221,110)
【关于图例】
可以在plt.legend()函数中传入位置参数loc=xx  默认位loc=0 此时会选择最适宜的位置添加图例
'''
import matplotlib.pyplot as plt
# 创建一些数据  
x = [1, 2, 3, 4, 5]  
y = [1, 4, 9, 16, 25]  

# 使用不同的linestyle绘制线条  
plt.plot(x, y, linestyle='-', label='Solid line')  
plt.plot(x, [x + 2 for x in y] , linestyle='--', label='Dashed line')  
plt.plot(x, [x + 4 for x in y] , linestyle='-.', label='Dash-dot line')  
plt.plot(x, [x + 6 for x in y] , linestyle=':', label='Dotted line')  
plt.plot(x, [x + 8 for x in y] , linestyle=' ', marker='o', label='No line, only markers')  

  
plt.legend()  
plt.show()

在这里插入图片描述

多图表绘制

在单坐标轴下创建多图表
# 实际上只需要在创建画布后多次使用plt.plot等绘制函数即可
import matplotlib.pyplot as plt
x = range(10)
y1 = [i*2 for i in x]
y2 =[i*3 for i  in x]
plt.figure()
plt.plot(x,y1,label='x * 2')
plt.plot(x,y2,label='x * 3')
plt.legend()
plt.show()

在这里插入图片描述

在多坐标轴下创建多图标
# 步骤与之前一致 但函数有所变化
import matplotlib.pyplot as plt
import random
# 数据准备
x = range(30)
y1 = [random.uniform(15,25) for i in x]
y2 = [random.uniform(15,25) for i in x]

# 创建画布
# plt.figure(figsize=(20,8),dpi=200)    
# fig 为画布对象 axes为坐标轴对象列表
fig,axes = plt.subplots(ncols=2,nrows=1)

# 绘制图形
#plt.plot(x_data,y_data,color='blue',linestyle='-',label='Legend Info')    
axes[0].plot(x,y1,label='Data 1')
axes[1].plot(x,y2,label='Data 2')

# 添加xy轴刻度
# plt.xticks(x_data[::5])
axes[0].set_xticks(x[::5]) 
axes[0].set_yticks(range(15,25)[::2])
axes[1].set_xticks(x[::5]) 
axes[1].set_yticks(range(15,25)[::2])

# 添加xy轴说明
# plt.xlabel("Temp",fontsize=15)
axes[0].set_xlabel("Temp",fontsize=15)
axes[0].set_ylabel("Date",fontsize=15)
axes[1].set_xlabel("Temp",fontsize=15)
axes[1].set_ylabel("Date",fontsize=15)

# 添加网格
# plt.grid(True,linestyle='--',alpha=0.5) # grid(是否显示网格,网格样式,网格颜色深度) 
axes[0].grid(True,linestyle='--',alpha=0.5)
axes[1].grid(True,linestyle='--',alpha=0.5)

# 添加表格标题
axes[0].set_title("Date of Temp  1",fontsize=15)
axes[1].set_title("Date of Temp  2",fontsize=15)

# 添加图例说明(使用此功能需要在绘制图形时写明label参数)
axes[0].legend()
axes[1].legend()

# 图片保存
plt.savefig("D://Desktop//test.png")

# 展示图形
plt.show()

在这里插入图片描述

其他图形绘制

'''
# 散点图
# 一般用于查看数据分布规律
plt.scatter(x,y) # 数据1,数据2

# 柱状图
# 用于统计数据与对比
plt.bar(x,width,align) # 数据,柱状高度,柱间对齐方式center默认

# 直方图
plt.hist(x,bins)  # 数据,划分区间数

# 饼图
plt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色
'''
'\n# 散点图\n# 一般用于查看数据分布规律\nplt.scatter(x,y) # 数据1,数据2\n\n# 柱状图\n# 用于统计数据与对比\nplt.bar(x,width,align) # 数据,柱状高度,柱间对齐方式center默认\n\n# 直方图\nplt.hist(x,bins)  # 数据,划分区间数\n\n# 饼图\nplt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色\n'
# 散点图演示
import random

# 准备数据
x = [random.uniform(1,200) for i in range(100)]
y = [random.uniform(1,200) for i in range(100)]

# 准备画布
plt.figure()

# 绘制图形
plt.scatter(x,y)

# 显示图形
plt.show()

在这里插入图片描述

# 柱状图演示
# 显示中文
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

# 准备数据
x=['M4A1-雷神','M4A1-星象','M4A1-仲达','AK47-奉先']
y=[67,89,86,80]

# 准备画布
plt.figure()

# 绘制图形
plt.bar(x,y,width=0.5,color=['blue','purple','green','red'])

# 显示标题
plt.title("部分武器得分榜")

# 显示网格
plt.grid(alpha=0.2)

# 展示
plt.show()

在这里插入图片描述

更多绘制方法,详见网址:https://matplotlib.org/index.html

Numpy

import numpy as np

ndarray

[注意] ndarray中所有元素为同一数据类型

创建

# 1 使用现有数组进行创建
# 1.1使用np.array( oobject ) 创建   object既可以是一维或者多维数组也可以是一维或者多维数组数组的变量
# 使用array为【深拷贝】 当原始数组数据值变化后 该ndarray不会随着变化
myArray = [ [1,2,3,4,5] , [6,7,8,9,10] ] 
myNDArray = np.array( myArray ) 
print("使用np.array创建")
print(myNDArray)
print("--------------------------")

# 1.2使用np.asarray( oobject ) 创建   object既可以是一维或者多维数组也可以是一维或者多维数组数组的变量
# 使用asarray为【浅拷贝】 当原始数组数据值变化后 该ndarray也会随着变化
myNDArray = np.asarray( myArray ) 
print("使用np.asarray创建")
print(myNDArray)
print("--------------------------")



# 2.使用函数创建多维数组
# 2.1 np.zeros( dim ) 创建全0数组
myNDArray = np.zeros((2,3)) 
print("使用np.zeros((2,3))创建")
print(myNDArray)
print("--------------------------")

# 2.2 np.ones( dim ) 创建全1数组
myNDArray = np.ones((2,3)) 
print("使用np.ones((2,3))创建")
print(myNDArray)
print("--------------------------")

# 2.3 np.eye( dim ) 创建单位矩阵
myNDArray = np.eye(3) 
print("使用np.zeros(2,3)创建")
print(myNDArray)
print("--------------------------")

# 2.4 np.diag( data ) 创建对角矩阵
myNDArray = np.diag([1,2,3,4]) 
print("使用np.diag([1,2,3,4])创建")
print(myNDArray)
print("--------------------------")

# 2.5 np.arange() 创建指定范围与步长的一维数组
myNDArray = np.arange(1,11,1)
print("使用np.arange(1,11,1))创建")
print(myNDArray)
print("--------------------------")

# 2.6 np.linspace(start,stop,num,endpoint=True) 创建指定起始的给定元素个数的【等差】一维数组(默认包含尾值)
myNDArray = np.linspace(1,11,5)
print("使用np.linspace(1,11,5)创建")
print(myNDArray)
print("--------------------------")

# 2.7 np.logspace(e**start,e**stop,num,endpoint=True,base=10) 创建指定起始的给定元素个数的【等比】一维数组(默认包含尾值,且等比base默认为10)
myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)
print("--------------------------")


# 2.8 np.random模块
# 2.8.1 np.random.random( size ) 创建0-1之间的指定个数的随机值组成的一维数组 
myNDArray = np.random.random(10)
print("使用np.np.random.random(10)创建")
print(myNDArray)
print("--------------------------")

# 2.8.2 np.random.normal( loc,scale,size ) 创建均值loc,标准差scale,元素个数size的一维数组 【正态分布】
myNDArray = np.random.normal(1.75,1,100)
print("使用 np.random.normal(1.75,1,100)创建")
print(myNDArray)
print("--------------------------")
# 使用plt展示
import matplotlib.pyplot as plt
plt.figure()
plt.hist(myNDArray,10)
plt.show()

# 2.8.23 np.random.uniform( low,high,size ) 创建均值loc,标准差scale,元素个数size的一维数组 【均匀分布】
myNDArray = np.random.uniform(1.10,1,(20,30))
print("使用 np.random.uniform(1.10,1,(2,3))创建")
print(myNDArray)
print("--------------------------")
# 使用plt展示
plt.figure()
plt.hist(myNDArray,10)
plt.show()

# 2.8.3 np.random.randint(a, b, size=(), dtype=int) 创建在范围在[a, b)中的指定大小的随机整数(含有重复值)组成的ndarray

# 2.8.4 np.random.randn(d0, d1, … dn) 创建标准正态分布(均值=0,标准差=1)的概率密度随机数(di表示i维上的数据个数)
# print(np.random.randn(2,3))

使用np.array创建
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
--------------------------
使用np.asarray创建
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
--------------------------
使用np.zeros((2,3))创建
[[0. 0. 0.]
 [0. 0. 0.]]
--------------------------
使用np.ones((2,3))创建
[[1. 1. 1.]
 [1. 1. 1.]]
--------------------------
使用np.zeros(2,3)创建
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
--------------------------
使用np.diag([1,2,3,4])创建
[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]
--------------------------
使用np.arange(1,11,1))创建
[ 1  2  3  4  5  6  7  8  9 10]
--------------------------
使用np.linspace(1,11,5)创建
[ 1.   3.5  6.   8.5 11. ]
--------------------------
使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
使用np.np.random.random(10)创建
[0.98694144 0.76442002 0.68214597 0.14857849 0.77693909 0.80805074
 0.21922838 0.18226575 0.26151965 0.15434984]
--------------------------
使用 np.random.normal(1.75,1,100)创建
[ 1.67505971  0.40113656  2.48161824 -0.01845878  0.90496477  2.69802984
  2.94877678  0.11782976  2.07285894  3.16597878  3.31993686  1.27704351
  1.9537056   0.84782021  2.40737044  0.2981676   0.54453057  3.48002726
  2.29480154  0.98207877  1.80277311  2.14887504  1.3656154   1.60483568
  2.67246384  2.28251879  1.90566007  2.68032271  0.68770623  1.65787999
  2.59625459  1.7460682  -0.39109142  3.1986754   2.02785007  1.76944876
  2.45046819  1.31492081  2.62908256  2.2628327   1.43874272  1.31190384
  2.30522717 -0.15768705  2.30893341  3.31765409  2.23322137  2.07029301
  1.3442506   1.4111489   3.80380232  0.42765345  0.96485353 -0.28374991
  3.73684752  1.378377    0.37245836  0.54526902  2.16979223  0.23042717
  2.29261266  0.63283867 -0.4741276   1.26395204  1.42796786 -0.42374254
  3.60312459  1.79117582  2.92906889  1.48240636  2.36196976  2.28228635
  2.43635078 -0.38269342  2.09099631  1.86993982  0.74433223  1.20943986
  1.55256999  1.46762698  0.83533645  1.33022822  1.05657318  1.77632771
  0.43736867  1.09809372  0.47584479  1.99188425  3.88644217  2.40961493
  1.06239141  2.79239086  2.09479233  1.01569475  2.71373938  1.75166344
  2.65374284  2.19328214  1.4190753   1.90988844]
--------------------------

在这里插入图片描述

使用 np.random.uniform(1.10,1,(2,3))创建
[[1.0646373  1.04106273 1.0493372  1.01323874 1.02643403 1.08756943
  1.02212617 1.06738376 1.02425757 1.03069668 1.07832839 1.06473457
  1.07000729 1.07619312 1.02304455 1.05318189 1.02412238 1.01569068
  1.00526404 1.0782949  1.04618623 1.06822866 1.00146653 1.01051038
  1.00344965 1.01393925 1.05490811 1.00784    1.05518212 1.04311717]
 [1.0161972  1.03336977 1.08515415 1.00538872 1.0048099  1.01892535
  1.02187729 1.07759793 1.0044576  1.05871996 1.09279977 1.01613777
  1.04812817 1.07807016 1.08747444 1.03193286 1.09220697 1.03715259
  1.08607635 1.09940974 1.0531829  1.06049616 1.02344541 1.02135377
  1.04657634 1.05591031 1.07599861 1.03680375 1.09862066 1.04289038]
 [1.02593125 1.00965726 1.06444982 1.02401025 1.07185022 1.05705167
  1.09746228 1.04192553 1.0069116  1.05229566 1.05951107 1.02913307
  1.04204475 1.08807504 1.05549678 1.00606048 1.09828326 1.06536738
  1.01245207 1.01919512 1.05593884 1.09891337 1.01178349 1.06798646
  1.06798943 1.03214976 1.03997211 1.04422143 1.05580566 1.00940351]
 [1.08527616 1.0407684  1.01089278 1.00474172 1.08707804 1.01449114
  1.074704   1.09791494 1.07646629 1.03379115 1.03625898 1.06829178
  1.03051412 1.04159155 1.02251988 1.08115572 1.08276213 1.09643571
  1.0586965  1.0872508  1.04649539 1.08920002 1.00349078 1.07687868
  1.01710053 1.08420061 1.08024906 1.09516048 1.09877514 1.04388992]
 [1.0612709  1.05243644 1.08522038 1.00013121 1.03534703 1.08780921
  1.0193435  1.00894882 1.00302732 1.01461643 1.0072089  1.03730753
  1.05381665 1.09445759 1.04131616 1.06119876 1.04852764 1.00175366
  1.08286298 1.00279791 1.05527854 1.0361621  1.09378048 1.03111487
  1.00669541 1.09474825 1.01639442 1.09015377 1.02394172 1.090647  ]
 [1.00015613 1.03282398 1.03716522 1.02517442 1.07579854 1.0482251
  1.06349351 1.01553608 1.03999816 1.04254074 1.06792544 1.05475944
  1.00114976 1.01739353 1.00824129 1.00200623 1.04649481 1.0051243
  1.0740545  1.05106591 1.00720557 1.00800414 1.02753216 1.09822785
  1.05727602 1.03426387 1.04683858 1.06691865 1.01377834 1.00705324]
 [1.0067312  1.07247999 1.09593779 1.01641581 1.00291297 1.0114776
  1.00105711 1.07605113 1.07579346 1.03026725 1.05393696 1.09963711
  1.05859099 1.09947237 1.01763998 1.03029485 1.0197936  1.00983296
  1.01802012 1.03844166 1.03516277 1.05402403 1.0927023  1.05876193
  1.087854   1.02137064 1.07775551 1.04664903 1.04622491 1.00956262]
 [1.02989735 1.08207482 1.04191933 1.09785752 1.05924577 1.09635408
  1.033684   1.09391279 1.01870238 1.05519453 1.01011376 1.01483874
  1.08141842 1.0880821  1.08154995 1.05923363 1.05106973 1.07919393
  1.01963205 1.05369732 1.07702226 1.01450227 1.03855182 1.08305393
  1.0236216  1.050381   1.0292057  1.01331392 1.08992239 1.00374043]
 [1.05060161 1.08649405 1.07435916 1.01101315 1.06029571 1.0613883
  1.070457   1.01786738 1.02438536 1.07860248 1.04866478 1.01978034
  1.08169093 1.03963785 1.02493817 1.0066989  1.09030658 1.05992054
  1.04863499 1.04604901 1.07653593 1.08996868 1.03754703 1.04983558
  1.01738205 1.02245851 1.08774079 1.03978456 1.06811202 1.00394928]
 [1.03458782 1.08135183 1.01569664 1.03430978 1.04429564 1.01601518
  1.01097922 1.01675885 1.09723926 1.04780064 1.07696447 1.08148946
  1.00054562 1.08183679 1.05303976 1.00145098 1.03689606 1.07795835
  1.029287   1.06312322 1.0250803  1.00676038 1.0140419  1.04726964
  1.0274964  1.02649829 1.00096425 1.01156186 1.01754458 1.05531346]
 [1.09650902 1.0501937  1.06202465 1.00478206 1.00743854 1.0153268
  1.03781729 1.08975352 1.04737951 1.05132598 1.08042364 1.02959953
  1.09620665 1.07454291 1.03555717 1.00811896 1.07945615 1.04025569
  1.03182324 1.02389092 1.05788885 1.07156594 1.08126521 1.01094597
  1.05453652 1.03866728 1.02859898 1.07011385 1.0639741  1.04873718]
 [1.01289679 1.06088697 1.01836315 1.09152328 1.04616927 1.05753735
  1.0534251  1.06095748 1.03362683 1.09954528 1.04185414 1.07635396
  1.03060254 1.05683366 1.06240245 1.09820495 1.04275747 1.03383959
  1.04278926 1.05108132 1.07775571 1.07758766 1.06246876 1.04136862
  1.0263422  1.09159908 1.02655434 1.04284162 1.09320124 1.02140362]
 [1.03076926 1.08489914 1.05564373 1.05989604 1.0298435  1.07130194
  1.08288339 1.04831859 1.06030481 1.09380775 1.06243543 1.02529793
  1.0719787  1.036513   1.07617001 1.06622073 1.09859239 1.0777177
  1.09110076 1.03473311 1.08305644 1.00216531 1.04208478 1.07066882
  1.09204254 1.02274071 1.06660437 1.05474587 1.06650928 1.02337433]
 [1.06936665 1.09303364 1.00260744 1.05244023 1.03261219 1.04621298
  1.01556954 1.01877958 1.01521382 1.09869627 1.08669236 1.07016579
  1.0952169  1.08934902 1.06328953 1.02382227 1.01876839 1.0009345
  1.027934   1.09276243 1.04391487 1.09443389 1.01542218 1.0921251
  1.08244328 1.01936138 1.00763194 1.08467298 1.02930919 1.01444137]
 [1.03241116 1.00601585 1.06875779 1.08777646 1.00891861 1.02913552
  1.06298963 1.05072194 1.06741635 1.03932451 1.04495906 1.0447338
  1.04151223 1.06903579 1.0628366  1.08174608 1.03152726 1.06632495
  1.09201743 1.03176641 1.032397   1.02919504 1.08260514 1.08569347
  1.06081318 1.0317379  1.05953802 1.02522307 1.04075903 1.00608679]
 [1.05553041 1.02101534 1.05290095 1.0641407  1.06111128 1.08564706
  1.04200773 1.0997679  1.04809944 1.00981487 1.04163062 1.07267272
  1.05242328 1.09129901 1.08763547 1.0178737  1.04900177 1.02927875
  1.03146633 1.04696562 1.03558471 1.05745922 1.07103769 1.03829319
  1.02809598 1.08237567 1.0357102  1.03505655 1.03042399 1.09207557]
 [1.0611547  1.02720924 1.0083003  1.01197453 1.06908812 1.02726194
  1.05819931 1.09970103 1.08127893 1.05669232 1.01912968 1.09735505
  1.02897018 1.03846417 1.0599642  1.01318714 1.00821409 1.07682408
  1.06714662 1.02513284 1.07503411 1.07189177 1.0973542  1.03331157
  1.04732424 1.09255911 1.04441842 1.08784725 1.09638214 1.08449857]
 [1.05152983 1.00141425 1.0890675  1.07067413 1.0472271  1.0992028
  1.07063033 1.02805586 1.01499495 1.00119302 1.02332388 1.06995337
  1.02036626 1.08210417 1.04553065 1.02409501 1.04450532 1.00192104
  1.00793566 1.0414471  1.02430948 1.07021291 1.06904127 1.03358178
  1.04097847 1.05477634 1.09583159 1.06618711 1.02113397 1.07613749]
 [1.09239319 1.05635017 1.07690173 1.02985581 1.00288046 1.0082043
  1.09042651 1.06242188 1.01469005 1.03368665 1.0700558  1.03243217
  1.09059841 1.09951115 1.03634514 1.06982106 1.06986148 1.06205987
  1.00002476 1.04247229 1.0447468  1.02022938 1.07420861 1.06889111
  1.03558028 1.05827583 1.01506662 1.0296531  1.02743054 1.05307535]
 [1.02543541 1.0190161  1.08180094 1.02548265 1.01674936 1.05028411
  1.00746477 1.04199418 1.0908696  1.09962999 1.06596327 1.05225282
  1.00740661 1.01910418 1.07142337 1.08127211 1.06233714 1.05716245
  1.01916161 1.0900991  1.00976151 1.00663847 1.00053414 1.02582093
  1.08658318 1.00868502 1.01652466 1.00617985 1.08167276 1.02615394]]
--------------------------

在这里插入图片描述

属性

# 数据准备
myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)
print("--------------------------")

# ndarray属性查看
# 1.ndarray.ndim 返回数组的维数
print("维度:\t\t",myNDArray.ndim)

# 2.ndarray.shape 返回数组的形状(以tuple形式)
print("形状:\t\t",myNDArray.shape)

# 3.ndarray.size 返回数组的元素个数
print("元素个数:\t",myNDArray.size)

# 4.ndarray.dtype 返回数组中元素的数据类型
print("元素数据类型:\t",myNDArray.dtype)

# 5.ndarray.itemsize 返回数组中元素占用的存储空间(B)
print("元素存储空间:\t",myNDArray.itemsize)

# 6.ndarray.T获取转置数组
print("转置数组:\t\n",myNDArray.T)
使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
维度:		 1
形状:		 (10,)
元素个数:	 10
元素数据类型:	 float64
元素存储空间:	 8
转置数组:	
 [  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]

形状修改

myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)

print("--------------------------")
print("使用 ndarray.reshape()")
print("修改后的多维数组")
print(myNDArray.reshape([-1,2]))  # 此处传入的-1表示不知道具体值 在运行时由程序自动计算
print("修改后的原始数组")
print(myNDArray)
print("【结论】reshape不修改原始数据 且执行函数后返回修改的值")

print("--------------------------")
print("使用 ndarray.resize()")
print("修改后的多维数组")
print(myNDArray.resize([5,2]))  # 此处不能传入的-1
print("修改后的原始数组")
print(myNDArray)
print("【结论】resize修改原始数据 且执行函数后返回None")

print("--------------------------")
print("使用 ndarray.T")
print("修改前")
print(myNDArray)
print("修改后的多维数组")
print(myNDArray.T)
print("修改后的原始数组")
print(myNDArray)
print("【结论】T属性不修改原始数据 且执行函数后返回结果")
使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
使用 ndarray.reshape()
修改后的多维数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
修改后的原始数组
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
【结论】reshape不修改原始数据 且执行函数后返回修改的值
--------------------------
使用 ndarray.resize()
修改后的多维数组
None
修改后的原始数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
【结论】resize修改原始数据 且执行函数后返回None
--------------------------
使用 ndarray.T
修改前
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
修改后的多维数组
[[  1.           2.7825594    7.74263683  21.5443469   59.94842503]
 [  1.66810054   4.64158883  12.91549665  35.93813664 100.        ]]
修改后的原始数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
【结论】T属性不修改原始数据 且执行函数后返回结果

索引与切片

# 索 引
# 对于N维数组 指定的方式为ndarray[n_dim,n-1_dim,n-2_dim .... 2_dim,1_dim]
pointArray = np.random.uniform(10,20,(3,4,5))
print(pointArray)
# 获取最后一个元素
print(pointArray[2][3][4])

[[[16.64716349 18.17656348 13.04133624 16.55571974 11.32178617]
  [17.79890946 16.34938001 13.85174534 16.56768269 11.5153287 ]
  [19.45768797 12.15795393 17.32689244 10.38788172 10.78220753]
  [16.2920733  18.05794596 12.96980799 18.02098106 13.50622008]]

 [[11.95759017 14.98805551 19.64466193 13.59526949 19.53866727]
  [11.15464832 15.46968762 12.76440722 12.41821119 16.24981854]
  [11.44377375 17.23954935 19.12779827 11.0169205  17.4642939 ]
  [11.38382855 19.36749907 13.84876092 12.8668164  10.55189115]]

 [[17.5450068  14.95714515 18.77974534 18.32744931 18.16252158]
  [19.16806758 18.05432418 16.60055023 11.384959   15.57648868]
  [18.85300183 16.99346902 17.08702811 12.82609623 16.3777613 ]
  [11.27290271 17.1267105  11.09846292 16.73752603 19.22497525]]]
19.224975245576598

运算

# 1.逻辑运算与赋值
# 1.1逻辑运算
nda = np.arange(1,11,2)
print(nda)
print(nda>5)

# 1.2 逻辑运算与赋值
nda[nda>5]=99
print(nda)

# 2.通用判断函数
# 2.1 np.any()
print(np.any(nda>4))
# 2.2 np.all()
print(np.all(nda>4))

# 3.np.where(三元)
# np.where(condition,if True return,if False return)
print(np.where(nda>10,1,0))
# 复合逻辑判断
print(np.where(np.logical_and(nda>0,nda<100),1,0))
print(np.where(np.logical_or(nda>0,nda<-1),1,0))

# 4.统计函数
# min max sum mean std var cumsum cumprod argmin argmax

# 5.广播机制
# 对于不同维度的多维数组之间的计算,如果满足:
# [从数组的最高维度向下比较,对于某一维度如果该维度的数据宽度相同,或者某一数组在该维度下数据维度为1,则继续比较下一维度,如果所有维度均符合上述条件],
# 那么numpy将在不创建副本的情况下自动拓展其维度使得计算能够进行
[1 3 5 7 9]
[False False False  True  True]
[ 1  3  5 99 99]
True
False
[0 0 0 1 1]
[1 1 1 1 1]
[1 1 1 1 1]

matrix

import numpy as np
# 矩阵本质上为特殊的二维数组 因此其可以继承所有ndarray数组的方法与属性


# 1.矩阵的创建
# 1.1 使用现有数据创建矩阵
matrix1 = np.mat('1,2,3;4,5,6;7,8,9')
matrix2 = np.matrix([[1,2,3],[4,5,6],[7,8,9]])
print(matrix1,matrix2)

# 1.2 使用现有数组变量创建矩阵
array = [[1,2,3],[4,5,6],[7,8,9]]
matrix3= np.mat(array)
matrix4= np.matrix(array)
print(matrix3,matrix4)
ndarray = np.array([[1,2,3],[4,5,6],[7,8,9]])
matrix5= np.mat(ndarray)
matrix6= np.matrix(ndarray)


# 2.矩阵加法
# 矩阵加法要求双方必须为同一类型的矩阵或者为标量
print("Matrix加法运算示例")
print(matrix4+2)
print(matrix3+matrix4)


# 3.矩阵的运算
# 矩阵乘法(需要符合矩阵相乘的条件 m,n * n,l =m,l)
# matrix1 * matrix2  该方法直接使用运算符* 进行矩阵之间的点乘
# np.dot(object1,object2)  该方法支持矩阵与标量计算 与直接使用*完全一致
# np.matmul(matrix1,matrix2) 该方法仅支持矩阵与矩阵之间的运算
print("Matrix乘法运算示例")
print(matrix5*2)
print(np.dot(matrix5,5))
print(np.matmul(matrix5,matrix6))
# 3.矩阵的特有属性
# T(转置矩阵) H(共轭转置矩阵) I(矩阵的逆矩阵) A(矩阵的二维数组)
[[1 2 3]
 [4 5 6]
 [7 8 9]] [[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2 3]
 [4 5 6]
 [7 8 9]] [[1 2 3]
 [4 5 6]
 [7 8 9]]
Matrix加法运算示例
[[ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]
Matrix乘法运算示例
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]
[[ 5 10 15]
 [20 25 30]
 [35 40 45]]
[[ 30  36  42]
 [ 66  81  96]
 [102 126 150]]

Pandas

介绍

Pandas基于:

  • Numpy
  • matpoltlib

其主要应用领域为数据挖掘
具有独特的数据结构:

  • Series
  • DataFram
  • So on

具有的独特优势:

  • 更好的图标可读性
  • 更强的数据处理能力
  • 方便的文件操作

Series

创建

import pandas as pd
import numpy as np
# pd.Series(list / ndarray)  [注意Series中的S大写]
# 默认配置索引
s1 = pd.Series(np.arange(10))
print(s1)
# 指定索引
print("------------------------------------------")
s2= pd.Series(np.arange(5),index=['a','b','c','d','e'])
print(s2)
# 字典创建
print("------------------------------------------")
s3 = pd.Series({'red':1,'green':2,'blue':3})
print(s3)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
------------------------------------------
a    0
b    1
c    2
d    3
e    4
dtype: int32
------------------------------------------
red      1
green    2
blue     3
dtype: int64

属性

'''
index,values属性
'''

import pandas as pd
s3 = pd.Series({'red':1,'green':2,'blue':3})
print(s3)
print("------------------------------------------")
print("s3的值为:",s3.values)
print("s3的索引为:",s3.index)
red      1
green    2
blue     3
dtype: int64
------------------------------------------
s3的值为: [1 2 3]
s3的索引为: Index(['red', 'green', 'blue'], dtype='object')

DataFrame

创建

import pandas as pd
import numpy as np

# 默认索引
df1 = pd.DataFrame(np.random.randn(2,3))
print(df1)

# 指定索引
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print(df2)

# 修改行列索引
# 修改索引值只能一起修改
df2.index = ["STU_No."+str(i) for i in range(1,11)]
print(df2)
# df2.index[0]=xxx 非法

# 重设索引值 reset_index(drop) 是否删除原索引
df2.reset_index(drop=True)
print(df2)

# 设置索引
df2.set_index('Chinese')
          0         1         2
0 -0.157809  0.320192  0.231431
1 -0.761845  0.638310  0.539367
          Chinese  Math  English  Physical  PE
STU_No1        59    58       83        52  74
STU_No2        86    97       78        56  55
STU_No3        57    75       75        43  96
STU_No4        91    81       78        55  96
STU_No5        72    82       54        68  40
STU_No6        75    64       96        91  41
STU_No7        42    50       92        91  47
STU_No8        64    51       67        98  71
STU_No9        95    49       68        45  81
STU_No10       84    56       47        45  43
           Chinese  Math  English  Physical  PE
STU_No.1        59    58       83        52  74
STU_No.2        86    97       78        56  55
STU_No.3        57    75       75        43  96
STU_No.4        91    81       78        55  96
STU_No.5        72    82       54        68  40
STU_No.6        75    64       96        91  41
STU_No.7        42    50       92        91  47
STU_No.8        64    51       67        98  71
STU_No.9        95    49       68        45  81
STU_No.10       84    56       47        45  43
           Chinese  Math  English  Physical  PE
STU_No.1        59    58       83        52  74
STU_No.2        86    97       78        56  55
STU_No.3        57    75       75        43  96
STU_No.4        91    81       78        55  96
STU_No.5        72    82       54        68  40
STU_No.6        75    64       96        91  41
STU_No.7        42    50       92        91  47
STU_No.8        64    51       67        98  71
STU_No.9        95    49       68        45  81
STU_No.10       84    56       47        45  43
MathEnglishPhysicalPE
Chinese
5958835274
8697785655
5775754396
9181785596
7282546840
7564969141
4250929147
6451679871
9549684581
8456474543

属性

# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)

# shape 返回DateFrame的形状
print("df2的形状:",df2.shape)

# index 返回DateFrame的行索引
print("df2的行索引:\n",df2.index)

# columns 返回DateFrame的列索引
print("df2的列索引:\n",df2.columns)

# values 返回DateFrame的值
print("df2的值:\n",df2.values)

# T 返回DataFrame的转置
print("df2的转置:\n",df2.T)

# head( num = 5 ) 返回DateFrame的前num行数据
print("df2的前3行数据:\n",df2.head(3))

# tail( num = 5 ) 返回DataFrame的后num行数据
print("df2的后5行数据:\n",df2.tail())
df2的形状: (10, 5)
df2的行索引:
 Index(['STU_No1', 'STU_No2', 'STU_No3', 'STU_No4', 'STU_No5', 'STU_No6',
       'STU_No7', 'STU_No8', 'STU_No9', 'STU_No10'],
      dtype='object')
df2的列索引:
 Index(['Chinese', 'Math', 'English', 'Physical', 'PE'], dtype='object')
df2的值:
 [[64 77 53 67 70]
 [48 89 99 68 43]
 [63 57 53 59 91]
 [44 72 78 96 40]
 [44 88 64 82 79]
 [80 42 46 54 96]
 [97 52 89 68 58]
 [57 53 76 78 96]
 [86 57 46 76 75]
 [78 73 61 91 67]]
df2的转置:
           STU_No1  STU_No2  STU_No3  STU_No4  STU_No5  STU_No6  STU_No7  \
Chinese        64       48       63       44       44       80       97   
Math           77       89       57       72       88       42       52   
English        53       99       53       78       64       46       89   
Physical       67       68       59       96       82       54       68   
PE             70       43       91       40       79       96       58   

          STU_No8  STU_No9  STU_No10  
Chinese        57       86        78  
Math           53       57        73  
English        76       46        61  
Physical       78       76        91  
PE             96       75        67  
df2的前3行数据:
          Chinese  Math  English  Physical  PE
STU_No1       64    77       53        67  70
STU_No2       48    89       99        68  43
STU_No3       63    57       53        59  91
df2的后5行数据:
           Chinese  Math  English  Physical  PE
STU_No6        80    42       46        54  96
STU_No7        97    52       89        68  58
STU_No8        57    53       76        78  96
STU_No9        86    57       46        76  75
STU_No10       78    73       61        91  67

Pandas的基本数据操作

索引与切片

# 数据准备
# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print(df2)
print()

# 1.直接根据索引获取值(先列后行)
# 【不支持】先行后列 或者使用数字(即形如array[1,1])
print("使用直接索引")
print(df2['Chinese']['STU_No1'])
print()

# 2.使用loc函数
print("使用loc函数")
print(df2.loc["STU_No1":"STU_No5","Chinese"])
print()

# 3.使用iloc函数
print("使用iloc函数")
print(df2.iloc[0:3,0:2])
print()

# 4.使用ix进行数值和名称的混合索引 ( V0.20后已被剔除 )
# print("使用ix函数")
# print(df2.ix[0:4,["Chinese","PE"]])
          Chinese  Math  English  Physical  PE
STU_No1        48    62       52        55  94
STU_No2        75    94       79        84  85
STU_No3        81    76       51        88  88
STU_No4        85    79       60        49  62
STU_No5        43    90       84        56  48
STU_No6        70    95       71        71  55
STU_No7        97    95       50        77  50
STU_No8        68    51       54        45  79
STU_No9        49    82       53        58  63
STU_No10       86    91       69        98  86

使用直接索引
48

使用loc函数
STU_No1    48
STU_No2    75
STU_No3    81
STU_No4    85
STU_No5    43
Name: Chinese, dtype: int32

使用iloc函数
         Chinese  Math
STU_No1       48    62
STU_No2       75    94
STU_No3       81    76

赋值与排序

# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(5,5))
rowIndex = ["STU_No"+str(i) for i in range(1,6)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print("原始数据")
print(df2)
print()

# 1.赋值操作
# 首先使用索引取到元素后使用等于号( = )赋值
df2.iloc[0:5,4] = 100
print("赋值后的数据")
print(df2,"\n")

# 2.排序操作
# 2.1 根据元素进行排序 df.sort_values(by = ,ascending = True) by 依据的列 ascending 是否升序
print("多指标排序的结果")
print(df2.sort_values(by=['Chinese','Math'],ascending = True))
print()

# 2.2 根据索引进行排序 df.sort_index( ascending = True )
print("索引排序结果")
print(df2.sort_index(ascending=False))

# 2.3 说明
# 上述排序基于DataFrame 针对Series仍可以适用,因为其仅为一维,故不需要填写 b y参数

原始数据
         Chinese  Math  English  Physical  PE
STU_No1       47    55       48        85  54
STU_No2       66    91       47        50  62
STU_No3       72    72       86        82  84
STU_No4       70    73       55        78  43
STU_No5       93    87       85        88  93

赋值后的数据
         Chinese  Math  English  Physical   PE
STU_No1       47    55       48        85  100
STU_No2       66    91       47        50  100
STU_No3       72    72       86        82  100
STU_No4       70    73       55        78  100
STU_No5       93    87       85        88  100 

多指标排序的结果
         Chinese  Math  English  Physical   PE
STU_No1       47    55       48        85  100
STU_No2       66    91       47        50  100
STU_No4       70    73       55        78  100
STU_No3       72    72       86        82  100
STU_No5       93    87       85        88  100

索引排序结果
         Chinese  Math  English  Physical   PE
STU_No5       93    87       85        88  100
STU_No4       70    73       55        78  100
STU_No3       72    72       86        82  100
STU_No2       66    91       47        50  100
STU_No1       47    55       48        85  100

算数、逻辑、统计运算

# 【算术运算】
# 加法 .add( i ) 或者 +i
# 减法 sub( i ) 或者 -i
# 乘 .mul( i ) 或 *i
# 除 .div( i ) 
# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(5,5))
rowIndex = ["STU_No"+str(i) for i in range(1,6)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
df2
ChineseMathEnglishPhysicalPE
STU_No16186666557
STU_No29997576364
STU_No34989468463
STU_No47681865265
STU_No56188705184
df2["Chinese"].add(10)
STU_No1     71
STU_No2    109
STU_No3     59
STU_No4     86
STU_No5     71
Name: Chinese, dtype: int32
df2*10
ChineseMathEnglishPhysicalPE
STU_No1610860660650570
STU_No2990970570630640
STU_No3490890460840630
STU_No4760810860520650
STU_No5610880700510840
# 【逻辑运算】
# 使用逻辑符号
df2[(df2["Chinese"]>80) & (df2["Chinese"]<90)]
ChineseMathEnglishPhysicalPE
# query函数
df2.query("Chinese>80 & Chinese<95")
ChineseMathEnglishPhysicalPE
# isin函数
df2[df2["Chinese"].isin([86,96])]
ChineseMathEnglishPhysicalPE
# 【统计运算】
# 1.综合统计函数 describe
df2.describe()
ChineseMathEnglishPhysicalPE
count5.0000005.000005.0000005.0000005.000000
mean69.20000088.2000065.00000063.00000066.600000
std19.2145785.8051714.93318513.32291310.212737
min49.00000081.0000046.00000051.00000057.000000
25%61.00000086.0000057.00000052.00000063.000000
50%61.00000088.0000066.00000063.00000064.000000
75%76.00000089.0000070.00000065.00000065.000000
max99.00000097.0000086.00000084.00000084.000000
# 2.使用统计函数 min max mod abs median sum idmax idmin  向函数传入参数axis
#    以及累计统计函数 cumsum cumprod cummax cummin
# 3.自定义运算
# apply(func,axis) func自定义函数 axis运算轴
df2[["Chinese","Math"]]
ChineseMath
STU_No16186
STU_No29997
STU_No34989
STU_No47681
STU_No56188
df2[["Chinese","Math"]].apply(lambda x:x.max()-x.min(),axis=0)
Chinese    50
Math       16
dtype: int64

Pandas的绘图

# DataFrame_OR_Series_Object.plot(kind='图形的样式(如line直线,bar柱状图,barh横向柱状图,hist直方图,pie饼图,scatter散点图)')
df2[["Chinese"]].plot(kind='line')
<Axes: >

在这里插入图片描述


Pandas的文件操作

CSV文件

import pandas as pd
# 读 Pandas.read_csv(file_Path,sep=',',usecols=) 文件路径 分隔符 读取的数据列名
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data
序号年级班级姓名是否请假
012020级智管2001徐顺明
122020级智管2001付芸
232020级智管2001晏程博
342020级智管2001李澎宣
452021级智管2001李雯
562021级2021级电子信息专硕1班李轩
672022级2022级电子信息专硕1班彭玉洁
782021级2021级电子信息专硕1班贾啸宇
892022级2022级电子信息专硕1班唐振瀚
9102021级2021级电子信息专硕1班朱旭炜
# 写文件 fileObject.to_csv( file_Path , columns= , mode = w , encoding=)
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data.to_csv("./data/simpleList.csv",columns=['姓名','是否请假'],index=False)
data = pd.read_csv("./data/simpleList.csv")
data
姓名是否请假
0徐顺明
1付芸
2晏程博
3李澎宣
4李雯
5李轩
6彭玉洁
7贾啸宇
8唐振瀚
9朱旭炜

缺失值处理

处理思路

  • 获取缺失值的标记方式(NaN 或者 ?等其他标记)
  • 如果缺失值为NaN
    • 判断数据中是否包含NaN
      • pd.isnull(df)
      • pd.notnull(df
    • 存在缺失值
      • 删除缺失值 dropna(axis)
        • 此方法不会修改原始数据,而是返回新对象
      • 替换缺失值 fillna(values,inplace = True)
        • value: 替换的值
        • inplace:是否修改原始数据
  • 如果缺失值不为NaN
    - 先替换缺失标记为NaN,然后按照上述方法执行

案例说明

import pandas as pd
## 导入数据
data = pd.read_csv("./data/IMDB-Movie-Data.csv")
data.head()
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0
import numpy as np
# 检查是否存在缺失值
print("存在缺失值:",np.any(pd.isna(data)))
存在缺失值: True
# 缺失值处理
# 删除dropna
data = data.dropna()
print("存在缺失值:",np.any(pd.isna(data)))
存在缺失值: False
# 缺失值处理
# 替换fillna()  由于替换只能一列一列的进行 这里使用for循环一次操作
for i in data.columns:
    if(np.any(pd.isna(data[i]))):
        data[i].fillna(data[i].mean(),inplace=True)
data.head()
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0

数据离散化

说明:
数据离散化是指在连续属性的值域上,将值域分为若干个离散的区间,并用不同的符号表示落在该离散区间内的连续属性

# 数据准备
data = pd.read_csv("./data/stock_day.csv")
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
data = data["p_change"] # 仅使用p-change作为演示
data.head()
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
Name: p_change, dtype: float64
# 数据离散化
# 自动分组 qcut( object , number ) 分组的对象 分组的数量
# 此方法将会给每个离散区间分配近乎一致的数量的元素
cut1 = pd.qcut(data,10)
cut1.value_counts()
p_change
(-10.030999999999999, -4.836]    65
(-0.462, 0.26]                   65
(0.26, 0.94]                     65
(5.27, 10.03]                    65
(-4.836, -2.444]                 64
(-2.444, -1.352]                 64
(-1.352, -0.462]                 64
(1.738, 2.938]                   64
(2.938, 5.27]                    64
(0.94, 1.738]                    63
Name: count, dtype: int64
# 数据离散化
# 手动分组 qcut( object , list ) 分组的对象 分组区间
bins = [-100,-75,-10.0,10,75,100]
cut2 = pd.cut(data,bins)
cut2.value_counts()
p_change
(-10.0, 10.0]      622
(-75.0, -10.0]      11
(10.0, 75.0]        10
(-100.0, -75.0]      0
(75.0, 100.0]        0
Name: count, dtype: int64
# 数据离散化
# 独热编码 one-hot pd.get_dummies(data,prefix)
dummies = pd.get_dummies(cut1,dtype=int)
dummies.head()
(-10.030999999999999, -4.836](-4.836, -2.444](-2.444, -1.352](-1.352, -0.462](-0.462, 0.26](0.26, 0.94](0.94, 1.738](1.738, 2.938](2.938, 5.27](5.27, 10.03]
2018-02-270000000100
2018-02-260000000010
2018-02-230000000100
2018-02-220000001000
2018-02-140000000100

数据表的合并

说明:
应用场景为模型需要的数据在多张表中,此时可以使用合并操作将多张表合为一张

# 数据展示
cut1
2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64, right]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
cut2
2018-02-27    (-10.0, 10.0]
2018-02-26    (-10.0, 10.0]
2018-02-23    (-10.0, 10.0]
2018-02-22    (-10.0, 10.0]
2018-02-14    (-10.0, 10.0]
                  ...      
2015-03-06    (-10.0, 10.0]
2015-03-05    (-10.0, 10.0]
2015-03-04    (-10.0, 10.0]
2015-03-03    (-10.0, 10.0]
2015-03-02    (-10.0, 10.0]
Name: p_change, Length: 643, dtype: category
Categories (5, interval[float64, right]): [(-100.0, -75.0] < (-75.0, -10.0] < (-10.0, 10.0] < (10.0, 75.0] < (75.0, 100.0]]
# pd.concat( [ df1 , df2 ...] , axis= )
pd.concat([cut1,cut2],axis=1)
p_changep_change
2018-02-27(1.738, 2.938](-10.0, 10.0]
2018-02-26(2.938, 5.27](-10.0, 10.0]
2018-02-23(1.738, 2.938](-10.0, 10.0]
2018-02-22(0.94, 1.738](-10.0, 10.0]
2018-02-14(1.738, 2.938](-10.0, 10.0]
.........
2015-03-06(5.27, 10.03](-10.0, 10.0]
2015-03-05(1.738, 2.938](-10.0, 10.0]
2015-03-04(0.94, 1.738](-10.0, 10.0]
2015-03-03(0.94, 1.738](-10.0, 10.0]
2015-03-02(1.738, 2.938](-10.0, 10.0]

643 rows × 2 columns

# pd.merge( lefr , right , how , on )
# left,right : 连接的表
# how  : 按照何种方式连接 (left,right,outer,inner)(类似于数据库连接方式)
# on   : 指定的公共键
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
left
key1key2AB
0K0K0A0B0
1K0K1A1B1
2K1K0A2B2
3K2K1A3B3
right
key1key2CD
0K0K0C0D0
1K1K0C1D1
2K1K0C2D2
3K2K0C3D3
pd.merge(left, right, on=["key1", "key2"])
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2

交叉表与透视表

# 交叉表 crosstable( Series1 , Series2 ) 展示两列数据的关系
# 数据准备
data = pd.read_csv("./data/stock_day.csv")
data.head()
data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns

time = pd.to_datetime(data.index)
time
DatetimeIndex(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22',
               '2018-02-14', '2018-02-13', '2018-02-12', '2018-02-09',
               '2018-02-08', '2018-02-07',
               ...
               '2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10',
               '2015-03-09', '2015-03-06', '2015-03-05', '2015-03-04',
               '2015-03-03', '2015-03-02'],
              dtype='datetime64[ns]', length=643, freq=None)
data['week'] = time.weekday
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweek
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.391
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.530
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.324
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.903
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.582
data['result'] = np.where(data['p_change']>0,1,0)
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweekresult
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.3911
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.5301
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.3241
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.9031
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.5821
result = pd.crosstab(data['week'],data['result'])
# 使用图形展示
sum = result.sum(axis=1)
result = result.div(sum,axis=1)
result
01234
week
00.5040.473282NaNNaNNaN
10.4400.580153NaNNaNNaN
20.4880.541985NaNNaNNaN
30.5040.496183NaNNaNNaN
40.4720.519084NaNNaNNaN
result.plot(kind='bar',title='Picture 1')
result.plot(kind='bar',title='Picture 2',stacked=True)
<Axes: title={'center': 'Picture 2'}, xlabel='week'>

在这里插入图片描述

在这里插入图片描述

# 透视表 dataFrame.pivot_table( Series1(Variable) , Series2(Index) )
data.pivot_table(['result'],['week'])
result
week
00.496000
10.580153
20.537879
30.507812
40.535433

分组与聚合

import pandas as pd
# 读 Pandas.read_csv(file_Path,sep=',',usecols=) 文件路径 分隔符 读取的数据列名
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data
序号年级班级姓名是否请假
012020级智管2001徐顺明
122020级智管2001付芸
232020级智管2001晏程博
342020级智管2001李澎宣
452021级智管2001李雯
562021级2021级电子信息专硕1班李轩
672022级2022级电子信息专硕1班彭玉洁
782021级2021级电子信息专硕1班贾啸宇
892022级2022级电子信息专硕1班唐振瀚
9102021级2021级电子信息专硕1班朱旭炜
data.groupby(['年级']).count()
序号班级姓名是否请假
年级
2020级4444
2021级4444
2022级2222

案例分析

星巴克零售店数据分析

目的:
按照国家、国内省份两种方式进行划分 查看星巴克零售店数量分布

import pandas as pd
data = pd.read_csv("./data/starbucks/directory.csv")
data.head()
BrandStore NumberStore NameOwnership TypeStreet AddressCityState/ProvinceCountryPostcodePhone NumberTimezoneLongitudeLatitude
0Starbucks47370-257954Meritxell, 96LicensedAv. Meritxell, 96Andorra la Vella7ADAD500376818720GMT+1:00 Europe/Andorra1.5342.51
1Starbucks22331-212325Ajman Drive ThruLicensed1 Street 69, Al JarfAjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.42
2Starbucks47089-256771Dana MallLicensedSheikh Khalifa Bin Zayed St.AjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.39
3Starbucks22126-218024Twofour 54LicensedAl Salam StreetAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.3824.48
4Starbucks17127-178586Al Ain TowerLicensedKhaldiya Area, Abu Dhabi IslandAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.5424.51
# 按照国家分
count = data.groupby(['Country']).count()
count = count['Brand']
count.plot(kind='bar',figsize=(20,8))
<Axes: xlabel='Country'>

在这里插入图片描述

# 按照国家-省份分
count = data.groupby(['Country','State/Province']).count()
count = count['Brand']
count.plot(kind='bar',figsize=(200,8))

图片

电影数据分析

目标:

  • 电影的平均分与电影的导演人数
  • 电影的rating与runtime的分布
  • 统计电影的分类数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("./data/IMDB-Movie-Data.csv")
data.head()
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0
print("电影平均分",data['Rating'].mean())
电影平均分 6.723199999999999
print("电影导演人数:(去重)",np.unique(data['Director']).shape[0])
电影导演人数:(去重) 644
data['Rating'].plot(kind='hist',figsize=(20,8),title='Score OF Film')
<Axes: title={'center': 'Score OF Film'}, ylabel='Frequency'>

在这里插入图片描述

data['Runtime (Minutes)'].plot(kind='hist',figsize=(20,8),title='Runtime OF Film')
<Axes: title={'center': 'Runtime OF Film'}, ylabel='Frequency'>

在这里插入图片描述

# 电影的分类分析
tempList = [ i.split(',') for i in data['Genre']]
genreList = np.unique([i for j in tempList for i in j])
tempDf = pd.DataFrame(np.zeros((data.shape[0],genreList.shape[0])),columns=genreList)
tempDf
ActionAdventureAnimationBiographyComedyCrimeDramaFamilyFantasyHistoryHorrorMusicMusicalMysteryRomanceSci-FiSportThrillerWarWestern
00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
...............................................................
9950.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
9960.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
9970.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
9980.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
9990.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0

1000 rows × 20 columns

for i in data.index:
    tempDf.loc[i,tempList[i]]=1
print(tempDf)
print("\n分类结果\n",tempDf.sum(axis=0))
     Action  Adventure  Animation  Biography  Comedy  Crime  Drama  Family  \
0       1.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
1       0.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
2       0.0        0.0        0.0        0.0     0.0    0.0    0.0     0.0   
3       0.0        0.0        1.0        0.0     1.0    0.0    0.0     1.0   
4       1.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
..      ...        ...        ...        ...     ...    ...    ...     ...   
995     0.0        0.0        0.0        0.0     0.0    1.0    1.0     0.0   
996     0.0        0.0        0.0        0.0     0.0    0.0    0.0     0.0   
997     0.0        0.0        0.0        0.0     0.0    0.0    1.0     0.0   
998     0.0        1.0        0.0        0.0     1.0    0.0    0.0     0.0   
999     0.0        0.0        0.0        0.0     1.0    0.0    0.0     1.0   

     Fantasy  History  Horror  Music  Musical  Mystery  Romance  Sci-Fi  \
0        0.0      0.0     0.0    0.0      0.0      0.0      0.0     1.0   
1        0.0      0.0     0.0    0.0      0.0      1.0      0.0     1.0   
2        0.0      0.0     1.0    0.0      0.0      0.0      0.0     0.0   
3        0.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
4        1.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
..       ...      ...     ...    ...      ...      ...      ...     ...   
995      0.0      0.0     0.0    0.0      0.0      1.0      0.0     0.0   
996      0.0      0.0     1.0    0.0      0.0      0.0      0.0     0.0   
997      0.0      0.0     0.0    1.0      0.0      0.0      1.0     0.0   
998      0.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
999      1.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   

     Sport  Thriller  War  Western  
0      0.0       0.0  0.0      0.0  
1      0.0       0.0  0.0      0.0  
2      0.0       1.0  0.0      0.0  
3      0.0       0.0  0.0      0.0  
4      0.0       0.0  0.0      0.0  
..     ...       ...  ...      ...  
995    0.0       0.0  0.0      0.0  
996    0.0       0.0  0.0      0.0  
997    0.0       0.0  0.0      0.0  
998    0.0       0.0  0.0      0.0  
999    0.0       0.0  0.0      0.0  

[1000 rows x 20 columns]

分类结果
 Action       303.0
Adventure    259.0
Animation     49.0
Biography     81.0
Comedy       279.0
Crime        150.0
Drama        513.0
Family        51.0
Fantasy      101.0
History       29.0
Horror       119.0
Music         16.0
Musical        5.0
Mystery      106.0
Romance      141.0
Sci-Fi       120.0
Sport         18.0
Thriller     195.0
War           13.0
Western        7.0
dtype: float64


  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

NUDTer2026

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值