Matplotlib、Numpy、Pandas学习笔记

最新推荐文章于 2025-03-06 22:04:15 发布

NUDTer2026

最新推荐文章于 2025-03-06 22:04:15 发布

阅读量577

点赞数 6

分类专栏：学习笔记文章标签： matplotlib numpy pandas 机器学习学习笔记

本文链接：https://blog.csdn.net/m0_73867959/article/details/140776893

版权

学习笔记专栏收录该内容

7 篇文章

订阅专栏

Matpoltlib

import matplotlib.pyplot as plt
import random

折线图绘制

单图表绘制

import matplotlib.pyplot as plt
import random
# 绘制折线图

# 数据准备
x_data = range(60)
y_data = [random.uniform(10,15) for i in x_data]

# 获取画布
plt.figure(figsize=(20,8),dpi=200)    # 指定画布大小与清晰度

# 绘制图形
plt.plot(x_data,y_data,color='blue',linestyle='-',label='Legend Info')        

# 添加xy轴刻度
plt.xticks(x_data[::5]) # 每隔5位设置为一个刻度
plt.yticks(range(10,20)[::2])

# 添加xy轴说明
plt.xlabel("Temp",fontsize=15)
plt.ylabel("Date",fontsize=15)

# 添加网格
plt.grid(True,linestyle='--',alpha=0.5) # grid(是否显示网格，网格样式，网格颜色深度) 

# 添加表格标题
plt.title("Date of Temp",fontsize=20)

# 添加图例说明（使用此功能需要在绘制图形时写明label参数）
plt.legend()  

# 图片保存
plt.savefig("D://Desktop//test.png")

# 展示图形(展示图片后将释放图片资源)
plt.show()

在这里插入图片描述

# 补充
'''
【关于线段颜色】
在linestyle中可以填写的参数有
1. -   表示实线
2. --  表示虚线
3. -.  表示点划线
4. :   表示电线
5. 空  表示无线段 但在每个刻度有标记点
【关于线段颜色】
color参数可以使用green等常见颜色的英文或者使用RBG三原色表示
如color='blue'  或 color = (0,221,110)
【关于图例】
可以在plt.legend()函数中传入位置参数loc=xx  默认位loc=0 此时会选择最适宜的位置添加图例
'''
import matplotlib.pyplot as plt
# 创建一些数据  
x = [1, 2, 3, 4, 5]  
y = [1, 4, 9, 16, 25]  

# 使用不同的linestyle绘制线条  
plt.plot(x, y, linestyle='-', label='Solid line')  
plt.plot(x, [x + 2 for x in y] , linestyle='--', label='Dashed line')  
plt.plot(x, [x + 4 for x in y] , linestyle='-.', label='Dash-dot line')  
plt.plot(x, [x + 6 for x in y] , linestyle=':', label='Dotted line')  
plt.plot(x, [x + 8 for x in y] , linestyle=' ', marker='o', label='No line, only markers')  

  
plt.legend()  
plt.show()

在这里插入图片描述

多图表绘制

在单坐标轴下创建多图表

# 实际上只需要在创建画布后多次使用plt.plot等绘制函数即可
import matplotlib.pyplot as plt
x = range(10)
y1 = [i*2 for i in x]
y2 =[i*3 for i  in x]
plt.figure()
plt.plot(x,y1,label='x * 2')
plt.plot(x,y2,label='x * 3')
plt.legend()
plt.show()

在这里插入图片描述

在多坐标轴下创建多图标

# 步骤与之前一致 但函数有所变化
import matplotlib.pyplot as plt
import random
# 数据准备
x = range(30)
y1 = [random.uniform(15,25) for i in x]
y2 = [random.uniform(15,25) for i in x]

# 创建画布
# plt.figure(figsize=(20,8),dpi=200)    
# fig 为画布对象 axes为坐标轴对象列表
fig,axes = plt.subplots(ncols=2,nrows=1)

# 绘制图形
#plt.plot(x_data,y_data,color='blue',linestyle='-',label='Legend Info')    
axes[0].plot(x,y1,label='Data 1')
axes[1].plot(x,y2,label='Data 2')

# 添加xy轴刻度
# plt.xticks(x_data[::5])
axes[0].set_xticks(x[::5]) 
axes[0].set_yticks(range(15,25)[::2])
axes[1].set_xticks(x[::5]) 
axes[1].set_yticks(range(15,25)[::2])

# 添加xy轴说明
# plt.xlabel("Temp",fontsize=15)
axes[0].set_xlabel("Temp",fontsize=15)
axes[0].set_ylabel("Date",fontsize=15)
axes[1].set_xlabel("Temp",fontsize=15)
axes[1].set_ylabel("Date",fontsize=15)

# 添加网格
# plt.grid(True,linestyle='--',alpha=0.5) # grid(是否显示网格，网格样式，网格颜色深度) 
axes[0].grid(True,linestyle='--',alpha=0.5)
axes[1].grid(True,linestyle='--',alpha=0.5)

# 添加表格标题
axes[0].set_title("Date of Temp  1",fontsize=15)
axes[1].set_title("Date of Temp  2",fontsize=15)

# 添加图例说明（使用此功能需要在绘制图形时写明label参数）
axes[0].legend()
axes[1].legend()

# 图片保存
plt.savefig("D://Desktop//test.png")

# 展示图形
plt.show()

在这里插入图片描述

其他图形绘制

'''
# 散点图
# 一般用于查看数据分布规律
plt.scatter(x,y) # 数据1，数据2

# 柱状图
# 用于统计数据与对比
plt.bar(x,width,align) # 数据，柱状高度，柱间对齐方式center默认

# 直方图
plt.hist(x,bins)  # 数据，划分区间数

# 饼图
plt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色
'''

'\n# 散点图\n# 一般用于查看数据分布规律\nplt.scatter(x,y) # 数据1，数据2\n\n# 柱状图\n# 用于统计数据与对比\nplt.bar(x,width,align) # 数据，柱状高度，柱间对齐方式center默认\n\n# 直方图\nplt.hist(x,bins)  # 数据，划分区间数\n\n# 饼图\nplt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色\n'

# 散点图演示
import random

# 准备数据
x = [random.uniform(1,200) for i in range(100)]
y = [random.uniform(1,200) for i in range(100)]

# 准备画布
plt.figure()

# 绘制图形
plt.scatter(x,y)

# 显示图形
plt.show()

在这里插入图片描述

# 柱状图演示
# 显示中文
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

# 准备数据
x=['M4A1-雷神','M4A1-星象','M4A1-仲达','AK47-奉先']
y=[67,89,86,80]

# 准备画布
plt.figure()

# 绘制图形
plt.bar(x,y,width=0.5,color=['blue','purple','green','red'])

# 显示标题
plt.title("部分武器得分榜")

# 显示网格
plt.grid(alpha=0.2)

# 展示
plt.show()

在这里插入图片描述

更多绘制方法，详见网址：https://matplotlib.org/index.html

Numpy

import numpy as np

ndarray

[注意] ndarray中所有元素为同一数据类型

创建

# 1 使用现有数组进行创建
# 1.1使用np.array( oobject ) 创建   object既可以是一维或者多维数组也可以是一维或者多维数组数组的变量
# 使用array为【深拷贝】 当原始数组数据值变化后 该ndarray不会随着变化
myArray = [ [1,2,3,4,5] , [6,7,8,9,10] ] 
myNDArray = np.array( myArray ) 
print("使用np.array创建")
print(myNDArray)
print("--------------------------")

# 1.2使用np.asarray( oobject ) 创建   object既可以是一维或者多维数组也可以是一维或者多维数组数组的变量
# 使用asarray为【浅拷贝】 当原始数组数据值变化后 该ndarray也会随着变化
myNDArray = np.asarray( myArray ) 
print("使用np.asarray创建")
print(myNDArray)
print("--------------------------")



# 2.使用函数创建多维数组
# 2.1 np.zeros( dim ) 创建全0数组
myNDArray = np.zeros((2,3)) 
print("使用np.zeros((2,3))创建")
print(myNDArray)
print("--------------------------")

# 2.2 np.ones( dim ) 创建全1数组
myNDArray = np.ones((2,3)) 
print("使用np.ones((2,3))创建")
print(myNDArray)
print("--------------------------")

# 2.3 np.eye( dim ) 创建单位矩阵
myNDArray = np.eye(3) 
print("使用np.zeros(2,3)创建")
print(myNDArray)
print("--------------------------")

# 2.4 np.diag( data ) 创建对角矩阵
myNDArray = np.diag([1,2,3,4]) 
print("使用np.diag([1,2,3,4])创建")
print(myNDArray)
print("--------------------------")

# 2.5 np.arange() 创建指定范围与步长的一维数组
myNDArray = np.arange(1,11,1)
print("使用np.arange(1,11,1))创建")
print(myNDArray)
print("--------------------------")

# 2.6 np.linspace(start,stop,num,endpoint=True) 创建指定起始的给定元素个数的【等差】一维数组（默认包含尾值）
myNDArray = np.linspace(1,11,5)
print("使用np.linspace(1,11,5)创建")
print(myNDArray)
print("--------------------------")

# 2.7 np.logspace(e**start,e**stop,num,endpoint=True,base=10) 创建指定起始的给定元素个数的【等比】一维数组（默认包含尾值，且等比base默认为10）
myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)
print("--------------------------")


# 2.8 np.random模块
# 2.8.1 np.random.random( size ) 创建0-1之间的指定个数的随机值组成的一维数组 
myNDArray = np.random.random(10)
print("使用np.np.random.random(10)创建")
print(myNDArray)
print("--------------------------")

# 2.8.2 np.random.normal( loc,scale,size ) 创建均值loc，标准差scale，元素个数size的一维数组 【正态分布】
myNDArray = np.random.normal(1.75,1,100)
print("使用 np.random.normal(1.75,1,100)创建")
print(myNDArray)
print("--------------------------")
# 使用plt展示
import matplotlib.pyplot as plt
plt.figure()
plt.hist(myNDArray,10)
plt.show()

# 2.8.23 np.random.uniform( low,high,size ) 创建均值loc，标准差scale，元素个数size的一维数组 【均匀分布】
myNDArray = np.random.uniform(1.10,1,(20,30))
print("使用 np.random.uniform(1.10,1,(2,3))创建")
print(myNDArray)
print("--------------------------")
# 使用plt展示
plt.figure()
plt.hist(myNDArray,10)
plt.show()

# 2.8.3 np.random.randint(a, b, size=(), dtype=int) 创建在范围在[a, b)中的指定大小的随机整数（含有重复值）组成的ndarray

# 2.8.4 np.random.randn(d0, d1, … dn) 创建标准正态分布(均值=0，标准差=1)的概率密度随机数（di表示i维上的数据个数）
# print(np.random.randn(2,3))

使用np.array创建
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
--------------------------
使用np.asarray创建
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
--------------------------
使用np.zeros((2,3))创建
[[0. 0. 0.]
 [0. 0. 0.]]
--------------------------
使用np.ones((2,3))创建
[[1. 1. 1.]
 [1. 1. 1.]]
--------------------------
使用np.zeros(2,3)创建
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
--------------------------
使用np.diag([1,2,3,4])创建
[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]
--------------------------
使用np.arange(1,11,1))创建
[ 1  2  3  4  5  6  7  8  9 10]
--------------------------
使用np.linspace(1,11,5)创建
[ 1.   3.5  6.   8.5 11. ]
--------------------------
使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
使用np.np.random.random(10)创建
[0.98694144 0.76442002 0.68214597 0.14857849 0.77693909 0.80805074
 0.21922838 0.18226575 0.26151965 0.15434984]
--------------------------
使用 np.random.normal(1.75,1,100)创建
[ 1.67505971  0.40113656  2.48161824 -0.01845878  0.90496477  2.69802984
  2.94877678  0.11782976  2.07285894  3.16597878  3.31993686  1.27704351
  1.9537056   0.84782021  2.40737044  0.2981676   0.54453057  3.48002726
  2.29480154  0.98207877  1.80277311  2.14887504  1.3656154   1.60483568
  2.67246384  2.28251879  1.90566007  2.68032271  0.68770623  1.65787999
  2.59625459  1.7460682  -0.39109142  3.1986754   2.02785007  1.76944876
  2.45046819  1.31492081  2.62908256  2.2628327   1.43874272  1.31190384
  2.30522717 -0.15768705  2.30893341  3.31765409  2.23322137  2.07029301
  1.3442506   1.4111489   3.80380232  0.42765345  0.96485353 -0.28374991
  3.73684752  1.378377    0.37245836  0.54526902  2.16979223  0.23042717
  2.29261266  0.63283867 -0.4741276   1.26395204  1.42796786 -0.42374254
  3.60312459  1.79117582  2.92906889  1.48240636  2.36196976  2.28228635
  2.43635078 -0.38269342  2.09099631  1.86993982  0.74433223  1.20943986
  1.55256999  1.46762698  0.83533645  1.33022822  1.05657318  1.77632771
  0.43736867  1.09809372  0.47584479  1.99188425  3.88644217  2.40961493
  1.06239141  2.79239086  2.09479233  1.01569475  2.71373938  1.75166344
  2.65374284  2.19328214  1.4190753   1.90988844]
--------------------------

在这里插入图片描述

使用 np.random.uniform(1.10,1,(2,3))创建
[[1.0646373  1.04106273 1.0493372  1.01323874 1.02643403 1.08756943
  1.02212617 1.06738376 1.02425757 1.03069668 1.07832839 1.06473457
  1.07000729 1.07619312 1.02304455 1.05318189 1.02412238 1.01569068
  1.00526404 1.0782949  1.04618623 1.06822866 1.00146653 1.01051038
  1.00344965 1.01393925 1.05490811 1.00784    1.05518212 1.04311717]
 [1.0161972  1.03336977 1.08515415 1.00538872 1.0048099  1.01892535
  1.02187729 1.07759793 1.0044576  1.05871996 1.09279977 1.01613777
  1.04812817 1.07807016 1.08747444 1.03193286 1.09220697 1.03715259
  1.08607635 1.09940974 1.0531829  1.06049616 1.02344541 1.02135377
  1.04657634 1.05591031 1.07599861 1.03680375 1.09862066 1.04289038]
 [1.02593125 1.00965726 1.06444982 1.02401025 1.07185022 1.05705167
  1.09746228 1.04192553 1.0069116  1.05229566 1.05951107 1.02913307
  1.04204475 1.08807504 1.05549678 1.00606048 1.09828326 1.06536738
  1.01245207 1.01919512 1.05593884 1.09891337 1.01178349 1.06798646
  1.06798943 1.03214976 1.03997211 1.04422143 1.05580566 1.00940351]
 [1.08527616 1.0407684  1.01089278 1.00474172 1.08707804 1.01449114
  1.074704   1.09791494 1.07646629 1.03379115 1.03625898 1.06829178
  1.03051412 1.04159155 1.02251988 1.08115572 1.08276213 1.09643571
  1.0586965  1.0872508  1.04649539 1.08920002 1.00349078 1.07687868
  1.01710053 1.08420061 1.08024906 1.09516048 1.09877514 1.04388992]
 [1.0612709  1.05243644 1.08522038 1.00013121 1.03534703 1.08780921
  1.0193435  1.00894882 1.00302732 1.01461643 1.0072089  1.03730753
  1.05381665 1.09445759 1.04131616 1.06119876 1.04852764 1.00175366
  1.08286298 1.00279791 1.05527854 1.0361621  1.09378048 1.03111487
  1.00669541 1.09474825 1.01639442 1.09015377 1.02394172 1.090647  ]
 [1.00015613 1.03282398 1.03716522 1.02517442 1.07579854 1.0482251
  1.06349351 1.01553608 1.03999816 1.04254074 1.06792544 1.05475944
  1.00114976 1.01739353 1.00824129 1.00200623 1.04649481 1.0051243
  1.0740545  1.05106591 1.00720557 1.00800414 1.02753216 1.09822785
  1.05727602 1.03426387 1.04683858 1.06691865 1.01377834 1.00705324]
 [1.0067312  1.07247999 1.09593779 1.01641581 1.00291297 1.0114776
  1.00105711 1.07605113 1.07579346 1.03026725 1.05393696 1.09963711
  1.05859099 1.09947237 1.01763998 1.03029485 1.0197936  1.00983296
  1.01802012 1.03844166 1.03516277 1.05402403 1.0927023  1.05876193
  1.087854   1.02137064 1.07775551 1.04664903 1.04622491 1.00956262]
 [1.02989735 1.08207482 1.04191933 1.09785752 1.05924577 1.09635408
  1.033684   1.09391279 1.01870238 1.05519453 1.01011376 1.01483874
  1.08141842 1.0880821  1.08154995 1.05923363 1.05106973 1.07919393
  1.01963205 1.05369732 1.07702226 1.01450227 1.03855182 1.08305393
  1.0236216  1.050381   1.0292057  1.01331392 1.08992239 1.00374043]
 [1.05060161 1.08649405 1.07435916 1.01101315 1.06029571 1.0613883
  1.070457   1.01786738 1.02438536 1.07860248 1.04866478 1.01978034
  1.08169093 1.03963785 1.02493817 1.0066989  1.09030658 1.05992054
  1.04863499 1.04604901 1.07653593 1.08996868 1.03754703 1.04983558
  1.01738205 1.02245851 1.08774079 1.03978456 1.06811202 1.00394928]
 [1.03458782 1.08135183 1.01569664 1.03430978 1.04429564 1.01601518
  1.01097922 1.01675885 1.09723926 1.04780064 1.07696447 1.08148946
  1.00054562 1.08183679 1.05303976 1.00145098 1.03689606 1.07795835
  1.029287   1.06312322 1.0250803  1.00676038 1.0140419  1.04726964
  1.0274964  1.02649829 1.00096425 1.01156186 1.01754458 1.05531346]
 [1.09650902 1.0501937  1.06202465 1.00478206 1.00743854 1.0153268
  1.03781729 1.08975352 1.04737951 1.05132598 1.08042364 1.02959953
  1.09620665 1.07454291 1.03555717 1.00811896 1.07945615 1.04025569
  1.03182324 1.02389092 1.05788885 1.07156594 1.08126521 1.01094597
  1.05453652 1.03866728 1.02859898 1.07011385 1.0639741  1.04873718]
 [1.01289679 1.06088697 1.01836315 1.09152328 1.04616927 1.05753735
  1.0534251  1.06095748 1.03362683 1.09954528 1.04185414 1.07635396
  1.03060254 1.05683366 1.06240245 1.09820495 1.04275747 1.03383959
  1.04278926 1.05108132 1.07775571 1.07758766 1.06246876 1.04136862
  1.0263422  1.09159908 1.02655434 1.04284162 1.09320124 1.02140362]
 [1.03076926 1.08489914 1.05564373 1.05989604 1.0298435  1.07130194
  1.08288339 1.04831859 1.06030481 1.09380775 1.06243543 1.02529793
  1.0719787  1.036513   1.07617001 1.06622073 1.09859239 1.0777177
  1.09110076 1.03473311 1.08305644 1.00216531 1.04208478 1.07066882
  1.09204254 1.02274071 1.06660437 1.05474587 1.06650928 1.02337433]
 [1.06936665 1.09303364 1.00260744 1.05244023 1.03261219 1.04621298
  1.01556954 1.01877958 1.01521382 1.09869627 1.08669236 1.07016579
  1.0952169  1.08934902 1.06328953 1.02382227 1.01876839 1.0009345
  1.027934   1.09276243 1.04391487 1.09443389 1.01542218 1.0921251
  1.08244328 1.01936138 1.00763194 1.08467298 1.02930919 1.01444137]
 [1.03241116 1.00601585 1.06875779 1.08777646 1.00891861 1.02913552
  1.06298963 1.05072194 1.06741635 1.03932451 1.04495906 1.0447338
  1.04151223 1.06903579 1.0628366  1.08174608 1.03152726 1.06632495
  1.09201743 1.03176641 1.032397   1.02919504 1.08260514 1.08569347
  1.06081318 1.0317379  1.05953802 1.02522307 1.04075903 1.00608679]
 [1.05553041 1.02101534 1.05290095 1.0641407  1.06111128 1.08564706
  1.04200773 1.0997679  1.04809944 1.00981487 1.04163062 1.07267272
  1.05242328 1.09129901 1.08763547 1.0178737  1.04900177 1.02927875
  1.03146633 1.04696562 1.03558471 1.05745922 1.07103769 1.03829319
  1.02809598 1.08237567 1.0357102  1.03505655 1.03042399 1.09207557]
 [1.0611547  1.02720924 1.0083003  1.01197453 1.06908812 1.02726194
  1.05819931 1.09970103 1.08127893 1.05669232 1.01912968 1.09735505
  1.02897018 1.03846417 1.0599642  1.01318714 1.00821409 1.07682408
  1.06714662 1.02513284 1.07503411 1.07189177 1.0973542  1.03331157
  1.04732424 1.09255911 1.04441842 1.08784725 1.09638214 1.08449857]
 [1.05152983 1.00141425 1.0890675  1.07067413 1.0472271  1.0992028
  1.07063033 1.02805586 1.01499495 1.00119302 1.02332388 1.06995337
  1.02036626 1.08210417 1.04553065 1.02409501 1.04450532 1.00192104
  1.00793566 1.0414471  1.02430948 1.07021291 1.06904127 1.03358178
  1.04097847 1.05477634 1.09583159 1.06618711 1.02113397 1.07613749]
 [1.09239319 1.05635017 1.07690173 1.02985581 1.00288046 1.0082043
  1.09042651 1.06242188 1.01469005 1.03368665 1.0700558  1.03243217
  1.09059841 1.09951115 1.03634514 1.06982106 1.06986148 1.06205987
  1.00002476 1.04247229 1.0447468  1.02022938 1.07420861 1.06889111
  1.03558028 1.05827583 1.01506662 1.0296531  1.02743054 1.05307535]
 [1.02543541 1.0190161  1.08180094 1.02548265 1.01674936 1.05028411
  1.00746477 1.04199418 1.0908696  1.09962999 1.06596327 1.05225282
  1.00740661 1.01910418 1.07142337 1.08127211 1.06233714 1.05716245
  1.01916161 1.0900991  1.00976151 1.00663847 1.00053414 1.02582093
  1.08658318 1.00868502 1.01652466 1.00617985 1.08167276 1.02615394]]
--------------------------

在这里插入图片描述

属性

# 数据准备
myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)
print("--------------------------")

# ndarray属性查看
# 1.ndarray.ndim 返回数组的维数
print("维度：\t\t",myNDArray.ndim)

# 2.ndarray.shape 返回数组的形状（以tuple形式）
print("形状：\t\t",myNDArray.shape)

# 3.ndarray.size 返回数组的元素个数
print("元素个数：\t",myNDArray.size)

# 4.ndarray.dtype 返回数组中元素的数据类型
print("元素数据类型：\t",myNDArray.dtype)

# 5.ndarray.itemsize 返回数组中元素占用的存储空间（B）
print("元素存储空间：\t",myNDArray.itemsize)

# 6.ndarray.T获取转置数组
print("转置数组：\t\n",myNDArray.T)

使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
维度：		 1
形状：		 (10,)
元素个数：	 10
元素数据类型：	 float64
元素存储空间：	 8
转置数组：	
 [  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]

形状修改

myNDArray = np.logspace(0,2,10)
print("使用np.logspace(0,2,105)创建")
print(myNDArray)

print("--------------------------")
print("使用 ndarray.reshape()")
print("修改后的多维数组")
print(myNDArray.reshape([-1,2]))  # 此处传入的-1表示不知道具体值 在运行时由程序自动计算
print("修改后的原始数组")
print(myNDArray)
print("【结论】reshape不修改原始数据 且执行函数后返回修改的值")

print("--------------------------")
print("使用 ndarray.resize()")
print("修改后的多维数组")
print(myNDArray.resize([5,2]))  # 此处不能传入的-1
print("修改后的原始数组")
print(myNDArray)
print("【结论】resize修改原始数据 且执行函数后返回None")

print("--------------------------")
print("使用 ndarray.T")
print("修改前")
print(myNDArray)
print("修改后的多维数组")
print(myNDArray.T)
print("修改后的原始数组")
print(myNDArray)
print("【结论】T属性不修改原始数据 且执行函数后返回结果")

使用np.logspace(0,2,105)创建
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
--------------------------
使用 ndarray.reshape()
修改后的多维数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
修改后的原始数组
[  1.           1.66810054   2.7825594    4.64158883   7.74263683
  12.91549665  21.5443469   35.93813664  59.94842503 100.        ]
【结论】reshape不修改原始数据 且执行函数后返回修改的值
--------------------------
使用 ndarray.resize()
修改后的多维数组
None
修改后的原始数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
【结论】resize修改原始数据 且执行函数后返回None
--------------------------
使用 ndarray.T
修改前
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
修改后的多维数组
[[  1.           2.7825594    7.74263683  21.5443469   59.94842503]
 [  1.66810054   4.64158883  12.91549665  35.93813664 100.        ]]
修改后的原始数组
[[  1.           1.66810054]
 [  2.7825594    4.64158883]
 [  7.74263683  12.91549665]
 [ 21.5443469   35.93813664]
 [ 59.94842503 100.        ]]
【结论】T属性不修改原始数据 且执行函数后返回结果

索引与切片

# 索 引
# 对于N维数组 指定的方式为ndarray[n_dim,n-1_dim,n-2_dim .... 2_dim,1_dim]
pointArray = np.random.uniform(10,20,(3,4,5))
print(pointArray)
# 获取最后一个元素
print(pointArray[2][3][4])

[[[16.64716349 18.17656348 13.04133624 16.55571974 11.32178617]
  [17.79890946 16.34938001 13.85174534 16.56768269 11.5153287 ]
  [19.45768797 12.15795393 17.32689244 10.38788172 10.78220753]
  [16.2920733  18.05794596 12.96980799 18.02098106 13.50622008]]

 [[11.95759017 14.98805551 19.64466193 13.59526949 19.53866727]
  [11.15464832 15.46968762 12.76440722 12.41821119 16.24981854]
  [11.44377375 17.23954935 19.12779827 11.0169205  17.4642939 ]
  [11.38382855 19.36749907 13.84876092 12.8668164  10.55189115]]

 [[17.5450068  14.95714515 18.77974534 18.32744931 18.16252158]
  [19.16806758 18.05432418 16.60055023 11.384959   15.57648868]
  [18.85300183 16.99346902 17.08702811 12.82609623 16.3777613 ]
  [11.27290271 17.1267105  11.09846292 16.73752603 19.22497525]]]
19.224975245576598

运算

# 1.逻辑运算与赋值
# 1.1逻辑运算
nda = np.arange(1,11,2)
print(nda)
print(nda>5)

# 1.2 逻辑运算与赋值
nda[nda>5]=99
print(nda)

# 2.通用判断函数
# 2.1 np.any()
print(np.any(nda>4))
# 2.2 np.all()
print(np.all(nda>4))

# 3.np.where(三元)
# np.where(condition,if True return,if False return)
print(np.where(nda>10,1,0))
# 复合逻辑判断
print(np.where(np.logical_and(nda>0,nda<100),1,0))
print(np.where(np.logical_or(nda>0,nda<-1),1,0))

# 4.统计函数
# min max sum mean std var cumsum cumprod argmin argmax

# 5.广播机制
# 对于不同维度的多维数组之间的计算，如果满足：
# [从数组的最高维度向下比较，对于某一维度如果该维度的数据宽度相同，或者某一数组在该维度下数据维度为1，则继续比较下一维度，如果所有维度均符合上述条件]，
# 那么numpy将在不创建副本的情况下自动拓展其维度使得计算能够进行

[1 3 5 7 9]
[False False False  True  True]
[ 1  3  5 99 99]
True
False
[0 0 0 1 1]
[1 1 1 1 1]
[1 1 1 1 1]

matrix

import numpy as np
# 矩阵本质上为特殊的二维数组 因此其可以继承所有ndarray数组的方法与属性


# 1.矩阵的创建
# 1.1 使用现有数据创建矩阵
matrix1 = np.mat('1,2,3;4,5,6;7,8,9')
matrix2 = np.matrix([[1,2,3],[4,5,6],[7,8,9]])
print(matrix1,matrix2)

# 1.2 使用现有数组变量创建矩阵
array = [[1,2,3],[4,5,6],[7,8,9]]
matrix3= np.mat(array)
matrix4= np.matrix(array)
print(matrix3,matrix4)
ndarray = np.array([[1,2,3],[4,5,6],[7,8,9]])
matrix5= np.mat(ndarray)
matrix6= np.matrix(ndarray)


# 2.矩阵加法
# 矩阵加法要求双方必须为同一类型的矩阵或者为标量
print("Matrix加法运算示例")
print(matrix4+2)
print(matrix3+matrix4)


# 3.矩阵的运算
# 矩阵乘法（需要符合矩阵相乘的条件 m,n * n,l =m,l）
# matrix1 * matrix2  该方法直接使用运算符* 进行矩阵之间的点乘
# np.dot(object1,object2)  该方法支持矩阵与标量计算 与直接使用*完全一致
# np.matmul(matrix1,matrix2) 该方法仅支持矩阵与矩阵之间的运算
print("Matrix乘法运算示例")
print(matrix5*2)
print(np.dot(matrix5,5))
print(np.matmul(matrix5,matrix6))
# 3.矩阵的特有属性
# T(转置矩阵) H(共轭转置矩阵) I(矩阵的逆矩阵) A(矩阵的二维数组)

[[1 2 3]
 [4 5 6]
 [7 8 9]] [[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2 3]
 [4 5 6]
 [7 8 9]] [[1 2 3]
 [4 5 6]
 [7 8 9]]
Matrix加法运算示例
[[ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]
Matrix乘法运算示例
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]
[[ 5 10 15]
 [20 25 30]
 [35 40 45]]
[[ 30  36  42]
 [ 66  81  96]
 [102 126 150]]

Pandas

介绍

Pandas基于：

Numpy
matpoltlib

其主要应用领域为数据挖掘
具有独特的数据结构：

Series
DataFram
So on

具有的独特优势：

更好的图标可读性
更强的数据处理能力
方便的文件操作

Series

创建

import pandas as pd
import numpy as np
# pd.Series(list / ndarray)  [注意Series中的S大写]
# 默认配置索引
s1 = pd.Series(np.arange(10))
print(s1)
# 指定索引
print("------------------------------------------")
s2= pd.Series(np.arange(5),index=['a','b','c','d','e'])
print(s2)
# 字典创建
print("------------------------------------------")
s3 = pd.Series({'red':1,'green':2,'blue':3})
print(s3)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
------------------------------------------
a    0
b    1
c    2
d    3
e    4
dtype: int32
------------------------------------------
red      1
green    2
blue     3
dtype: int64

属性

'''
index,values属性
'''

import pandas as pd
s3 = pd.Series({'red':1,'green':2,'blue':3})
print(s3)
print("------------------------------------------")
print("s3的值为：",s3.values)
print("s3的索引为：",s3.index)

red      1
green    2
blue     3
dtype: int64
------------------------------------------
s3的值为： [1 2 3]
s3的索引为： Index(['red', 'green', 'blue'], dtype='object')

DataFrame

创建

import pandas as pd
import numpy as np

# 默认索引
df1 = pd.DataFrame(np.random.randn(2,3))
print(df1)

# 指定索引
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print(df2)

# 修改行列索引
# 修改索引值只能一起修改
df2.index = ["STU_No."+str(i) for i in range(1,11)]
print(df2)
# df2.index[0]=xxx 非法

# 重设索引值 reset_index(drop) 是否删除原索引
df2.reset_index(drop=True)
print(df2)

# 设置索引
df2.set_index('Chinese')

          0         1         2
0 -0.157809  0.320192  0.231431
1 -0.761845  0.638310  0.539367
          Chinese  Math  English  Physical  PE
STU_No1        59    58       83        52  74
STU_No2        86    97       78        56  55
STU_No3        57    75       75        43  96
STU_No4        91    81       78        55  96
STU_No5        72    82       54        68  40
STU_No6        75    64       96        91  41
STU_No7        42    50       92        91  47
STU_No8        64    51       67        98  71
STU_No9        95    49       68        45  81
STU_No10       84    56       47        45  43
           Chinese  Math  English  Physical  PE
STU_No.1        59    58       83        52  74
STU_No.2        86    97       78        56  55
STU_No.3        57    75       75        43  96
STU_No.4        91    81       78        55  96
STU_No.5        72    82       54        68  40
STU_No.6        75    64       96        91  41
STU_No.7        42    50       92        91  47
STU_No.8        64    51       67        98  71
STU_No.9        95    49       68        45  81
STU_No.10       84    56       47        45  43
           Chinese  Math  English  Physical  PE
STU_No.1        59    58       83        52  74
STU_No.2        86    97       78        56  55
STU_No.3        57    75       75        43  96
STU_No.4        91    81       78        55  96
STU_No.5        72    82       54        68  40
STU_No.6        75    64       96        91  41
STU_No.7        42    50       92        91  47
STU_No.8        64    51       67        98  71
STU_No.9        95    49       68        45  81
STU_No.10       84    56       47        45  43

	Math	English	Physical	PE
Chinese
59	58	83	52	74
86	97	78	56	55
57	75	75	43	96
91	81	78	55	96
72	82	54	68	40
75	64	96	91	41
42	50	92	91	47
64	51	67	98	71
95	49	68	45	81
84	56	47	45	43

属性

# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)

# shape 返回DateFrame的形状
print("df2的形状：",df2.shape)

# index 返回DateFrame的行索引
print("df2的行索引：\n",df2.index)

# columns 返回DateFrame的列索引
print("df2的列索引：\n",df2.columns)

# values 返回DateFrame的值
print("df2的值：\n",df2.values)

# T 返回DataFrame的转置
print("df2的转置：\n",df2.T)

# head( num = 5 ) 返回DateFrame的前num行数据
print("df2的前3行数据：\n",df2.head(3))

# tail( num = 5 ) 返回DataFrame的后num行数据
print("df2的后5行数据：\n",df2.tail())

df2的形状： (10, 5)
df2的行索引：
 Index(['STU_No1', 'STU_No2', 'STU_No3', 'STU_No4', 'STU_No5', 'STU_No6',
       'STU_No7', 'STU_No8', 'STU_No9', 'STU_No10'],
      dtype='object')
df2的列索引：
 Index(['Chinese', 'Math', 'English', 'Physical', 'PE'], dtype='object')
df2的值：
 [[64 77 53 67 70]
 [48 89 99 68 43]
 [63 57 53 59 91]
 [44 72 78 96 40]
 [44 88 64 82 79]
 [80 42 46 54 96]
 [97 52 89 68 58]
 [57 53 76 78 96]
 [86 57 46 76 75]
 [78 73 61 91 67]]
df2的转置：
           STU_No1  STU_No2  STU_No3  STU_No4  STU_No5  STU_No6  STU_No7  \
Chinese        64       48       63       44       44       80       97   
Math           77       89       57       72       88       42       52   
English        53       99       53       78       64       46       89   
Physical       67       68       59       96       82       54       68   
PE             70       43       91       40       79       96       58   

          STU_No8  STU_No9  STU_No10  
Chinese        57       86        78  
Math           53       57        73  
English        76       46        61  
Physical       78       76        91  
PE             96       75        67  
df2的前3行数据：
          Chinese  Math  English  Physical  PE
STU_No1       64    77       53        67  70
STU_No2       48    89       99        68  43
STU_No3       63    57       53        59  91
df2的后5行数据：
           Chinese  Math  English  Physical  PE
STU_No6        80    42       46        54  96
STU_No7        97    52       89        68  58
STU_No8        57    53       76        78  96
STU_No9        86    57       46        76  75
STU_No10       78    73       61        91  67

Pandas的基本数据操作

索引与切片

# 数据准备
# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(10,5))
rowIndex = ["STU_No"+str(i) for i in range(1,11)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print(df2)
print()

# 1.直接根据索引获取值（先列后行）
# 【不支持】先行后列 或者使用数字（即形如array[1,1]）
print("使用直接索引")
print(df2['Chinese']['STU_No1'])
print()

# 2.使用loc函数
print("使用loc函数")
print(df2.loc["STU_No1":"STU_No5","Chinese"])
print()

# 3.使用iloc函数
print("使用iloc函数")
print(df2.iloc[0:3,0:2])
print()

# 4.使用ix进行数值和名称的混合索引 ( V0.20后已被剔除 )
# print("使用ix函数")
# print(df2.ix[0:4,["Chinese","PE"]])

          Chinese  Math  English  Physical  PE
STU_No1        48    62       52        55  94
STU_No2        75    94       79        84  85
STU_No3        81    76       51        88  88
STU_No4        85    79       60        49  62
STU_No5        43    90       84        56  48
STU_No6        70    95       71        71  55
STU_No7        97    95       50        77  50
STU_No8        68    51       54        45  79
STU_No9        49    82       53        58  63
STU_No10       86    91       69        98  86

使用直接索引
48

使用loc函数
STU_No1    48
STU_No2    75
STU_No3    81
STU_No4    85
STU_No5    43
Name: Chinese, dtype: int32

使用iloc函数
         Chinese  Math
STU_No1       48    62
STU_No2       75    94
STU_No3       81    76

赋值与排序

# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(5,5))
rowIndex = ["STU_No"+str(i) for i in range(1,6)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
print("原始数据")
print(df2)
print()

# 1.赋值操作
# 首先使用索引取到元素后使用等于号（ = ）赋值
df2.iloc[0:5,4] = 100
print("赋值后的数据")
print(df2,"\n")

# 2.排序操作
# 2.1 根据元素进行排序 df.sort_values(by = ,ascending = True) by 依据的列 ascending 是否升序
print("多指标排序的结果")
print(df2.sort_values(by=['Chinese','Math'],ascending = True))
print()

# 2.2 根据索引进行排序 df.sort_index( ascending = True )
print("索引排序结果")
print(df2.sort_index(ascending=False))

# 2.3 说明
# 上述排序基于DataFrame 针对Series仍可以适用，因为其仅为一维，故不需要填写 b y参数

原始数据
         Chinese  Math  English  Physical  PE
STU_No1       47    55       48        85  54
STU_No2       66    91       47        50  62
STU_No3       72    72       86        82  84
STU_No4       70    73       55        78  43
STU_No5       93    87       85        88  93

赋值后的数据
         Chinese  Math  English  Physical   PE
STU_No1       47    55       48        85  100
STU_No2       66    91       47        50  100
STU_No3       72    72       86        82  100
STU_No4       70    73       55        78  100
STU_No5       93    87       85        88  100 

多指标排序的结果
         Chinese  Math  English  Physical   PE
STU_No1       47    55       48        85  100
STU_No2       66    91       47        50  100
STU_No4       70    73       55        78  100
STU_No3       72    72       86        82  100
STU_No5       93    87       85        88  100

索引排序结果
         Chinese  Math  English  Physical   PE
STU_No5       93    87       85        88  100
STU_No4       70    73       55        78  100
STU_No3       72    72       86        82  100
STU_No2       66    91       47        50  100
STU_No1       47    55       48        85  100

算数、逻辑、统计运算

# 【算术运算】
# 加法 .add( i ) 或者 +i
# 减法 sub( i ) 或者 -i
# 乘 .mul( i ) 或 *i
# 除 .div( i )

# 数据准备
import pandas as pd
import numpy as mp
score = np.random.randint(40,100,(5,5))
rowIndex = ["STU_No"+str(i) for i in range(1,6)]
columnIndex =  ['Chinese',"Math",'English','Physical','PE']
df2 = pd.DataFrame(score,index=rowIndex,columns=columnIndex)
df2

	Chinese	Math	English	Physical	PE
STU_No1	61	86	66	65	57
STU_No2	99	97	57	63	64
STU_No3	49	89	46	84	63
STU_No4	76	81	86	52	65
STU_No5	61	88	70	51	84

df2["Chinese"].add(10)

STU_No1     71
STU_No2    109
STU_No3     59
STU_No4     86
STU_No5     71
Name: Chinese, dtype: int32

df2*10

	Chinese	Math	English	Physical	PE
STU_No1	610	860	660	650	570
STU_No2	990	970	570	630	640
STU_No3	490	890	460	840	630
STU_No4	760	810	860	520	650
STU_No5	610	880	700	510	840

# 【逻辑运算】
# 使用逻辑符号
df2[(df2["Chinese"]>80) & (df2["Chinese"]<90)]

	Chinese	Math	English	Physical	PE

# query函数
df2.query("Chinese>80 & Chinese<95")

	Chinese	Math	English	Physical	PE

# isin函数
df2[df2["Chinese"].isin([86,96])]

	Chinese	Math	English	Physical	PE

# 【统计运算】
# 1.综合统计函数 describe
df2.describe()

	Chinese	Math	English	Physical	PE
count	5.000000	5.00000	5.000000	5.000000	5.000000
mean	69.200000	88.20000	65.000000	63.000000	66.600000
std	19.214578	5.80517	14.933185	13.322913	10.212737
min	49.000000	81.00000	46.000000	51.000000	57.000000
25%	61.000000	86.00000	57.000000	52.000000	63.000000
50%	61.000000	88.00000	66.000000	63.000000	64.000000
75%	76.000000	89.00000	70.000000	65.000000	65.000000
max	99.000000	97.00000	86.000000	84.000000	84.000000

# 2.使用统计函数 min max mod abs median sum idmax idmin  向函数传入参数axis
#    以及累计统计函数 cumsum cumprod cummax cummin

# 3.自定义运算
# apply(func,axis) func自定义函数 axis运算轴
df2[["Chinese","Math"]]

	Chinese	Math
STU_No1	61	86
STU_No2	99	97
STU_No3	49	89
STU_No4	76	81
STU_No5	61	88

df2[["Chinese","Math"]].apply(lambda x:x.max()-x.min(),axis=0)

Chinese    50
Math       16
dtype: int64

Pandas的绘图

# DataFrame_OR_Series_Object.plot(kind='图形的样式(如line直线，bar柱状图,barh横向柱状图，hist直方图，pie饼图，scatter散点图)')

df2[["Chinese"]].plot(kind='line')

<Axes: >

在这里插入图片描述

Pandas的文件操作

CSV文件

import pandas as pd
# 读 Pandas.read_csv(file_Path,sep=',',usecols=) 文件路径 分隔符 读取的数据列名
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data

	序号	年级	班级	姓名	是否请假
0	1	2020级	智管2001	徐顺明	是
1	2	2020级	智管2001	付芸	是
2	3	2020级	智管2001	晏程博	是
3	4	2020级	智管2001	李澎宣	是
4	5	2021级	智管2001	李雯	是
5	6	2021级	2021级电子信息专硕1班	李轩	是
6	7	2022级	2022级电子信息专硕1班	彭玉洁	是
7	8	2021级	2021级电子信息专硕1班	贾啸宇	是
8	9	2022级	2022级电子信息专硕1班	唐振瀚	是
9	10	2021级	2021级电子信息专硕1班	朱旭炜	是

# 写文件 fileObject.to_csv( file_Path , columns= , mode = w , encoding=)
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data.to_csv("./data/simpleList.csv",columns=['姓名','是否请假'],index=False)
data = pd.read_csv("./data/simpleList.csv")
data

	姓名	是否请假
0	徐顺明	是
1	付芸	是
2	晏程博	是
3	李澎宣	是
4	李雯	是
5	李轩	是
6	彭玉洁	是
7	贾啸宇	是
8	唐振瀚	是
9	朱旭炜	是

缺失值处理

处理思路

获取缺失值的标记方式（NaN 或者？等其他标记）
如果缺失值为NaN
- 判断数据中是否包含NaN
  - pd.isnull(df)
  - pd.notnull(df
- 存在缺失值
  - 删除缺失值 dropna(axis)
    - 此方法不会修改原始数据，而是返回新对象
  - 替换缺失值 fillna(values,inplace = True)
    - value: 替换的值
    - inplace：是否修改原始数据
如果缺失值不为NaN
- 先替换缺失标记为NaN，然后按照上述方法执行

案例说明

import pandas as pd
## 导入数据
data = pd.read_csv("./data/IMDB-Movie-Data.csv")
data.head()

	Rank	Title	Genre	Description	Director	Actors	Year	Runtime (Minutes)	Rating	Votes	Revenue (Millions)	Metascore
0	1	Guardians of the Galaxy	Action,Adventure,Sci-Fi	A group of intergalactic criminals are forced ...	James Gunn	Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...	2014	121	8.1	757074	333.13	76.0
1	2	Prometheus	Adventure,Mystery,Sci-Fi	Following clues to the origin of mankind, a te...	Ridley Scott	Noomi Rapace, Logan Marshall-Green, Michael Fa...	2012	124	7.0	485820	126.46	65.0
2	3	Split	Horror,Thriller	Three girls are kidnapped by a man with a diag...	M. Night Shyamalan	James McAvoy, Anya Taylor-Joy, Haley Lu Richar...	2016	117	7.3	157606	138.12	62.0
3	4	Sing	Animation,Comedy,Family	In a city of humanoid animals, a hustling thea...	Christophe Lourdelet	Matthew McConaughey,Reese Witherspoon, Seth Ma...	2016	108	7.2	60545	270.32	59.0
4	5	Suicide Squad	Action,Adventure,Fantasy	A secret government agency recruits some of th...	David Ayer	Will Smith, Jared Leto, Margot Robbie, Viola D...	2016	123	6.2	393727	325.02	40.0

import numpy as np
# 检查是否存在缺失值
print("存在缺失值：",np.any(pd.isna(data)))

存在缺失值： True

# 缺失值处理
# 删除dropna
data = data.dropna()
print("存在缺失值：",np.any(pd.isna(data)))

存在缺失值： False

# 缺失值处理
# 替换fillna()  由于替换只能一列一列的进行 这里使用for循环一次操作
for i in data.columns:
    if(np.any(pd.isna(data[i]))):
        data[i].fillna(data[i].mean(),inplace=True)
data.head()

	Rank	Title	Genre	Description	Director	Actors	Year	Runtime (Minutes)	Rating	Votes	Revenue (Millions)	Metascore
0	1	Guardians of the Galaxy	Action,Adventure,Sci-Fi	A group of intergalactic criminals are forced ...	James Gunn	Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...	2014	121	8.1	757074	333.13	76.0
1	2	Prometheus	Adventure,Mystery,Sci-Fi	Following clues to the origin of mankind, a te...	Ridley Scott	Noomi Rapace, Logan Marshall-Green, Michael Fa...	2012	124	7.0	485820	126.46	65.0
2	3	Split	Horror,Thriller	Three girls are kidnapped by a man with a diag...	M. Night Shyamalan	James McAvoy, Anya Taylor-Joy, Haley Lu Richar...	2016	117	7.3	157606	138.12	62.0
3	4	Sing	Animation,Comedy,Family	In a city of humanoid animals, a hustling thea...	Christophe Lourdelet	Matthew McConaughey,Reese Witherspoon, Seth Ma...	2016	108	7.2	60545	270.32	59.0
4	5	Suicide Squad	Action,Adventure,Fantasy	A secret government agency recruits some of th...	David Ayer	Will Smith, Jared Leto, Margot Robbie, Viola D...	2016	123	6.2	393727	325.02	40.0

数据离散化

说明：
数据离散化是指在连续属性的值域上，将值域分为若干个离散的区间，并用不同的符号表示落在该离散区间内的连续属性

# 数据准备
data = pd.read_csv("./data/stock_day.csv")
data.head()

	open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58

data = data["p_change"] # 仅使用p-change作为演示
data.head()

2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
Name: p_change, dtype: float64

# 数据离散化
# 自动分组 qcut( object , number ) 分组的对象 分组的数量
# 此方法将会给每个离散区间分配近乎一致的数量的元素
cut1 = pd.qcut(data,10)
cut1.value_counts()

p_change
(-10.030999999999999, -4.836]    65
(-0.462, 0.26]                   65
(0.26, 0.94]                     65
(5.27, 10.03]                    65
(-4.836, -2.444]                 64
(-2.444, -1.352]                 64
(-1.352, -0.462]                 64
(1.738, 2.938]                   64
(2.938, 5.27]                    64
(0.94, 1.738]                    63
Name: count, dtype: int64

# 数据离散化
# 手动分组 qcut( object , list ) 分组的对象 分组区间
bins = [-100,-75,-10.0,10,75,100]
cut2 = pd.cut(data,bins)
cut2.value_counts()

p_change
(-10.0, 10.0]      622
(-75.0, -10.0]      11
(10.0, 75.0]        10
(-100.0, -75.0]      0
(75.0, 100.0]        0
Name: count, dtype: int64

# 数据离散化
# 独热编码 one-hot pd.get_dummies(data,prefix)
dummies = pd.get_dummies(cut1,dtype=int)
dummies.head()

	(0.94, 1.738]	(1.738, 2.938]	(2.938, 5.27]
2018-02-27	0	1	0
2018-02-26	0	0	1
2018-02-23	0	1	0
2018-02-22	1	0	0
2018-02-14	0	1	0

数据表的合并

说明：
应用场景为模型需要的数据在多张表中，此时可以使用合并操作将多张表合为一张

# 数据展示
cut1

2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64, right]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]

cut2

2018-02-27    (-10.0, 10.0]
2018-02-26    (-10.0, 10.0]
2018-02-23    (-10.0, 10.0]
2018-02-22    (-10.0, 10.0]
2018-02-14    (-10.0, 10.0]
                  ...      
2015-03-06    (-10.0, 10.0]
2015-03-05    (-10.0, 10.0]
2015-03-04    (-10.0, 10.0]
2015-03-03    (-10.0, 10.0]
2015-03-02    (-10.0, 10.0]
Name: p_change, Length: 643, dtype: category
Categories (5, interval[float64, right]): [(-100.0, -75.0] < (-75.0, -10.0] < (-10.0, 10.0] < (10.0, 75.0] < (75.0, 100.0]]

# pd.concat( [ df1 , df2 ...] , axis= )
pd.concat([cut1,cut2],axis=1)

	p_change	p_change
2018-02-27	(1.738, 2.938]	(-10.0, 10.0]
2018-02-26	(2.938, 5.27]	(-10.0, 10.0]
2018-02-23	(1.738, 2.938]	(-10.0, 10.0]
2018-02-22	(0.94, 1.738]	(-10.0, 10.0]
2018-02-14	(1.738, 2.938]	(-10.0, 10.0]
...	...	...
2015-03-06	(5.27, 10.03]	(-10.0, 10.0]
2015-03-05	(1.738, 2.938]	(-10.0, 10.0]
2015-03-04	(0.94, 1.738]	(-10.0, 10.0]
2015-03-03	(0.94, 1.738]	(-10.0, 10.0]
2015-03-02	(1.738, 2.938]	(-10.0, 10.0]

643 rows × 2 columns

# pd.merge( lefr , right , how , on )
# left,right : 连接的表
# how  : 按照何种方式连接 （left,right,outer,inner）(类似于数据库连接方式)
# on   : 指定的公共键
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

pd.merge(left, right, on=["key1", "key2"])

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

交叉表与透视表

# 交叉表 crosstable( Series1 , Series2 ) 展示两列数据的关系
# 数据准备
data = pd.read_csv("./data/stock_day.csv")
data.head()
data

	open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2015-03-06	13.17	14.48	14.28	13.13	179831.72	1.12	8.51	13.112	13.112	13.112	115090.18	115090.18	115090.18	6.16
2015-03-05	12.88	13.45	13.16	12.87	93180.39	0.26	2.02	12.820	12.820	12.820	98904.79	98904.79	98904.79	3.19
2015-03-04	12.80	12.92	12.90	12.61	67075.44	0.20	1.57	12.707	12.707	12.707	100812.93	100812.93	100812.93	2.30
2015-03-03	12.52	13.06	12.70	12.52	139071.61	0.18	1.44	12.610	12.610	12.610	117681.67	117681.67	117681.67	4.76
2015-03-02	12.25	12.67	12.52	12.20	96291.73	0.32	2.62	12.520	12.520	12.520	96291.73	96291.73	96291.73	3.30

643 rows × 14 columns

time = pd.to_datetime(data.index)
time

DatetimeIndex(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22',
               '2018-02-14', '2018-02-13', '2018-02-12', '2018-02-09',
               '2018-02-08', '2018-02-07',
               ...
               '2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10',
               '2015-03-09', '2015-03-06', '2015-03-05', '2015-03-04',
               '2015-03-03', '2015-03-02'],
              dtype='datetime64[ns]', length=643, freq=None)

data['week'] = time.weekday
data.head()

	open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover	week
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39	1
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53	0
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32	4
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90	3
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58	2

data['result'] = np.where(data['p_change']>0,1,0)
data.head()

	open	high	close	low	volume	price_change	p_change	ma5	ma10	ma20	v_ma5	v_ma10	v_ma20	turnover	week	result
2018-02-27	23.53	25.88	24.16	23.53	95578.03	0.63	2.68	22.942	22.142	22.875	53782.64	46738.65	55576.11	2.39	1	1
2018-02-26	22.80	23.78	23.53	22.80	60985.11	0.69	3.02	22.406	21.955	22.942	40827.52	42736.34	56007.50	1.53	0	1
2018-02-23	22.88	23.37	22.82	22.71	52914.01	0.54	2.42	21.938	21.929	23.022	35119.58	41871.97	56372.85	1.32	4	1
2018-02-22	22.25	22.76	22.28	22.02	36105.01	0.36	1.64	21.446	21.909	23.137	35397.58	39904.78	60149.60	0.90	3	1
2018-02-14	21.49	21.99	21.92	21.48	23331.04	0.44	2.05	21.366	21.923	23.253	33590.21	42935.74	61716.11	0.58	2	1

result = pd.crosstab(data['week'],data['result'])

# 使用图形展示
sum = result.sum(axis=1)
result = result.div(sum,axis=1)
result

	0	1	2	3	4
week
0	0.504	0.473282	NaN	NaN	NaN
1	0.440	0.580153	NaN	NaN	NaN
2	0.488	0.541985	NaN	NaN	NaN
3	0.504	0.496183	NaN	NaN	NaN
4	0.472	0.519084	NaN	NaN	NaN

result.plot(kind='bar',title='Picture 1')
result.plot(kind='bar',title='Picture 2',stacked=True)

<Axes: title={'center': 'Picture 2'}, xlabel='week'>

在这里插入图片描述

# 透视表 dataFrame.pivot_table( Series1(Variable) , Series2(Index) )
data.pivot_table(['result'],['week'])

	result
week
0	0.496000
1	0.580153
2	0.537879
3	0.507812
4	0.535433

分组与聚合

import pandas as pd
# 读 Pandas.read_csv(file_Path,sep=',',usecols=) 文件路径 分隔符 读取的数据列名
data = pd.read_csv("./data/参会与请假名单.csv",usecols=["序号",'年级','姓名','班级','是否请假'])
data

	序号	年级	班级	姓名	是否请假
0	1	2020级	智管2001	徐顺明	是
1	2	2020级	智管2001	付芸	是
2	3	2020级	智管2001	晏程博	是
3	4	2020级	智管2001	李澎宣	是
4	5	2021级	智管2001	李雯	是
5	6	2021级	2021级电子信息专硕1班	李轩	是
6	7	2022级	2022级电子信息专硕1班	彭玉洁	是
7	8	2021级	2021级电子信息专硕1班	贾啸宇	是
8	9	2022级	2022级电子信息专硕1班	唐振瀚	是
9	10	2021级	2021级电子信息专硕1班	朱旭炜	是

data.groupby(['年级']).count()

	序号	班级	姓名	是否请假
年级
2020级	4	4	4	4
2021级	4	4	4	4
2022级	2	2	2	2

案例分析

星巴克零售店数据分析

目的：
按照国家、国内省份两种方式进行划分查看星巴克零售店数量分布

import pandas as pd
data = pd.read_csv("./data/starbucks/directory.csv")
data.head()

	Brand	Store Number	Store Name	Ownership Type	Street Address	City	State/Province	Country	Postcode	Phone Number	Timezone	Longitude	Latitude
0	Starbucks	47370-257954	Meritxell, 96	Licensed	Av. Meritxell, 96	Andorra la Vella	7	AD	AD500	376818720	GMT+1:00 Europe/Andorra	1.53	42.51
1	Starbucks	22331-212325	Ajman Drive Thru	Licensed	1 Street 69, Al Jarf	Ajman	AJ	AE	NaN	NaN	GMT+04:00 Asia/Dubai	55.47	25.42
2	Starbucks	47089-256771	Dana Mall	Licensed	Sheikh Khalifa Bin Zayed St.	Ajman	AJ	AE	NaN	NaN	GMT+04:00 Asia/Dubai	55.47	25.39
3	Starbucks	22126-218024	Twofour 54	Licensed	Al Salam Street	Abu Dhabi	AZ	AE	NaN	NaN	GMT+04:00 Asia/Dubai	54.38	24.48
4	Starbucks	17127-178586	Al Ain Tower	Licensed	Khaldiya Area, Abu Dhabi Island	Abu Dhabi	AZ	AE	NaN	NaN	GMT+04:00 Asia/Dubai	54.54	24.51

# 按照国家分
count = data.groupby(['Country']).count()
count = count['Brand']
count.plot(kind='bar',figsize=(20,8))

<Axes: xlabel='Country'>

在这里插入图片描述

# 按照国家-省份分
count = data.groupby(['Country','State/Province']).count()
count = count['Brand']
count.plot(kind='bar',figsize=(200,8))

电影数据分析

目标：

电影的平均分与电影的导演人数
电影的rating与runtime的分布
统计电影的分类数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./data/IMDB-Movie-Data.csv")
data.head()

	Rank	Title	Genre	Description	Director	Actors	Year	Runtime (Minutes)	Rating	Votes	Revenue (Millions)	Metascore
0	1	Guardians of the Galaxy	Action,Adventure,Sci-Fi	A group of intergalactic criminals are forced ...	James Gunn	Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...	2014	121	8.1	757074	333.13	76.0
1	2	Prometheus	Adventure,Mystery,Sci-Fi	Following clues to the origin of mankind, a te...	Ridley Scott	Noomi Rapace, Logan Marshall-Green, Michael Fa...	2012	124	7.0	485820	126.46	65.0
2	3	Split	Horror,Thriller	Three girls are kidnapped by a man with a diag...	M. Night Shyamalan	James McAvoy, Anya Taylor-Joy, Haley Lu Richar...	2016	117	7.3	157606	138.12	62.0
3	4	Sing	Animation,Comedy,Family	In a city of humanoid animals, a hustling thea...	Christophe Lourdelet	Matthew McConaughey,Reese Witherspoon, Seth Ma...	2016	108	7.2	60545	270.32	59.0
4	5	Suicide Squad	Action,Adventure,Fantasy	A secret government agency recruits some of th...	David Ayer	Will Smith, Jared Leto, Margot Robbie, Viola D...	2016	123	6.2	393727	325.02	40.0

print("电影平均分",data['Rating'].mean())

电影平均分 6.723199999999999

print("电影导演人数：(去重)",np.unique(data['Director']).shape[0])

电影导演人数：(去重) 644

data['Rating'].plot(kind='hist',figsize=(20,8),title='Score OF Film')

<Axes: title={'center': 'Score OF Film'}, ylabel='Frequency'>

在这里插入图片描述

data['Runtime (Minutes)'].plot(kind='hist',figsize=(20,8),title='Runtime OF Film')

<Axes: title={'center': 'Runtime OF Film'}, ylabel='Frequency'>

在这里插入图片描述

# 电影的分类分析
tempList = [ i.split(',') for i in data['Genre']]
genreList = np.unique([i for j in tempList for i in j])
tempDf = pd.DataFrame(np.zeros((data.shape[0],genreList.shape[0])),columns=genreList)
tempDf

	Action	Adventure	Animation	Biography	Comedy	Crime	Drama	Family	Fantasy	History	Horror	Music	Musical	Mystery	Romance	Sci-Fi	Sport	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
996	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
997	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
998	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
999	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

1000 rows × 20 columns

for i in data.index:
    tempDf.loc[i,tempList[i]]=1
print(tempDf)
print("\n分类结果\n",tempDf.sum(axis=0))

     Action  Adventure  Animation  Biography  Comedy  Crime  Drama  Family  \
0       1.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
1       0.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
2       0.0        0.0        0.0        0.0     0.0    0.0    0.0     0.0   
3       0.0        0.0        1.0        0.0     1.0    0.0    0.0     1.0   
4       1.0        1.0        0.0        0.0     0.0    0.0    0.0     0.0   
..      ...        ...        ...        ...     ...    ...    ...     ...   
995     0.0        0.0        0.0        0.0     0.0    1.0    1.0     0.0   
996     0.0        0.0        0.0        0.0     0.0    0.0    0.0     0.0   
997     0.0        0.0        0.0        0.0     0.0    0.0    1.0     0.0   
998     0.0        1.0        0.0        0.0     1.0    0.0    0.0     0.0   
999     0.0        0.0        0.0        0.0     1.0    0.0    0.0     1.0   

     Fantasy  History  Horror  Music  Musical  Mystery  Romance  Sci-Fi  \
0        0.0      0.0     0.0    0.0      0.0      0.0      0.0     1.0   
1        0.0      0.0     0.0    0.0      0.0      1.0      0.0     1.0   
2        0.0      0.0     1.0    0.0      0.0      0.0      0.0     0.0   
3        0.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
4        1.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
..       ...      ...     ...    ...      ...      ...      ...     ...   
995      0.0      0.0     0.0    0.0      0.0      1.0      0.0     0.0   
996      0.0      0.0     1.0    0.0      0.0      0.0      0.0     0.0   
997      0.0      0.0     0.0    1.0      0.0      0.0      1.0     0.0   
998      0.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   
999      1.0      0.0     0.0    0.0      0.0      0.0      0.0     0.0   

     Sport  Thriller  War  Western  
0      0.0       0.0  0.0      0.0  
1      0.0       0.0  0.0      0.0  
2      0.0       1.0  0.0      0.0  
3      0.0       0.0  0.0      0.0  
4      0.0       0.0  0.0      0.0  
..     ...       ...  ...      ...  
995    0.0       0.0  0.0      0.0  
996    0.0       0.0  0.0      0.0  
997    0.0       0.0  0.0      0.0  
998    0.0       0.0  0.0      0.0  
999    0.0       0.0  0.0      0.0  

[1000 rows x 20 columns]

分类结果
 Action       303.0
Adventure    259.0
Animation     49.0
Biography     81.0
Comedy       279.0
Crime        150.0
Drama        513.0
Family        51.0
Fantasy      101.0
History       29.0
Horror       119.0
Music         16.0
Musical        5.0
Mystery      106.0
Romance      141.0
Sci-Fi       120.0
Sport         18.0
Thriller     195.0
War           13.0
Western        7.0
dtype: float64

	Action	Adventure	Animation	Biography	Comedy	Crime	Drama	Family	Fantasy	History	Horror	Music	Musical	Mystery	Romance	Sci-Fi	Sport	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
996	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
997	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
998	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
999	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Action	Adventure	Animation	Biography	Comedy	Crime	Drama	Family	Fantasy	History	Horror	Music	Musical	Mystery	Romance	Sci-Fi	Sport	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
996	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
997	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
998	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
999	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Action	Adventure	Animation	Biography	Comedy	Crime	Drama	Family	Fantasy	History	Horror	Music	Musical	Mystery	Romance	Sci-Fi	Sport	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
996	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
997	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
998	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
999	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0