Matpoltlib
import matplotlib. pyplot as plt
import random
折线图绘制
单图表绘制
import matplotlib. pyplot as plt
import random
x_data = range ( 60 )
y_data = [ random. uniform( 10 , 15 ) for i in x_data]
plt. figure( figsize= ( 20 , 8 ) , dpi= 200 )
plt. plot( x_data, y_data, color= 'blue' , linestyle= '-' , label= 'Legend Info' )
plt. xticks( x_data[ : : 5 ] )
plt. yticks( range ( 10 , 20 ) [ : : 2 ] )
plt. xlabel( "Temp" , fontsize= 15 )
plt. ylabel( "Date" , fontsize= 15 )
plt. grid( True , linestyle= '--' , alpha= 0.5 )
plt. title( "Date of Temp" , fontsize= 20 )
plt. legend( )
plt. savefig( "D://Desktop//test.png" )
plt. show( )
'''
【关于线段颜色】
在linestyle中可以填写的参数有
1. - 表示实线
2. -- 表示虚线
3. -. 表示点划线
4. : 表示电线
5. 空 表示无线段 但在每个刻度有标记点
【关于线段颜色】
color参数可以使用green等常见颜色的英文或者使用RBG三原色表示
如color='blue' 或 color = (0,221,110)
【关于图例】
可以在plt.legend()函数中传入位置参数loc=xx 默认位loc=0 此时会选择最适宜的位置添加图例
'''
import matplotlib. pyplot as plt
x = [ 1 , 2 , 3 , 4 , 5 ]
y = [ 1 , 4 , 9 , 16 , 25 ]
plt. plot( x, y, linestyle= '-' , label= 'Solid line' )
plt. plot( x, [ x + 2 for x in y] , linestyle= '--' , label= 'Dashed line' )
plt. plot( x, [ x + 4 for x in y] , linestyle= '-.' , label= 'Dash-dot line' )
plt. plot( x, [ x + 6 for x in y] , linestyle= ':' , label= 'Dotted line' )
plt. plot( x, [ x + 8 for x in y] , linestyle= ' ' , marker= 'o' , label= 'No line, only markers' )
plt. legend( )
plt. show( )
多图表绘制
在单坐标轴下创建多图表
import matplotlib. pyplot as plt
x = range ( 10 )
y1 = [ i* 2 for i in x]
y2 = [ i* 3 for i in x]
plt. figure( )
plt. plot( x, y1, label= 'x * 2' )
plt. plot( x, y2, label= 'x * 3' )
plt. legend( )
plt. show( )
在多坐标轴下创建多图标
import matplotlib. pyplot as plt
import random
x = range ( 30 )
y1 = [ random. uniform( 15 , 25 ) for i in x]
y2 = [ random. uniform( 15 , 25 ) for i in x]
fig, axes = plt. subplots( ncols= 2 , nrows= 1 )
axes[ 0 ] . plot( x, y1, label= 'Data 1' )
axes[ 1 ] . plot( x, y2, label= 'Data 2' )
axes[ 0 ] . set_xticks( x[ : : 5 ] )
axes[ 0 ] . set_yticks( range ( 15 , 25 ) [ : : 2 ] )
axes[ 1 ] . set_xticks( x[ : : 5 ] )
axes[ 1 ] . set_yticks( range ( 15 , 25 ) [ : : 2 ] )
axes[ 0 ] . set_xlabel( "Temp" , fontsize= 15 )
axes[ 0 ] . set_ylabel( "Date" , fontsize= 15 )
axes[ 1 ] . set_xlabel( "Temp" , fontsize= 15 )
axes[ 1 ] . set_ylabel( "Date" , fontsize= 15 )
axes[ 0 ] . grid( True , linestyle= '--' , alpha= 0.5 )
axes[ 1 ] . grid( True , linestyle= '--' , alpha= 0.5 )
axes[ 0 ] . set_title( "Date of Temp 1" , fontsize= 15 )
axes[ 1 ] . set_title( "Date of Temp 2" , fontsize= 15 )
axes[ 0 ] . legend( )
axes[ 1 ] . legend( )
plt. savefig( "D://Desktop//test.png" )
plt. show( )
其他图形绘制
'''
# 散点图
# 一般用于查看数据分布规律
plt.scatter(x,y) # 数据1,数据2
# 柱状图
# 用于统计数据与对比
plt.bar(x,width,align) # 数据,柱状高度,柱间对齐方式center默认
# 直方图
plt.hist(x,bins) # 数据,划分区间数
# 饼图
plt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色
'''
'\n# 散点图\n# 一般用于查看数据分布规律\nplt.scatter(x,y) # 数据1,数据2\n\n# 柱状图\n# 用于统计数据与对比\nplt.bar(x,width,align) # 数据,柱状高度,柱间对齐方式center默认\n\n# 直方图\nplt.hist(x,bins) # 数据,划分区间数\n\n# 饼图\nplt.pie(x,labels,autopct,colors) # 数据 每部分的标签 百分比显示形式 每部分的颜色\n'
import random
x = [ random. uniform( 1 , 200 ) for i in range ( 100 ) ]
y = [ random. uniform( 1 , 200 ) for i in range ( 100 ) ]
plt. figure( )
plt. scatter( x, y)
plt. show( )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
x= [ 'M4A1-雷神' , 'M4A1-星象' , 'M4A1-仲达' , 'AK47-奉先' ]
y= [ 67 , 89 , 86 , 80 ]
plt. figure( )
plt. bar( x, y, width= 0.5 , color= [ 'blue' , 'purple' , 'green' , 'red' ] )
plt. title( "部分武器得分榜" )
plt. grid( alpha= 0.2 )
plt. show( )
更多绘制方法,详见网址:https://matplotlib.org/index.html
Numpy
import numpy as np
ndarray
[注意] ndarray中所有元素为同一数据类型
创建
myArray = [ [ 1 , 2 , 3 , 4 , 5 ] , [ 6 , 7 , 8 , 9 , 10 ] ]
myNDArray = np. array( myArray )
print ( "使用np.array创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. asarray( myArray )
print ( "使用np.asarray创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. zeros( ( 2 , 3 ) )
print ( "使用np.zeros((2,3))创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. ones( ( 2 , 3 ) )
print ( "使用np.ones((2,3))创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. eye( 3 )
print ( "使用np.zeros(2,3)创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. diag( [ 1 , 2 , 3 , 4 ] )
print ( "使用np.diag([1,2,3,4])创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. arange( 1 , 11 , 1 )
print ( "使用np.arange(1,11,1))创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. linspace( 1 , 11 , 5 )
print ( "使用np.linspace(1,11,5)创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. logspace( 0 , 2 , 10 )
print ( "使用np.logspace(0,2,105)创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. random. random( 10 )
print ( "使用np.np.random.random(10)创建" )
print ( myNDArray)
print ( "--------------------------" )
myNDArray = np. random. normal( 1.75 , 1 , 100 )
print ( "使用 np.random.normal(1.75,1,100)创建" )
print ( myNDArray)
print ( "--------------------------" )
import matplotlib. pyplot as plt
plt. figure( )
plt. hist( myNDArray, 10 )
plt. show( )
myNDArray = np. random. uniform( 1.10 , 1 , ( 20 , 30 ) )
print ( "使用 np.random.uniform(1.10,1,(2,3))创建" )
print ( myNDArray)
print ( "--------------------------" )
plt. figure( )
plt. hist( myNDArray, 10 )
plt. show( )
使用np.array创建
[[ 1 2 3 4 5]
[ 6 7 8 9 10]]
--------------------------
使用np.asarray创建
[[ 1 2 3 4 5]
[ 6 7 8 9 10]]
--------------------------
使用np.zeros((2,3))创建
[[0. 0. 0.]
[0. 0. 0.]]
--------------------------
使用np.ones((2,3))创建
[[1. 1. 1.]
[1. 1. 1.]]
--------------------------
使用np.zeros(2,3)创建
[[1. 0. 0.]
[0. 1. 0.]
[0. 0. 1.]]
--------------------------
使用np.diag([1,2,3,4])创建
[[1 0 0 0]
[0 2 0 0]
[0 0 3 0]
[0 0 0 4]]
--------------------------
使用np.arange(1,11,1))创建
[ 1 2 3 4 5 6 7 8 9 10]
--------------------------
使用np.linspace(1,11,5)创建
[ 1. 3.5 6. 8.5 11. ]
--------------------------
使用np.logspace(0,2,105)创建
[ 1. 1.66810054 2.7825594 4.64158883 7.74263683
12.91549665 21.5443469 35.93813664 59.94842503 100. ]
--------------------------
使用np.np.random.random(10)创建
[0.98694144 0.76442002 0.68214597 0.14857849 0.77693909 0.80805074
0.21922838 0.18226575 0.26151965 0.15434984]
--------------------------
使用 np.random.normal(1.75,1,100)创建
[ 1.67505971 0.40113656 2.48161824 -0.01845878 0.90496477 2.69802984
2.94877678 0.11782976 2.07285894 3.16597878 3.31993686 1.27704351
1.9537056 0.84782021 2.40737044 0.2981676 0.54453057 3.48002726
2.29480154 0.98207877 1.80277311 2.14887504 1.3656154 1.60483568
2.67246384 2.28251879 1.90566007 2.68032271 0.68770623 1.65787999
2.59625459 1.7460682 -0.39109142 3.1986754 2.02785007 1.76944876
2.45046819 1.31492081 2.62908256 2.2628327 1.43874272 1.31190384
2.30522717 -0.15768705 2.30893341 3.31765409 2.23322137 2.07029301
1.3442506 1.4111489 3.80380232 0.42765345 0.96485353 -0.28374991
3.73684752 1.378377 0.37245836 0.54526902 2.16979223 0.23042717
2.29261266 0.63283867 -0.4741276 1.26395204 1.42796786 -0.42374254
3.60312459 1.79117582 2.92906889 1.48240636 2.36196976 2.28228635
2.43635078 -0.38269342 2.09099631 1.86993982 0.74433223 1.20943986
1.55256999 1.46762698 0.83533645 1.33022822 1.05657318 1.77632771
0.43736867 1.09809372 0.47584479 1.99188425 3.88644217 2.40961493
1.06239141 2.79239086 2.09479233 1.01569475 2.71373938 1.75166344
2.65374284 2.19328214 1.4190753 1.90988844]
--------------------------
使用 np.random.uniform(1.10,1,(2,3))创建
[[1.0646373 1.04106273 1.0493372 1.01323874 1.02643403 1.08756943
1.02212617 1.06738376 1.02425757 1.03069668 1.07832839 1.06473457
1.07000729 1.07619312 1.02304455 1.05318189 1.02412238 1.01569068
1.00526404 1.0782949 1.04618623 1.06822866 1.00146653 1.01051038
1.00344965 1.01393925 1.05490811 1.00784 1.05518212 1.04311717]
[1.0161972 1.03336977 1.08515415 1.00538872 1.0048099 1.01892535
1.02187729 1.07759793 1.0044576 1.05871996 1.09279977 1.01613777
1.04812817 1.07807016 1.08747444 1.03193286 1.09220697 1.03715259
1.08607635 1.09940974 1.0531829 1.06049616 1.02344541 1.02135377
1.04657634 1.05591031 1.07599861 1.03680375 1.09862066 1.04289038]
[1.02593125 1.00965726 1.06444982 1.02401025 1.07185022 1.05705167
1.09746228 1.04192553 1.0069116 1.05229566 1.05951107 1.02913307
1.04204475 1.08807504 1.05549678 1.00606048 1.09828326 1.06536738
1.01245207 1.01919512 1.05593884 1.09891337 1.01178349 1.06798646
1.06798943 1.03214976 1.03997211 1.04422143 1.05580566 1.00940351]
[1.08527616 1.0407684 1.01089278 1.00474172 1.08707804 1.01449114
1.074704 1.09791494 1.07646629 1.03379115 1.03625898 1.06829178
1.03051412 1.04159155 1.02251988 1.08115572 1.08276213 1.09643571
1.0586965 1.0872508 1.04649539 1.08920002 1.00349078 1.07687868
1.01710053 1.08420061 1.08024906 1.09516048 1.09877514 1.04388992]
[1.0612709 1.05243644 1.08522038 1.00013121 1.03534703 1.08780921
1.0193435 1.00894882 1.00302732 1.01461643 1.0072089 1.03730753
1.05381665 1.09445759 1.04131616 1.06119876 1.04852764 1.00175366
1.08286298 1.00279791 1.05527854 1.0361621 1.09378048 1.03111487
1.00669541 1.09474825 1.01639442 1.09015377 1.02394172 1.090647 ]
[1.00015613 1.03282398 1.03716522 1.02517442 1.07579854 1.0482251
1.06349351 1.01553608 1.03999816 1.04254074 1.06792544 1.05475944
1.00114976 1.01739353 1.00824129 1.00200623 1.04649481 1.0051243
1.0740545 1.05106591 1.00720557 1.00800414 1.02753216 1.09822785
1.05727602 1.03426387 1.04683858 1.06691865 1.01377834 1.00705324]
[1.0067312 1.07247999 1.09593779 1.01641581 1.00291297 1.0114776
1.00105711 1.07605113 1.07579346 1.03026725 1.05393696 1.09963711
1.05859099 1.09947237 1.01763998 1.03029485 1.0197936 1.00983296
1.01802012 1.03844166 1.03516277 1.05402403 1.0927023 1.05876193
1.087854 1.02137064 1.07775551 1.04664903 1.04622491 1.00956262]
[1.02989735 1.08207482 1.04191933 1.09785752 1.05924577 1.09635408
1.033684 1.09391279 1.01870238 1.05519453 1.01011376 1.01483874
1.08141842 1.0880821 1.08154995 1.05923363 1.05106973 1.07919393
1.01963205 1.05369732 1.07702226 1.01450227 1.03855182 1.08305393
1.0236216 1.050381 1.0292057 1.01331392 1.08992239 1.00374043]
[1.05060161 1.08649405 1.07435916 1.01101315 1.06029571 1.0613883
1.070457 1.01786738 1.02438536 1.07860248 1.04866478 1.01978034
1.08169093 1.03963785 1.02493817 1.0066989 1.09030658 1.05992054
1.04863499 1.04604901 1.07653593 1.08996868 1.03754703 1.04983558
1.01738205 1.02245851 1.08774079 1.03978456 1.06811202 1.00394928]
[1.03458782 1.08135183 1.01569664 1.03430978 1.04429564 1.01601518
1.01097922 1.01675885 1.09723926 1.04780064 1.07696447 1.08148946
1.00054562 1.08183679 1.05303976 1.00145098 1.03689606 1.07795835
1.029287 1.06312322 1.0250803 1.00676038 1.0140419 1.04726964
1.0274964 1.02649829 1.00096425 1.01156186 1.01754458 1.05531346]
[1.09650902 1.0501937 1.06202465 1.00478206 1.00743854 1.0153268
1.03781729 1.08975352 1.04737951 1.05132598 1.08042364 1.02959953
1.09620665 1.07454291 1.03555717 1.00811896 1.07945615 1.04025569
1.03182324 1.02389092 1.05788885 1.07156594 1.08126521 1.01094597
1.05453652 1.03866728 1.02859898 1.07011385 1.0639741 1.04873718]
[1.01289679 1.06088697 1.01836315 1.09152328 1.04616927 1.05753735
1.0534251 1.06095748 1.03362683 1.09954528 1.04185414 1.07635396
1.03060254 1.05683366 1.06240245 1.09820495 1.04275747 1.03383959
1.04278926 1.05108132 1.07775571 1.07758766 1.06246876 1.04136862
1.0263422 1.09159908 1.02655434 1.04284162 1.09320124 1.02140362]
[1.03076926 1.08489914 1.05564373 1.05989604 1.0298435 1.07130194
1.08288339 1.04831859 1.06030481 1.09380775 1.06243543 1.02529793
1.0719787 1.036513 1.07617001 1.06622073 1.09859239 1.0777177
1.09110076 1.03473311 1.08305644 1.00216531 1.04208478 1.07066882
1.09204254 1.02274071 1.06660437 1.05474587 1.06650928 1.02337433]
[1.06936665 1.09303364 1.00260744 1.05244023 1.03261219 1.04621298
1.01556954 1.01877958 1.01521382 1.09869627 1.08669236 1.07016579
1.0952169 1.08934902 1.06328953 1.02382227 1.01876839 1.0009345
1.027934 1.09276243 1.04391487 1.09443389 1.01542218 1.0921251
1.08244328 1.01936138 1.00763194 1.08467298 1.02930919 1.01444137]
[1.03241116 1.00601585 1.06875779 1.08777646 1.00891861 1.02913552
1.06298963 1.05072194 1.06741635 1.03932451 1.04495906 1.0447338
1.04151223 1.06903579 1.0628366 1.08174608 1.03152726 1.06632495
1.09201743 1.03176641 1.032397 1.02919504 1.08260514 1.08569347
1.06081318 1.0317379 1.05953802 1.02522307 1.04075903 1.00608679]
[1.05553041 1.02101534 1.05290095 1.0641407 1.06111128 1.08564706
1.04200773 1.0997679 1.04809944 1.00981487 1.04163062 1.07267272
1.05242328 1.09129901 1.08763547 1.0178737 1.04900177 1.02927875
1.03146633 1.04696562 1.03558471 1.05745922 1.07103769 1.03829319
1.02809598 1.08237567 1.0357102 1.03505655 1.03042399 1.09207557]
[1.0611547 1.02720924 1.0083003 1.01197453 1.06908812 1.02726194
1.05819931 1.09970103 1.08127893 1.05669232 1.01912968 1.09735505
1.02897018 1.03846417 1.0599642 1.01318714 1.00821409 1.07682408
1.06714662 1.02513284 1.07503411 1.07189177 1.0973542 1.03331157
1.04732424 1.09255911 1.04441842 1.08784725 1.09638214 1.08449857]
[1.05152983 1.00141425 1.0890675 1.07067413 1.0472271 1.0992028
1.07063033 1.02805586 1.01499495 1.00119302 1.02332388 1.06995337
1.02036626 1.08210417 1.04553065 1.02409501 1.04450532 1.00192104
1.00793566 1.0414471 1.02430948 1.07021291 1.06904127 1.03358178
1.04097847 1.05477634 1.09583159 1.06618711 1.02113397 1.07613749]
[1.09239319 1.05635017 1.07690173 1.02985581 1.00288046 1.0082043
1.09042651 1.06242188 1.01469005 1.03368665 1.0700558 1.03243217
1.09059841 1.09951115 1.03634514 1.06982106 1.06986148 1.06205987
1.00002476 1.04247229 1.0447468 1.02022938 1.07420861 1.06889111
1.03558028 1.05827583 1.01506662 1.0296531 1.02743054 1.05307535]
[1.02543541 1.0190161 1.08180094 1.02548265 1.01674936 1.05028411
1.00746477 1.04199418 1.0908696 1.09962999 1.06596327 1.05225282
1.00740661 1.01910418 1.07142337 1.08127211 1.06233714 1.05716245
1.01916161 1.0900991 1.00976151 1.00663847 1.00053414 1.02582093
1.08658318 1.00868502 1.01652466 1.00617985 1.08167276 1.02615394]]
--------------------------
属性
myNDArray = np. logspace( 0 , 2 , 10 )
print ( "使用np.logspace(0,2,105)创建" )
print ( myNDArray)
print ( "--------------------------" )
print ( "维度:\t\t" , myNDArray. ndim)
print ( "形状:\t\t" , myNDArray. shape)
print ( "元素个数:\t" , myNDArray. size)
print ( "元素数据类型:\t" , myNDArray. dtype)
print ( "元素存储空间:\t" , myNDArray. itemsize)
print ( "转置数组:\t\n" , myNDArray. T)
使用np.logspace(0,2,105)创建
[ 1. 1.66810054 2.7825594 4.64158883 7.74263683
12.91549665 21.5443469 35.93813664 59.94842503 100. ]
--------------------------
维度: 1
形状: (10,)
元素个数: 10
元素数据类型: float64
元素存储空间: 8
转置数组:
[ 1. 1.66810054 2.7825594 4.64158883 7.74263683
12.91549665 21.5443469 35.93813664 59.94842503 100. ]
形状修改
myNDArray = np. logspace( 0 , 2 , 10 )
print ( "使用np.logspace(0,2,105)创建" )
print ( myNDArray)
print ( "--------------------------" )
print ( "使用 ndarray.reshape()" )
print ( "修改后的多维数组" )
print ( myNDArray. reshape( [ - 1 , 2 ] ) )
print ( "修改后的原始数组" )
print ( myNDArray)
print ( "【结论】reshape不修改原始数据 且执行函数后返回修改的值" )
print ( "--------------------------" )
print ( "使用 ndarray.resize()" )
print ( "修改后的多维数组" )
print ( myNDArray. resize( [ 5 , 2 ] ) )
print ( "修改后的原始数组" )
print ( myNDArray)
print ( "【结论】resize修改原始数据 且执行函数后返回None" )
print ( "--------------------------" )
print ( "使用 ndarray.T" )
print ( "修改前" )
print ( myNDArray)
print ( "修改后的多维数组" )
print ( myNDArray. T)
print ( "修改后的原始数组" )
print ( myNDArray)
print ( "【结论】T属性不修改原始数据 且执行函数后返回结果" )
使用np.logspace(0,2,105)创建
[ 1. 1.66810054 2.7825594 4.64158883 7.74263683
12.91549665 21.5443469 35.93813664 59.94842503 100. ]
--------------------------
使用 ndarray.reshape()
修改后的多维数组
[[ 1. 1.66810054]
[ 2.7825594 4.64158883]
[ 7.74263683 12.91549665]
[ 21.5443469 35.93813664]
[ 59.94842503 100. ]]
修改后的原始数组
[ 1. 1.66810054 2.7825594 4.64158883 7.74263683
12.91549665 21.5443469 35.93813664 59.94842503 100. ]
【结论】reshape不修改原始数据 且执行函数后返回修改的值
--------------------------
使用 ndarray.resize()
修改后的多维数组
None
修改后的原始数组
[[ 1. 1.66810054]
[ 2.7825594 4.64158883]
[ 7.74263683 12.91549665]
[ 21.5443469 35.93813664]
[ 59.94842503 100. ]]
【结论】resize修改原始数据 且执行函数后返回None
--------------------------
使用 ndarray.T
修改前
[[ 1. 1.66810054]
[ 2.7825594 4.64158883]
[ 7.74263683 12.91549665]
[ 21.5443469 35.93813664]
[ 59.94842503 100. ]]
修改后的多维数组
[[ 1. 2.7825594 7.74263683 21.5443469 59.94842503]
[ 1.66810054 4.64158883 12.91549665 35.93813664 100. ]]
修改后的原始数组
[[ 1. 1.66810054]
[ 2.7825594 4.64158883]
[ 7.74263683 12.91549665]
[ 21.5443469 35.93813664]
[ 59.94842503 100. ]]
【结论】T属性不修改原始数据 且执行函数后返回结果
索引与切片
pointArray = np. random. uniform( 10 , 20 , ( 3 , 4 , 5 ) )
print ( pointArray)
print ( pointArray[ 2 ] [ 3 ] [ 4 ] )
[[[16.64716349 18.17656348 13.04133624 16.55571974 11.32178617]
[17.79890946 16.34938001 13.85174534 16.56768269 11.5153287 ]
[19.45768797 12.15795393 17.32689244 10.38788172 10.78220753]
[16.2920733 18.05794596 12.96980799 18.02098106 13.50622008]]
[[11.95759017 14.98805551 19.64466193 13.59526949 19.53866727]
[11.15464832 15.46968762 12.76440722 12.41821119 16.24981854]
[11.44377375 17.23954935 19.12779827 11.0169205 17.4642939 ]
[11.38382855 19.36749907 13.84876092 12.8668164 10.55189115]]
[[17.5450068 14.95714515 18.77974534 18.32744931 18.16252158]
[19.16806758 18.05432418 16.60055023 11.384959 15.57648868]
[18.85300183 16.99346902 17.08702811 12.82609623 16.3777613 ]
[11.27290271 17.1267105 11.09846292 16.73752603 19.22497525]]]
19.224975245576598
运算
nda = np. arange( 1 , 11 , 2 )
print ( nda)
print ( nda> 5 )
nda[ nda> 5 ] = 99
print ( nda)
print ( np. any ( nda> 4 ) )
print ( np. all ( nda> 4 ) )
print ( np. where( nda> 10 , 1 , 0 ) )
print ( np. where( np. logical_and( nda> 0 , nda< 100 ) , 1 , 0 ) )
print ( np. where( np. logical_or( nda> 0 , nda< - 1 ) , 1 , 0 ) )
[1 3 5 7 9]
[False False False True True]
[ 1 3 5 99 99]
True
False
[0 0 0 1 1]
[1 1 1 1 1]
[1 1 1 1 1]
matrix
import numpy as np
matrix1 = np. mat( '1,2,3;4,5,6;7,8,9' )
matrix2 = np. matrix( [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 7 , 8 , 9 ] ] )
print ( matrix1, matrix2)
array = [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 7 , 8 , 9 ] ]
matrix3= np. mat( array)
matrix4= np. matrix( array)
print ( matrix3, matrix4)
ndarray = np. array( [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 7 , 8 , 9 ] ] )
matrix5= np. mat( ndarray)
matrix6= np. matrix( ndarray)
print ( "Matrix加法运算示例" )
print ( matrix4+ 2 )
print ( matrix3+ matrix4)
print ( "Matrix乘法运算示例" )
print ( matrix5* 2 )
print ( np. dot( matrix5, 5 ) )
print ( np. matmul( matrix5, matrix6) )
[[1 2 3]
[4 5 6]
[7 8 9]] [[1 2 3]
[4 5 6]
[7 8 9]]
[[1 2 3]
[4 5 6]
[7 8 9]] [[1 2 3]
[4 5 6]
[7 8 9]]
Matrix加法运算示例
[[ 3 4 5]
[ 6 7 8]
[ 9 10 11]]
[[ 2 4 6]
[ 8 10 12]
[14 16 18]]
Matrix乘法运算示例
[[ 2 4 6]
[ 8 10 12]
[14 16 18]]
[[ 5 10 15]
[20 25 30]
[35 40 45]]
[[ 30 36 42]
[ 66 81 96]
[102 126 150]]
Pandas
介绍
Pandas基于:
其主要应用领域为数据挖掘 具有独特的数据结构:
具有的独特优势:
Series
创建
import pandas as pd
import numpy as np
s1 = pd. Series( np. arange( 10 ) )
print ( s1)
print ( "------------------------------------------" )
s2= pd. Series( np. arange( 5 ) , index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
print ( s2)
print ( "------------------------------------------" )
s3 = pd. Series( { 'red' : 1 , 'green' : 2 , 'blue' : 3 } )
print ( s3)
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
------------------------------------------
a 0
b 1
c 2
d 3
e 4
dtype: int32
------------------------------------------
red 1
green 2
blue 3
dtype: int64
属性
'''
index,values属性
'''
import pandas as pd
s3 = pd. Series( { 'red' : 1 , 'green' : 2 , 'blue' : 3 } )
print ( s3)
print ( "------------------------------------------" )
print ( "s3的值为:" , s3. values)
print ( "s3的索引为:" , s3. index)
red 1
green 2
blue 3
dtype: int64
------------------------------------------
s3的值为: [1 2 3]
s3的索引为: Index(['red', 'green', 'blue'], dtype='object')
DataFrame
创建
import pandas as pd
import numpy as np
df1 = pd. DataFrame( np. random. randn( 2 , 3 ) )
print ( df1)
score = np. random. randint( 40 , 100 , ( 10 , 5 ) )
rowIndex = [ "STU_No" + str ( i) for i in range ( 1 , 11 ) ]
columnIndex = [ 'Chinese' , "Math" , 'English' , 'Physical' , 'PE' ]
df2 = pd. DataFrame( score, index= rowIndex, columns= columnIndex)
print ( df2)
df2. index = [ "STU_No." + str ( i) for i in range ( 1 , 11 ) ]
print ( df2)
df2. reset_index( drop= True )
print ( df2)
df2. set_index( 'Chinese' )
0 1 2
0 -0.157809 0.320192 0.231431
1 -0.761845 0.638310 0.539367
Chinese Math English Physical PE
STU_No1 59 58 83 52 74
STU_No2 86 97 78 56 55
STU_No3 57 75 75 43 96
STU_No4 91 81 78 55 96
STU_No5 72 82 54 68 40
STU_No6 75 64 96 91 41
STU_No7 42 50 92 91 47
STU_No8 64 51 67 98 71
STU_No9 95 49 68 45 81
STU_No10 84 56 47 45 43
Chinese Math English Physical PE
STU_No.1 59 58 83 52 74
STU_No.2 86 97 78 56 55
STU_No.3 57 75 75 43 96
STU_No.4 91 81 78 55 96
STU_No.5 72 82 54 68 40
STU_No.6 75 64 96 91 41
STU_No.7 42 50 92 91 47
STU_No.8 64 51 67 98 71
STU_No.9 95 49 68 45 81
STU_No.10 84 56 47 45 43
Chinese Math English Physical PE
STU_No.1 59 58 83 52 74
STU_No.2 86 97 78 56 55
STU_No.3 57 75 75 43 96
STU_No.4 91 81 78 55 96
STU_No.5 72 82 54 68 40
STU_No.6 75 64 96 91 41
STU_No.7 42 50 92 91 47
STU_No.8 64 51 67 98 71
STU_No.9 95 49 68 45 81
STU_No.10 84 56 47 45 43
Math English Physical PE Chinese 59 58 83 52 74 86 97 78 56 55 57 75 75 43 96 91 81 78 55 96 72 82 54 68 40 75 64 96 91 41 42 50 92 91 47 64 51 67 98 71 95 49 68 45 81 84 56 47 45 43
属性
import pandas as pd
import numpy as mp
score = np. random. randint( 40 , 100 , ( 10 , 5 ) )
rowIndex = [ "STU_No" + str ( i) for i in range ( 1 , 11 ) ]
columnIndex = [ 'Chinese' , "Math" , 'English' , 'Physical' , 'PE' ]
df2 = pd. DataFrame( score, index= rowIndex, columns= columnIndex)
print ( "df2的形状:" , df2. shape)
print ( "df2的行索引:\n" , df2. index)
print ( "df2的列索引:\n" , df2. columns)
print ( "df2的值:\n" , df2. values)
print ( "df2的转置:\n" , df2. T)
print ( "df2的前3行数据:\n" , df2. head( 3 ) )
print ( "df2的后5行数据:\n" , df2. tail( ) )
df2的形状: (10, 5)
df2的行索引:
Index(['STU_No1', 'STU_No2', 'STU_No3', 'STU_No4', 'STU_No5', 'STU_No6',
'STU_No7', 'STU_No8', 'STU_No9', 'STU_No10'],
dtype='object')
df2的列索引:
Index(['Chinese', 'Math', 'English', 'Physical', 'PE'], dtype='object')
df2的值:
[[64 77 53 67 70]
[48 89 99 68 43]
[63 57 53 59 91]
[44 72 78 96 40]
[44 88 64 82 79]
[80 42 46 54 96]
[97 52 89 68 58]
[57 53 76 78 96]
[86 57 46 76 75]
[78 73 61 91 67]]
df2的转置:
STU_No1 STU_No2 STU_No3 STU_No4 STU_No5 STU_No6 STU_No7 \
Chinese 64 48 63 44 44 80 97
Math 77 89 57 72 88 42 52
English 53 99 53 78 64 46 89
Physical 67 68 59 96 82 54 68
PE 70 43 91 40 79 96 58
STU_No8 STU_No9 STU_No10
Chinese 57 86 78
Math 53 57 73
English 76 46 61
Physical 78 76 91
PE 96 75 67
df2的前3行数据:
Chinese Math English Physical PE
STU_No1 64 77 53 67 70
STU_No2 48 89 99 68 43
STU_No3 63 57 53 59 91
df2的后5行数据:
Chinese Math English Physical PE
STU_No6 80 42 46 54 96
STU_No7 97 52 89 68 58
STU_No8 57 53 76 78 96
STU_No9 86 57 46 76 75
STU_No10 78 73 61 91 67
Pandas的基本数据操作
索引与切片
import pandas as pd
import numpy as mp
score = np. random. randint( 40 , 100 , ( 10 , 5 ) )
rowIndex = [ "STU_No" + str ( i) for i in range ( 1 , 11 ) ]
columnIndex = [ 'Chinese' , "Math" , 'English' , 'Physical' , 'PE' ]
df2 = pd. DataFrame( score, index= rowIndex, columns= columnIndex)
print ( df2)
print ( )
print ( "使用直接索引" )
print ( df2[ 'Chinese' ] [ 'STU_No1' ] )
print ( )
print ( "使用loc函数" )
print ( df2. loc[ "STU_No1" : "STU_No5" , "Chinese" ] )
print ( )
print ( "使用iloc函数" )
print ( df2. iloc[ 0 : 3 , 0 : 2 ] )
print ( )
Chinese Math English Physical PE
STU_No1 48 62 52 55 94
STU_No2 75 94 79 84 85
STU_No3 81 76 51 88 88
STU_No4 85 79 60 49 62
STU_No5 43 90 84 56 48
STU_No6 70 95 71 71 55
STU_No7 97 95 50 77 50
STU_No8 68 51 54 45 79
STU_No9 49 82 53 58 63
STU_No10 86 91 69 98 86
使用直接索引
48
使用loc函数
STU_No1 48
STU_No2 75
STU_No3 81
STU_No4 85
STU_No5 43
Name: Chinese, dtype: int32
使用iloc函数
Chinese Math
STU_No1 48 62
STU_No2 75 94
STU_No3 81 76
赋值与排序
import pandas as pd
import numpy as mp
score = np. random. randint( 40 , 100 , ( 5 , 5 ) )
rowIndex = [ "STU_No" + str ( i) for i in range ( 1 , 6 ) ]
columnIndex = [ 'Chinese' , "Math" , 'English' , 'Physical' , 'PE' ]
df2 = pd. DataFrame( score, index= rowIndex, columns= columnIndex)
print ( "原始数据" )
print ( df2)
print ( )
df2. iloc[ 0 : 5 , 4 ] = 100
print ( "赋值后的数据" )
print ( df2, "\n" )
print ( "多指标排序的结果" )
print ( df2. sort_values( by= [ 'Chinese' , 'Math' ] , ascending = True ) )
print ( )
print ( "索引排序结果" )
print ( df2. sort_index( ascending= False ) )
原始数据
Chinese Math English Physical PE
STU_No1 47 55 48 85 54
STU_No2 66 91 47 50 62
STU_No3 72 72 86 82 84
STU_No4 70 73 55 78 43
STU_No5 93 87 85 88 93
赋值后的数据
Chinese Math English Physical PE
STU_No1 47 55 48 85 100
STU_No2 66 91 47 50 100
STU_No3 72 72 86 82 100
STU_No4 70 73 55 78 100
STU_No5 93 87 85 88 100
多指标排序的结果
Chinese Math English Physical PE
STU_No1 47 55 48 85 100
STU_No2 66 91 47 50 100
STU_No4 70 73 55 78 100
STU_No3 72 72 86 82 100
STU_No5 93 87 85 88 100
索引排序结果
Chinese Math English Physical PE
STU_No5 93 87 85 88 100
STU_No4 70 73 55 78 100
STU_No3 72 72 86 82 100
STU_No2 66 91 47 50 100
STU_No1 47 55 48 85 100
算数、逻辑、统计运算
import pandas as pd
import numpy as mp
score = np. random. randint( 40 , 100 , ( 5 , 5 ) )
rowIndex = [ "STU_No" + str ( i) for i in range ( 1 , 6 ) ]
columnIndex = [ 'Chinese' , "Math" , 'English' , 'Physical' , 'PE' ]
df2 = pd. DataFrame( score, index= rowIndex, columns= columnIndex)
df2
Chinese Math English Physical PE STU_No1 61 86 66 65 57 STU_No2 99 97 57 63 64 STU_No3 49 89 46 84 63 STU_No4 76 81 86 52 65 STU_No5 61 88 70 51 84
df2[ "Chinese" ] . add( 10 )
STU_No1 71
STU_No2 109
STU_No3 59
STU_No4 86
STU_No5 71
Name: Chinese, dtype: int32
df2* 10
Chinese Math English Physical PE STU_No1 610 860 660 650 570 STU_No2 990 970 570 630 640 STU_No3 490 890 460 840 630 STU_No4 760 810 860 520 650 STU_No5 610 880 700 510 840
df2[ ( df2[ "Chinese" ] > 80 ) & ( df2[ "Chinese" ] < 90 ) ]
Chinese Math English Physical PE
df2. query( "Chinese>80 & Chinese<95" )
Chinese Math English Physical PE
df2[ df2[ "Chinese" ] . isin( [ 86 , 96 ] ) ]
Chinese Math English Physical PE
df2. describe( )
Chinese Math English Physical PE count 5.000000 5.00000 5.000000 5.000000 5.000000 mean 69.200000 88.20000 65.000000 63.000000 66.600000 std 19.214578 5.80517 14.933185 13.322913 10.212737 min 49.000000 81.00000 46.000000 51.000000 57.000000 25% 61.000000 86.00000 57.000000 52.000000 63.000000 50% 61.000000 88.00000 66.000000 63.000000 64.000000 75% 76.000000 89.00000 70.000000 65.000000 65.000000 max 99.000000 97.00000 86.000000 84.000000 84.000000
df2[ [ "Chinese" , "Math" ] ]
Chinese Math STU_No1 61 86 STU_No2 99 97 STU_No3 49 89 STU_No4 76 81 STU_No5 61 88
df2[ [ "Chinese" , "Math" ] ] . apply ( lambda x: x. max ( ) - x. min ( ) , axis= 0 )
Chinese 50
Math 16
dtype: int64
Pandas的绘图
df2[ [ "Chinese" ] ] . plot( kind= 'line' )
<Axes: >
Pandas的文件操作
CSV文件
import pandas as pd
data = pd. read_csv( "./data/参会与请假名单.csv" , usecols= [ "序号" , '年级' , '姓名' , '班级' , '是否请假' ] )
data
序号 年级 班级 姓名 是否请假 0 1 2020级 智管2001 徐顺明 是 1 2 2020级 智管2001 付芸 是 2 3 2020级 智管2001 晏程博 是 3 4 2020级 智管2001 李澎宣 是 4 5 2021级 智管2001 李雯 是 5 6 2021级 2021级电子信息专硕1班 李轩 是 6 7 2022级 2022级电子信息专硕1班 彭玉洁 是 7 8 2021级 2021级电子信息专硕1班 贾啸宇 是 8 9 2022级 2022级电子信息专硕1班 唐振瀚 是 9 10 2021级 2021级电子信息专硕1班 朱旭炜 是
data = pd. read_csv( "./data/参会与请假名单.csv" , usecols= [ "序号" , '年级' , '姓名' , '班级' , '是否请假' ] )
data. to_csv( "./data/simpleList.csv" , columns= [ '姓名' , '是否请假' ] , index= False )
data = pd. read_csv( "./data/simpleList.csv" )
data
姓名 是否请假 0 徐顺明 是 1 付芸 是 2 晏程博 是 3 李澎宣 是 4 李雯 是 5 李轩 是 6 彭玉洁 是 7 贾啸宇 是 8 唐振瀚 是 9 朱旭炜 是
缺失值处理
处理思路
获取缺失值的标记方式(NaN 或者 ?等其他标记) 如果缺失值为NaN
判断数据中是否包含NaN
pd.isnull(df) pd.notnull(df 存在缺失值
删除缺失值 dropna(axis)
替换缺失值 fillna(values,inplace = True)
value: 替换的值 inplace:是否修改原始数据 如果缺失值不为NaN - 先替换缺失标记为NaN,然后按照上述方法执行
案例说明
import pandas as pd
data = pd. read_csv( "./data/IMDB-Movie-Data.csv" )
data. head( )
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore 0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0 1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0 2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0 3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0 4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
import numpy as np
print ( "存在缺失值:" , np. any ( pd. isna( data) ) )
存在缺失值: True
data = data. dropna( )
print ( "存在缺失值:" , np. any ( pd. isna( data) ) )
存在缺失值: False
for i in data. columns:
if ( np. any ( pd. isna( data[ i] ) ) ) :
data[ i] . fillna( data[ i] . mean( ) , inplace= True )
data. head( )
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore 0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0 1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0 2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0 3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0 4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
数据离散化
说明: 数据离散化是指在连续属性的值域上,将值域分为若干个离散的区间,并用不同的符号表示落在该离散区间内的连续属性
data = pd. read_csv( "./data/stock_day.csv" )
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
data = data[ "p_change" ]
data. head( )
2018-02-27 2.68
2018-02-26 3.02
2018-02-23 2.42
2018-02-22 1.64
2018-02-14 2.05
Name: p_change, dtype: float64
cut1 = pd. qcut( data, 10 )
cut1. value_counts( )
p_change
(-10.030999999999999, -4.836] 65
(-0.462, 0.26] 65
(0.26, 0.94] 65
(5.27, 10.03] 65
(-4.836, -2.444] 64
(-2.444, -1.352] 64
(-1.352, -0.462] 64
(1.738, 2.938] 64
(2.938, 5.27] 64
(0.94, 1.738] 63
Name: count, dtype: int64
bins = [ - 100 , - 75 , - 10.0 , 10 , 75 , 100 ]
cut2 = pd. cut( data, bins)
cut2. value_counts( )
p_change
(-10.0, 10.0] 622
(-75.0, -10.0] 11
(10.0, 75.0] 10
(-100.0, -75.0] 0
(75.0, 100.0] 0
Name: count, dtype: int64
dummies = pd. get_dummies( cut1, dtype= int )
dummies. head( )
(-10.030999999999999, -4.836] (-4.836, -2.444] (-2.444, -1.352] (-1.352, -0.462] (-0.462, 0.26] (0.26, 0.94] (0.94, 1.738] (1.738, 2.938] (2.938, 5.27] (5.27, 10.03] 2018-02-27 0 0 0 0 0 0 0 1 0 0 2018-02-26 0 0 0 0 0 0 0 0 1 0 2018-02-23 0 0 0 0 0 0 0 1 0 0 2018-02-22 0 0 0 0 0 0 1 0 0 0 2018-02-14 0 0 0 0 0 0 0 1 0 0
数据表的合并
说明: 应用场景为模型需要的数据在多张表中,此时可以使用合并操作将多张表合为一张
cut1
2018-02-27 (1.738, 2.938]
2018-02-26 (2.938, 5.27]
2018-02-23 (1.738, 2.938]
2018-02-22 (0.94, 1.738]
2018-02-14 (1.738, 2.938]
...
2015-03-06 (5.27, 10.03]
2015-03-05 (1.738, 2.938]
2015-03-04 (0.94, 1.738]
2015-03-03 (0.94, 1.738]
2015-03-02 (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64, right]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
cut2
2018-02-27 (-10.0, 10.0]
2018-02-26 (-10.0, 10.0]
2018-02-23 (-10.0, 10.0]
2018-02-22 (-10.0, 10.0]
2018-02-14 (-10.0, 10.0]
...
2015-03-06 (-10.0, 10.0]
2015-03-05 (-10.0, 10.0]
2015-03-04 (-10.0, 10.0]
2015-03-03 (-10.0, 10.0]
2015-03-02 (-10.0, 10.0]
Name: p_change, Length: 643, dtype: category
Categories (5, interval[float64, right]): [(-100.0, -75.0] < (-75.0, -10.0] < (-10.0, 10.0] < (10.0, 75.0] < (75.0, 100.0]]
pd. concat( [ cut1, cut2] , axis= 1 )
p_change p_change 2018-02-27 (1.738, 2.938] (-10.0, 10.0] 2018-02-26 (2.938, 5.27] (-10.0, 10.0] 2018-02-23 (1.738, 2.938] (-10.0, 10.0] 2018-02-22 (0.94, 1.738] (-10.0, 10.0] 2018-02-14 (1.738, 2.938] (-10.0, 10.0] ... ... ... 2015-03-06 (5.27, 10.03] (-10.0, 10.0] 2015-03-05 (1.738, 2.938] (-10.0, 10.0] 2015-03-04 (0.94, 1.738] (-10.0, 10.0] 2015-03-03 (0.94, 1.738] (-10.0, 10.0] 2015-03-02 (1.738, 2.938] (-10.0, 10.0]
643 rows × 2 columns
left = pd. DataFrame( { 'key1' : [ 'K0' , 'K0' , 'K1' , 'K2' ] ,
'key2' : [ 'K0' , 'K1' , 'K0' , 'K1' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pd. DataFrame( { 'key1' : [ 'K0' , 'K1' , 'K1' , 'K2' ] ,
'key2' : [ 'K0' , 'K0' , 'K0' , 'K0' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
left
key1 key2 A B 0 K0 K0 A0 B0 1 K0 K1 A1 B1 2 K1 K0 A2 B2 3 K2 K1 A3 B3
right
key1 key2 C D 0 K0 K0 C0 D0 1 K1 K0 C1 D1 2 K1 K0 C2 D2 3 K2 K0 C3 D3
pd. merge( left, right, on= [ "key1" , "key2" ] )
key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2
交叉表与透视表
data = pd. read_csv( "./data/stock_day.csv" )
data. head( )
data
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 115090.18 115090.18 115090.18 6.16 2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 98904.79 98904.79 98904.79 3.19 2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 100812.93 100812.93 100812.93 2.30 2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 117681.67 117681.67 117681.67 4.76 2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 96291.73 96291.73 96291.73 3.30
643 rows × 14 columns
time = pd. to_datetime( data. index)
time
DatetimeIndex(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22',
'2018-02-14', '2018-02-13', '2018-02-12', '2018-02-09',
'2018-02-08', '2018-02-07',
...
'2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10',
'2015-03-09', '2015-03-06', '2015-03-05', '2015-03-04',
'2015-03-03', '2015-03-02'],
dtype='datetime64[ns]', length=643, freq=None)
data[ 'week' ] = time. weekday
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover week 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 0 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 4 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 3 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 2
data[ 'result' ] = np. where( data[ 'p_change' ] > 0 , 1 , 0 )
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover week result 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 1 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 0 1 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 4 1 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 3 1 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 2 1
result = pd. crosstab( data[ 'week' ] , data[ 'result' ] )
sum = result. sum ( axis= 1 )
result = result. div( sum , axis= 1 )
result
0 1 2 3 4 week 0 0.504 0.473282 NaN NaN NaN 1 0.440 0.580153 NaN NaN NaN 2 0.488 0.541985 NaN NaN NaN 3 0.504 0.496183 NaN NaN NaN 4 0.472 0.519084 NaN NaN NaN
result. plot( kind= 'bar' , title= 'Picture 1' )
result. plot( kind= 'bar' , title= 'Picture 2' , stacked= True )
<Axes: title={'center': 'Picture 2'}, xlabel='week'>
data. pivot_table( [ 'result' ] , [ 'week' ] )
result week 0 0.496000 1 0.580153 2 0.537879 3 0.507812 4 0.535433
分组与聚合
import pandas as pd
data = pd. read_csv( "./data/参会与请假名单.csv" , usecols= [ "序号" , '年级' , '姓名' , '班级' , '是否请假' ] )
data
序号 年级 班级 姓名 是否请假 0 1 2020级 智管2001 徐顺明 是 1 2 2020级 智管2001 付芸 是 2 3 2020级 智管2001 晏程博 是 3 4 2020级 智管2001 李澎宣 是 4 5 2021级 智管2001 李雯 是 5 6 2021级 2021级电子信息专硕1班 李轩 是 6 7 2022级 2022级电子信息专硕1班 彭玉洁 是 7 8 2021级 2021级电子信息专硕1班 贾啸宇 是 8 9 2022级 2022级电子信息专硕1班 唐振瀚 是 9 10 2021级 2021级电子信息专硕1班 朱旭炜 是
data. groupby( [ '年级' ] ) . count( )
序号 班级 姓名 是否请假 年级 2020级 4 4 4 4 2021级 4 4 4 4 2022级 2 2 2 2
案例分析
星巴克零售店数据分析
目的: 按照国家、国内省份两种方式进行划分 查看星巴克零售店数量分布
import pandas as pd
data = pd. read_csv( "./data/starbucks/directory.csv" )
data. head( )
Brand Store Number Store Name Ownership Type Street Address City State/Province Country Postcode Phone Number Timezone Longitude Latitude 0 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, 96 Andorra la Vella 7 AD AD500 376818720 GMT+1:00 Europe/Andorra 1.53 42.51 1 Starbucks 22331-212325 Ajman Drive Thru Licensed 1 Street 69, Al Jarf Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.42 2 Starbucks 47089-256771 Dana Mall Licensed Sheikh Khalifa Bin Zayed St. Ajman AJ AE NaN NaN GMT+04:00 Asia/Dubai 55.47 25.39 3 Starbucks 22126-218024 Twofour 54 Licensed Al Salam Street Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.38 24.48 4 Starbucks 17127-178586 Al Ain Tower Licensed Khaldiya Area, Abu Dhabi Island Abu Dhabi AZ AE NaN NaN GMT+04:00 Asia/Dubai 54.54 24.51
count = data. groupby( [ 'Country' ] ) . count( )
count = count[ 'Brand' ]
count. plot( kind= 'bar' , figsize= ( 20 , 8 ) )
<Axes: xlabel='Country'>
count = data. groupby( [ 'Country' , 'State/Province' ] ) . count( )
count = count[ 'Brand' ]
count. plot( kind= 'bar' , figsize= ( 200 , 8 ) )
电影数据分析
目标:
电影的平均分与电影的导演人数 电影的rating与runtime的分布 统计电影的分类数据
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
data = pd. read_csv( "./data/IMDB-Movie-Data.csv" )
data. head( )
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore 0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0 1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0 2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0 3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0 4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
print ( "电影平均分" , data[ 'Rating' ] . mean( ) )
电影平均分 6.723199999999999
print ( "电影导演人数:(去重)" , np. unique( data[ 'Director' ] ) . shape[ 0 ] )
电影导演人数:(去重) 644
data[ 'Rating' ] . plot( kind= 'hist' , figsize= ( 20 , 8 ) , title= 'Score OF Film' )
<Axes: title={'center': 'Score OF Film'}, ylabel='Frequency'>
data[ 'Runtime (Minutes)' ] . plot( kind= 'hist' , figsize= ( 20 , 8 ) , title= 'Runtime OF Film' )
<Axes: title={'center': 'Runtime OF Film'}, ylabel='Frequency'>
tempList = [ i. split( ',' ) for i in data[ 'Genre' ] ]
genreList = np. unique( [ i for j in tempList for i in j] )
tempDf = pd. DataFrame( np. zeros( ( data. shape[ 0 ] , genreList. shape[ 0 ] ) ) , columns= genreList)
tempDf
Action Adventure Animation Biography Comedy Crime Drama Family Fantasy History Horror Music Musical Mystery Romance Sci-Fi Sport Thriller War Western 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 996 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 997 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 998 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 999 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1000 rows × 20 columns
for i in data. index:
tempDf. loc[ i, tempList[ i] ] = 1
print ( tempDf)
print ( "\n分类结果\n" , tempDf. sum ( axis= 0 ) )
Action Adventure Animation Biography Comedy Crime Drama Family \
0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
4 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
.. ... ... ... ... ... ... ... ...
995 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
996 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
997 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
998 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
999 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
Fantasy History Horror Music Musical Mystery Romance Sci-Fi \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
1 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0
2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
.. ... ... ... ... ... ... ... ...
995 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
996 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
997 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
998 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
999 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Sport Thriller War Western
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 1.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
.. ... ... ... ...
995 0.0 0.0 0.0 0.0
996 0.0 0.0 0.0 0.0
997 0.0 0.0 0.0 0.0
998 0.0 0.0 0.0 0.0
999 0.0 0.0 0.0 0.0
[1000 rows x 20 columns]
分类结果
Action 303.0
Adventure 259.0
Animation 49.0
Biography 81.0
Comedy 279.0
Crime 150.0
Drama 513.0
Family 51.0
Fantasy 101.0
History 29.0
Horror 119.0
Music 16.0
Musical 5.0
Mystery 106.0
Romance 141.0
Sci-Fi 120.0
Sport 18.0
Thriller 195.0
War 13.0
Western 7.0
dtype: float64