一、复习 关于 布尔序列 在数据清洗中的应用P150
import numpy as np
import pandas as pd
detail= pd. read_csv( r"G:\大数据实验数据库\3.大数据实验数据\detail.csv" , encoding= "GB18030" )
ser1= detail[ "counts" ]
x1= ser1. mean( ) - 3 * ser1. std( ) > ser1
x2= ser1. mean( ) + 3 * ser1. std( ) < ser1
boolind= x1| x2
boolind
二、P188 任务6.3 svm模型的使用
import numpy as np
import pandas as pd
from sklearn. metrics import classification_report
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler
from sklearn. svm import SVC
ablone= pd. read_csv( r"G:\大数据实验数据库\4.Python数据分析与应用\第6章\任务程序\data\abalone.data" )
data= ablone. iloc[ : , : 8 ]
target= ablone. iloc[ : , 8 ]
sex= pd. get_dummies( ablone[ "sex" ] )
data= pd. concat( [ data, sex] , axis= 1 )
data. drop( "sex" , axis= 1 , inplace= True )
x1, x2, y1, y2= \
train_test_split( data, target, train_size= 0.8 , random_state= 42 )
ssd= StandardScaler( ) . fit( x1)
x1_s= ssd. transform( x1)
x2_s= ssd. transform( x2)
mysvm= SVC( ) . fit( x1_s, y1)
pred= mysvm. predict( x2_s)
print ( classification_report( y2, pred) )
三、处理数据航空数据
airline_data= pd. read_csv( r"G:\大数据实验数据库\4.Python数据分析与应用\第7章\任务程序\data\air_data.csv" , encoding= "GB18030" )
exp1= airline_data[ "SUM_YR_1" ] . notnull( )
exp2= airline_data[ "SUM_YR_1" ] . notnull( )
exp= exp1 & exp2
airnotnull= airline_data. loc[ exp, : ]
airnotnull. shape
index1= airnotnull[ "SUM_YR_1" ] != 0
index2= airnotnull[ "SUM_YR_2" ] != 0
index3= airnotnull[ "SEG_KM_SUM" ] > 0 & ( airnotnull[ "avg_discount" ] != 0 )
airline= airnotnull[ ( index1| index2) & index3]
airline. shape