import pandas as pd
import matplotlib. pyplot as plt
datingTest = pd. read_table( 'datingTestSet.txt' , header= None )
datingTest. head( )
Colors = [ ]
for i in range ( datingTest. shape[ 0 ] ) :
m = datingTest. iloc[ i, - 1 ]
if m== 'didntLike' :
Colors. append( 'black' )
if m== 'smallDoses' :
Colors. append( 'orange' )
if m== 'largeDoses' :
Colors. append( 'red' )
plt. rcParams[ 'font.sans-serif' ] = [ 'Simhei' ]
pl= plt. figure( figsize= ( 12 , 8 ) )
fig1= pl. add_subplot( 221 )
plt. scatter( datingTest. iloc[ : , 1 ] , datingTest. iloc[ : , 2 ] , marker= '.' , c= Colors)
plt. xlabel( '玩游戏视频所占时间比' )
plt. ylabel( '每周消费冰淇淋公升数' )
fig2= pl. add_subplot( 222 )
plt. scatter( datingTest. iloc[ : , 0 ] , datingTest. iloc[ : , 1 ] , marker= '.' , c= Colors)
plt. xlabel( '每年飞行常客里程' )
plt. ylabel( '玩游戏视频所占时间比' )
fig3= pl. add_subplot( 223 )
plt. scatter( datingTest. iloc[ : , 0 ] , datingTest. iloc[ : , 2 ] , marker= '.' , c= Colors)
plt. xlabel( '每年飞行常客里程' )
plt. ylabel( '每周消费冰淇淋公升数' )
plt. show( )
def minmax ( dataSet) :
minDf = dataSet. min ( )
maxDf = dataSet. max ( )
normSet = ( dataSet- minDf) / ( maxDf- minDf)
return normSet
datingT1= minmax( datingTest. iloc[ : , : 3 ] )
datingT2= datingTest. iloc[ : , 3 ]
datingT = pd. concat( [ datingT1, datingT2] , axis= 1 )
datingT. head( )
def randSplit ( dataSet, rate= 0.7 ) :
n = dataSet. shape[ 0 ]
m= int ( n* rate)
train = dataSet. iloc[ : m, : ]
test = dataSet. iloc[ m: , : ]
test. index = range ( test. shape[ 0 ] )
return train, test
train1, test1 = randSplit( datingT)
"""
一步一步验证原理用的,可以忽略
n=train1.shape[1]-1#训练集数据的列数减一
m=test1.shape[0]#测试集数据行数
result=[]
for i in range(m):
dist = list((((train1.iloc[:, :n] - test1.iloc[i, :n]) ** 2).sum(1))**5)
dist_l = pd.DataFrame({'dist': dist, 'labels': (train1.iloc[:, n])})
dr = dist_l.sort_values(by = 'dist')[: 5]
re = dr.loc[:, 'labels'].value_counts()
result.append(re.index[0])
result2 = pd.Series(result)
test1['predict'] = result2
"""
def datingClass ( train, test, k) :
n= train. shape[ 1 ] - 1
m= test. shape[ 0 ]
result = [ ]
for i in range ( m) :
dist = list ( ( ( ( train. iloc[ : , : n] - test. iloc[ i, : n] ) ** 2 ) . sum ( 1 ) ) ** 5 )
dist_l = pd. DataFrame( { 'dist' : dist, 'labels' : ( train. iloc[ : , n] ) } )
dr = dist_l. sort_values( by = 'dist' ) [ : k]
re = dr. loc[ : , 'labels' ] . value_counts( )
result. append( re. index[ 0 ] )
result = pd. Series( result)
test[ 'predict' ] = result
acc = ( test. iloc[ : , - 1 ] == test. iloc[ : , - 2 ] ) . mean( )
print ( f'模型预测准确率为{acc}' )
return test
datingClass( train1, test1, 5 )
"""总结此代码
1.调用了pandas,matplotlib.pyplot模块。
2.给出了调入数据集的办法
3.给出了画散点图的方法
4.给出了划分训练集、测试集的办法
5.给出了k近邻算法的调用
6.给出了预测正确率的一种办法
"""