import csv
import random
from math import sqrt
def random_divide ( data_sample, n) :
list_class = { }
iris_train = [ ]
for i in data_sample:
iris_train. append( i)
iris_test = [ ]
for sample in data_sample:
if sample[ - 1 ] not in list_class:
list_class[ sample[ - 1 ] ] = 1
else :
list_class[ sample[ - 1 ] ] += 1
for item in list_class. items( ) :
a = int ( item[ 1 ] * n)
for i in range ( 0 , a) :
pop_num = - 1
list_num = - 1
pop_list = random. randint( 0 , list_class[ item[ 0 ] ] - 1 )
for temp_sample in iris_train:
list_num += 1
if temp_sample[ - 1 ] == item[ 0 ] :
pop_num += 1
if pop_num == pop_list:
iris_test. append( iris_train. pop( list_num) )
list_class[ item[ 0 ] ] -= 1
break
return iris_train, iris_test
def train_pretreatment ( train) :
average_variance= [ ]
train_p = [ ]
for i in range ( len ( train[ 0 ] ) - 1 ) :
a_v = [ ]
sum = 0
for j in range ( len ( train) ) :
sum += train[ j] [ i]
a_v. append( round ( sum / len ( train) , 3 ) )
sq = 0
for j in range ( len ( train) ) :
sq += ( a_v[ 0 ] - train[ j] [ i] ) ** 2
a_v. append( round ( sqrt( sq/ len ( train) ) , 3 ) )
average_variance. append( a_v)
for i in range ( len ( train) ) :
sample_p = [ ]
for j in range ( len ( train[ 0 ] ) - 1 ) :
sample_p. append( round ( ( train[ i] [ j] - average_variance[ j] [ 0 ] ) / average_variance[ j] [ 1 ] , 3 ) )
sample_p. append( train[ i] [ - 1 ] )
train_p. append( sample_p)
return train_p, average_variance
def test_pretreatment ( test, average_variance) :
test_p = [ ]
for i in range ( len ( test) ) :
p = [ ]
for j in range ( len ( test[ i] ) - 1 ) :
p. append( round ( ( test[ i] [ j] - average_variance[ j] [ 0 ] ) / average_variance[ j] [ 1 ] , 3 ) )
p. append( test[ i] [ - 1 ] )
test_p. append( p)
return test_p
def found_k ( test_sample, train_p, k) :
num_k = { }
for i in range ( len ( train_p) ) :
sum = 0
for j in range ( len ( test_sample) - 1 ) :
sum += ( test_sample[ j] - train_p[ i] [ j] ) ** 2
distance = round ( sqrt( sum ) , 3 )
if len ( num_k) < k:
num_k[ tuple ( train_p[ i] ) ] = distance
else :
tempk = ( )
tempv = distance
for item in num_k. items( ) :
if tempv < item[ 1 ] :
tempk = item[ 0 ]
tempv = item[ 1 ]
if tempv != distance:
del num_k[ tempk]
num_k[ tuple ( train_p[ i] ) ] = distance
return num_k
def count_class ( num_k) :
sample_count_class = { }
for item in num_k. items( ) :
if item[ 0 ] [ - 1 ] in sample_count_class:
sample_count_class[ item[ 0 ] [ - 1 ] ] += 1
else :
sample_count_class[ item[ 0 ] [ - 1 ] ] = 1
return sample_count_class
def class_p ( count_class) :
max_class = ''
max_count = 0
for item in count_class. items( ) :
if max_count <= item[ 1 ] :
max_class = item[ 0 ]
max_count = item[ 1 ]
return max_class
def accomplish_p ( test_p, train_p, k) :
pclass = [ ]
for i in range ( len ( test_p) ) :
num_k = found_k( test_p[ i] , train_p, k)
sample_count_class = count_class( num_k)
sample_class_p = class_p( sample_count_class)
pclass. append( sample_class_p)
error_class = 0
for i in range ( len ( pclass) ) :
if test_p[ i] [ - 1 ] != pclass[ i] :
error_class += 1
err = round ( error_class/ len ( test_p) , 5 )
return err
def chose_k ( train_data) :
train_data_temp = [ ]
train_data_k = [ ]
good_k = - 1
err_a = 1
err_v = 1
for i in range ( len ( train_data) ) :
train_data_temp. append( train_data[ i] )
n = int ( len ( train_data) / 10 )
for i in range ( 10 ) :
train_data_temp, train_data_test = random_divide( train_data_temp, n/ len ( train_data_temp) )
train_data_k. append( train_data_test)
for k in range ( 1 , 10 ) :
errk = 0
errks = [ ]
errks_sum = 0
for i in range ( 10 ) :
train_data_test = train_data_k[ i]
train_data_train = [ ]
for j in range ( 10 ) :
if ( j != i) :
for temp in train_data_k[ j] :
train_data_train. append( temp)
train_data_train_p, train_data_train_average_variance = train_pretreatment( train_data_train)
train_data_test_p = test_pretreatment( train_data_test, train_data_train_average_variance)
err_temp = accomplish_p( train_data_test_p, train_data_train_p, k)
errk+= err_temp
errks. append( err_temp)
errk_a = round ( errk/ 10 , 6 )
for i in range ( 10 ) :
errks_sum += ( errk_a- errks[ i] ) ** 2
errk_v = round ( sqrt( errks_sum) , 6 )
print ( "k=%d, 错误率平均:%f,方差:%f" % ( k, errk_a, errk_v) )
if err_a > errk_a:
good_k = k
err_a = errk_a
err_v = errk_v
elif err_a == errk_a:
if err_v > errk_v:
good_k = k
err_a = errk_a
err_v = errk_v
print ( "k选取%d最合适,错误率平均:%f,方差:%f" % ( good_k, err_a, err_v) )
return good_k
print ( "---------------------获取样本集--------------------------" )
f = csv. reader( open ( "D:/iris.csv" , "r" ) )
iris_data = [ ]
for i in f:
for j in range ( len ( i) - 1 ) :
i[ j] = float ( i[ j] )
iris_data. append( i)
iris_class = [ ]
for iris_sample in iris_data:
if iris_sample[ len ( iris_sample) - 1 ] not in iris_class:
iris_class. append( iris_sample[ len ( iris_sample) - 1 ] )
print ( "iris的样本数量:" , len ( iris_data) )
print ( "iris的特征维数:" , len ( iris_data[ 0 ] ) - 1 )
print ( "iris的类别情况:" , iris_class)
print ( "---------------------随机划分----------------------------" )
iris_train, iris_test = random_divide( iris_data, 0.2 )
print ( "len(iris_data) = " , len ( iris_data) )
print ( "len(iris_train) = " , len ( iris_train) )
print ( "len(iris_test) = " , len ( iris_test) )
print ( "----------------------选取k的值--------------------------" )
k = chose_k( iris_train)
print ( "--------------------训练集预处理---------------------------" )
iris_train_p, iris_average_variance = train_pretreatment( iris_train)
print ( "iris_train_p = " , iris_train_p)
print ( "len(iris_train_p) = " , len ( iris_train_p) )
print ( "iris_average_variance = " , iris_average_variance)
print ( "-------------------测试集预处理-----------------------------" )
iris_test_p = test_pretreatment( iris_test, iris_average_variance)
print ( "iris_test_p = " , iris_test_p)
print ( "len(iris_test_p) = " , len ( iris_test_p) )
print ( "-------------------对测试集进行预测--------------------------" )
err = accomplish_p( iris_test_p, iris_train_p, k)
print ( "错误率 = " , err)