台大机器学习基石(Machine Learning Foundations)（Quiz2--作业二）的Python实现（带详细注释）

Alex_YP_Jiang

于 2019-06-04 16:09:10 发布

阅读量314

点赞数

文章标签：机器学习基石作业2 Python 机器学习

本文链接：https://blog.csdn.net/weixin_45135272/article/details/90770523

版权

Quiz 2 (q17-20)

源码文件链接：https://github.com/Alex-YP-Jiang/Machine-Learning-Foundations-Taiwan-University-Quiz-1-4-Python-Codes

>>> import math, random
>>> from pylab import *
>>>
>>> # Function that generates training examples regarding the given P(x) of input space and the noisy target function(for y_n).
>>> def training_set(N, x_l, x_r, noise):
   x = []
   y = []
   for i in range(N):   # Inputs of training examples are generated under P(x), has to be sorted for running dicotomy algo(for theta,s of each 'h')
       x_i = uniform(x_l,x_r)
       x.append(x_i)
   x.sort()
   for i in range(N):
       s = sign(x[i]) # 'sign()' is from numpy
       y_i = choice([s,-s], p=[1-noise,noise]) # random.choice(list, p=prob-list)
       #x.append(x_i)
       y.append(y_i)
   return [x,y]

>>> def dichotomy(points, index): # The dichotomy() function is the key! It defines one specific dichotomy with 'index' argument and returns its E_in and hypothesis parameters.
   x = points[0]
   y = points[1]
   N = len(x)
   if index>2*N-1 or index<0 or type(index)!=int:
       raise ValueError('Inappropriate dichotomy index!')
   E_in, theta, s = 0, 0, 0 # Initialize the 3 key values of the dicotomy. Beware of the 'a,b = 0' mistake!
   ## In terms of symmetry, N dicotomies are for positive ray and other N for negative ray. First dicotomies of both rays have a special theta, due to "no range".
   if index<N: s = 1
   else: s = -1
   if index == 0 or index == N:
       theta = x[0]-1
   else:
       if index<N:
           theta = 0.5*(x[index]-x[index-1]) + x[index-1]
       if index>N:
           theta = 0.5*(x[index-N]-x[index-N-1]) + x[index-N-1] # Locating theta("median of range") with sorted x-list.
   y_dico = [] # Setting up the dicotomy outputs!!!
   if index<N:
       for i in range(N):
           y_dico.append(1)
       if index>0:
           for i in range(0,index):
               y_dico[i] = -1
   elif index>=N:
       for i in range(N):
           y_dico.append(-1)
       if index>N:
           for i in range(0,index-N):
               y_dico[i] = 1    # How cumbersome!!!
   error = 0 # Calculating E_D(dico)
   for i in range(N):
       if y_dico[i]!=y[i]:
           error+=1
   E_in = error/N
   return [s, theta, E_in]

>>> def DSA(n_exp, N, x_l, x_r, noise):
   E_D_mean = 0
   E_X_mean = 0
   for i in range(n_exp):
       t_data = training_set(N, x_l, x_r, noise)
       E_D_min = 1
       dico_min = 0
       for j in range(2*N):
           d = dichotomy(t_data, j)
           if d[2]<E_D_min:
               E_D_min = d[2]
               dico_min = j
       E_D_mean += E_D_min
       d_min = dichotomy(t_data, dico_min)
       s = d_min[0]
       theta = d_min[1]
       E_X_g = 0.5+0.3*s*(abs(theta)-1)
       E_X_mean += E_X_g
   E_D_mean = E_D_mean/n_exp
   E_X_mean = E_X_mean/n_exp
   return E_D_mean, E_X_mean

>>> DSA(5000,20, -1, 1, 0.2)
(0.1681100000000005, 0.25715611716437825)
>>> DSA(5000,20, -1, 1, 0.2)
(0.16749000000000003, 0.25488045851543445)
>>>

>>> ## Multidimensional Decision Stump: Q19-20

>>> # Run DSA with positive/negative rays(H) on each dimension of input x, choose the h(s,theta) from the 9 dimensions that has the lowest E_D on 1 of all 9 dimensions.
>>> def multi_d_examples(fname): # returns the sorted x_d--y lists for running DSA/dicotomy()
   F = open(fname)
   L_strings = F.readlines()   # Re-use the file-reading codes from PLA.
   L_float_lists = []
   for l in L_strings:
       t1 = l.strip()
       t2 = t1.split()
       for i in range(len(t2)):
           t2[i] = float(t2[i])
       L_float_lists.append(t2)
   all_d_data = []             # each element is a list of [x,y] with sorted x, like output of function 'training_set()'
   dimension = len(L_float_lists[0]) - 1
   N = len(L_float_lists)
   def takeFirst(L):    # Similar to lambda function, for the "key" in 'sorted()'.
       return L[0]
   for i in range(dimension):   # Collecting x-y pairs of every dimension and convert the data to the format of sorted [x,y].
       x = []
       y = []
       unsorted = []
       for j in range(N):
           x_y = [L_float_lists[j][i], L_float_lists[j][-1]]
           unsorted.append(x_y)
       ascend = sorted(unsorted, key = takeFirst) # The 'sorted()' function!
       for l in ascend:
           x.append(l[0])
           y.append(l[1])
       d_data = [x,y]
       all_d_data.append(d_data)
   return all_d_data # A list of 'dimension'# of [x,y] lists, i.e [[x_0,y], [x_1,y].....]

>>> def DSA_multi_d(train, test): # One function that covers Q19 and Q20.
   training_data = multi_d_examples(train)
   optimal_dim = 0 # Recording the dimension with lowest E_D, 'classification dimension'.
   E_D_d_min = 1    # Recording the lowest E_D of all dimensions, return for Q19.
   dico_best = []    # Recording the best h of all dimensions.
   dim = len(training_data)   # Calculating the # of dimension.
   N = len(training_data[0][0]) # Getting the size of training set of any dimension, ie # of inputs/labels.
   for i in range(dim):
       t_d = training_data[i]   # Running DSA on each dimension, recording the dimension with the lowest E_D and its parameters: dimension #, h-theta/s.
       E_D_min = 1
       dico_min = 0
       for j in range(2*N):
           d = dichotomy(t_d, j)
           if d[2]<E_D_min:
               E_D_min = d[2]
               dico_min = j
       if E_D_min < E_D_d_min:        # Final step!
           E_D_d_min = E_D_min
           optimal_dim = i
           dico_best = dichotomy(t_d, dico_min) # Key elements(E_D_d_min, optimal_dim, dico_best) all attained!
   test_data = multi_d_examples(test)
   error_test = 0
   test_d_x = test_data[optimal_dim][0]   # Getting the x and its outputs y of the classification dimension.
   test_d_y = test_data[optimal_dim][1]
   s = dico_best[0]
   theta = dico_best[1]
   for i in range(len(test_d_y)):
       h = s*sign(test_d_x[i]-theta)
       if h!=test_d_y[i]:
           error_test+=1
   E_test = error_test/len(test_d_y)
   print('Minimal E_in is: ',E_D_d_min,'; E_test is: ',E_test)

>>> DSA_multi_d('C:/Users/logic/Desktop/train.txt', 'C:/Users/logic/Desktop/test.txt')
Minimal E_in is: 0.25 ; E_test is: 0.355

Alex_YP_Jiang

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫