# import math
# import sklearn
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn import datasets
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_predict
# from collections import Counter
#
#
# def dist(x1, x2, p=2): # p=1曼哈顿距离 p=2是欧氏距离 p=3...默认2
# if len(x1) == len(x2) and len(x1) > 1:
# my_sum = 0
# for i in range(0, len(x1), 1):
# my_sum += math.pow(abs(x1[i] - x2[i]), p) # 注意这里的绝对值,不加的话会报错
# ret = math.pow(my_sum, 1/p)
# return ret
# return 0
#
#
# def homework():
# a = [1, 1]
# b = [5, 1]
# c = [4, 4]
#
# for p in range(1, 6, 1):
# myresu = [dist(a, b, p), dist(a, c, p)]
# print("p为:" + str(p) + ", a到b, c的距离是:%f %f" % (myresu[0], myresu[1]))
#
#
# def feature_program():
# iris = sklearn.datasets.load_iris()
# df = pd.DataFrame(iris.data, columns=iris.feature_names)
# df['label'] = iris.target
# df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
# # print(df.head(10))
# # plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
# # plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
# # plt.xlabel('sepal length')
# # plt.ylabel('sepal width')
# # plt.show()
# # plt.legend()
#
# data = np.array(df.iloc[:100, [0, 1, -1]])
# x, y = data[:, :-1], data[:, -1]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#
#
#
# class KNN:
# def __init__(self, X_train, y_train, n_neighbors=3, p=2):
# """
# parameter: n_neighbors 临近点个数
# parameter: p 距离度量
# """
# self.n = n_neighbors
# self.p = p
# self.X_train = X_train
# self.y_train = y_train
#
# def predict(self, X):
# # 取出n个点
# knn_list = []
#
# for i in range(self.n):
# # 调用库函数求向量的p范数 : 指定向量到每一个训练集当中的举例
# # 元祖保存起来,(距离, 分类)
# dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
# knn_list.append((dist, self.y_train[i]))
#
# for i in range(self.n, len(self.X_train)):
# #
# max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
# dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
# if knn_list[max_index][0] > dist:
# knn_list[max_index] = (dist, self.y_train[i])
#
# # 统计
# knn = [k[-1] for k in knn_list]
# count_pairs = Counter(knn)
# # max_count = sorted(count_pairs, key=lambda x: x)[-1]
# max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]
# return max_count
#
# def score(self, X_test, y_test):
# right_count = 0
# n = 10
# for X, y in zip(X_test, y_test):
# label = self.predict(X)
# if label == y:
# right_count += 1
# return right_count / len(X_test)
#
#
#
#
#
import math
import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
def dist(x, y, p=2):
my_sum = 0
for i in range(0, len(x), 1):
my_sum += math.pow(abs(x[i] - y[i]), p)
return math.pow(my_sum, 1 / p)
def exanple_3_1():
x1 = [1, 1]
x2 = [5, 1]
x3 = [4, 4]
for p in range(2, 6):
print(p)
print(dist(x1, x2, p))
print(dist(x1, x3, p))
print()
iris = datasets.load_iris()
df = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])
df['target'] = iris['target']
plt.scatter(df[0:50]['sepal length (cm)'], df[0:50]['sepal width (cm)'], label='0')
plt.scatter(df[50:100]['sepal length (cm)'], df[50:100]['sepal width (cm)'], label='1')
plt.scatter(df[100:150]['sepal length (cm)'], df[100:150]['sepal width (cm)'], label='2')
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.legend()
plt.show()
x = np.array(df.iloc[:, 0:2])
y = np.array(df.iloc[:, -1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
class KNN:
def __init__(self, x_train, y_train, neighbor=5, p=2):
self.x_train = x_train
self.y_train = y_train
self.neighbor = neighbor
self.p = p
self.diction = []
for i in range(len(x_train)): # [ ([], 类别), () ]
temp = (x_train[i], y_train[i])
self.diction.append(temp)
def fit(self, x):
# x是输入要用KNN判断的点,x_train是已经知道画在图上的点,要求这个点到每个点的距离
my_result = [] # 【[距离, 类别], [距离, 类别] ,,,】
for position in self.diction:
my_result.append([np.linalg.norm(x - position[0], ord=self.p), position[1]])
my_result.sort()
countA = 0
countB = 0
countC = 0
for i in range(0, self.p):
if my_result[i][-1] == 0:
countA += 1
elif my_result[i][-1] == 1:
countB += 1
else:
countC += 1
if countA > countB:
return 0
elif countC > countB:
return 2
else:
return 1
def test(self, x_test, y_test):
count = 0
for (point, label) in zip(x_test, y_test):
if self.fit(point) == label:
count += 1
return count / len(x_test)
df1 = pd.DataFrame(data=x_train)
df2 = pd.DataFrame(data=x_test)
plt.scatter(df1[:][0], df1[:][1], label='train')
plt.scatter(df2[:][0], df2[:][1], label='test')
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.legend()
plt.show()
clf = KNN(x_train, y_train, neighbor=5, p=2)
print(clf.test(x_test, y_test))