吴恩达机器学习代码

新图学霸猫

已于 2024-05-05 14:45:05 修改

阅读量420

点赞数 4

文章标签：机器学习人工智能 python

于 2024-04-15 17:39:44 首次发布

本文链接：https://blog.csdn.net/wenchlove/article/details/137788952

版权

线性回归

单变量线性回归-梯度下降算法

from xml.etree.ElementTree import tostring

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置随机种子，以确保结果可重复
np.random.seed(0)

# 生成自变量 x，假设范围是 0 到 100，共有 100 个数据点
x = np.random.uniform(0, 100, 100)

# 假设因变量 y 与 x 之间存在线性关系，加上一些噪声
# y = 3 * x + 5 + 噪声，噪声服从均值为 0，标准差为 10 的正态分布
noise = np.random.normal(0, 10, 100)
y = 3 * x + 5 + noise

# 创建 DataFrame 存储数据
data = pd.DataFrame({'X': x, 'Y': y})

# 保存数据到 CSV 文件
data.to_csv('linear_regression_data.csv', index=False)

data.insert(0,'ones',1)
# 绘制散点图
plt.scatter(x, y)
plt.title('Scatter Plot of Linear Regression Data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(True)
plt.show()


#数据处理
X = data.iloc[:,0:-1]
X.head()
X = X.values
X.shape
y = data.iloc[:,-1]
y.head()
y = y.values
y.shape
y = y.reshape(100,1)
y.shape

#计算J（Θ）的值
def cost_func(X,y,theta):
    inner=np.power(X@theta - y,2)
    return np.sum(inner)/(2*len(X))

#随机初始值
theta=np.zeros((2,1))
print(theta)
cost0=cost_func(X,y,theta)
print("cost0:")
print(cost0)
#学习率和学习论述
alpha=0.000001
count=1000000


#梯度下降算法
def gradient_Abscent(X,y,alpha,count):
    global theta
    costs=[]
    for i in range(count):
        #theta = theta-(X.T @(X @ theta - y))*alpha/len(X)
        theta = theta - (X.T @ (X @ theta - y)) * alpha / len(X)
        nowcost=cost_func(X,y,theta)
        costs.append(nowcost)
        if i%100==0:
            print(nowcost)
    return theta,costs







theta_ans,cost_ans=gradient_Abscent(X,y,alpha,count)

#代价函数可视化
fig,ax = plt.subplots()
ax.plot(np.arange(count),cost_ans)
ax.set(xlabel = 'count',ylabel = 'cost')
plt.show()

# 拟合函数可视化
x = np.linspace(y.min(), y.max(), 100)  # 网格数据
y_ = theta_ans[0, 0] + theta_ans[1, 0] * x  # 取theta第一行第一个和第二行第一个

print("b:")
print(theta_ans[0, 0])
print("k:")
print(theta_ans[1, 0])


fig, ax = plt.subplots()
ax.scatter(X[:, 1], y, label='training')  # 绘制数据集散点图取x所有行，第2列population
ax.plot(x, y_, 'r', label='predict')  # 绘制预测后的直线
ax.legend()
ax.set(xlabel='population', ylabel='profit')
plt.show()

生成的初试样本
J（Θ）随轮数增加而减小
最终的直线

多变量线性回归-梯度下降算法

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

np.random.seed(0)
# 设置数据集大小
num_samples = 1000

# 生成特征数据
area = np.random.normal(loc=1500, scale=300, size=num_samples)  # 房屋面积，均值为1500，标准差为300
year = np.random.randint(1950, 2023, size=num_samples)  # 房屋年份，范围在1950年至2022年之间
num_rooms = np.random.randint(2, 6, size=num_samples)  # 房间数量，范围在2至5之间

# 生成目标变量数据（房价），假设线性关系为 price = 100 * area + 500 * year - 300 * num_rooms + noise
noise = np.random.normal(loc=0, scale=10000, size=num_samples)  # 添加噪声
price = 100 * area + 500 * (2022 - year) - 300 * num_rooms + noise

# 定义Z-score标准化函数
def z_score_normalization(feature):
    mean = np.mean(feature)
    std = np.std(feature)
    normalized_feature = (feature - mean) / std
    return normalized_feature

# 对每个特征进行Z-score标准化
area_normalized = z_score_normalization(area)
year_normalized = z_score_normalization(year)
num_rooms_normalized = z_score_normalization(num_rooms)

# 输出标准化后的特征数据
print("Normalized Area:", area_normalized)
print("Normalized Year:", year_normalized)
print("Normalized Number of Rooms:", num_rooms_normalized)


# 创建 DataFrame 对象
data = pd.DataFrame({
    'Area': area_normalized,
    'Year': year_normalized,
    'NumRooms': num_rooms_normalized,
    'Price': price
})

# 保存数据集到文件
data.to_csv('linear_regression_data1.csv', index=False)
data.insert(0,'ones',1) #x0=1


# 数据处理
X = data.iloc[:, :-1].values  # 特征矩阵
y = data.iloc[:, -1].values.reshape(-1, 1)  # 目标变量列

# 数据集分割为训练集和测试集（70%训练，30%测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


#计算J（Θ）的值
def cost_func(X,y,theta):
    #inner=np.power(X @ theta - y,2)
    # print("X shape:", X.shape)
    # print("y shape:", y.shape)
    # print("theta shape:", theta.shape)
    inner = (X @ theta - y) ** 2
    return np.sum(inner)/(2*len(X))

# 随机初始值
theta = np.zeros((X.shape[1], 1))  # 初始化参数
#学习率和学习论述
alpha=0.0001
count=1000000


#梯度下降算法
def gradient_Abscent(X,y,alpha,count):
    global theta
    costs=[]
    for i in range(count):
        # print("X shape:", X.shape)
        # print("y shape:", y.shape)
        # print("theta shape:", theta.shape)
        #theta = theta-(X.T @(X @ theta - y))*alpha/len(X)
        theta = theta - (X.T @ (X @ theta - y)) * alpha / len(X)
        nowcost=cost_func(X,y,theta)
        costs.append(nowcost)
        if i%100==0:
            print(nowcost)
    return theta,costs

theta_ans, cost_ans =  gradient_Abscent(X_train, y_train, alpha, count)

#代价函数可视化
fig,ax = plt.subplots()
ax.plot(np.arange(count),cost_ans)
ax.set(xlabel = 'count',ylabel = 'cost')
plt.show()

print("θ0:")
print(theta_ans[0, 0])
print("θ1:")
print(theta_ans[1, 0])
print("θ2:")
print(theta_ans[2, 0])
print("θ3:")
print(theta_ans[3, 0])


# 使用测试集评估模型性能
test_cost = cost_func(X_test, y_test, theta_ans)
print("测试集上的代价函数值:", test_cost)

在这里插入图片描述

逻辑回归

二分类问题

import numpy as np


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def bce_loss(pred, target):
    """
    计算误差
    :param pred: 预测
    :param target: ground truth
    :return: 损失序列
    """
    return -np.mean(target * np.log(pred) + (1-target) * np.log(1-pred))


class LogisticRegression:
    """
    Logistic回归类
    """

    def __init__(self, x, y, val_x, val_y, epoch=100, lr=0.1, normalize=True, regularize=None, scale=0, show=True):
        """
        初始化
        :param x: 样本, (sample_number, dimension)
        :param y: 标签, (sample_numer, 1)
        :param epoch: 训练迭代次数
        :param lr: 学习率
        """
        self.theta = None
        self.loss = []
        self.val_loss = []
        self.n = x.shape[0]
        self.d = x.shape[1]

        self.epoch = epoch
        self.lr = lr

        t = np.ones(shape=(self.n, 1))

        self.normalize = normalize

        if self.normalize:
            self.x_std = x.std(axis=0)
            self.x_mean = x.mean(axis=0)
            self.y_mean = y.mean(axis=0)
            self.y_std = y.std(axis=0)
            x = (x - self.x_mean) / self.x_std

        self.y = y
        self.x = np.concatenate((t, x), axis=1)

        # self.val_x = (val_x - val_x.mean(axis=0)) / val_x.std(axis=0)
        self.val_x = val_x
        self.val_y = val_y

        self.regularize = regularize
        self.scale = scale

        self.show = show

    def init_theta(self):
        """
        初始化参数
        :return: theta (1, d+1)
        """
        self.theta = np.zeros(shape=(1, self.d + 1))

    def gradient_decent(self, pred):
        """
        实现梯度下降求解
        """
        # error (n,1)
        error = pred - self.y
        # term (d+1, 1)
        term = np.matmul(self.x.T, error)
        # term (1,d+1)
        term = term.T

        if self.regularize == "L2":
            re = self.scale / self.n * self.theta[0, 1:]
            re = np.expand_dims(np.array(re), axis=0)
            re = np.concatenate((np.array([[0]]), re), axis=1)
            # re [0,...] (1,d+1)
            self.theta = self.theta - self.lr * (term / self.n + re)
        # update parameters
        else:
            self.theta = self.theta - self.lr * (term / self.n)

    def validation(self, x, y):
        if self.normalize:
            x = (x - x.mean(axis=0)) / x.std(axis=0)
        outputs = self.get_prob(x)
        curr_loss = bce_loss(outputs, y)
        if self.regularize == "L2":
            curr_loss += self.scale / self.n * np.sum(self.theta[0, 1:] ** 2)
        self.val_loss.append(curr_loss)
        predicted = np.expand_dims(np.where(outputs[:, 0] > 0.5, 1, 0), axis=1)
        count = np.sum(predicted == y)
        if self.show:
            print("Accuracy on Val set: {:.2f}%\tLoss on Val set: {:.4f}".format(count / y.shape[0] * 100, curr_loss))

    def test(self, x, y):
        outputs = self.get_prob(x)
        predicted = np.expand_dims(np.where(outputs[:, 0] > 0.5, 1, 0), axis=1)
        count = np.sum(predicted == y)
        # print("Accuracy on Test set: {:.2f}%".format(count / y.shape[0] * 100))
        # curr_loss = bce_loss(outputs, y)
        # if self.regularize == "L2":
        # curr_loss += self.scale / self.n * np.sum(self.theta[0, 1:] ** 2)
        return count / y.shape[0]  # , curr_loss

    def train(self):
        """
        训练Logistic回归
        :return: 参数矩阵theta (1,d+1); 损失序列 loss
        """
        self.init_theta()

        for i in range(self.epoch):
            # pred (1,n); theta (1,d+1); self.x.T (d+1, n)
            z = np.matmul(self.theta, self.x.T).T
            # pred (n,1)
            pred = sigmoid(z)
            curr_loss = bce_loss(pred, self.y)
            if self.regularize == "L2":
                curr_loss += self.scale / self.n * np.sum(self.theta[0, 1:] ** 2)
            self.loss.append(curr_loss)
            self.gradient_decent(pred)
            if self.show:
                print("Epoch: {}/{}, Train Loss: {:.4f}".format(i + 1, self.epoch, curr_loss))
            self.validation(self.val_x, self.val_y)

        if self.normalize:
            y_mean = np.mean(z, axis=0)
            self.theta[0, 1:] = self.theta[0, 1:] / self.x_std.T
            self.theta[0, 0] = y_mean - np.dot(self.theta[0, 1:], self.x_mean.T)
        return self.theta, self.loss, self.val_loss

    def get_prob(self, x):
        """
        回归预测
        :param x: 输入样本 (n,d)
        :return: 预测结果 (n,1)
        """
        t = np.ones(shape=(x.shape[0], 1))
        x = np.concatenate((t, x), axis=1)
        pred = sigmoid(np.matmul(self.theta, x.T))
        return pred.T

    def get_inner_product(self, x):
        t = np.ones(shape=(x.shape[0], 1))
        x = np.concatenate((t, x), axis=1)
        return np.matmul(self.theta, x.T)

    def predict(self, x):
        prob = self.get_prob(x)
        return np.expand_dims(np.where(prob[:, 0] > 0.5, 1, 0), axis=1)

神经网络

多分类转为多个二分类，使用二分类交叉熵做损失函数时：

公式与思路得推导如下图：
在这里插入图片描述

代码如下：

import copy
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
"""
mse：使用均方误差
ce：将多分类问题转化为多个二分类问题
soft：多分类交叉熵，最后一层输出后再使用softmax函数
"""
mse_list = []
ce_list = []
soft_list = []

#激活函数
def sigmoid(x):
    return 1/(1+np.exp(-x))

#激活函数求导
def sigmoid_derivative(x):
    return x*(1-x)

#计算均方误差
def mse_function(y,pred_y):
    mse=(np.sum(pow((pred_y-y),2))/len(pred_y))/2
    return mse

#多分类问题转化为多个二分类问题时，计算交叉熵
def ce_function(y,pred_y):
    cross_entropy=-np.sum(y*np.log(pred_y)+(1-y)*np.log(1-pred_y))
    return cross_entropy

#计算多分类交叉熵
def soft_function(y,pred_y):
    pred_y = softmax(pred_y)
    soft_y = -np.sum(y * np.log(pred_y))
    return soft_y

def softmax(pred_y):
    denominator = np.sum(np.exp(pred_y))
    pred_y = np.exp(pred_y) / denominator
    return pred_y


class NeuralNetwork:
    def __init__(self, layer, times, alpha, epsilon):
        self.layer = layer
        self.times = times
        self.alpha = alpha

        # 初始化隐藏层和输出层的权重
        self.ce_weights = []
        # (100, 65) (10, 101)
        for tier in range(len(layer) - 1):
            self.ce_weights.append(np.random.rand(layer[tier + 1], layer[tier] + 1) * 2 * epsilon - epsilon)
        self.mse_weights = copy.deepcopy(self.ce_weights)
        self.soft_weights = copy.deepcopy(self.ce_weights)

    # 前向传播,得到每层神经元的输出
    def forward_propagation(self, feature_one, for_weights):
        activators = [feature_one.reshape(1, -1)]  # 激活项,输入层的激活项即为X (1*65)
        for forward_layer in range(len(for_weights)):
            activator = sigmoid(np.dot(activators[forward_layer], for_weights[forward_layer].T)) #a3=sigmoid(z3)=sigmoid(a2*theta2)
            if forward_layer < len(for_weights) - 1:
                activator = np.append(np.array([1]), activator)  # 1*101 #如果当前层不是输出层,添加一个额外的神经元作为偏置单元，其输出固定为 1
            # activators：(1, 65) (1, 101) (1, 10)
            activators.append(activator.reshape(1, -1))
        return activators

    # 使用交叉熵函数进行反向传播
    def ce_back_propagation(self, activators, target_one, back_weights):
        # 反向计算该样本各层神经元误差 δ deltas
        deltas_error = [0 for _ in range(len(back_weights))]
        error = target_one - activators[-1] #δ=aL-y，此处取得是-δ不影响，在最后的梯度下降处+变-，-变+即可
        deltas = [error]
        for j in range(len(back_weights) - 1, 0, -1):
            delta = np.dot(back_weights[j].T, deltas[-1].T).T * sigmoid_derivative(activators[j])
            deltas.append(delta)
        deltas.reverse()  # deltas：(1, 101) (1, 10)

        # 计算deltas_error △
        for j in range(len(back_weights)):
            #de_error = np.dot(deltas[j].reshape(-1, 1), activators[j])
            de_error = np.dot(deltas[j].reshape(-1, 1), activators[j])
            if j < len(back_weights) - 1:
                de_err = de_error[1:]
            else:
                de_err = de_error
            deltas_error[j] = de_err
        return deltas_error

    # 更新参数
    def update_parameters(self, deltas_error_list, up_weights, data_num, gamma):
        for ly in range(len(up_weights)):
            # 除偏置神经元外，其余神经元加上正则项
            deltas_error_list_regular = deltas_error_list[ly][:, 1:] #去除了每层第一个元素（偏置项的误差）的误差梯度Δ，因为偏置项不受正则化影响。
            weights_regular = up_weights[ly][:, 1:] #去除了每层第一列（偏置项的权重）的权重矩阵，因为偏置项不参与正则化计算。
            d_part = deltas_error_list_regular / data_num + gamma * weights_regular #计算了正则化项的部分，将误差梯度除以样本数量 data_num，并加上了正则化系数 gamma 乘以权重矩阵。
            # test_der = deltas_error_list[ly][:, 0]/data_num
            der = np.hstack(
                (((deltas_error_list[ly][:, 0].reshape(-1, 1)) / data_num),
                 d_part))
            up_weights[ly] = up_weights[ly] + self.alpha * der  #此处原本应该是theta=theta-Δ，但在一开始时δ用的-δ，因此-变＋
        return up_weights

    def fit(self, feature, target, gamma, batch_num):
        feature_x0 = np.ones((np.shape(feature)[0], 1))
        feature_x = np.hstack((feature_x0, feature))  # 维度(1257, 65)
        m = len(feature_x)
        iters = 0

        # 第一次前向传播和反向传播，使用全部样本，更新参数
        ce_error = 0
        ce_deltas_error_list = np.array(
            [0 for _ in range(len(self.ce_weights))])
        for i in range(m):
            # 前向传播，得到每一层神经元的激活值，即输出，并得到第一次前向传播的ce值
            activators = self.forward_propagation(
                feature_x[i], self.mse_weights)
            ce = ce_function(target[i], activators[-1])
            ce_error = ce_error + ce   #得J(theta)

            # 反向传播
            # 使用交叉熵误差时的反向传播，得到deltas_error △，并更新
            ce_deltas_error = self.ce_back_propagation(activators, target[i], self.ce_weights)
            ce_deltas_error_list = ce_deltas_error_list + ce_deltas_error
            # if i % 100 == 0:
            #     print(i)
        # 记录第一次前向传播后，权重未更新时，加上正则项后的ce的误差
        ce_regu = []
        for w in range(len(self.ce_weights)):
            ce_weights_re = np.sum(np.power(self.ce_weights[w], 2))
            ce_regu.append(ce_weights_re)
        ce_regular = (ce_error + gamma * np.sum(ce_regu) / 2) / m
        ce_list.append(ce_regular)

        # 根据反向传播的结果，更新各层参数
        ce_weights = self.update_parameters(
            ce_deltas_error_list, self.ce_weights, m, gamma)
        self.ce_weights = ce_weights

        # 记录第一次反向传播，权重更新后，加上正则项后，所有样本的ce的误差
        self.compute_error(feature_x, target, m, gamma)

        # 小批量更新
        print("batch----------------------------------------------------------")
        while iters < self.times:
            rand_index = np.random.randint(0, m, size=(1, batch_num))[0]
            feature_batch = feature_x[rand_index]
            target_batch = target[rand_index]
            ce_error = 0
            ce_deltas_error_list = np.array(
                [0 for _ in range(len(self.ce_weights))])
            for i in range(batch_num):
                # 前向传播，得到每一层神经元的激活值，即输出，并得到前向传播的mse和ce值

                ce_activators = self.forward_propagation(
                    feature_batch[i], self.ce_weights)

                ce = ce_function(target[i], ce_activators[-1])

                ce_error = ce_error + ce

                # print(iters)
                # print("mse_activators:", mse_activators)
                # print("ce_activators:", ce_activators)
                # print("soft_activators:", soft_activators)

                # 反向传播

                # 使用交叉熵误差时的反向传播，得到deltas_error △，并更新
                ce_deltas_error = self.ce_back_propagation(
                    ce_activators, target_batch[i], self.ce_weights)
                ce_deltas_error_list = ce_deltas_error_list + ce_deltas_error

            # 根据反向传播的结果，更新各层参数
            ce_weights = self.update_parameters(
                ce_deltas_error_list, self.ce_weights, batch_num, gamma)
            self.ce_weights = ce_weights

            # 记录此次反向传播，权重更新后，加上正则项后，所有样本的mse、ce、使用soft和交叉熵的误差
            self.compute_error(feature_x, target, m, gamma)

            iters += 1
            if iters % 500 == 0:
                print(iters)
        return self.ce_weights

# 进行预测
def predict(feature, target, target_lb, mse_w, ce_w, soft_w, gamma):
    feature_x0 = np.ones((np.shape(feature)[0], 1))
    feature_x = np.hstack((feature_x0, feature))  # 维度(540, 65)
    data_num = len(feature_x)

    ce_error = 0

    ce_predict_value_list = []

    for i in range(data_num):
        ce_activators = nn.forward_propagation(feature_x[i], ce_w)
        # 计算数字形式的预测输出，用于之后计算准确率
        ce_pred = ce_activators[-1]
        ce_index_value = np.argmax(ce_pred)
        ce_predict_value_list.append(ce_index_value)
        # 计算均方误差和交叉熵
        ce = ce_function(target_lb[i], ce_activators[-1])
        ce_error = ce_error + ce

    # 计算加上正则项后的mse、ce、使用soft和交叉熵的误差
    ce_regu = []
    for w in range(len(ce_weights)):
        ce_weights_re = np.sum(np.power(ce_w[w], 2))
        ce_regu.append(ce_weights_re)
    ce_regular = (ce_error + gamma * np.sum(ce_regu) / 2) / data_num

    # 计算准确率
    ce_judge = np.array(ce_predict_value_list) == np.array(target)
    ce_prec = np.sum(ce_judge) / len(ce_judge)
    print(ce_judge,sep="\n")

    return ce_regular, ce_prec


def plot(mse, ce, soft):
    # 设置matplotlib 支持中文显示
    mpl.rcParams['font.family'] = 'SimHei'  # 设置字体为黑体
    mpl.rcParams['axes.unicode_minus'] = False  # 设置在中文字体是能够正常显示负号（“-”）
    # plt.figure(figsize=(20, 20))
    # plt.plot(mse_re, lw=1, c='red', marker='s', ms=4, label="均方误差")
    # plt.plot(ce_re, lw=1, c='green', marker='o', ms=4, label="二分类交叉熵")
    # plt.plot(soft_re, lw=1, c='yellow', marker='^', ms=4, label="多分类交叉熵")
    plt.figure()
    # 绘制误差值
    ce_re_part = [ce[i - 1] for i in range(1, len(ce)) if i % 50 == 0]
    ce_re_part.insert(0, ce[0])
    x_data = [i for i in range(len(soft)) if i % 50 == 0]

    plt.plot(x_data, ce_re_part, lw=1, c='green',
             marker='o', ms=4, label="二分类交叉熵")
    plt.xlabel("迭代次数")
    plt.ylabel("误差")
    plt.title("手写字预测-神经网络")
    plt.legend()
    plt.show()

if __name__ == "__main__":

    #加载手写数字数据集
    digits = datasets.load_digits()

    #对数据进行归一化处理
    range_value = np.max(digits.data) - np.min(digits.data)
    data = (digits.data - np.min(digits.data)) / range_value

    #将数据集划分为训练集和测试集
    train_feature, test_feature, train_target, test_target = train_test_split(data, digits.target, test_size=0.3)
    train_target_lb = LabelBinarizer().fit_transform(train_target)
    test_target_lb = LabelBinarizer().fit_transform(test_target)
    layer = [64, 100, 10]
    times = 8000  # 迭代次数
    alphas = 0.02  # 迭代步长
    epsilon = 1  # 初始化权重的范围[-epsilon, epsilon]
    nn = NeuralNetwork(layer, times, alphas, epsilon)  # 初始化一个三层的神经网络
    gamma = 0.0001  # 正则化系数
    batch_num = 20
    ce_weights = nn.fit(train_feature, train_target_lb, gamma, batch_num)
    # print(mse_list, ce_list, soft_list, sep="\n")
    ce_re, ce_precision = predict(test_feature, test_target, test_target_lb,ce_weights, gamma)
    print("ce_re:{0}".format(ce_re), sep="\n")
    print("ce_precision:{0}".format(ce_precision),sep="\n")
    plot(ce_list)

tips：还有点小bug，应该是矩阵运算得形状没对上

使用多分类交叉熵做损失函数时：

待完成

使用均方误差做损失函数时：

待完成

无监督学习

聚类 K-means算法

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

dataset = pd.read_csv('watermelon.csv', delimiter=",")
data = dataset.values

print(dataset)


def distance(x1, x2):  # 计算距离
    return sum((x1 - x2) ** 2)


def Kmeans(D, K, maxIter):
    m, n = np.shape(D)
    if K >= m:
        return D
    initSet = set()
    curK = K
    while (curK > 0):  # 随机选取k个样本
        randomInt = random.randint(0, m - 1)
        if randomInt not in initSet:
            curK -= 1
            initSet.add(randomInt)
    U = D[list(initSet), :]  # 均值向量,即质心
    C = np.zeros(m)
    curIter = maxIter  # 最大的迭代次数
    while curIter > 0:
        curIter -= 1
        # 计算样本到各均值向量的距离
        for i in range(m):
            p = 0
            minDistance = distance(D[i], U[0])
            for j in range(1, K):
                if distance(D[i], U[j]) < minDistance:
                    p = j
                    minDistance = distance(D[i], U[j])
            C[i] = p
        newU = np.zeros((K, n))
        cnt = np.zeros(K)

        for i in range(m):
            newU[int(C[i])] += D[i]
            cnt[int(C[i])] += 1
        changed = 0
        # 判断质心是否发生变化，如果发生变化则继续迭代，否则结束
        for i in range(K):
            newU[i] /= cnt[i]
            for j in range(n):
                if U[i, j] != newU[i, j]:
                    changed = 1
                    U[i, j] = newU[i, j]
        if changed == 0:
            return U, C, maxIter - curIter
    return U, C, maxIter - curIter


U, C, iter = Kmeans(data, 3, 20)

f1 = plt.figure(1)
plt.title('watermelon')
plt.xlabel('density')
plt.ylabel('ratio')
plt.scatter(data[:, 0], data[:, 1], marker='o', color='g', s=50)
plt.scatter(U[:, 0], U[:, 1], marker='o', color='r', s=100)
m, n = np.shape(data)
for i in range(m):
    plt.plot([data[i, 0], U[int(C[i]), 0]], [data[i, 1], U[int(C[i]), 1]], "c--", linewidth=0.3)
plt.show()