机器学习实战：利用K-均值聚类算法对未标注数据分组_机器学习实战:利用k-均值聚类算法对未标注数据分组-CSDN博客

本文链接：https://blog.csdn.net/qq_26094481/article/details/107668242
本文深入探讨了如何使用K-均值聚类算法对未标注数据进行有效分组。通过Python实现，详细解释了算法步骤，包括数据预处理、选择合适的K值、迭代过程以及结果评估。该方法在数据挖掘和无监督学习中具有广泛应用。
摘要由CSDN通过智能技术生成
from numpy import *

def loadDataSet(fileName):	  #general function to parse tab -delimited floats
	dataMat = []				#assume last column is target value
	fr = open(fileName)
	for line in fr.readlines():
		#Python strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列。
		#Python split() 通过指定分隔符对字符串进行切片，如果参数 num 有指定值，则分隔 num+1 个子字符串
		#map() 会根据提供的函数对指定序列做映射。
		#第一个参数 function 以参数序列中的每一个元素调用 function 函数，返回包含每次 function 函数返回值的新列表。
		#\t表示空四个字符，也称缩进，相当于按一下Tab键
		curLine = line.strip().split('\t')
		fltLine = map(float,curLine) #map all elements to float()
		dataMat.append(fltLine)
	return dataMat

def distEclud(vecA, vecB):
	return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)

def randCent(dataSet, k):
	n = shape(dataSet)[1]
	#mat创建矩阵、转换为矩阵
	#zeros返回给定形状和类型的新数组
	#np.random.rand()返回一个或一组服从“0~1”均匀分布的随机样本值。随机样本取值范围是[0,1)，不包括1。
	#np.random.randn()返回一个或一组服从标准正态分布的随机样本值。
	centroids = mat(zeros((k,n)))#create centroid mat
	for j in range(n):#create random cluster centers, within bounds of each dimension
		minJ = min(dataSet[:,j]) 
		rangeJ = float(max(dataSet[:,j]) - minJ)
		centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
	return centroids
	
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
	#数据集的行数
	#python中的正无穷或负无穷，使用float("inf")或float("-inf")来表示。
	m = shape(dataSet)[0]
	#索引与距离
	clusterAssment = mat(zeros((m,2)))#create mat to assign data points 
									  #to a centroid, also holds SE of each point
	#随机中心
	centroids = createCent(dataSet, k)
	clusterChanged = True
	while clusterChanged:
		clusterChanged = False
		#外循环数据集，内循环中心点集合
		for i in range(m):#for each data point assign it to the closest centroid
			minDist = inf; minIndex = -1
			for j in range(k):
				#数据集与中心的距离
				distJI = distMeas(centroids[j,:],dataSet[i,:])
				if distJI < minDist:
					minDist = distJI; minIndex = j
			#有任何一个原有的索引与最新的索引不一致，clusterChanged都会变成True
			if clusterAssment[i,0] != minIndex: clusterChanged = True
			#print(clusterChanged)
			clusterAssment[i,:] = minIndex,minDist**2
		print (centroids)
		#print (clusterAssment)
		for cent in range(k):#recalculate centroids
			#Numpy.nonzero()返回的是数组中，非零元素的位置。如果是二维数组就是描述非零元素在几行几列，三维数组则是描述非零元素在第几组中的第几行第几列。
			#.A将矩阵转化为数组
			ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
			centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean 
	return centroids, clusterAssment

def biKmeans(dataSet, k, distMeas=distEclud):
	#记录条数
	m = shape(dataSet)[0]
	#索引与距离
	clusterAssment = mat(zeros((m,2)))
	#tolist将数组或者矩阵转换成列表
	centroid0 = mean(dataSet, axis=0).tolist()[0]
	centList =[centroid0] #create a list with one centroid
	for j in range(m):#calc initial Error
		clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
	while (len(centList) < k):
		lowestSSE = inf
		for i in range(len(centList)):
			ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
			centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
			print(i)
			sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
			sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
			print ("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
			#拆分与非拆分的误差平方和
			if (sseSplit + sseNotSplit) < lowestSSE:
				#拆分的当前中心点索引
				bestCentToSplit = i
				#拆分的中心点
				bestNewCents = centroidMat
				#拆分的与中心点匹配的索引，距离
				bestClustAss = splitClustAss.copy()
				lowestSSE = sseSplit + sseNotSplit
		#拆分后新分组，分组一对应索引在原来的基础上+1，分组二对应索引为拆分前的索引
		bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
		bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
		print ('the bestCentToSplit is: ',bestCentToSplit)
		print ('the len of bestClustAss is: ', len(bestClustAss))
		centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids 
		centList.append(bestNewCents[1,:].tolist()[0])
		clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
		#print ('clusterAssment is: ', clusterAssment)
	return mat(centList), clusterAssment

# 2.1 urllib.request 是求请模块，urllib.response 是响应处理，它在urllib.request模块中包含，用于处理响应，不用我们单独处理。
# 2.2 urllib.parse 是url解析模块
# 2.3 urllib.error 是异常处理模块
# 2.4 urllib.robotparser 是robot.txt解析模块，这个用于通用爬虫。
import urllib
import json
def geoGrab(stAddress, city):
	apiStem = 'http://where.yahooapis.com/geocode?'  #create a dict and constants for the goecoder
	params = {}
	params['flags'] = 'J'#JSON return type
	params['appid'] = 'aaa0VN6k'
	params['location'] = '%s %s' % (stAddress, city)
	#urllib.urlencode将创建的字典转化为通过URL进行传递的字符串格式
	#python3改为urllib.parse.urlencode
	url_params = urllib.urlencode(params)
	yahooApi = apiStem + url_params	  #print url_params
	print (yahooApi)
	#urllib.urlopen
	#python3改为urllib.request.urlopen
	c=urllib.urlopen(yahooApi)
	#使用json.loads对返回的json格式进行解码
	return json.loads(c.read())

from time import sleep
def massPlaceFind(fileName):
	fw = open('places.txt', 'w')
	for line in open(fileName).readlines():
		line = line.strip()
		lineArr = line.split('\t')
		retDict = geoGrab(lineArr[1], lineArr[2])
		#对返回结果retDict进行处理
		if retDict['ResultSet']['Error'] == 0:
			lat = float(retDict['ResultSet']['Results'][0]['latitude'])
			lng = float(retDict['ResultSet']['Results'][0]['longitude'])
			print ("%s\t%f\t%f") % (lineArr[0], lat, lng)
			fw.write('%s\t%f\t%f\n' % (line, lat, lng))
		else: print ("error fetching")
		sleep(1)
	fw.close()
	
def distSLC(vecA, vecB):#Spherical Law of Cosines
	a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
	b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \
					  cos(pi * (vecB[0,0]-vecA[0,0]) /180)
	return arccos(a + b)*6371.0 #pi is imported with numpy

import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(numClust=5):
	datList = []
	#datList添加经度纬度数据
	for line in open('places.txt').readlines():
		lineArr = line.split('\t')
		datList.append([float(lineArr[4]), float(lineArr[3])])
	datMat = mat(datList)
	myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
	fig = plt.figure()
	rect=[0.1,0.1,0.8,0.8]
	scatterMarkers=['s', 'o', '^', '8', 'p', \
					'd', 'v', 'h', '>', '<']
	axprops = dict(xticks=[], yticks=[])
	ax0=fig.add_axes(rect, label='ax0', **axprops)
	imgP = plt.imread('Portland.png')
	ax0.imshow(imgP)
	ax1=fig.add_axes(rect, label='ax1', frameon=False)
	for i in range(numClust):
		ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
		markerStyle = scatterMarkers[i % len(scatterMarkers)]
		ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
	ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
	plt.show()