import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans # 引入大哥
import pickle
import csv
title = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','deposit']
selectTitle = ['age','job','marital','education','default','balance','housing','loan','contact','duration','campaign','pdays','previous','poutcome','deposit']
jobItems = []
class BankData():
def detectData(self, filePath):
df = pd.read_csv(filePath)
describe = df.describe(include='all')
df.to_excel('data/bank_data.xls')
describe.T.to_excel('data/bank_describe.xls')
def cleanData(self, filePath):
# 这个数据我仔细看了一下没什么需要处理的
pass
def chooseData(self, cleanedFilePath):
# cleanedFilePath
df = pd.read_excel(cleanedFilePath)
# 选取除了日期之外的所有
df = df[['age','job','marital','education','default','balance','housing','loan','contact','duration','campaign','pdays','previous','poutcome','deposit']]
df.to_csv('data/bank_coredata.csv')
pass
def transformData(self, filePath):
# 统一单位并且把非数字的数据转化成数字
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
jobItems = list(set([row['job'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
maritalItems = list(set([row['marital'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
educationItems = list(set([row['education'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
defaultItems = list(set([row['default'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
housingItems = list(set([row['housing'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
loanItems = list(set([row['loan'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
contactItems = list(set([row['contact'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
poutcomeItems = list(set([row['poutcome'] for row in reader]))
with open(filePath, 'r') as csvfile:
reader = csv.DictReader(csvfile)
depositItems = list(set([row['deposit'] for row in reader]))
df = pd.read_excel('data/bank_coredata.xls')
df['job'] = [jobItems.index(temp) for temp in df['job'].to_list()]
df['marital'] = [maritalItems.index(temp) for temp in df['marital'].to_list()]
df['education'] = [educationItems.index(temp) for temp in df['education'].to_list()]
df['default'] = [defaultItems.index(temp) for temp in df['default'].to_list()]
df['housing'] = [housingItems.index(temp) for temp in df['housing'].to_list()]
df['loan'] = [loanItems.index(temp) for temp in df['loan'].to_list()]
df['contact'] = [contactItems.index(temp) for temp in df['contact'].to_list()]
df['poutcome'] = [poutcomeItems.index(temp) for temp in df['poutcome'].to_list()]
df['deposit'] = [depositItems.index(temp) for temp in df['deposit'].to_list()]
df.to_excel('data/bank_coreTransformData.xls')
def standardData(self, filePath):
# 一般的标准化的方式(用偏差-平均值)/标准差
df = pd.read_excel(filePath)
df = (df - np.mean(df, axis=0))/np.std(df, axis=0)
df.to_excel('data/bankStdCoreData.xls')
pass
def classifyData(self, filePath, k=15):
df = pd.read_excel(filePath)
kmeans = KMeans(k)
kmeans.fit(df[selectTitle])
# 得到聚类中心点
# 聚类标签
df['label'] = kmeans.labels_
coreData = np.array(kmeans.cluster_centers_)
print(coreData)
#
x = np.linspace(0,2*np.pi,k, endpoint=False)
x= np.concatenate((x,[x[0]]))
ydata1 = np.concatenate((coreData[0], [coreData[0][0]]))
ydata2 = np.concatenate((coreData[1], [coreData[1][0]]))
ydata3 = np.concatenate((coreData[2], [coreData[2][0]]))
ydata4 = np.concatenate((coreData[3], [coreData[3][0]]))
ydata5 = np.concatenate((coreData[4], [coreData[4][0]]))
ydata6 = np.concatenate((coreData[5], [coreData[5][0]]))
ydata7 = np.concatenate((coreData[6], [coreData[6][0]]))
ydata8 = np.concatenate((coreData[7], [coreData[7][0]]))
ydata9 = np.concatenate((coreData[8], [coreData[8][0]]))
ydata10 = np.concatenate((coreData[9], [coreData[9][0]]))
ydata11 = np.concatenate((coreData[10], [coreData[10][0]]))
ydata12 = np.concatenate((coreData[11], [coreData[11][0]]))
ydata13 = np.concatenate((coreData[12], [coreData[12][0]]))
ydata14 = np.concatenate((coreData[13], [coreData[13][0]]))
ydata15 = np.concatenate((coreData[14], [coreData[14][0]]))
fig = plt.figure()
ax = fig.add_subplot(111,polar=True)
ax.plot(x, ydata1, 'b--', linewidth=1, label='customer1')
ax.plot(x, ydata2, 'g--', linewidth=1, label='customer2')
ax.plot(x, ydata3, 'p--', linewidth=1, label='customer3')
ax.plot(x, ydata4, 'y--', linewidth=1, label='customer4')
ax.plot(x, ydata5, 'r--', linewidth=1, label='customer5')
ax.plot(x, ydata6, 'r--', linewidth=1, label='customer6')
ax.plot(x, ydata7, 's--', linewidth=1, label='customer7')
ax.plot(x, ydata8, 'w--', linewidth=1, label='customer8')
ax.plot(x, ydata9, 'c--', linewidth=1, label='customer9')
ax.plot(x, ydata10, 'k--', linewidth=1, label='customer10')
ax.plot(x, ydata11, 'o--', linewidth=1, label='customer11')
ax.plot(x, ydata12, 'r--', linewidth=1, label='customer12')
ax.plot(x, ydata13, 'g--', linewidth=1, label='customer13')
ax.plot(x, ydata14, 'o--', linewidth=1, label='customer14')
ax.plot(x, ydata15, 'b--', linewidth=1, label='customer15')
ax.set_thetagrids(x*180/np.pi, [selectTitle,])
plt.legend(loc='best')
plt.show()
pass
if __name__ == '__main__':
b1 = BankData()
# b1.detectData('data/bank.csv')
## b1.cleanData('data/bank.csv')
# b1.chooseData('data/bank_data.xls')
# b1.transformData('data/bank_coredata.csv')
# b1.standardData('data/bank_coreTransformData.xls')
b1.classifyData('data/bankStdCoreData.xls')
运行结果如下: