数据分析实战

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans  # 引入大哥
import pickle
import csv
title = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','deposit']
selectTitle = ['age','job','marital','education','default','balance','housing','loan','contact','duration','campaign','pdays','previous','poutcome','deposit']
jobItems = []


class BankData():

    def detectData(self, filePath):
        df = pd.read_csv(filePath)
        describe = df.describe(include='all')
        df.to_excel('data/bank_data.xls')
        describe.T.to_excel('data/bank_describe.xls')

    def cleanData(self, filePath):
        # 这个数据我仔细看了一下没什么需要处理的
        pass

    def chooseData(self, cleanedFilePath):
        # cleanedFilePath
        df = pd.read_excel(cleanedFilePath)
        # 选取除了日期之外的所有
        df = df[['age','job','marital','education','default','balance','housing','loan','contact','duration','campaign','pdays','previous','poutcome','deposit']]
        df.to_csv('data/bank_coredata.csv')
        pass

    def transformData(self, filePath):
        # 统一单位并且把非数字的数据转化成数字
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            jobItems = list(set([row['job'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            maritalItems = list(set([row['marital'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            educationItems = list(set([row['education'] for row in reader]))

        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            defaultItems = list(set([row['default'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            housingItems = list(set([row['housing'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            loanItems = list(set([row['loan'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            contactItems = list(set([row['contact'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            poutcomeItems = list(set([row['poutcome'] for row in reader]))
        with open(filePath, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            depositItems = list(set([row['deposit'] for row in reader]))

        df = pd.read_excel('data/bank_coredata.xls')
        df['job'] = [jobItems.index(temp) for temp in df['job'].to_list()]

        df['marital'] = [maritalItems.index(temp) for temp in df['marital'].to_list()]
        df['education'] = [educationItems.index(temp) for temp in df['education'].to_list()]
        df['default'] = [defaultItems.index(temp) for temp in df['default'].to_list()]
        df['housing'] = [housingItems.index(temp) for temp in df['housing'].to_list()]
        df['loan'] = [loanItems.index(temp) for temp in df['loan'].to_list()]
        df['contact'] = [contactItems.index(temp) for temp in df['contact'].to_list()]
        df['poutcome'] = [poutcomeItems.index(temp) for temp in df['poutcome'].to_list()]
        df['deposit'] = [depositItems.index(temp) for temp in df['deposit'].to_list()]

        df.to_excel('data/bank_coreTransformData.xls')

    def standardData(self, filePath):
        # 一般的标准化的方式(用偏差-平均值)/标准差
        df = pd.read_excel(filePath)
        df = (df - np.mean(df, axis=0))/np.std(df, axis=0)
        df.to_excel('data/bankStdCoreData.xls')
        pass

    def classifyData(self, filePath, k=15):
        df = pd.read_excel(filePath)
        kmeans = KMeans(k)
        kmeans.fit(df[selectTitle])
        # 得到聚类中心点
        # 聚类标签
        df['label'] = kmeans.labels_
        coreData = np.array(kmeans.cluster_centers_)
        print(coreData)
        #
        x = np.linspace(0,2*np.pi,k, endpoint=False)
        x= np.concatenate((x,[x[0]]))
        ydata1 = np.concatenate((coreData[0], [coreData[0][0]]))
        ydata2 = np.concatenate((coreData[1], [coreData[1][0]]))
        ydata3 = np.concatenate((coreData[2], [coreData[2][0]]))
        ydata4 = np.concatenate((coreData[3], [coreData[3][0]]))
        ydata5 = np.concatenate((coreData[4], [coreData[4][0]]))
        ydata6 = np.concatenate((coreData[5], [coreData[5][0]]))
        ydata7 = np.concatenate((coreData[6], [coreData[6][0]]))
        ydata8 = np.concatenate((coreData[7], [coreData[7][0]]))
        ydata9 = np.concatenate((coreData[8], [coreData[8][0]]))
        ydata10 = np.concatenate((coreData[9], [coreData[9][0]]))
        ydata11 = np.concatenate((coreData[10], [coreData[10][0]]))
        ydata12 = np.concatenate((coreData[11], [coreData[11][0]]))
        ydata13 = np.concatenate((coreData[12], [coreData[12][0]]))
        ydata14 = np.concatenate((coreData[13], [coreData[13][0]]))
        ydata15 = np.concatenate((coreData[14], [coreData[14][0]]))

        fig = plt.figure()
        ax = fig.add_subplot(111,polar=True)
        ax.plot(x, ydata1, 'b--', linewidth=1, label='customer1')
        ax.plot(x, ydata2, 'g--', linewidth=1, label='customer2')
        ax.plot(x, ydata3, 'p--', linewidth=1, label='customer3')
        ax.plot(x, ydata4, 'y--', linewidth=1, label='customer4')
        ax.plot(x, ydata5, 'r--', linewidth=1, label='customer5')
        ax.plot(x, ydata6, 'r--', linewidth=1, label='customer6')
        ax.plot(x, ydata7, 's--', linewidth=1, label='customer7')
        ax.plot(x, ydata8, 'w--', linewidth=1, label='customer8')
        ax.plot(x, ydata9, 'c--', linewidth=1, label='customer9')
        ax.plot(x, ydata10, 'k--', linewidth=1, label='customer10')
        ax.plot(x, ydata11, 'o--', linewidth=1, label='customer11')
        ax.plot(x, ydata12, 'r--', linewidth=1, label='customer12')
        ax.plot(x, ydata13, 'g--', linewidth=1, label='customer13')
        ax.plot(x, ydata14, 'o--', linewidth=1, label='customer14')
        ax.plot(x, ydata15, 'b--', linewidth=1, label='customer15')
        ax.set_thetagrids(x*180/np.pi, [selectTitle,])
        plt.legend(loc='best')
        plt.show()
        pass

if __name__ == '__main__':

    b1 = BankData()
    # b1.detectData('data/bank.csv')
    ## b1.cleanData('data/bank.csv')
    # b1.chooseData('data/bank_data.xls')
    # b1.transformData('data/bank_coredata.csv')
    # b1.standardData('data/bank_coreTransformData.xls')
    b1.classifyData('data/bankStdCoreData.xls')


运行结果如下:

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值