python-pandas初识

最新推荐文章于 2024-03-25 17:24:13 发布

g1x2w3

最新推荐文章于 2024-03-25 17:24:13 发布

阅读量153

点赞数

文章标签： pandas

本文链接：https://blog.csdn.net/g1x2w3/article/details/100852276

版权

python-pandas初识

初学pandas库，通过分析得到的人力资源数据集将得到的数据分为6类，最后通过雷达图将其展现。以下为代码实现。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
class PersonData():
    def explorData(self,filePath):
        '''
        探索数据，得到需要处理的数据列
        :param filePath:文件路径
        :return:  无
        '''
        df1=pd.read_csv(filePath[0])#读取出来的数据是DataFrame结构。
        df2=pd.read_csv(filePath[1],encoding='gb2312')  #文件中有汉字，出现编码错误，用gb2312读取文件
        #print(df2['age'][:15000])
        #得到需要处理的数据列
        df1['S']=df1['satisfaction_level']
        df1['L']=df1['last_evaluation']
        df1['N']=df1['number_project']
        df1['A']=df1['average_montly_hours']
        df1['T']=df1['time_spend_company']
        df1['G']=df2['age'][:15000]
        df1['E']=df2['Educational_time'][:15000]
        df1=df1[['S','L','N','A','T','G','E']]
        df1.to_excel('data1/per_data.xls')  #将得到的数据存入特定文件
        pass

    def cleanData(self,filePath):
        '''
        清洗无效数据：空值，不在合理的数据范围内的行
        :param filePath:
        :return:
        '''
        df=pd.read_excel(filePath)
        describe=df.describe().T  #通过describe函数来查看每一列是否有异常值和缺失值
        #describe.to_excel('data1/per_describe.xls')
        #print(describe)
        #过滤非法数据
        filter1=((df['G']>=18) )
        filter3=((df['G']<=60))  #G代表年龄
        filter2=((df['E']>=6))   #E代表受教育年龄
        filters=filter1 & filter2 & filter3
        df=df[filters]
        df.to_excel('data1/per_coredata.xls')
        pass
    def standarData(self,filePath):
        '''
        对数据进行标注化：（原数据-平均值）/标准差
        :param filePath:
        :return:
        '''
        df=pd.read_excel(filePath)
        #（原数据-平均值）/标准差
        df=(df-np.mean(df,axis=0))/np.std(df,axis=0)
        df[['S', 'L', 'N', 'A', 'T', 'G', 'E']].to_excel('data1/per_standata.xls')
        pass
    def classifyData(self,filePath,k=6):
        '''
        用kmeans算法对数据进行分类，分为六类
        :param filePath:
        :return: 返回中心点
        '''
        df=pd.read_excel(filePath)
        kmeans=KMeans(k)
        kmeans.fit(df[['S', 'L', 'N', 'A', 'T', 'G', 'E']])  #调用kmeans方法
        df['label']=kmeans.labels_   #设置新的列，把等级标签加入数据表
        df.to_excel('data1/per_findata.xls')
        coreData=np.array(kmeans.cluster_centers_)
        #print(coreData)
        return coreData
        pass

    def drawData(self,filePath,f):
        '''
        画雷达图，便于分析
        :param filePath:
        :param k:
        :return:
        '''
        data=self.classifyData(filePath)  #得到数据
        xdata=np.linspace(0,2*np.pi,f,endpoint=False)
        xdata=np.concatenate((xdata,[xdata[0]]))

        ydata1=np.concatenate((data[0],[data[0][0]]))
        ydata2 = np.concatenate((data[1], [data[1][0]]))
        ydata3 = np.concatenate((data[2], [data[2][0]]))
        ydata4 = np.concatenate((data[3], [data[3][0]]))
        ydata5 = np.concatenate((data[4], [data[4][0]]))
        ydata6 = np.concatenate((data[5], [data[5][0]]))

        fig=plt.figure()
        ax=fig.add_subplot(111,polar=True)

        ax.plot(xdata, ydata1, 'b--', linewidth=1, label='Person1')
        ax.plot(xdata, ydata2, 'r--', linewidth=1, label='Person2')
        ax.plot(xdata, ydata3, 'g--', linewidth=1, label='Person3')
        ax.plot(xdata, ydata4, 'o--', linewidth=1, label='Person4')
        ax.plot(xdata, ydata5, 'y--', linewidth=1, label='Person5')
        ax.plot(xdata, ydata6, 'p--', linewidth=1, label='Person6')

        ax.set_thetagrids(xdata * 180 / np.pi, ['S：满意度评分', 'L：能力评分', 'N：参与项目数', 'A：月工作时间', 'T：公司数', 'G：年龄', 'E：受教育年限'])
        ax.set_rlim(-2, 2,0.5)
        plt.legend(loc='best')
        plt.show()
    pass


if __name__=="__main__":
    pa=PersonData()
    #pa.explorData(['data1/HR_comma_sep.csv','data1/personal_income.csv'])
    #pa.cleanData('data1/per_data.xls')
    #pa.standarData('data1/per_coredata.xls')
    #pa.classifyData('data1/per_standata.xls')
    pa.drawData('data1/per_standata.xls',7)
    pass