pandas数据分析0723

最新推荐文章于 2021-08-21 10:00:41 发布

单明火

最新推荐文章于 2021-08-21 10:00:41 发布

阅读量316

点赞数

分类专栏： python数据分析

python数据分析专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\Administrator\Desktop\python 0712\DataAnalyst.csv",encoding = 'gb2312')
df.head()
len(df.positionId.unique())
df_duplicates = df.drop_duplicates(subset = 'positionId',keep = 'first')
df_duplicates.head()

def cut_word(word,method):
    position = word.find('-')
    length = len(word)
    if position !=-1:
        bottomsalary = word[:position-1]
        topsalary = word[position+1:length-1]  #左边的数能取到，右边的数取不到，之所以还要-1，因为起始位置是从0开始
    else:
        bottomsalary = word[:word.upper().find('K')]
        topsalary = bottomsalary
    if method == 'bottom':
        return bottomsalary
    else:
        return topsalary

df_duplicates['topsalary'] = df_duplicates.salary.apply(cut_word, method = 'top')        #apply函数将自定义的word函数应用在salary列的数据上	
df_duplicates['bottomsalary'] = df_duplicates.salary.apply(cut_word, method = 'bottom')  #这两步赋值df_duplicates的topsalary/bottomsalary属性的值
df_duplicates.topsalary = df_duplicates.topsalary.astype('int')
df_duplicates.bottomsalary = df_duplicates.bottomsalary.astype('int')  #这两步将字段中清洗出来的数字（str）转化为数字（int）格式
df_duplicates['avgsalary'] = (df_duplicates.topsalary+ df_duplicates.bottomsalary)/2  #整数格式数字直接运算
df_duplicates.avgsalary

df_clean = df_duplicates[['city','companyShortName','companySize','education','positionName','positionLables','workYear','avgsalary']]  #从原表中切出想要的数据字段
df_clean.head()

df_clean.city.value_counts #统计所有非零元素个数
df_clean.describe()  #数值统计，生成各类统计指标

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

df_clean.avgsalary.hist(bins=15)  #bins是指直方图个数

from matplotlib.font_manager import FontProperties
ax = df_clean.boxplot(column = 'avgsalary',by='city',figsize=(9,7)) #按城市看；figsize参数为图片大小
for label in ax.get_xticklabels():      #ax.get_xticklabels获取坐标轴刻度
    label.set_fontproperties('SimHei')  #for循环依次替换X坐标轴上的中文刻度
	
ax = df_clean.boxplot(column = 'avgsalary',by='education',figsize=(9,7)) #按学历看
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei') 

ax = df_clean.boxplot(column = 'avgsalary',by='workYear',figsize=(9,7)) #按工作年限看
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei') 
	
df_clean.groupby(['city','education']).mean()
df_clean.groupby(['city','education']).mean().unstack() #unstack行列转置
df_clean.groupby('companyShortName').avgsalary.agg(['count','mean']).sort_values(by='count',ascending=False)

ax = df_clean.groupby=('city').mean().plot.bar() #按城市排在平均工资条形图
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei')