pandas数据分析0723

import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\Administrator\Desktop\python 0712\DataAnalyst.csv",encoding = 'gb2312')
df.head()
len(df.positionId.unique())
df_duplicates = df.drop_duplicates(subset = 'positionId',keep = 'first')
df_duplicates.head()

def cut_word(word,method):
    position = word.find('-')
    length = len(word)
    if position !=-1:
        bottomsalary = word[:position-1]
        topsalary = word[position+1:length-1]  #左边的数能取到,右边的数取不到,之所以还要-1,因为起始位置是从0开始
    else:
        bottomsalary = word[:word.upper().find('K')]
        topsalary = bottomsalary
    if method == 'bottom':
        return bottomsalary
    else:
        return topsalary

df_duplicates['topsalary'] = df_duplicates.salary.apply(cut_word, method = 'top')        #apply函数将自定义的word函数应用在salary列的数据上	
df_duplicates['bottomsalary'] = df_duplicates.salary.apply(cut_word, method = 'bottom')  #这两步赋值df_duplicates的topsalary/bottomsalary属性的值
df_duplicates.topsalary = df_duplicates.topsalary.astype('int')
df_duplicates.bottomsalary = df_duplicates.bottomsalary.astype('int')  #这两步将字段中清洗出来的数字(str)转化为数字(int)格式
df_duplicates['avgsalary'] = (df_duplicates.topsalary+ df_duplicates.bottomsalary)/2  #整数格式数字直接运算
df_duplicates.avgsalary

df_clean = df_duplicates[['city','companyShortName','companySize','education','positionName','positionLables','workYear','avgsalary']]  #从原表中切出想要的数据字段
df_clean.head()

df_clean.city.value_counts #统计所有非零元素个数
df_clean.describe()  #数值统计,生成各类统计指标

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

df_clean.avgsalary.hist(bins=15)  #bins是指直方图个数

from matplotlib.font_manager import FontProperties
ax = df_clean.boxplot(column = 'avgsalary',by='city',figsize=(9,7)) #按城市看;figsize参数为图片大小
for label in ax.get_xticklabels():      #ax.get_xticklabels获取坐标轴刻度
    label.set_fontproperties('SimHei')  #for循环依次替换X坐标轴上的中文刻度
	
ax = df_clean.boxplot(column = 'avgsalary',by='education',figsize=(9,7)) #按学历看
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei') 

ax = df_clean.boxplot(column = 'avgsalary',by='workYear',figsize=(9,7)) #按工作年限看
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei') 
	
df_clean.groupby(['city','education']).mean()
df_clean.groupby(['city','education']).mean().unstack() #unstack行列转置
df_clean.groupby('companyShortName').avgsalary.agg(['count','mean']).sort_values(by='count',ascending=False)

ax = df_clean.groupby=('city').mean().plot.bar() #按城市排在平均工资条形图
for label in ax.get_xticklabels():      
    label.set_fontproperties('SimHei') 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值