pandas实战进阶

1.读取文件和查看数据
df = pd.read_csv("DataAnalyst.csv", encoding="gb2312")
df.info()
2.清洗数据
len(df.positionId.unique())
#以positionId为唯一值去重,保留第一个
df_duplicates = df.drop_duplicates(subset="positionId", keep="first")
df_duplicates.head()


#自定函数清洗数据
def cut_word(word, method):
    position = word.find("-")
    length = len(word)
    if position != -1:
        bottomSalary = word[:position-1]
        topSalary = word[position+1:length-1]
    else:
        bottomSalary = word[:word.upper().find("K")]
        topSalary = bottomSalary
    if method == "bottom":
        return bottomSalary
    else:
        return topSalary
    return bottomSalary
 df_duplicates["bottomSalary"] = df_duplicates.salary.apply(cut_word, method = "bottom")
df_duplicates["topSalary"] = df_duplicates.salary.apply(cut_word, method = "top")

df_duplicates.bottomSalary = df_duplicates.bottomSalary.astype("int")
df_duplicates.topSalary =  df_duplicates.topSalary.astype("int")
df_duplicates["avgSalary"] = df_duplicates.apply(lambda x: (x.bottomSalary+x.topSalary)/2, axis=1 )
df_duplicates.head()
#取出需要的值
df_clean = df_duplicates[["city", "companyShortName","companySize",
                         "education","positionName","positionLables",
                         "workYear","avgSalary"]]
df_clean.head()
3.数据分析
df_clean.city.value_counts()
df_clean.describe()
4.数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
#直方图
df_clean.avgSalary.hist()
df_clean.avgSalary.hist(bins=15)

#盒型图
df_clean.boxplot(column="avgSalary", by="city", figsize=(9, 9))
df_sh_bj = df_clean[df_clean['city'].isin(["上海", "北京"])]
ax = df_sh_bj.boxplot(column="avgSalary", by=["education", "city"], figsize=(14, 6))

#groupby
df_clean.groupby(["city", "education"]).mean()
df_clean.groupby(["city", "education"]).mean().unstack()
# agg
df_clean.groupby("companyShortName").avgSalary.agg(["count", "mean"]).sort_values(by="count", ascending=False)
#apply
def topN(df, n=5):
    counts = df.value_counts()
    return counts.sort_values(ascending=False)[:n]
df_clean.groupby("city").companyShortName.apply(topN)



ax = df_clean.groupby(["city","education"]).mean().unstack().plot.bar()
5.数据精细化加工
bins = [0, 3, 5, 10,15, 20, 30, 100]
level = ["0-3", "3-5","5-10","10-15","15-20","20-30","30+",]
df_clean["level"] = pd.cut(df_clean["avgSalary"], bins = bins, labels=level)
df_clean[["avgSalary", "level"]]
df_level_prop = df_level.apply(lambda x:x/x.sum(), axis=1)
df_level_prop.head(5)
ax = df_level_prop.plot.bar(stacked = True, figsize=(14, 6))

知乎参考原文链接

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值