pandas实战进阶

最新推荐文章于 2024-08-14 18:22:26 发布

DropJing

最新推荐文章于 2024-08-14 18:22:26 发布

阅读量297

点赞数

本文链接：https://blog.csdn.net/DropJing/article/details/104256700

版权

1.读取文件和查看数据

df = pd.read_csv("DataAnalyst.csv", encoding="gb2312")
df.info()

2.清洗数据

len(df.positionId.unique())
#以positionId为唯一值去重，保留第一个
df_duplicates = df.drop_duplicates(subset="positionId", keep="first")
df_duplicates.head()

#自定函数清洗数据
def cut_word(word, method):
    position = word.find("-")
    length = len(word)
    if position != -1:
        bottomSalary = word[:position-1]
        topSalary = word[position+1:length-1]
    else:
        bottomSalary = word[:word.upper().find("K")]
        topSalary = bottomSalary
    if method == "bottom":
        return bottomSalary
    else:
        return topSalary
    return bottomSalary
 df_duplicates["bottomSalary"] = df_duplicates.salary.apply(cut_word, method = "bottom")
df_duplicates["topSalary"] = df_duplicates.salary.apply(cut_word, method = "top")

df_duplicates.bottomSalary = df_duplicates.bottomSalary.astype("int")
df_duplicates.topSalary =  df_duplicates.topSalary.astype("int")
df_duplicates["avgSalary"] = df_duplicates.apply(lambda x: (x.bottomSalary+x.topSalary)/2, axis=1 )
df_duplicates.head()

#取出需要的值
df_clean = df_duplicates[["city", "companyShortName","companySize",
                         "education","positionName","positionLables",
                         "workYear","avgSalary"]]
df_clean.head()

3.数据分析

df_clean.city.value_counts()
df_clean.describe()

4.数据可视化

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
#直方图
df_clean.avgSalary.hist()
df_clean.avgSalary.hist(bins=15)

#盒型图
df_clean.boxplot(column="avgSalary", by="city", figsize=(9, 9))
df_sh_bj = df_clean[df_clean['city'].isin(["上海", "北京"])]
ax = df_sh_bj.boxplot(column="avgSalary", by=["education", "city"], figsize=(14, 6))

#groupby
df_clean.groupby(["city", "education"]).mean()
df_clean.groupby(["city", "education"]).mean().unstack()
# agg
df_clean.groupby("companyShortName").avgSalary.agg(["count", "mean"]).sort_values(by="count", ascending=False)
#apply
def topN(df, n=5):
    counts = df.value_counts()
    return counts.sort_values(ascending=False)[:n]
df_clean.groupby("city").companyShortName.apply(topN)



ax = df_clean.groupby(["city","education"]).mean().unstack().plot.bar()

5.数据精细化加工

bins = [0, 3, 5, 10,15, 20, 30, 100]
level = ["0-3", "3-5","5-10","10-15","15-20","20-30","30+",]
df_clean["level"] = pd.cut(df_clean["avgSalary"], bins = bins, labels=level)
df_clean[["avgSalary", "level"]]
df_level_prop = df_level.apply(lambda x:x/x.sum(), axis=1)
df_level_prop.head(5)
ax = df_level_prop.plot.bar(stacked = True, figsize=(14, 6))