1.读取文件和查看数据
df = pd.read_csv("DataAnalyst.csv", encoding="gb2312")
df.info()
2.清洗数据
len(df.positionId.unique())
df_duplicates = df.drop_duplicates(subset="positionId", keep="first")
df_duplicates.head()
def cut_word(word, method):
position = word.find("-")
length = len(word)
if position != -1:
bottomSalary = word[:position-1]
topSalary = word[position+1:length-1]
else:
bottomSalary = word[:word.upper().find("K")]
topSalary = bottomSalary
if method == "bottom":
return bottomSalary
else:
return topSalary
return bottomSalary
df_duplicates["bottomSalary"] = df_duplicates.salary.apply(cut_word, method = "bottom")
df_duplicates["topSalary"] = df_duplicates.salary.apply(cut_word, method = "top")
df_duplicates.bottomSalary = df_duplicates.bottomSalary.astype("int")
df_duplicates.topSalary = df_duplicates.topSalary.astype("int")
df_duplicates["avgSalary"] = df_duplicates.apply(lambda x: (x.bottomSalary+x.topSalary)/2, axis=1 )
df_duplicates.head()
df_clean = df_duplicates[["city", "companyShortName","companySize",
"education","positionName","positionLables",
"workYear","avgSalary"]]
df_clean.head()
3.数据分析
df_clean.city.value_counts()
df_clean.describe()
4.数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
df_clean.avgSalary.hist()
df_clean.avgSalary.hist(bins=15)
df_clean.boxplot(column="avgSalary", by="city", figsize=(9, 9))
df_sh_bj = df_clean[df_clean['city'].isin(["上海", "北京"])]
ax = df_sh_bj.boxplot(column="avgSalary", by=["education", "city"], figsize=(14, 6))
df_clean.groupby(["city", "education"]).mean()
df_clean.groupby(["city", "education"]).mean().unstack()
df_clean.groupby("companyShortName").avgSalary.agg(["count", "mean"]).sort_values(by="count", ascending=False)
def topN(df, n=5):
counts = df.value_counts()
return counts.sort_values(ascending=False)[:n]
df_clean.groupby("city").companyShortName.apply(topN)
ax = df_clean.groupby(["city","education"]).mean().unstack().plot.bar()
5.数据精细化加工
bins = [0, 3, 5, 10,15, 20, 30, 100]
level = ["0-3", "3-5","5-10","10-15","15-20","20-30","30+",]
df_clean["level"] = pd.cut(df_clean["avgSalary"], bins = bins, labels=level)
df_clean[["avgSalary", "level"]]
df_level_prop = df_level.apply(lambda x:x/x.sum(), axis=1)
df_level_prop.head(5)
ax = df_level_prop.plot.bar(stacked = True, figsize=(14, 6))
知乎参考原文链接