Dataquest学习总结[4]

最新推荐文章于 2022-07-18 10:24:51 发布

sodleave

最新推荐文章于 2022-07-18 10:24:51 发布

阅读量1k

点赞数

分类专栏： python数据分析

本文链接：https://blog.csdn.net/sodleave/article/details/71600444

版权

python数据分析专栏收录该内容

12 篇文章 0 订阅

订阅专栏

继续Intermediate Python and Pandas / Data Analysis with Pandas: Intermediate /

Guided Project: Analyzing Thanksgiving Dinner：数据集地址：here

用到的方法： pandas.Series.value_counts()

import pandas as pd
data=pd.read_csv("thanksgiving.csv",encoding="Latin-1")
print(data.head(3))

print(data.columns)

print(data["Do you celebrate Thanksgiving?"].value_counts())
data=data[data["Do you celebrate Thanksgiving?"]=="Yes"]
print(len(data))

print(data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())
data_1=data[data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"]
print(data_1["Do you typically have gravy?"])

apple_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
Pumpkin_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
Pecan_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])
ate_pies=apple_isnull&Pumpkin_isnull&Pecan_isnull
print(ate_pies.value_counts())

def str_2_int(str_1):
    if pd.isnull(str_1):
        return None
    str_2=str_1.split(' ')[0]
    len_2=len(str_2)
    if str_2[len_2-1]=='+':
        str_2=str_2[:len_2-1]
    return int(str_2)
data["int_age"]=data["Age"].apply(str_2_int)
print(data.loc[5:20,["Age","int_age"]])

def str_2_int_2(stra):
    if pd.isnull(stra):
        return None
    strb=stra.split(' ')[0]
    if strb=="Prefer":
        return None
    else:
        lenb=len(strb)
        strc=strb[1:lenb-4]+strb[lenb-3:lenb]
        return int(strc)
    
data['int_come']=data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(str_2_int_2)
print(data["int_come"].describe())        

data_far=data[data["int_come"]<=150000]["How far will you travel for Thanksgiving?"]
print(data_far.value_counts())
data_far_1=data[data["int_come"]>150000]["How far will you travel for Thanksgiving?"]
print(data_far_1.value_counts())

ave_age=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_age")
print(ave_age)
ave_income=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_come")
print(ave_income)

接下来是关于可视化部分 Intermediate Python and Pandas / Exploratory Data Visualization

DATE	VALUE
1948-01-01	3.4

>>对于上图这样的数据，pandas会把DATE默认读成字符串的格式，即object类型，需要进行类型转换：

unrate=pd.read_csv("unrate.csv")
unrate["DATE"]=pd.to_datetime(unrate["DATE"])

>>关于画图

import matplotlib.pyplot as plt

plt.plot()

plt.show()

matplotlib.pyplot.xticks(*args, **kwargs) #对x坐标tick label进行调整，比如旋转90度，plt.xticks(rotation=90)

plt.xlabel("Month") #对x轴加标签
plt.ylabel("Unemployment Rate")
plt.title("Monthly Unemployment Trends, 1948")
>>构建figure对象，构建子图：

fig = plt.figure() #或者fig = plt.figure(figsize=(width, height))定义figure对象的长宽
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.plot(unrate["DATE"][:12],unrate["VALUE"][:12])
ax2.plot(unrate["DATE"][12:24],unrate["VALUE"][12:24])
plt.show()

>>在同一图中画多条线：

unrate['MONTH'] = unrate['DATE'].dt.month
fig=plt.figure(figsize=(6,3))
plt.plot(unrate["MONTH"][:12],unrate["VALUE"][:12],c="red")
plt.plot(unrate["MONTH"][12:24],unrate["VALUE"][12:24],c="blue")
plt.show()

>>还可以给每条线增加标签和指定标签的位置：

plt.plot(unrate[0:12]['MONTH'], unrate[0:12]['VALUE'], c='red', label='1948')
plt.plot(unrate[12:24]['MONTH'], unrate[12:24]['VALUE'], c='blue', label='1949')

plt.legend(loc='upper left')

>>画直方图：pyplot.bar() Axes.bar()

>>直方图其他操作：

num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.bar(bar_positions,bar_heights,0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols,rotation=90)
plt.xlabel("Rating Source")
plt.ylabel("Average Rating")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()

>>水平直方图

import matplotlib.pyplot as plt
from numpy import arange
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_widths = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.barh(bar_positions,bar_widths,0.5)
ax.set_yticks(tick_positions)
ax.set_yticklabels(num_cols)
plt.xlabel("Average Rating")
plt.ylabel("Rating Source")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()

>>绘制散点图：

fig,ax=plt.subplots()
ax.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
plt.xlabel("Fandango")
plt.ylabel("Rotten Tomatoes")
plt.show()

>>多个子图

fig = plt.figure(figsize=(5,10))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
ax1.set_xlabel("Fandango")
ax1.set_ylabel("Rotten Tomatoes")
ax2.scatter(norm_reviews["RT_user_norm"],norm_reviews["Fandango_Ratingvalue"])
ax2.set_xlabel("Rotten Tomatoes")
ax2.set_ylabel("Fandango")
plt.show()

>>设置横纵坐标的范围：Axes.set_xlim() Axes.set_ylim()

这一部分暂告一段落，关于可视化操作还有下一步的表格显示以及Storytelling Through Data Visualization部分没有学习，先跳过学习Data Cleaning