继续Intermediate Python and Pandas / Data Analysis with Pandas: Intermediate /
Guided Project: Analyzing Thanksgiving Dinner: 数据集地址:here
用到的方法: pandas.Series.value_counts()
import pandas as pd
data=pd.read_csv("thanksgiving.csv",encoding="Latin-1")
print(data.head(3))
print(data.columns)
print(data["Do you celebrate Thanksgiving?"].value_counts())
data=data[data["Do you celebrate Thanksgiving?"]=="Yes"]
print(len(data))
print(data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())
data_1=data[data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"]
print(data_1["Do you typically have gravy?"])
apple_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
Pumpkin_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
Pecan_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])
ate_pies=apple_isnull&Pumpkin_isnull&Pecan_isnull
print(ate_pies.value_counts())
def str_2_int(str_1):
if pd.isnull(str_1):
return None
str_2=str_1.split(' ')[0]
len_2=len(str_2)
if str_2[len_2-1]=='+':
str_2=str_2[:len_2-1]
return int(str_2)
data["int_age"]=data["Age"].apply(str_2_int)
print(data.loc[5:20,["Age","int_age"]])
def str_2_int_2(stra):
if pd.isnull(stra):
return None
strb=stra.split(' ')[0]
if strb=="Prefer":
return None
else:
lenb=len(strb)
strc=strb[1:lenb-4]+strb[lenb-3:lenb]
return int(strc)
data['int_come']=data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(str_2_int_2)
print(data["int_come"].describe())
data_far=data[data["int_come"]<=150000]["How far will you travel for Thanksgiving?"]
print(data_far.value_counts())
data_far_1=data[data["int_come"]>150000]["How far will you travel for Thanksgiving?"]
print(data_far_1.value_counts())
ave_age=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_age")
print(ave_age)
ave_income=data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?",columns="Have you ever attended a \"Friendsgiving?\"",values="int_come")
print(ave_income)
DATE | VALUE |
---|---|
1948-01-01 | 3.4 |
unrate=pd.read_csv("unrate.csv")
unrate["DATE"]=pd.to_datetime(unrate["DATE"])
>>关于画图
import matplotlib.pyplot as plt
plt.plot()
plt.show()
matplotlib.pyplot.xticks(*args, **kwargs) #对x坐标tick label进行调整,比如旋转90度,plt.xticks(rotation=90)
plt.xlabel("Month") #对x轴加标签
plt.ylabel("Unemployment Rate")
plt.title("Monthly Unemployment Trends, 1948")
>>构建figure对象,构建子图:
fig = plt.figure() #或者fig = plt.figure(figsize=(width, height))定义figure对象的长宽
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.plot(unrate["DATE"][:12],unrate["VALUE"][:12])
ax2.plot(unrate["DATE"][12:24],unrate["VALUE"][12:24])
plt.show()
>>在同一图中画多条线:
unrate['MONTH'] = unrate['DATE'].dt.month
fig=plt.figure(figsize=(6,3))
plt.plot(unrate["MONTH"][:12],unrate["VALUE"][:12],c="red")
plt.plot(unrate["MONTH"][12:24],unrate["VALUE"][12:24],c="blue")
plt.show()
>>还可以给每条线增加标签和指定标签的位置:
plt.plot(unrate[0:12]['MONTH'], unrate[0:12]['VALUE'], c='red', label='1948')
plt.plot(unrate[12:24]['MONTH'], unrate[12:24]['VALUE'], c='blue', label='1949')
plt.legend(loc='upper left')
>>画直方图:pyplot.bar() Axes.bar()
import matplotlib.pyplot as plt
from numpy import arange
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
fig,ax=plt.subplots() #返回Figure和Axes对象
ax.bar(bar_positions,bar_heights,0.5)
plt.show()
>>直方图其他操作:
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.bar(bar_positions,bar_heights,0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols,rotation=90)
plt.xlabel("Rating Source")
plt.ylabel("Average Rating")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()
>>水平直方图
import matplotlib.pyplot as plt
from numpy import arange
num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
bar_widths = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig,ax=plt.subplots()
ax.barh(bar_positions,bar_widths,0.5)
ax.set_yticks(tick_positions)
ax.set_yticklabels(num_cols)
plt.xlabel("Average Rating")
plt.ylabel("Rating Source")
plt.title("Average User Rating For Avengers: Age of Ultron (2015)")
plt.show()
>>绘制散点图:
fig,ax=plt.subplots()
ax.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
plt.xlabel("Fandango")
plt.ylabel("Rotten Tomatoes")
plt.show()
>>多个子图
fig = plt.figure(figsize=(5,10))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews["Fandango_Ratingvalue"],norm_reviews["RT_user_norm"])
ax1.set_xlabel("Fandango")
ax1.set_ylabel("Rotten Tomatoes")
ax2.scatter(norm_reviews["RT_user_norm"],norm_reviews["Fandango_Ratingvalue"])
ax2.set_xlabel("Rotten Tomatoes")
ax2.set_ylabel("Fandango")
plt.show()
>>设置横纵坐标的范围:Axes.set_xlim() Axes.set_ylim()
这一部分暂告一段落,关于可视化操作还有下一步的表格显示以及Storytelling Through Data Visualization部分没有学习,先跳过学习Data Cleaning