# 数据科学之第三章中lesson4的探索两个变量 (R语言)

Lesson 4
========================================================

***

### Scatterplots and Perceived Audience Size
Notes:

***

### Scatterplots
Notes:

{r Scatterplots}
library(ggplot2)
ggplot(aes(x=age,y=friend_count),data=pf) + geom_point()


***

#### What are some things that you notice right away?
Response:

***

### ggplot Syntax
Notes:

{r ggplot Syntax}
qplot(x=age,y=friend_count,data=pf)

ggplot(aes(x=age,y=friend_count),data=pf) + geom_point() + xlim(13,90)
summary(pf$age)  *** ### Overplotting Notes: the relation between friend_count and age {r Overplotting} # alpha的使用 library(gridExtra) px = ggplot(aes(x=age,y=friend_count),data=pf) + geom_point(alpha=1/20) + xlim(13,90) # 抖动的使用 py = ggplot(aes(x=age,y=friend_count),data=pf) + geom_jitter(alpha=1/20) + xlim(13,90) grid.arrange(px,py,ncol=1)  #### What do you notice in the plot? Response: *** ### Coord_trans() Notes: {r Coord_trans()} ggplot(aes(x=age,y=friend_count),data=pf) + geom_point(alpha=1/20) + xlim(13,90) + coord_trans(y='sqrt') # try : coord_trans(x='log10',y='log10')  #### Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot! {r} # 练习使用alpha和抖动来减少图形的重叠（overlap） ggplot(aes(x=age,y=friend_count),data=pf) + geom_jitter(alpha=1/20) + xlim(13,90)  #### What do you notice? *** ### Alpha and Jitter Notes: {r Alpha and Jitter} # build up in layers ggplot(aes(x=age,y=friend_count),data=pf) + geom_jitter(alpha=1/20) + xlim(13,90)  *** ### Overplotting and Domain Knowledge Notes: *** ### Conditional Means Notes: {r Conditional Means} # dpylr 让我们可以分割DataFrame,然后向数据的某些部分应用某个函数 install.packages('dplyr') library(dplyr) age_groups <- group_by(pf,age) pf.fc_by_age <- summarise(age_groups, friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), n = n()) pf.fc_by_age <- arrange(pf.fc_by_age) head(pf.fc_by_age)  {r Conditional Alternate Means} install.packages('dplyr') library(dplyr) pf.fc_by_age <- pf %>% group_by(age) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), n = n()) %>% arrange(age) head(pf.fc_by_age, 20)  Create your plot! {r Conditional Means Plot}  *** ### Overlaying Summaries with Raw Data Notes: {r Overlaying Summaries with Raw Data} ggplot(aes(x=age,y=friend_count),data=pf) + coord_cartesian(xlim(13,90)) + geom_point(alpha=0.05, positions = position_jitter(h=0), color = 'orange') + coord_trans(y='sqrt') + geom_line(stat='summary',fun.y=mean) + # 平均值 geom_line(stat = 'summary',fun.y=quantile, fun.args= list(probs=.1, linetype="dashed", size = 1.5, color='red'))+ # 10%分位数 geom_line(stat = 'summary',fun.y=quantile, fun.args= list(probs=.5,linetype="dashed", size = 1.5, color='red')) + # 50%分位数 中位数 geom_line(stat = 'summary',fun.y=quantile, fun.args= list(probs=.9, linetype="dashed", size = 1.5, color='red')) # 90%分位数  #### What are some of your observations of the plot? Response: *** ### Moira: Histogram Summary and Scatterplot See the Instructor Notes of this video to download Moira's paper on perceived audience size and to see the final plot. Notes: *** ### Correlation Notes: {r Correlation} # 使用pearson积矩相关， 用小写的r来衡量年龄与好友数的线性关系。  Look up the documentation for the cor.test function. ### this function use pearson theory What's the correlation between age and friend count? Round to three decimal places. Response: *** ### Correlation on Subsets 从上面的图可以看出，其相关性并不是简单的单调的。 Notes: {r Correlation on Subsets} with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson')) # 仅仅利用自称年龄在70岁以下的人的信息来计算相关系数  *** ### Correlation Methods Notes: {r Correlation on Subsets} with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman')) # 仅仅利用自称年龄在70岁以下的人的信息来计算相关系数  *** ## Create Scatterplots Notes: {r} ggplot(aes(x=www_likes_received, y=likes_received),data=pf) + geom_point()  *** ### Strong Correlations Notes: {r Strong Correlations} ggplot(aes(x=www_likes_received, y=likes_received),data=pf) + geom_point() + xlim(0,quantile(pf$www_likes_received, 0.95)) +
ylim(0,quantile(pf$likes_received, 0.95)) + geom_smooth(method='lm',color='red') # 添加一个线性的平滑器  What's the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places. {r Correlation Calcuation} cor.test(pf$www_likes_received, pf$likes_received) # 根据结果显示，两者是一个强相关性的变量。  Response: *** ### Moira on Correlation Notes: *** ### More Caution with Correlation Notes: {r More Caution With Correlation} install.packages('alr3') library(alr3) data(Mitchell) ?Mitchell # 这个数据集包含了内布拉斯加州米切尔市的土壤温度  Create your plot! {r Temp vs Month} ggplot(aes(x=Month,y=Temp),data=Mitchell) + geom_point()  *** ### Noisy Scatterplots a. Take a guess for the correlation coefficient for the scatterplot. b. What is the actual correlation of the two variables? (Round to the thousandths place) {r Noisy Scatterplots} cor.test(Mitchell$Month, Mitchell$Temp)  *** ### Making Sense of Data Notes: {r Making Sense of Data} ggplot(aes(x=Month,y=Temp),data=Mitchell) + geom_point() + scale_x_discrete(breaks = seq(0,203,12)) # 将所有数据按照0-203的区间进行划分和叠加  {r Making Sense of Data} ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell) + geom_point()  *** ### A New Perspective What do you notice? Response: Watch the solution video and check out the Instructor Notes! Notes: *** ### Understanding Noise: Age to Age Months Notes: {r Understanding Noise: Age to Age Months} ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age) + geom_line() head(pf.fc_by_age,10) pf.fc_by_age[17:19,]  *** ### Age with Months Means {r Age with Months Means} pf$age_with_months <- pf$age +(1 - pf$dob_month /12)


Programming Assignment
{r Programming Assignment: Age with months means solutions}
library(dplyr)

pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)

{r Age with months means Alternate solutions}
age_with_months_groups <- group_by(pf,age_with_months)
pf.fc_by_age_months2 <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months2 <- arrange(pf.fc_by_age_months2,age_with_months)



***

### Noise in Conditional Means

{r Noise in Conditional Means}
ggplot(aes(x=age_with_months,y=friend_count_mean),data=subset(pf.fc_by_age_months2,age_with_months <= 71)) +
geom_line()


***

### Smoothing Conditional Means
Notes:

{r Smoothing Conditional Means}
pscm1 = ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age <= 71)) +
geom_line()

pscm2 = ggplot(aes(x=age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months,age_with_months <= 71)) +
geom_line()

pscm3 = ggplot(aes(x=round(age/5)*5, y=friend_count),
data = subset(pf,age <= 71)) +
geom_line(stat='summary',fun.y=mean)

grid.arrange(pscm1, pscm2, pscm3, ncol=1)


***

### Which Plot to Choose?
Notes:

***

### Analyzing Two Variables
Reflection:

***

Click **KnitHTML** to see all of your hard work and to have an html

#### conclusions
## 1: 学习了如何探索（可视化）两个变量之间的关系 - 散点图，增强手段：条件汇总+平均值
## 2：使用相关性的优势和局限，要了解两个变量的关系，以及关联度如何影响，
## 3：学习通过调整可视化来更好的理解数据的含义
## 4：如何使用抖动和透明度（jitter and transparency） 来减少图形重叠


library(ggplot2)
library(dplyr)
library(GGally)
library(scales)
library(memisc)
library(reshape)
library(gridExtra)

# 砖石的变量：carat（克拉） cut（切割）  color（颜色）  clarity（纯度） depth（深度）  table  price（价格） x  y  z  volume(体积=x*y*z)

# 可视化价格和的散点图，发现：随着价格的增长，相应的x也在变大
ggplot(aes(x=price,y=x),data=diamondsinfo) +geom_point()

# 计算价格和x的相关性 结果：0.8844352
cor.test(diamondsinfo$price, diamondsinfo$x)

# 计算价格和y的相关性 结果：0.8654209
cor.test(diamondsinfo$price, diamondsinfo$y)

# 计算价格和z的相关性 结果：0.8612494
cor.test(diamondsinfo$price, diamondsinfo$z)

# create a simple scatter plot of price vs depth
ggplot(aes(x=price,y=depth),data=diamondsinfo) +
geom_point()
# 计算价格和depth 的相关性 结果： -0.0106474
cor.test(diamondsinfo$price, diamondsinfo$depth)   #

# change the code to make the transparency of the points to be 1/100 of what they are now and mark the x_axis every 2 units.
ggplot(aes(x=price,y=depth),data=diamondsinfo) +
geom_point(alpha=1/100)

# most diamonds are between what values of depth:0-1000

# create a scatterplot of price vs carat and omit the top 1% of price and carat values（克拉）
ggplot(aes(x=price,y=carat),data=diamondsinfo) +
geom_point()

# create a scatterplot of price vs.volume(x*y*z).this is a very rough approximation for a diamond's volume 体积
ggplot(aes(x=price,y=(x*y*z)),data=diamondsinfo) +
geom_point()
cor.test(diamondsinfo$price,(diamondsinfo$x * diamondsinfo$y * diamondsinfo$z)) # 0.9023845
### 从上面的价格与X、y、z以及x*y*z的相关性结果可知，其有着很大的相关性；但价格和深度得相关性不大。

# 要求：衡量体积大于0且体积在800以下的，相应价格和体积的相关性，子集的相关性。有点错误？？？
diamondsinfo$volume <- diamondsinfo$x * diamondsinfo$y * diamondsinfo$z
cor.test(diamondsinfo$price, daimondsinfo$volume >0 $daimondsinfo$volume <800)

# adjust transpareency of the points and add s linear model to the plot
ggplot(aes(x=volume, y=price), data=subset(diamondsinfo, volume>0, volume<=800)) +
geom_point(alpha=1/5) +
xlim(0,850) +
geom_smooth(method='lm',color='red')  # 添加一个线性的平滑器

# use the function dplyr package to create a new data frame containing info on diamonds by clarity:new datafram:
# 1) mean_price 2)median_price 3)min_price 4)max_price 5) n
clarity_groups <- group_by(diamondsinfo,clarity)
diamondsinfo.clarity_group <- summarise(clarity_groups,
price_mean = mean(price),
price_median = median(price),
price_min = min(price),
price_max = max(price),
n = n())
diamondsinfo.clarity_group <- arrange(diamondsinfo.clarity_group,clarity)
diamondsinfo.clarity_group

# 2
color_groups <- group_by(diamondsinfo,color)
diamondsinfo.color_group <- summarise(color_groups,
price_mean = mean(price),
price_median = median(price),
price_min = min(price),
price_max = max(price),
n = n())
diamondsinfo.color_group <- arrange(diamondsinfo.color_group,color)
diamondsinfo.color_group

# 平均价格柱状图：
diamonds_mp_by_clarity <- summarise(clarity_groups,
mean_price = mean(price))

diamonds_mp_by_color <- summarise(color_groups,
mean_price = mean(price))
pclarity = ggplot(aes(x=clarity,y=mean_price),data=diamonds_mp_by_clarity) +
geom_bar()
pcolor = ggplot(aes(x=color,y=mean_price),data=diamonds_mp_by_color) +
geom_bar()
grid.arrange(pclarity,pcolor,ncol=1)

# 重访Gapminder
##  Gapminder website contains over 500 datasets with information about the world's population. your work is to continue the investigation you did at the end of problem set 3
##  or you can start fresh and choose a different data set from Gapminder.