R分析钻石数据集

 

dm <- diamonds
qplot(data = dm, x = carat, y = price,
      xlim = c(0, quantile(dm$carat, 0.99)),
      ylim = c(0, quantile(dm$price, 0.99)))+
  geom_point(fill = I('#F79420'), color = I('black'), shape = 21)

 

ggplot(dm, aes(x = carat, y = price)) +
  scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
  scale_y_continuous(lim = c(0, quantile(dm$price, 0.99))) +
  geom_point(fill = I('#F79420'), color = I('black'), shape = 21)

 

使用统计平滑曲线

ggplot(dm, aes(x = carat, y = price)) +
  geom_point(color = I('#F79420'),alpha= 1/4 ) +
  stat_smooth(method = 'lm')
  scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
  scale_y_continuous(lim = c(0, quantile(dm$price, 0.99))) 

  install.packages('GGally')
  install.packages('scales')
  install.packages('memisc')
  install.packages('lattice')
  install.packages('MASS')
  install.packages('car')
  install.packages('reshape')
  install.packages('plyr')
  
  library(GGally)
  library(scales)
  library(memisc)
  library(lattice)
  library(MASS)
  library(car)
  library(reshape)
  library(plyr)

 

 

set.seed(20022012)
diamonds_samp <- dm[sample(1:length(dm$price), 10000 ) ,]

ggpairs(diamonds_samp, 
        lower = list(continuous = wrap("points", shape = I('.'))), 
        upper = list(combo = wrap("box", outlier.shape = I('.'))))

 

 

library(gridExtra)
library(grid)
plot1 <- qplot(data = dm,x = price,binwidth = 100,
               fill = I('#099DD9')) + 
  ggtitle('Price')


plot2 <- qplot(data = dm,x = price,binwidth = 0.01,
               fill = I('#F79420')) +
  ggtitle('Price (log10)') +
  scale_x_log10()

grid.arrange(plot1,plot2,ncol = 2)

 

qplot(carat,price,data=dm)+
  scale_y_continuous(trans = log10_trans())+
  ggtitle('Price (log10) by Carat')

 

cuberoot_trans = function() trans_new('cuberoot',
                      transform = function(x) x^(1/3),
                      inverse = function(x) x^3)

ggplot(aes(carat,price), data = dm) +
  geom_point() +
  scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                     breaks = c(0.2,0.5,1,2,3)) +
  scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                     breaks = c(350,1000,5000,10000,15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')

 

head(sort(table(dm$carat),decreasing = T))
head(sort(table(dm$price),decreasing = T))
 

>  head(sort(table(dm$price),decreasing = T))

605 802 625 828 776 698 
132 127 126 125 124 121 
> head(sort(table(dm$carat),decreasing = T))

 0.3 0.31 1.01  0.7 0.32    1 
2604 2249 2242 1981 1840 1558

 

 

函数测试  

set.seed(100)
d <- rpois(25,8)
GetMeanAndSE <- function(x) {
  m <- mean(x)
  n <- length(x)
  SE <- sd(x) / sqrt(n)
  return(c(m, SE))
}
GetMeanAndSE(d)

 

 

 

ggplot(aes(carat,price), data = dm) +
  geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
  scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                     breaks = c(0.2,0.5,1,2,3)) +
  scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                     breaks = c(350,1000,5000,10000,15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')

 

library(RColorBrewer)
ggplot(aes(carat,price,colour= clarity), data = dm) +
  geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
  scale_color_brewer( type= 'div',
    guide = guide_legend(title = 'Clarity', reverse = TRUE,
              override.aes = list(alpha = 1 ,size =2))) +
  scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                     breaks = c(0.2,0.5,1,2,3)) +
  scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                     breaks = c(350,1000,5000,10000,15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')

ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Clarity', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')

ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
        guide = guide_legend(title = 'Color', reverse = F,
              override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')

 

构建线性模型

参考

https://data.princeton.edu/r/linearmodels

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data=dm)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1,m2,m3,m4,m5)

 

 

 

获取更多数据集

从 https://github.com/solomonm/diamonds-data 下载数据集。点击 BigDiamonds.Rda 链接,然后点击“原始数据”按钮开始下载。下载完成后,你就可以通过命令 load("BigDiamonds.rda") 加载数据

 

install.packages('RCurl')
install.packages('bitops')
library(RCurl)
library(bitops)
diamondsurl= getBinaryURL("https://raw.github.com/SolomonMg/diamonds-data/blob/master/BigDiamonds.Rda")
load(rawConnection(diamondsurl))

load("BigDiamonds.rda")    #如果手动下载
dmb <- diamondsbig
dmb$logprice = log(dmb$price)
m1 <- lm(logprice ~ I(carat^(1/3)),
    data=dmb[dmb$price < 10000 &
               dmb$cert == "GIA",])
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
suppressMessages(library(lattice))
suppressMessages(library(MASS))
suppressMessages(library(memisc))
mtable(m1, m2, m3, m4, m5)
models <- mtable(m1, m2, m3, m4, m5)

 

 

thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
                         color = "I", clarity="VS1")
modelEstimate =predict(m5,newdata = thisDiamond,
                       interval = "prediction", level = .95)
exp(modelEstimate)

dat = data.frame(m4$model, m4$residuals) 

with(dat, sd(m4.residuals)) 

with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals)) 

dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) + 
  geom_line(stat = "summary", fun.y = sd) 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 2
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值