dm <- diamonds
qplot(data = dm, x = carat, y = price,
xlim = c(0, quantile(dm$carat, 0.99)),
ylim = c(0, quantile(dm$price, 0.99)))+
geom_point(fill = I('#F79420'), color = I('black'), shape = 21)
ggplot(dm, aes(x = carat, y = price)) +
scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
scale_y_continuous(lim = c(0, quantile(dm$price, 0.99))) +
geom_point(fill = I('#F79420'), color = I('black'), shape = 21)
使用统计平滑曲线
ggplot(dm, aes(x = carat, y = price)) +
geom_point(color = I('#F79420'),alpha= 1/4 ) +
stat_smooth(method = 'lm')
scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
scale_y_continuous(lim = c(0, quantile(dm$price, 0.99)))
install.packages('GGally')
install.packages('scales')
install.packages('memisc')
install.packages('lattice')
install.packages('MASS')
install.packages('car')
install.packages('reshape')
install.packages('plyr')
library(GGally)
library(scales)
library(memisc)
library(lattice)
library(MASS)
library(car)
library(reshape)
library(plyr)
set.seed(20022012)
diamonds_samp <- dm[sample(1:length(dm$price), 10000 ) ,]
ggpairs(diamonds_samp,
lower = list(continuous = wrap("points", shape = I('.'))),
upper = list(combo = wrap("box", outlier.shape = I('.'))))
library(gridExtra)
library(grid)
plot1 <- qplot(data = dm,x = price,binwidth = 100,
fill = I('#099DD9')) +
ggtitle('Price')
plot2 <- qplot(data = dm,x = price,binwidth = 0.01,
fill = I('#F79420')) +
ggtitle('Price (log10)') +
scale_x_log10()
grid.arrange(plot1,plot2,ncol = 2)
qplot(carat,price,data=dm)+
scale_y_continuous(trans = log10_trans())+
ggtitle('Price (log10) by Carat')
cuberoot_trans = function() trans_new('cuberoot',
transform = function(x) x^(1/3),
inverse = function(x) x^3)
ggplot(aes(carat,price), data = dm) +
geom_point() +
scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
breaks = c(0.2,0.5,1,2,3)) +
scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
breaks = c(350,1000,5000,10000,15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
head(sort(table(dm$carat),decreasing = T))
head(sort(table(dm$price),decreasing = T))
> head(sort(table(dm$price),decreasing = T))
605 802 625 828 776 698
132 127 126 125 124 121
> head(sort(table(dm$carat),decreasing = T))
0.3 0.31 1.01 0.7 0.32 1
2604 2249 2242 1981 1840 1558
函数测试
set.seed(100)
d <- rpois(25,8)
GetMeanAndSE <- function(x) {
m <- mean(x)
n <- length(x)
SE <- sd(x) / sqrt(n)
return(c(m, SE))
}
GetMeanAndSE(d)
ggplot(aes(carat,price), data = dm) +
geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
breaks = c(0.2,0.5,1,2,3)) +
scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
breaks = c(350,1000,5000,10000,15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
library(RColorBrewer)
ggplot(aes(carat,price,colour= clarity), data = dm) +
geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
scale_color_brewer( type= 'div',
guide = guide_legend(title = 'Clarity', reverse = TRUE,
override.aes = list(alpha = 1 ,size =2))) +
scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
breaks = c(0.2,0.5,1,2,3)) +
scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
breaks = c(350,1000,5000,10000,15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')
ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Color', reverse = F,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')
构建线性模型
参考
https://data.princeton.edu/r/linearmodels
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data=dm)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1,m2,m3,m4,m5)
获取更多数据集
从 https://github.com/solomonm/diamonds-data 下载数据集。点击 BigDiamonds.Rda 链接,然后点击“原始数据”按钮开始下载。下载完成后,你就可以通过命令 load("BigDiamonds.rda")
加载数据
install.packages('RCurl')
install.packages('bitops')
library(RCurl)
library(bitops)
diamondsurl= getBinaryURL("https://raw.github.com/SolomonMg/diamonds-data/blob/master/BigDiamonds.Rda")
load(rawConnection(diamondsurl))
load("BigDiamonds.rda") #如果手动下载
dmb <- diamondsbig
dmb$logprice = log(dmb$price)
m1 <- lm(logprice ~ I(carat^(1/3)),
data=dmb[dmb$price < 10000 &
dmb$cert == "GIA",])
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
suppressMessages(library(lattice))
suppressMessages(library(MASS))
suppressMessages(library(memisc))
mtable(m1, m2, m3, m4, m5)
models <- mtable(m1, m2, m3, m4, m5)
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
color = "I", clarity="VS1")
modelEstimate =predict(m5,newdata = thisDiamond,
interval = "prediction", level = .95)
exp(modelEstimate)
dat = data.frame(m4$model, m4$residuals)
with(dat, sd(m4.residuals))
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) +
geom_line(stat = "summary", fun.y = sd)