## 随机数:从1到7的7个随机数
>sample(7)[1]5274361
## 随机数:从1到7的5个随机数
>sample(7,5)[1]72634>sample(7,10, replace = TRUE)[1]7352752345>sample(.leap.seconds,4)[1]"1991-01-01 08:00:00 CST"[2]"1999-01-01 08:00:00 CST"[3]"1992-07-01 08:00:00 CST"[4]"1988-01-01 08:00:00 CST">sample(colors(),4)[1]"darkgoldenrod3""mediumorchid"[3]"indianred2""gold2"
## 随机数并且限定暑假的概论(13,21)最大
> weights <-c(1,1,2,3,5,8,13,21,8,3,1,1)>sample(month.abb,1, prob = weights)[1]"Jun">sample(month.abb,1, prob = weights)[1]"Dec">sample(month.abb,1, prob = weights)[1]"Jul"
## 5 uniform random numbers between 0 and 1runif(5)
## 5 uniform random numbers between 1 and 10runif(5,1,10)
## 5 normal random numbers with mean 0 and std dev 1rnorm(5)
## 5 normal random numbers with mean 3 and std dev 7rnorm(5,3,7)
## 随机数的算法
>?RNG
>RNGkind()[1]"Mersenne-Twister"[2]"Inversion"
## 设定seed,确定每次的随机数字
> set.seed(105)>runif(5)[1]0.097914950.99563352[3]0.320552520.39772641[5]0.55322525> set.seed(105)>runif(5)[1]0.097914950.99563352[3]0.320552520.39772641[5]0.55322525
## You can also specify different generation algorithms,
though this is very advanced usage,so don’t do that
unless you know what you are doing.
2. A First Model: Linear Regressions(线性回归)
>library(learningr)> gonorrhoea
## The function for calculating them is concisely if not clearly named:lm,shortfor “linear model.”
## 查看分组变量的levels
>lapply(Filter(is.factor, gonorrhoea), levels)
$`Age.Group`
[1]"0 to 4""5 to 9""10 to 14""15 to 19"[5]"20 to 24""25 to 29""30 to 34""35 to 39"[9]"40 to 44""45 to 54""55 to 64""65 or more"
$Ethnicity
[1]"American Indians & Alaskan Natives"[2]"Asians & Pacific Islanders"[3]"Hispanics"[4]"Non-Hispanic Blacks"[5]"Non-Hispanic Whites"
$Gender
[1]"Female""Male"> model1 <-lm(Rate ~ Year + Age.Group + Ethnicity + Gender, gonorrhoea)> model1
Call:lm(formula = Rate ~ Year + Age.Group + Ethnicity + Gender, data = gonorrhoea)
Coefficients:(Intercept)5540.496
Year
-2.770
Age.Group5 to 9-0.614
Age.Group10 to 1415.268
Age.Group15 to 19415.698
Age.Group20 to 24546.820
Age.Group25 to 29291.098
Age.Group30 to 34155.872
Age.Group35 to 3984.612
Age.Group40 to 4449.506
Age.Group45 to 5427.364
Age.Group55 to 648.684
Age.Group65 or more
1.178
EthnicityAsians & Pacific Islanders
-82.923
EthnicityHispanics
-49.000
EthnicityNon-Hispanic Blacks
376.204
EthnicityNon-Hispanic Whites
-68.263
GenderMale
-17.892
## 数据解读:For example, the "0 to 4 age group" is missing,
and the "American Indians & AlaskanNatives ethnicity" is also
missing.Why?
## 原因:These “missing” categories are included in the "intercept".
In the following output, the intercept value of 5,540 people infected with
gonorrhoea per 100,000 people
applies to "0- to 4-year-old""female""American Indians and Alaskan Natives"
in the "year 0".
## The "0- to 4-year-old""female""American Indians and Alaskan Natives"
group was chosen because it consists of the first level of each of the factor variables.>summary(model1)
Call:lm(formula = Rate ~ Year + Age.Group + Ethnicity + Gender, data = gonorrhoea)
Residuals:
Min 1Q Median 3Q Max
-376.74-130.6237.1290.691467.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)(Intercept)5540.49614866.4060.3730.70952
Year -2.7707.400-0.3740.70826
Age.Group5 to 9-0.61451.268-0.0120.99045
Age.Group10 to 1415.26851.2680.2980.76596
Age.Group15 to 19415.69851.2688.1083.05e-15***
Age.Group20 to 24546.82051.26810.666<2e-16***
Age.Group25 to 29291.09851.2685.6782.15e-08***
Age.Group30 to 34155.87251.2683.0400.00247**
Age.Group35 to 3984.61251.2681.6500.09940.
Age.Group40 to 4449.50651.2680.9660.33463
Age.Group45 to 5427.36451.2680.5340.59372
Age.Group55 to 648.68451.2680.1690.86555
Age.Group65 or more 1.17851.2680.0230.98168
EthnicityAsians & Pacific Islanders -82.92233.093-2.5060.01249*
EthnicityHispanics -49.00033.093-1.4810.13924
EthnicityNon-Hispanic Blacks 376.20433.09311.368<2e-16***
EthnicityNon-Hispanic Whites -68.26333.093-2.0630.03958*
GenderMale -17.89220.930-0.8550.39298---
Signif. codes:0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error:256.3 on 582 degrees of freedom
Multiple R-squared:0.4913, Adjusted R-squared:0.4765
F-statistic:33.07 on 17 and 582 DF, p-value:<2.2e-16
## 考虑Year没用太多意义,予以删除
## In the next example,"." means the terms that were already in the formula,
and "–"(minus) means remove this next term”:> model2 <-update(model1,~.- Year)>summary(model2)
Call:lm(formula = Rate ~ Age.Group + Ethnicity + Gender, data = gonorrhoea)
Residuals:
Min 1Q Median 3Q Max
-377.59-128.4534.6492.221472.60
Coefficients:
Estimate Std. Error t value
(Intercept)-25.10443.117-0.582
Age.Group5 to 9-0.61451.230-0.012
Age.Group10 to 1415.26851.2300.298
Age.Group15 to 19415.69851.2308.114
Age.Group20 to 24546.82051.23010.674
Age.Group25 to 29291.09851.2305.682
Age.Group30 to 34155.87251.2303.043
Age.Group35 to 3984.61251.2301.652
Age.Group40 to 4449.50651.2300.966
Age.Group45 to 5427.36451.2300.534
Age.Group55 to 648.68451.2300.170
Age.Group65 or more 1.17851.2300.023
EthnicityAsians & Pacific Islanders -82.92233.069-2.508
EthnicityHispanics -49.00033.069-1.482
EthnicityNon-Hispanic Blacks 376.20433.06911.376
EthnicityNon-Hispanic Whites -68.26333.069-2.064
GenderMale -17.89220.915-0.855Pr(>|t|)(Intercept)0.56064
Age.Group5 to 90.99044
Age.Group10 to 140.76579
Age.Group15 to 192.91e-15***
Age.Group20 to 24<2e-16***
Age.Group25 to 292.10e-08***
Age.Group30 to 340.00245**
Age.Group35 to 390.09915.
Age.Group40 to 440.33427
Age.Group45 to 540.59345
Age.Group55 to 640.86545
Age.Group65 or more 0.98166
EthnicityAsians & Pacific Islanders 0.01243*
EthnicityHispanics 0.13894
EthnicityNon-Hispanic Blacks <2e-16***
EthnicityNon-Hispanic Whites 0.03943*
GenderMale 0.39263---
Signif. codes:0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error:256.1 on 583 degrees of freedom
Multiple R-squared:0.4912, Adjusted R-squared:0.4772
F-statistic:35.18 on 16 and 583 DF, p-value:<2.2e-16
3. 比较模型
>anova(model1, model2)
Analysis of Variance Table
Model 1: Rate ~ Year + Age.Group + Ethnicity + Gender
Model 2: Rate ~ Age.Group + Ethnicity + Gender
Res.Df RSS Df Sum of Sq F Pr(>F)158238243062258338252272-1-9209.70.14020.7083
## 结果分析:The p-value in the righthand column is 0.71,
so removing the year term didn’t signif‐icantly affect the model’s fit to
the data.
## The Akaike and Bayesian information criteria provide alternate methods of comparingmodels, via the AIC and BIC functions.>AIC(model1, model2)
df AIC
model1 198378.252
model2 188376.397>BIC(model1, model2)
df BIC
model1 198461.794
model2 188455.541
## 结果分析:Smaller numbers roughly correspond to “better” models:
4. Plotting and Inspecting Models
## "lm" models have a plot method that lets you check the goodness of fit
in "six different ways". A slightly better approach is to use the "layout"
function to see all the plots together. Figure 15-2> plot_numbers <-1:6>layout(matrix(plot_numbers, ncol =2, byrow = TRUE))>plot(model1, plot_numbers)
## you can explore the structure of these model variables in
the usual way:>unclass(model1)>str(model1)
## There are many convenience functions for accessing the various components
of the model, such as formula, nobs, residuals, fitted, and coefficients:>formula(model1)
Rate ~ Year + Age.Group + Ethnicity + Gender
>nobs(model1)[1]600>head(residuals(model1))123456106.0185106.332590.6505-278.9795-364.2015-121.9795>head(fitted(model1))123456-105.7185-106.3325-90.4505309.9795441.1015185.3795>head(coefficients(model1))(Intercept) Year Age.Group5 to 9 Age.Group10 to 145540.496167-2.770333-0.61400015.268000
Age.Group15 to 19 Age.Group20 to 24415.698000546.820000
## The real beauty of using models is that you can predict outcomes based on whatever model inputs interest you. It’s more fun when you have a time variable in the model so that you can predict the future, but in this case we can take a look at infection rates for specific demographics. In a completely self-interested fashion, I’m going to look at the infection rates for 30- to 34-year-old non-Hispanic white people:
> new_data <- data.frame(+ Age.Group ="30 to 34",+ Ethnicity ="Non-Hispanic Whites"+)>predict(model4, new_data)153.559
## The model predicts an infection rate of 54 people per 100,000.
## Let’s compare it to the data for that group:>subset(+ gonorrhoea,+ Age.Group =="30 to 34"& Ethnicity =="Non-Hispanic Whites"+)
Year Age.Group Ethnicity Gender Rate
7200730 to 34 Non-Hispanic Whites Male 41.019200730 to 34 Non-Hispanic Whites Female 45.6127200830 to 34 Non-Hispanic Whites Male 35.1139200830 to 34 Non-Hispanic Whites Female 40.8247200930 to 34 Non-Hispanic Whites Male 34.6259200930 to 34 Non-Hispanic Whites Female 33.8367201030 to 34 Non-Hispanic Whites Male 40.8379201030 to 34 Non-Hispanic Whites Female 38.5487201130 to 34 Non-Hispanic Whites Male 48.0499201130 to 34 Non-Hispanic Whites Female 40.7
## 数据解读:The data points range from 34 to 48, so the prediction is a little bit high