R语言入门(Grassroots)

Data wrangling 

 

 

Reshaping data

第一步 transform the data from wide format to long format

# pivot_longer() takes multiple columns and collapses them so that each unique variable has it’s own column and has four main arguments:
#1 data is the name of the object you want to transform
#2 names_to is the name of the new column that you will create that will contain the names of the original wide format columns
#3 values_to is the name of the column that will contain the existing values.
#4 cols are the original columns you want to collapse.

rlong <- pivot_longer(data = responses, 
                      names_to = "Question",  
                      values_to = "Response",
                      cols = Q1:Q10)

第二步 recreate the example from the Data and only use one participant

# Create a new object called rlong_16 that uses filter() to keep only the data from participant Id 16.
rlong_16 <- filter(rlong, Id == 16) 

 第三步 match data

# Create a new object called rlong_16_join that uses inner_join() to join together rlong_16 and qformatsby their common column.
rlong_16_join <- inner_join(rlong_16, qformats, "Question")

# Create a new object named scores_16 that joins together rlong_16_join with scoring
scores_16 <- inner_join(rlong_16_join, scoring, c("QFormat", "Response")) # 用c()写出所有要匹配的column;inner_join一次只能合并两张表

第四步 Calculating the AQ score

# Create a new object called AQ_16. Use summarise() and sum()to add up the numbers in the column Score from scores_16 and call the result of this calculation AQ_score.
AQ_16 <- summarise(scores_16, AQ_score = sum(Score))

第五步 Calculating all scores (do the same thing but for all participants,逻辑同第三步和第四步)

rlong_join <- inner_join(rlong, qformats, "Question")
scores <- inner_join(rlong_join, scoring, c("QFormat", "Response"))

scores_grouped <- group_by(scores, Id, gender) # calculate a score for each participant’s gender
AQ_all <- summarise(scores_grouped, total_score = sum(Score))

 

此外,可以利用pipe连接代码使其更简洁。

rlong <- pivot_longer(data = responses, 
                      names_to = "Question",  
                      values_to = "Response",
                      cols = Q1:Q10)

rlong_join <- inner_join(rlong, qformats, by = "Question")

scores <- inner_join(rlong_join, scoring, by = c("QFormat", "Response"))

scores_grouped <- group_by(scores, Id, gender)

AQ_all <- summarise(scores_grouped, total_score = sum(Score))

# pipes send the result to the first argument of the next function.
AQ_all <- pivot_longer(data = responses, 
                      names_to = "Question",  
                      values_to = "Response",
                      cols = Q1:Q10) %>%
  inner_join(qformats, by = "Question") %>%
  inner_join(scoring, by = c("QFormat", "Response")) %>%
  group_by(Id, gender) %>%
  summarise(total_score = sum(Score))

# 作图时使用pipe
AQ_all %>%
  filter(gender == "female") %>%
  ggplot(aes(x = total_score)) +
  geom_histogram(binwidth = 1, colour = "black", fill = "grey") +
  theme_minimal()+
  scale_x_continuous(name = "Total AQ Score (female participants)", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) +
  scale_y_continuous(name = "Count")

 

Data visualisation

作图之前先检查数据类型:

str(summarydata) #作图前观察数据类型

# 将num数据变更为factor数据
summarydata <- summarydata %>%
  mutate(sex = as.factor(sex),
         educ = as.factor(educ),
         income = as.factor(income)) #overwrite the data that is in the column sex with sex as a factor

 

Bar plot为例,构建ggplot():

ggplot(summarydata, aes(x = sex)) # aes() can take both an x and y argument
ggplot(summarydata, aes(x = sex)) +
  geom_bar() # use geom_bar() as we want to draw a bar plot
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar() # Adding fill to the first layer will separate the data into each level of the grouping variable and give it a different colour
             # fill() has also produced a plot legend to the right of the graph. When you have multiple grouping variables you need this to know which groups each bit of the plot is referring to.

# If not, you can get rid of it by adding show.legend = FALSE to the geom_bar() code.
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE)

tidy up our plot to make it look a bit nicer:

# change the axis labels and change the numeric sex codes into words
# name which controls the name of each axis
# labels which controls the names of the break points on the axis
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE) +
  scale_x_discrete(name = "Participant Sex", 
                   labels = c("Female", "Male")) +
  scale_y_continuous(name = "Number of participants")
# adjust the colours and the visual style of the plot using theme_minimal()
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE) +
  scale_x_discrete(name = "Participant Sex", 
                   labels = c("Female", "Male")) +
  scale_y_continuous(name = "Number of participants") +
  theme_minimal() # try typing theme_ into a code chunk and try all the options

# to use a colour-blind friendly palette that can also be read if printed in black-and-white by adding on the function scale_fill_viridis_d()
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE) +
  scale_x_discrete(name = "Participant Sex", 
                   labels = c("Female", "Male")) +
  scale_y_continuous(name = "Number of participants") +
  theme_minimal() +
  scale_fill_viridis_d(option = "E") # This function has 5 colour options, A, B, C, D, and E.
# adjust the transparency of the bars by adding alpha to geom_bar()
ggplot(summarydata, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE, alpha = .8) +
  scale_x_discrete(name = "Participant Sex", 
                   labels = c("Female", "Male")) +
  scale_y_continuous(name = "Number of participants") +
  theme_minimal() +
  scale_fill_viridis_d(option = "E")

 

其他plot类型:

# violin plot and box plot
ggplot(summarydata, aes(x = income, y = ahiTotal, fill = income)) +
  geom_violin(trim = FALSE, show.legend = FALSE, alpha = .4) + # trim表示?
  geom_boxplot(width = .2, show.legend = FALSE, alpha = .7)+  # 两层geom_代码的顺序决定谁显示在上,后写的层被叠加在先写的层上面
                                                              # width表示箱图的横向宽度
  scale_x_discrete(name = "Income",
                   labels = c("Below Average", "Average", "Above Average")) +
  scale_y_continuous(name = "Authentic Happiness Inventory Score")+
  theme_minimal() +
  scale_fill_viridis_d()

# scatter plot
ggplot(age_65max, aes(x = ahiTotal , y = cesdTotal)) + 
  geom_point(colour = "red") +                         # add the argument colour = "red" to geom_point  to change the colour
  scale_x_continuous(name = "Happiness Score") +
  scale_y_continuous(name = "Depression Score") +
  theme_minimal()

# histogram 直方图
ggplot(AQ_all, aes(x = total_score)) +
  geom_histogram(binwidth = 1, colour = "black", fill = "grey") +  # binwidth表示一个直方块的宽度;colour表示直方块边缘的颜色;fill是直方块的填充色
  theme_minimal()+
  scale_x_continuous(name = "Total AQ Score", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) +
  scale_y_continuous(name = "Count")

# density plot 密度图
ggplot(AQ_all, aes(x = total_score, fill = gender)) + # produce different coloured geoms for each level of gender
  geom_density(alpha = .3) + # draw the density curve. The argument alpha controls the transparency of the colours
  theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_x_continuous(name = "Total AQ Score", breaks = c(1,2,3,4,5,6,7,8,9,10)) +
  scale_y_continuous(name = "Density")
# the proportion of the data points fall at each point on the x-axis is the proportion multiply by 100 or move the decimal place two places to the right.
# so a proportion of .5 = 50%, a proportion of .03 = 3% and so on.

# bad bar plot
# show the true distribution of the data
AQ_all %>%
  ggplot(aes(x = gender, y = total_score)) +
  stat_summary(geom = "bar", fun = "mean") +  # draw a summary of the data, use a bar chart to do so and the function to display on the y-axis should be the mean.
  stat_summary(geom = "errorbar", fun.data = "mean_se", width = .2) + # draw another summary but this time use an errorbar and the function to apply to the data is standard error of the mean
  theme_minimal()




 

保存做出的图

ggsave("density.png") # by default it will save a copy of the last plot you created. The default size for an image is 7 x 7.
ggsave("density.png", width = 10, height = 8) # overwrite the image file with new dimensions

# save your plot as an object and then tell it which object to write to a file
# saves the pipe plot from Activity 3 into an object named AQ_histogram and then saves it to an image file “AQ_histogram.png
AQ_histogram <- AQ_all %>%
  filter(gender == "female") %>%
  ggplot(aes(x = total_score)) +
  geom_histogram(binwidth = 1, colour = "black", fill = "grey") +
  theme_minimal()+
  scale_x_continuous(name = "Total AQ Score (female participants)", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) +
  scale_y_continuous(name = "Count")

ggsave("AQ_histogram.png", plot = AQ_histogram)
# When you save a plot to an object, it won’t display in R Studio. To get it to display you need to type the object name in the console (i.e., AQ_histogram). The benefit of doing it this way is that if you are making lots of plots, you can’t accidentally save the wrong one because you are explicitly specifying which plot to save rather than just saving the last one.

 

Intro to Probability

Types of data

Discrete: nominal(categorical)  ordinal

Continuous: interval  ratio

 

Probability distributions

The uniform distribution,each outcome has the same chance of occurring.

The binomial distribution,Binomial distributions represent discrete data.

 

dbinom(x = 5, size = 10, prob = 0.5) #the probability of getting 5 heads out of 10 coin flips to 2 decimal places

pbinom(q = 2, size = 10, prob = 0.5) #the probability of getting a maximum of 2 heads on 10 coin flips to 2 decimal places
pbinom(q = 7, size = 10, prob = .5) #the wrong way to calculate the probability of getting 7 or more heads on 10 flips
pbinom(q = 6, size = 10, prob = .5, lower.tail = FALSE) #the wright way to calculate the probability of getting 7 or more heads on 10 flips

qbinom(p = .05, size = 10, prob = .5) #the minimum number of successes (e.g. heads) to maintain an overall probability of .05, in 10 flips, when the probability of a success on any one flip is .5
qbinom(p = .05, size = c(100, 1000, 10000), prob = .5) #the cut-off if you ran 100, 1000 or 10000 trials

The normal distribution,reflects the probability of any value occurring for a continuous variable.

 

##The average height of a 16-24 year old Scottish man is 176.2 centimetres, with a standard deviation of 6.748.
##The average height of a 16-24 year old Scottish woman is 163.8 cm, with a standard deviation of 6.931.

pnorm(q = 176.2, mean = 163.8, sd = 6.931, lower.tail = FALSE) #the probability of meeting a 16-24 y.o. Scottish woman who is taller than the average 16-24 y.o. Scottish man
pnorm(q = 181.12, mean = 176.2, sd = 6.748, lower.tail = FALSE) #the proportion of Scottish men Fiona would be willing to date to 2 decimal places
pnorm(q = 181.12, mean = 163.8, sd = 6.931, lower.tail = TRUE) #the proportion of Scottish women would Fiona be willing to date to 2 decimal places

qnorm(p = 0.05, mean = 176.2, sd = 6.748, lower.tail = FALSE) #how tall a 16-24 y.o. Scottish man would have to be in order to be in the top 5% (i.e., p = .05) of the height distribution for Scottish men in his age group

Simulation

## sample() is used when we want to simulate discrete data, i.e., (nominal or ordinal data).
# Notice that because our event labels are strings (text), we need to enter them into the function in "quotes" 
library(tidyverse)
sample(x = c("HEADS", "TAILS"), size = 4, replace = TRUE) 
## [1] "TAILS" "HEADS" "TAILS" "HEADS"
sample(x = 0:1, size = 4, replace = TRUE)
## [1] 0 1 0 0

## using ones and zeroes we can count the number of heads by summing the values of the outcomes.
# This code pipes the output of sample() into sum() which counts up the number of heads/1s.
sample(x = 0:1, size = 4, replace = TRUE) %>% sum()
## [1] 2

## repeat the experiment a whole bunch more times.
replicate(n = 20, expr = sample(0:1, 4, TRUE) %>% sum())
##  [1] 0 1 2 1 1 4 3 2 2 2 3 1 3 2 1 2 3 2 2 3

Monte Carlo simulation

Every year, the city of Monte Carlo is the site of innumerable games of chance played in its casinos by people from all over the world. This notoriety is reflected in the use of the term “Monte Carlo simulation” among statisticians to refer to using a computer simulation to estimate statistical properties of a random process. In a Monte Carlo simulations, the random process is repeated over and over again in order to assess its performance over a very large number of trials. It is usually used in situations where mathematical solutions are unknown or hard to compute. Now we are ready to use Monte Carlo simulation to demonstrate the probability of various outcomes.

heads50 <- replicate(50, sample(0:1, 4, TRUE) %>% sum())   
heads50
##  [1] 2 2 2 2 3 3 3 3 1 3 3 1 3 1 2 2 3 1 2 3 3 3 2 3 1 2 2 3 2 3 2 1 3 2 2 3 4 3
## [39] 2 3 3 2 1 2 4 1 1 2 2 1

## estimate the probability of each of the outcomes (0, 1, 2, 3, 4 heads after 4 coin tosses) by counting them up and dividing by the number of experiments.
data50 <- tibble(heads = heads50) %>%   # convert to a table
                group_by(heads) %>%   # group by number of possibilities (0,1,2,3,4)
                summarise(n = n(), # count occurances of each possibility,
                          p=n/50) # & calculate probability (p) of each

## plot a histogram of the outcomes
# Note: stat = "identity" tells  ggplot to use the values of the y-axis variable (p) as the height of the bars in our histogram (as opposed to counting the number of occurances of those values)

ggplot(data50, aes(x = heads,y = p)) + 
  geom_bar(stat = "identity", fill = "purple") + 
  labs(x = "Number of Heads", y = "Probability of Heads in 4 flips (p)") +
  theme_minimal()

If you want reliable estimates, you need a bigger sample to minimise the probability that a possible outcome won’t occur.

heads10K <- replicate(n = 10000, expr = sample(0:1, 4, TRUE) %>% sum())   
data10K <- tibble(heads = heads10K) %>%   
                group_by(heads) %>%           
                summarise(n = n(), p=n/10000) 
ggplot(data10K, aes(heads,p)) + 
  geom_bar(stat = "identity", fill = "purple") + 
  labs(x = "Number of Heads", y = "Probability of Heads in 4 flips (p)") +
  theme_minimal()

## what is the probability of getting two or more heads in four throws? 
data10K %>%
  filter(heads >= 2) %>%
  summarise(p2 = sum(p))
## # A tibble: 1 x 1
##      p2
##   <dbl>
## 1 0.688
## You can add probabilities for various outcomes together as long as the outcomes are mutually exclusive, that is, when one outcome occurs, the others can’t occur. 

Continuous data simulation

normal <- rnorm(n = 50, mean = 10, sd = 2)
mean(normal)
sd(normal)
tibble(normal = normal) %>% #turn the variable normal into a table and then
  ggplot(aes(normal)) + # create a density plot
  geom_density(fill = "red") +
  theme_minimal()

normal <- rnorm(n = 500, mean = 10, sd = 2)
tibble(normal = normal) %>% #turn the variable normal into a table and then
  ggplot(aes(normal)) + # create a density plot
  geom_density(fill = "red") +
  theme_minimal()

normal <- rnorm(n = 5000, mean = 10, sd = 2)
tibble(normal = normal) %>% #turn the variable normal into a table and then
  ggplot(aes(normal)) + # create a density plot
  geom_density(fill = "red") +
  theme_minimal()

A dataset simulation

Let’s imagine that we’re going to run an experiment to see whether 120 people will roll a higher number on a die if their IQ is higher. This is obviously a stupid experiment but psychology does occasionally do stupid things.

subject_id <- 1:120 # create a variable called subject_id that has the numbers 1 to 120 in it
gender <- rep(x = c("man", "woman", "nonbinary"), each = 40) # create a variable that repeats “man” 40 times, then “women” 40 times, then “non-binary” 40 times
rolls <- sample(x = 1:6, size = 120, replace = TRUE) # simulate them all rolling a dice once
iq <- rnorm(n = 120, mean = 100, sd = 15) # simulate their IQ scores. IQ scores are standardised so that overall, the population has an average IQ of 100 and a SD of 15 so we can use this information to simulate the data with rnorm()
sim_data <- tibble(subject_id, gender, rolls, iq) # stitch all these variables together into a table

sim_data %>%
  mutate(rolls = as.factor(rolls)) %>%
  ggplot(aes(x = rolls, y = iq, fill = rolls)) +
  geom_violin(trim = FALSE, alpha = .6, show.legend = FALSE) +
  geom_boxplot(width = .2, show.legend = FALSE) +
  scale_fill_viridis_d(option = "E") +
  theme_minimal() +
  labs(x = "Outcome of dice roll")

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值