Data wrangling
Reshaping data
第一步 transform the data from wide format to long format
# pivot_longer() takes multiple columns and collapses them so that each unique variable has it’s own column and has four main arguments:
#1 data is the name of the object you want to transform
#2 names_to is the name of the new column that you will create that will contain the names of the original wide format columns
#3 values_to is the name of the column that will contain the existing values.
#4 cols are the original columns you want to collapse.
rlong <- pivot_longer(data = responses,
names_to = "Question",
values_to = "Response",
cols = Q1:Q10)
第二步 recreate the example from the Data and only use one participant
# Create a new object called rlong_16 that uses filter() to keep only the data from participant Id 16.
rlong_16 <- filter(rlong, Id == 16)
第三步 match data
# Create a new object called rlong_16_join that uses inner_join() to join together rlong_16 and qformatsby their common column.
rlong_16_join <- inner_join(rlong_16, qformats, "Question")
# Create a new object named scores_16 that joins together rlong_16_join with scoring
scores_16 <- inner_join(rlong_16_join, scoring, c("QFormat", "Response")) # 用c()写出所有要匹配的column;inner_join一次只能合并两张表
第四步 Calculating the AQ score
# Create a new object called AQ_16. Use summarise() and sum()to add up the numbers in the column Score from scores_16 and call the result of this calculation AQ_score.
AQ_16 <- summarise(scores_16, AQ_score = sum(Score))
第五步 Calculating all scores (do the same thing but for all participants,逻辑同第三步和第四步)
rlong_join <- inner_join(rlong, qformats, "Question")
scores <- inner_join(rlong_join, scoring, c("QFormat", "Response"))
scores_grouped <- group_by(scores, Id, gender) # calculate a score for each participant’s gender
AQ_all <- summarise(scores_grouped, total_score = sum(Score))
此外,可以利用pipe连接代码使其更简洁。
rlong <- pivot_longer(data = responses,
names_to = "Question",
values_to = "Response",
cols = Q1:Q10)
rlong_join <- inner_join(rlong, qformats, by = "Question")
scores <- inner_join(rlong_join, scoring, by = c("QFormat", "Response"))
scores_grouped <- group_by(scores, Id, gender)
AQ_all <- summarise(scores_grouped, total_score = sum(Score))
# pipes send the result to the first argument of the next function.
AQ_all <- pivot_longer(data = responses,
names_to = "Question",
values_to = "Response",
cols = Q1:Q10) %>%
inner_join(qformats, by = "Question") %>%
inner_join(scoring, by = c("QFormat", "Response")) %>%
group_by(Id, gender) %>%
summarise(total_score = sum(Score))
# 作图时使用pipe
AQ_all %>%
filter(gender == "female") %>%
ggplot(aes(x = total_score)) +
geom_histogram(binwidth = 1, colour = "black", fill = "grey") +
theme_minimal()+
scale_x_continuous(name = "Total AQ Score (female participants)", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) +
scale_y_continuous(name = "Count")
Data visualisation
作图之前先检查数据类型:
str(summarydata) #作图前观察数据类型
# 将num数据变更为factor数据
summarydata <- summarydata %>%
mutate(sex = as.factor(sex),
educ = as.factor(educ),
income = as.factor(income)) #overwrite the data that is in the column sex with sex as a factor
Bar plot为例,构建ggplot():
ggplot(summarydata, aes(x = sex)) # aes() can take both an x and y argument
ggplot(summarydata, aes(x = sex)) +
geom_bar() # use geom_bar() as we want to draw a bar plot
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar() # Adding fill to the first layer will separate the data into each level of the grouping variable and give it a different colour
# fill() has also produced a plot legend to the right of the graph. When you have multiple grouping variables you need this to know which groups each bit of the plot is referring to.
# If not, you can get rid of it by adding show.legend = FALSE to the geom_bar() code.
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar(show.legend = FALSE)
tidy up our plot to make it look a bit nicer:
# change the axis labels and change the numeric sex codes into words
# name which controls the name of each axis
# labels which controls the names of the break points on the axis
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar(show.legend = FALSE) +
scale_x_discrete(name = "Participant Sex",
labels = c("Female", "Male")) +
scale_y_continuous(name = "Number of participants")
# adjust the colours and the visual style of the plot using theme_minimal()
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar(show.legend = FALSE) +
scale_x_discrete(name = "Participant Sex",
labels = c("Female", "Male")) +
scale_y_continuous(name = "Number of participants") +
theme_minimal() # try typing theme_ into a code chunk and try all the options
# to use a colour-blind friendly palette that can also be read if printed in black-and-white by adding on the function scale_fill_viridis_d()
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar(show.legend = FALSE) +
scale_x_discrete(name = "Participant Sex",
labels = c("Female", "Male")) +
scale_y_continuous(name = "Number of participants") +
theme_minimal() +
scale_fill_viridis_d(option = "E") # This function has 5 colour options, A, B, C, D, and E.
# adjust the transparency of the bars by adding alpha to geom_bar()
ggplot(summarydata, aes(x = sex, fill = sex)) +
geom_bar(show.legend = FALSE, alpha = .8) +
scale_x_discrete(name = "Participant Sex",
labels = c("Female", "Male")) +
scale_y_continuous(name = "Number of participants") +
theme_minimal() +
scale_fill_viridis_d(option = "E")
其他plot类型:
# violin plot and box plot ggplot(summarydata, aes(x = income, y = ahiTotal, fill = income)) + geom_violin(trim = FALSE, show.legend = FALSE, alpha = .4) + # trim表示? geom_boxplot(width = .2, show.legend = FALSE, alpha = .7)+ # 两层geom_代码的顺序决定谁显示在上,后写的层被叠加在先写的层上面 # width表示箱图的横向宽度 scale_x_discrete(name = "Income", labels = c("Below Average", "Average", "Above Average")) + scale_y_continuous(name = "Authentic Happiness Inventory Score")+ theme_minimal() + scale_fill_viridis_d() # scatter plot ggplot(age_65max, aes(x = ahiTotal , y = cesdTotal)) + geom_point(colour = "red") + # add the argument colour = "red" to geom_point to change the colour scale_x_continuous(name = "Happiness Score") + scale_y_continuous(name = "Depression Score") + theme_minimal() # histogram 直方图 ggplot(AQ_all, aes(x = total_score)) + geom_histogram(binwidth = 1, colour = "black", fill = "grey") + # binwidth表示一个直方块的宽度;colour表示直方块边缘的颜色;fill是直方块的填充色 theme_minimal()+ scale_x_continuous(name = "Total AQ Score", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) + scale_y_continuous(name = "Count") # density plot 密度图 ggplot(AQ_all, aes(x = total_score, fill = gender)) + # produce different coloured geoms for each level of gender geom_density(alpha = .3) + # draw the density curve. The argument alpha controls the transparency of the colours theme_minimal() + scale_fill_viridis_d(option = "D") + scale_x_continuous(name = "Total AQ Score", breaks = c(1,2,3,4,5,6,7,8,9,10)) + scale_y_continuous(name = "Density") # the proportion of the data points fall at each point on the x-axis is the proportion multiply by 100 or move the decimal place two places to the right. # so a proportion of .5 = 50%, a proportion of .03 = 3% and so on. # bad bar plot # show the true distribution of the data AQ_all %>% ggplot(aes(x = gender, y = total_score)) + stat_summary(geom = "bar", fun = "mean") + # draw a summary of the data, use a bar chart to do so and the function to display on the y-axis should be the mean. stat_summary(geom = "errorbar", fun.data = "mean_se", width = .2) + # draw another summary but this time use an errorbar and the function to apply to the data is standard error of the mean theme_minimal()
保存做出的图
ggsave("density.png") # by default it will save a copy of the last plot you created. The default size for an image is 7 x 7.
ggsave("density.png", width = 10, height = 8) # overwrite the image file with new dimensions
# save your plot as an object and then tell it which object to write to a file
# saves the pipe plot from Activity 3 into an object named AQ_histogram and then saves it to an image file “AQ_histogram.png
AQ_histogram <- AQ_all %>%
filter(gender == "female") %>%
ggplot(aes(x = total_score)) +
geom_histogram(binwidth = 1, colour = "black", fill = "grey") +
theme_minimal()+
scale_x_continuous(name = "Total AQ Score (female participants)", breaks = c(0,1,2,3,4,5,6,7,8,9,10)) +
scale_y_continuous(name = "Count")
ggsave("AQ_histogram.png", plot = AQ_histogram)
# When you save a plot to an object, it won’t display in R Studio. To get it to display you need to type the object name in the console (i.e., AQ_histogram). The benefit of doing it this way is that if you are making lots of plots, you can’t accidentally save the wrong one because you are explicitly specifying which plot to save rather than just saving the last one.
Intro to Probability
Types of data
Discrete: nominal(categorical) ordinal
Continuous: interval ratio
Probability distributions
The uniform distribution,each outcome has the same chance of occurring.
The binomial distribution,Binomial distributions represent discrete data.
dbinom(x = 5, size = 10, prob = 0.5) #the probability of getting 5 heads out of 10 coin flips to 2 decimal places pbinom(q = 2, size = 10, prob = 0.5) #the probability of getting a maximum of 2 heads on 10 coin flips to 2 decimal places pbinom(q = 7, size = 10, prob = .5) #the wrong way to calculate the probability of getting 7 or more heads on 10 flips pbinom(q = 6, size = 10, prob = .5, lower.tail = FALSE) #the wright way to calculate the probability of getting 7 or more heads on 10 flips qbinom(p = .05, size = 10, prob = .5) #the minimum number of successes (e.g. heads) to maintain an overall probability of .05, in 10 flips, when the probability of a success on any one flip is .5 qbinom(p = .05, size = c(100, 1000, 10000), prob = .5) #the cut-off if you ran 100, 1000 or 10000 trials
The normal distribution,reflects the probability of any value occurring for a continuous variable.
##The average height of a 16-24 year old Scottish man is 176.2 centimetres, with a standard deviation of 6.748. ##The average height of a 16-24 year old Scottish woman is 163.8 cm, with a standard deviation of 6.931. pnorm(q = 176.2, mean = 163.8, sd = 6.931, lower.tail = FALSE) #the probability of meeting a 16-24 y.o. Scottish woman who is taller than the average 16-24 y.o. Scottish man pnorm(q = 181.12, mean = 176.2, sd = 6.748, lower.tail = FALSE) #the proportion of Scottish men Fiona would be willing to date to 2 decimal places pnorm(q = 181.12, mean = 163.8, sd = 6.931, lower.tail = TRUE) #the proportion of Scottish women would Fiona be willing to date to 2 decimal places qnorm(p = 0.05, mean = 176.2, sd = 6.748, lower.tail = FALSE) #how tall a 16-24 y.o. Scottish man would have to be in order to be in the top 5% (i.e., p = .05) of the height distribution for Scottish men in his age group
Simulation
## sample() is used when we want to simulate discrete data, i.e., (nominal or ordinal data).
# Notice that because our event labels are strings (text), we need to enter them into the function in "quotes"
library(tidyverse)
sample(x = c("HEADS", "TAILS"), size = 4, replace = TRUE)
## [1] "TAILS" "HEADS" "TAILS" "HEADS"
sample(x = 0:1, size = 4, replace = TRUE)
## [1] 0 1 0 0
## using ones and zeroes we can count the number of heads by summing the values of the outcomes.
# This code pipes the output of sample() into sum() which counts up the number of heads/1s.
sample(x = 0:1, size = 4, replace = TRUE) %>% sum()
## [1] 2
## repeat the experiment a whole bunch more times.
replicate(n = 20, expr = sample(0:1, 4, TRUE) %>% sum())
## [1] 0 1 2 1 1 4 3 2 2 2 3 1 3 2 1 2 3 2 2 3
Monte Carlo simulation
Every year, the city of Monte Carlo is the site of innumerable games of chance played in its casinos by people from all over the world. This notoriety is reflected in the use of the term “Monte Carlo simulation” among statisticians to refer to using a computer simulation to estimate statistical properties of a random process. In a Monte Carlo simulations, the random process is repeated over and over again in order to assess its performance over a very large number of trials. It is usually used in situations where mathematical solutions are unknown or hard to compute. Now we are ready to use Monte Carlo simulation to demonstrate the probability of various outcomes.
heads50 <- replicate(50, sample(0:1, 4, TRUE) %>% sum())
heads50
## [1] 2 2 2 2 3 3 3 3 1 3 3 1 3 1 2 2 3 1 2 3 3 3 2 3 1 2 2 3 2 3 2 1 3 2 2 3 4 3
## [39] 2 3 3 2 1 2 4 1 1 2 2 1
## estimate the probability of each of the outcomes (0, 1, 2, 3, 4 heads after 4 coin tosses) by counting them up and dividing by the number of experiments.
data50 <- tibble(heads = heads50) %>% # convert to a table
group_by(heads) %>% # group by number of possibilities (0,1,2,3,4)
summarise(n = n(), # count occurances of each possibility,
p=n/50) # & calculate probability (p) of each
## plot a histogram of the outcomes
# Note: stat = "identity" tells ggplot to use the values of the y-axis variable (p) as the height of the bars in our histogram (as opposed to counting the number of occurances of those values)
ggplot(data50, aes(x = heads,y = p)) +
geom_bar(stat = "identity", fill = "purple") +
labs(x = "Number of Heads", y = "Probability of Heads in 4 flips (p)") +
theme_minimal()
If you want reliable estimates, you need a bigger sample to minimise the probability that a possible outcome won’t occur.
heads10K <- replicate(n = 10000, expr = sample(0:1, 4, TRUE) %>% sum())
data10K <- tibble(heads = heads10K) %>%
group_by(heads) %>%
summarise(n = n(), p=n/10000)
ggplot(data10K, aes(heads,p)) +
geom_bar(stat = "identity", fill = "purple") +
labs(x = "Number of Heads", y = "Probability of Heads in 4 flips (p)") +
theme_minimal()
## what is the probability of getting two or more heads in four throws?
data10K %>%
filter(heads >= 2) %>%
summarise(p2 = sum(p))
## # A tibble: 1 x 1
## p2
## <dbl>
## 1 0.688
## You can add probabilities for various outcomes together as long as the outcomes are mutually exclusive, that is, when one outcome occurs, the others can’t occur.
Continuous data simulation
normal <- rnorm(n = 50, mean = 10, sd = 2)
mean(normal)
sd(normal)
tibble(normal = normal) %>% #turn the variable normal into a table and then
ggplot(aes(normal)) + # create a density plot
geom_density(fill = "red") +
theme_minimal()
normal <- rnorm(n = 500, mean = 10, sd = 2)
tibble(normal = normal) %>% #turn the variable normal into a table and then
ggplot(aes(normal)) + # create a density plot
geom_density(fill = "red") +
theme_minimal()
normal <- rnorm(n = 5000, mean = 10, sd = 2)
tibble(normal = normal) %>% #turn the variable normal into a table and then
ggplot(aes(normal)) + # create a density plot
geom_density(fill = "red") +
theme_minimal()
A dataset simulation
Let’s imagine that we’re going to run an experiment to see whether 120 people will roll a higher number on a die if their IQ is higher. This is obviously a stupid experiment but psychology does occasionally do stupid things.
subject_id <- 1:120 # create a variable called subject_id that has the numbers 1 to 120 in it
gender <- rep(x = c("man", "woman", "nonbinary"), each = 40) # create a variable that repeats “man” 40 times, then “women” 40 times, then “non-binary” 40 times
rolls <- sample(x = 1:6, size = 120, replace = TRUE) # simulate them all rolling a dice once
iq <- rnorm(n = 120, mean = 100, sd = 15) # simulate their IQ scores. IQ scores are standardised so that overall, the population has an average IQ of 100 and a SD of 15 so we can use this information to simulate the data with rnorm()
sim_data <- tibble(subject_id, gender, rolls, iq) # stitch all these variables together into a table
sim_data %>%
mutate(rolls = as.factor(rolls)) %>%
ggplot(aes(x = rolls, y = iq, fill = rolls)) +
geom_violin(trim = FALSE, alpha = .6, show.legend = FALSE) +
geom_boxplot(width = .2, show.legend = FALSE) +
scale_fill_viridis_d(option = "E") +
theme_minimal() +
labs(x = "Outcome of dice roll")