17. Exploratory Data Analysis in R Case Study

文章目录

1. Data Cleaning and Summarizing with dplyr

1.1 The United Nations Voting Dataset (video)

1.2 Filtering rows

Instruction :

# Load the dplyr package
library(dplyr)

# Print the votes dataset
votes

# Filter for votes that are "yes", "abstain", or "no"
votes %>%
filter(vote %in% c(1, 2, 3))

1.3 Adding a year column

Instruction :

# Add another %>% step to add a year column
votes %>%
  filter(vote <= 3) %>%
  mutate(year = 1945 + session)

1.4 Adding a country column

Instruction :

# Load the countrycode package
library(countrycode)
# Convert country code 100
countrycode(100, "cown", "country.name")

# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
  filter(vote <= 3) %>%
  mutate(year = session + 1945,
         country = countrycode(ccode, "cown", "country.name")
)

1.5 Grouping and summarizing (video)

1.6 Summarizing the full dataset

Instruction :

# Print votes_processed
votes_processed

# Find total and fraction of "yes" votes
votes_processed %>%
summarise(total = n(),
percent_yes = mean(vote == 1))

1.7 Summarizing by year

Instruction :

# Change this code to summarize by year
votes_processed %>%
  group_by(year) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1)) 

1.8 Summarizing by country

Instruction :

# Summarize by country: by_country
by_country <- votes_processed %>%
  group_by(country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

1.9 Sorting and filtering summarized data (video)

1.10 Sorting by percentage of yes"votes

Instruction :

# You have the votes summarized by country
by_country <- votes_processed %>%
  group_by(country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Print the by_country dataset
by_country

# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)

# Now sort in descending order
by_country %>%
arrange(desc(percent_yes))

1.11 Filtering summarized output

Instruction :

# Filter out countries with fewer than 100 votes
by_country %>%
  arrange(percent_yes) %>%
  filter(total >= 100)

2. Data Visualization with ggplot2

2.1 Visualization with ggplot2 (video)

2.2 Choosing an aesthetic

2.3 Plotting a line over time

Instruction :

# Define by_year
by_year <- votes_processed %>%
  group_by(year) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Load the ggplot2 package
library(ggplot2)

# Create line plot
ggplot(by_year, aes(x = year, y = percent_yes)) +
  geom_line()

2.4 Other ggplot2 layers

Instruction :

# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
  geom_point() +
  geom_smooth()

2.5 Visualizing by country (video)

2.6 Summarizing by year and country

Instruction :

# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
  group_by(year,country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

2.7 Plotting just the UK over time

Instruction :

# Start with by_year_country dataset
by_year_country <- votes_processed %>%
  group_by(year, country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Print by_year_country
by_year_country

# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom")

# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x = year, y = percent_yes)) +
geom_line()

2.8 Plotting multiple countries

Instruction :

# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
               "France", "India")

# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)

# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x = year, y = percent_yes, color = country)) +
geom_line()

2.9 Faceting by country (video)

2.10 Faceting the time series

Instruction :

# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India")

# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~country)

2.11 Faceting with free y-axis

Instruction :

# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India")

# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
  filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales = "free_y")

2.12 Choose your own countries

Instruction :

# Add three more countries to this list
countries <- c("United States", "United Kingdom", "France", "Japan", 
               "Brazil", "India", "Chile", "China", "Colombia")

# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
  filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales = "free_y")

3. Tidy Modeling with Broom

3.1 Linear regression (video)

3.2 Linear regression on the United States

Instruction :

# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
  filter(country == "United States")

# Print the US_by_year data
US_by_year

# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year,US_by_year)

# Perform summary() on the US_fit object
summary(US_fit)

3.3 Finding the slope of a linear regression

3.4 Finding the p-value of a linear regression

3.5 Tidying models with broom (video)

3.7 Tidying a linear regression model

Instruction :

# Load the broom package
library(broom)

# Call the tidy() function on the US_fit object
tidy(US_fit)

3.8 Combining models for multiple countries

Instruction :

# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
  filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)

# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
  filter(country == "United Kingdom")
UK_fit <- lm(percent_yes ~ year, UK_by_year)

# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)

# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)

3.9 Nesting for multiple models (video)

3.10 Nesting a data frame

Instruction :

# Load the tidyr package
library(tidyr)

# Nest all columns besides country
by_year_country %>%
nest(-country)

3.11 List columns

Instruction :

# All countries are nested besides country
nested <- by_year_country %>%
  nest(-country)

# Print the nested data for Brazil
nested$data[[7]]

3.12 Unnesting

Instruction :

# All countries are nested besides country
nested <- by_year_country %>%
  nest(-country)

# Unnest the data column to return it to its original form
nested %>%
unnest()

3.13 Fitting multiple models (video)

3.14 Performing linear regression on each nested dataset

Instruction :

# Load tidyr and purrr
library(tidyr)
library(purrr)


# Perform a linear regression on each item in the data column
by_year_country %>%
  nest(-country)%>%
  mutate(model = map(data, ~lm(percent_yes ~ year, .)))
3.15 Tidy each linear regression mode

Instruction :

# Load the broom package
library(broom)

# Add another mutate that applies tidy() to each model
by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .))) %>%
  mutate(tidied = map(model, tidy))
3.16 Unnesting a data frame

Instruction :

# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
         tidied = map(model, tidy)) %>%
  unnest(tidied)


# Print the resulting country_coefficients variable
country_coefficients
3.17 Working with many tidy models (video)
3.18 Filtering model terms

Instruction :

# Print the country_coefficients dataset
country_coefficients

# Filter for only the slope terms
country_coefficients %>%
filter(term == "year")
3.19 Filtering for significant countries

Instruction :

# Filter for only the slope terms
slope_terms <- country_coefficients %>%
  filter(term == "year")

# Add p.adjusted column, then filter
slope_terms %>%
 mutate(p.adjusted = p.adjust(p.value)) %>%
 filter(p.adjusted < .05)
3.20 Sorting by slope

Instruction :

# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value)) %>%
  filter(p.adjusted < .05)

# Sort for the countries increasing most quickly
filtered_countries %>%
  arrange(estimate)


# Sort for the countries decreasing most quickly
filtered_countries %>%
  arrange(desc(estimate))

4. Joining and Tidying

4.1 Joining datasets (video)
4.2 Joining datasets with inner_join

Instruction :

# Load dplyr package
library(dplyr)

# Print the votes_processed dataset
votes_processed

# Print the descriptions dataset
descriptions

# Join them together based on the "rcid" and "session" columns
votes_joined <- votes_processed %>%
  inner_join(descriptions, by = c("rcid", "session"))
4.3 Filtering the joined dataset

Instruction :

# Filter for votes related to colonialism
votes_joined %>%
  filter(co == 1)
4.4 Visualizing colonialism votes

Instruction :

# Load the ggplot2 package
library(ggplot2)

# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
  filter(country == "United States",co == 1) %>%
  group_by(year) %>%
  summarize(percent_yes = mean(vote == 1))

# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x = year, y = percent_yes)) +
  geom_line()
4.5 Tidy data (video)
4.6 Tidy data observations
4.7 Using gather to tidy a dataset

Instruction :

# Load the tidyr package
library(tidyr)

# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
  gather(topic, has_topic, me:ec)


# Perform gather again, then filter
votes_gathered <- votes_joined %>%
  gather(topic, has_topic, me:ec) %>%
  filter(has_topic == 1)
4.8 Recoding the topics

Instruction :

# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
  mutate(topic = recode(topic,
                        me = "Palestinian conflict",
                        nu = "Nuclear weapons and nuclear material",
                        di = "Arms control and disarmament",
                        hr = "Human rights",
                        co = "Colonialism",
                        ec = "Economic development"))
4.9 Summarize by country, year, and topic

Instruction :

# Print votes_tidied
votes_tidied

# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
  group_by(country, year, topic) %>%
  summarize(total = n(), percent_yes = mean(vote == 1) ) %>%
  ungroup()

# Print by_country_year_topic
by_country_year_topic
4.10 Visualizing trends in topics for one country

Instruction :

# Load the ggplot2 package
library(ggplot2)

# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States")

# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~topic)
4.11 Tidy modeling by topic and country (video)
4.12 Nesting by topic and country

Instruction :

# Load purrr, tidyr, and broom
library(purrr)
library(tidyr)
library(broom)

# Print by_country_year_topic
by_country_year_topic

# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
         tidied = map(model, tidy)) %>%
  unnest(tidied)

# Print country_topic_coefficients
country_topic_coefficients
4.13 Interpreting tidy models

Instruction :

# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value)) %>%
  filter(p.adjusted < .05)
4.14 Steepest trends by topic
4.15 Checking models visually

Instruction :

by_country_year_topic %>%
  filter(country == "Vanuatu")

# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x = year, y = percent_yes)) +
  geom_line() +
  facet_wrap(~topic)
4.16 Conclusion
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值