Setting up / loading packages

knitr::opts_chunk$set(echo=TRUE,  message=FALSE, warning=FALSE)
library(tidyverse)
library(tidymodels)
library(ggrepel)

Reading files

# Substitute this for the path where you keep your files
my_home <- "~/Google Drive/My Drive/extracurricular/Presentations/"

# State area measurements from https://www.census.gov/geographies/reference-files/2010/geo/state-area.html 
# State abbreviations from https://about.usps.com/who-we-are/postal-history/state-abbreviations.htm 
state_areas <- read_tsv(paste0(my_home, "state_areas.tsv"), col_names = TRUE, show_col_types = FALSE)
state_abbrev <- read_tsv(paste0(my_home, "state_abbreviations.tsv"), col_names = TRUE, show_col_types = FALSE)

Using tidyverse packages to wrangle and display data

1. What does our data look like?

Check shape, display by a certain order, use specialized functions like glimpse()
# check shape
dim(state_areas)
## [1] 57  7
dim(state_abbrev)
## [1] 52  6
# displaying
head(state_abbrev)
# changing the order of the rows & displaying
state_areas %>% arrange(desc(Name)) %>% head()
# glimpse is another way to see column types
glimpse(state_areas)
## Rows: 57
## Columns: 7
## $ Name              <chr> "United States", "Alabama", "Alaska", "Arizona", "Ar…
## $ Total_Sq_Mi       <dbl> 3796742, 52420, 665384, 113990, 53179, 163695, 10409…
## $ Total_Sq_Km       <dbl> 9833517, 135767, 1723337, 295234, 137732, 423967, 26…
## $ Land_Sq_Mi        <dbl> 3531905, 50645, 570641, 113594, 52035, 155779, 10364…
## $ Land_Sq_Km        <dbl> 9147593, 131171, 1477953, 294207, 134771, 403466, 26…
## $ Water_Total_Sq_Mi <dbl> 264837, 1775, 94743, 396, 1143, 7916, 452, 701, 540,…
## $ Water_Total_Sq_Km <dbl> 685924, 4597, 245383, 1026, 2961, 20501, 1170, 1816,…
Manipulate dataframe using filter(), select(), rename(); add the variables we want with left_join() and mutate()
# we'll come up with a made-up category -- "dry" or "water" states
# based on the total water area / total area
states_pct_water <- state_areas %>%
  filter((Name != "United States") & (Name != "District of Columbia")) %>%
  select(Name, ends_with("Mi")) %>% # keep the area data in miles
  mutate(pct_water = 100 * (Water_Total_Sq_Mi / Total_Sq_Mi), 
         category = ifelse(as.numeric(pct_water) > 20, "water state", "dry state"))
# cleaning up the abbreviations dataframe, renaming the column we're interested in (present abbreviation)
state_abbrev <- state_abbrev %>%
  select(Name, ends_with("present")) %>%
  rename(abbreviation = ends_with("present")) %>% #new name = old name
  mutate(abbreviation = str_sub(abbreviation,1,2)) #make sure abbreviation is two letters
# joining to our water table
states_pct_water <- left_join(states_pct_water, state_abbrev, by = "Name") 
head(states_pct_water)
# see how our percentage calculation & join went -- any NA's or join by wrong key?
library(naniar)
vis_miss(states_pct_water)

# in addition to glimpse, another method from outside the tidyverse is skim
library(skimr)
skim(states_pct_water) 
Data summary
Name states_pct_water
Number of rows 55
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1.00 4 24 0 55 0
category 0 1.00 9 11 0 2 0
abbreviation 4 0.93 2 2 0 51 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Total_Sq_Mi 0 1 69197.49 95173.42 571.00 18318.00 53819.00 82923.5 665384.00 ▇▁▁▁▁
Land_Sq_Mi 0 1 64288.58 84220.46 76.00 16872.50 50645.00 78225.5 570641.00 ▇▁▁▁▁
Water_Total_Sq_Mi 0 1 4908.96 13637.82 192.00 596.00 1494.00 3625.0 94743.00 ▇▁▁▁▁
pct_water 0 1 14.09 21.70 0.24 1.78 4.16 16.5 90.74 ▇▁▁▁▁
# exclude entries that do not have a matching abbreviation
states_pct_water <- filter(states_pct_water, !is.na(abbreviation)) 
head(states_pct_water)

2. Plotting associations between variables

Starting to ask more pointed questions of our data, use group_by() and summarize() to get summary statistics
# grouping and summarizing
states_category_mean_area <- states_pct_water %>%
  group_by(category) %>%
  summarize(mean_total_area = mean(Total_Sq_Mi))

states_category_mean_area
Changing the shape of our dataframes with pivot_longer() and pivot_wider() (formerly spread() and gather())
# pivot_longer (formerly "gather")
# this lets us work with the legend of different types of areas more easily
states_pct_water_long <- states_pct_water %>%
  pivot_longer(cols = ends_with("Mi"), names_to = "area_type", values_to = "area_sq_mi")
head(states_pct_water_long)
A sampling of what we can visualize with ggplot2
# A simple histogram of our made-up metric, "pct_water"
ggplot(data = states_pct_water) + 
  geom_histogram(mapping = aes(x = pct_water))

# what is the overall distribution of areas? differentiate by water/land
# facet_wrap -- somewhat independent plots by category
ggplot(data = states_pct_water_long, mapping = aes(x = area_sq_mi)) + 
  geom_histogram() + 
  facet_wrap(~area_type, scales = "free")

ggplot(data = states_pct_water_long, mapping = aes(x = area_sq_mi)) + 
  geom_density() + 
  facet_wrap(~area_type, scales = "free")