knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)
library(tidyverse)
library(tidymodels)
library(ggrepel)
# Substitute this for the path where you keep your files
my_home <- "~/Google Drive/My Drive/extracurricular/Presentations/"
# State area measurements from https://www.census.gov/geographies/reference-files/2010/geo/state-area.html
# State abbreviations from https://about.usps.com/who-we-are/postal-history/state-abbreviations.htm
state_areas <- read_tsv(paste0(my_home, "state_areas.tsv"), col_names = TRUE, show_col_types = FALSE)
state_abbrev <- read_tsv(paste0(my_home, "state_abbreviations.tsv"), col_names = TRUE, show_col_types = FALSE)
glimpse()
# check shape
dim(state_areas)
## [1] 57 7
dim(state_abbrev)
## [1] 52 6
# displaying
head(state_abbrev)
# changing the order of the rows & displaying
state_areas %>% arrange(desc(Name)) %>% head()
# glimpse is another way to see column types
glimpse(state_areas)
## Rows: 57
## Columns: 7
## $ Name <chr> "United States", "Alabama", "Alaska", "Arizona", "Ar…
## $ Total_Sq_Mi <dbl> 3796742, 52420, 665384, 113990, 53179, 163695, 10409…
## $ Total_Sq_Km <dbl> 9833517, 135767, 1723337, 295234, 137732, 423967, 26…
## $ Land_Sq_Mi <dbl> 3531905, 50645, 570641, 113594, 52035, 155779, 10364…
## $ Land_Sq_Km <dbl> 9147593, 131171, 1477953, 294207, 134771, 403466, 26…
## $ Water_Total_Sq_Mi <dbl> 264837, 1775, 94743, 396, 1143, 7916, 452, 701, 540,…
## $ Water_Total_Sq_Km <dbl> 685924, 4597, 245383, 1026, 2961, 20501, 1170, 1816,…
filter()
, select()
, rename()
; add the variables we want with left_join()
and mutate()
# we'll come up with a made-up category -- "dry" or "water" states
# based on the total water area / total area
states_pct_water <- state_areas %>%
filter((Name != "United States") & (Name != "District of Columbia")) %>%
select(Name, ends_with("Mi")) %>% # keep the area data in miles
mutate(pct_water = 100 * (Water_Total_Sq_Mi / Total_Sq_Mi),
category = ifelse(as.numeric(pct_water) > 20, "water state", "dry state"))
# cleaning up the abbreviations dataframe, renaming the column we're interested in (present abbreviation)
state_abbrev <- state_abbrev %>%
select(Name, ends_with("present")) %>%
rename(abbreviation = ends_with("present")) %>% #new name = old name
mutate(abbreviation = str_sub(abbreviation,1,2)) #make sure abbreviation is two letters
# joining to our water table
states_pct_water <- left_join(states_pct_water, state_abbrev, by = "Name")
head(states_pct_water)
# see how our percentage calculation & join went -- any NA's or join by wrong key?
library(naniar)
vis_miss(states_pct_water)
# in addition to glimpse, another method from outside the tidyverse is skim
library(skimr)
skim(states_pct_water)
Name | states_pct_water |
Number of rows | 55 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Name | 0 | 1.00 | 4 | 24 | 0 | 55 | 0 |
category | 0 | 1.00 | 9 | 11 | 0 | 2 | 0 |
abbreviation | 4 | 0.93 | 2 | 2 | 0 | 51 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Total_Sq_Mi | 0 | 1 | 69197.49 | 95173.42 | 571.00 | 18318.00 | 53819.00 | 82923.5 | 665384.00 | ▇▁▁▁▁ |
Land_Sq_Mi | 0 | 1 | 64288.58 | 84220.46 | 76.00 | 16872.50 | 50645.00 | 78225.5 | 570641.00 | ▇▁▁▁▁ |
Water_Total_Sq_Mi | 0 | 1 | 4908.96 | 13637.82 | 192.00 | 596.00 | 1494.00 | 3625.0 | 94743.00 | ▇▁▁▁▁ |
pct_water | 0 | 1 | 14.09 | 21.70 | 0.24 | 1.78 | 4.16 | 16.5 | 90.74 | ▇▁▁▁▁ |
# exclude entries that do not have a matching abbreviation
states_pct_water <- filter(states_pct_water, !is.na(abbreviation))
head(states_pct_water)
group_by()
and summarize()
to get summary statistics# grouping and summarizing
states_category_mean_area <- states_pct_water %>%
group_by(category) %>%
summarize(mean_total_area = mean(Total_Sq_Mi))
states_category_mean_area
pivot_longer()
and pivot_wider()
(formerly spread()
and gather()
)# pivot_longer (formerly "gather")
# this lets us work with the legend of different types of areas more easily
states_pct_water_long <- states_pct_water %>%
pivot_longer(cols = ends_with("Mi"), names_to = "area_type", values_to = "area_sq_mi")
head(states_pct_water_long)
ggplot2
# A simple histogram of our made-up metric, "pct_water"
ggplot(data = states_pct_water) +
geom_histogram(mapping = aes(x = pct_water))
# what is the overall distribution of areas? differentiate by water/land
# facet_wrap -- somewhat independent plots by category
ggplot(data = states_pct_water_long, mapping = aes(x = area_sq_mi)) +
geom_histogram() +
facet_wrap(~area_type, scales = "free")
ggplot(data = states_pct_water_long, mapping = aes(x = area_sq_mi)) +
geom_density() +
facet_wrap(~area_type, scales = "free")
# do our made up "dry states" have less water after all?
# plot together but split by "fill" as opposed to facet_wrap; can't set independent axes in this
ggplot(data = states_pct_water_long, mapping = aes(x = category, y = area_sq_mi, fill = area_type)) +
geom_violin() +
scale_y_log10()
ggplot(data = states_pct_water_long, mapping = aes(x = category, y = area_sq_mi)) +
geom_violin(mapping = aes(fill = area_type)) +
geom_boxplot() + # because the fill is not inherited from ggplot() call, boxplots are of overall areas
geom_jitter(width = 0.05) +
scale_y_log10()
ggplot(data = states_pct_water_long) +
geom_bar(mapping = aes(x = category, y = area_sq_mi, fill = area_type),
position = "fill", stat = "identity")
sp <- ggplot(states_pct_water) +
geom_point(mapping = aes(x = Total_Sq_Mi, y = pct_water))
sp
# adding a scale to the dots
get_closest_quantile <- function(my_state_area){
# generate quantiles by 10%
q <- quantile(states_pct_water$Total_Sq_Mi, probs = seq(0,1,by=0.1))
# get the index of the closest quantile to my specific state area
q_idx <- which.min(abs(q - my_state_area))
# return a number 1-10 (divided by 10 to make a decimal)
return(q_idx/10)
}
# some baseline dot size + an added scaling factor according to the closest percentile
states_pct_water <- states_pct_water %>%
mutate(dot_size = 3 + as.numeric(unlist(lapply(Total_Sq_Mi, get_closest_quantile))))
# add title, caption
# in addition to sizing by our new scale, add transparency (alpha)
sp <- ggplot(states_pct_water, aes(x = Total_Sq_Mi, y = pct_water)) +
geom_point(mapping = aes(color = category, size = dot_size), alpha = 0.5) +
labs(title = "% of Water Area vs. Total Area of State",
caption = "Source: US Census Bureau \n census.gov/geographies/reference-files/2010/geo/state-area.html")
sp
clean_theme <- theme_classic() + # layer our modifications on top of a default ggplot theme
theme(plot.title = element_text(family = "serif", size = 15, face = 'italic', hjust = 0.5, vjust = 0.5),
plot.caption = element_text(family = "serif", size = 8, face = 'bold', hjust = 1, vjust = 0.5),
axis.title = element_text(family = "serif", size = 10),
axis.text = element_text(family = "serif", size = 10),
axis.text.x = element_text(family = "serif", size = 10),
legend.title = element_text(family = "serif", size = 10),
legend.text = element_text(family = "serif", size = 10),
legend.position = "bottom",
plot.margin = margin(1,2,1,2, "cm")) # top, right, bottom, left margins
sp <- sp +
scale_color_manual(values = c("firebrick4", "deepskyblue3")) + # adjust the color legend
scale_size(guide="none") + # hide the size legend
xlab("Total Area (sq. mi)") +
ylab("Water % of State Area") +
scale_x_log10() +
clean_theme
sp +
geom_text(data = filter(states_pct_water, pct_water > 5), #subsetting to avoid crowding
mapping = aes(label = abbreviation), hjust = 0, vjust = -0.5, size = 3)
# alternative to geom_text, ggrepel deals with crowding labels
sp +
ylim(-12,50) +
geom_text_repel(mapping = aes(label = Name),
size = 2.5, max.overlaps = 50, force_pull = 0.01)
skim()
# we'll bring in one of the default R datasets -- us_rent_income; see data()
skim(us_rent_income)
Name | us_rent_income |
Number of rows | 104 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
GEOID | 0 | 1 | 2 | 2 | 0 | 52 | 0 |
NAME | 0 | 1 | 4 | 20 | 0 | 52 | 0 |
variable | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
estimate | 1 | 0.99 | 14923.08 | 14459.89 | 464 | 864.5 | 1507 | 28872.0 | 43198 | ▇▁▂▆▁ |
moe | 1 | 0.99 | 95.50 | 119.74 | 2 | 4.5 | 18 | 153.5 | 681 | ▇▃▁▁▁ |
# merge our state water data with rent/income data
all_state_data <- us_rent_income %>%
rename(Name = NAME) %>%
select(-moe, -GEOID) %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
left_join(states_pct_water, by = "Name") %>%
na.omit()
head(all_state_data)
library(GGally)
# can also get nice plots using ggpairs
all_num_state_data <- all_state_data %>% select(-dot_size) %>% select_if(is.double)
ggpairs(all_num_state_data)
library(corrr)
# can easily get correlation tables and plots using corrr from tidymodels
all_num_state_data_cor <- all_num_state_data %>% correlate() %>% rearrange() %>% shave()
fashion(all_num_state_data_cor) # text display in a nice format
rplot(all_num_state_data_cor) # circle plot with correlation magnitude
# can also make other fancier correlation plots
all_num_state_data %>%
correlate() %>%
network_plot(min_cor = .1)
unsurprisingly, rent/income are correlated to each other; land/total square miles are in turn correlated to each other
pct_water seems to be behaving weirdly: correlates better with rent than with total water sq. miles
we can use our somewhat orthogonal variables to reduce dimensions using PCA– see which states are more similar to each other
# first make a matrix with states as row names
states_mat <- as.data.frame(all_num_state_data)
rownames(states_mat) <- all_state_data$Name
# run PCA
states_pca <- prcomp(states_mat, center = TRUE, scale = TRUE)
# what is the variance explained by each PC? individual + cumulative
states_pca_var <- data.frame(
PC = seq(1, length(states_pca$sdev)),
stdev = states_pca$sdev,
var = (states_pca$sdev)^2,
var_pct = cumsum((states_pca$sdev)^2 / sum((states_pca$sdev)^2))
)
ggplot(states_pca_var) +
geom_line(mapping = aes(x = PC, y = var)) +
geom_point(mapping = aes(x = PC, y = var), size = 3, shape = 23, fill = "goldenrod") +
scale_x_continuous(labels = min(states_pca_var$PC):max(states_pca_var$PC),
breaks = min(states_pca_var$PC):max(states_pca_var$PC)) +
xlab("Principal Component") +
ylab("Variance explained") +
clean_theme
ggplot(states_pca_var) +
geom_line(mapping = aes(x = PC, y = var_pct)) +
geom_point(mapping = aes(x = PC, y = var_pct), size = 3, shape = 21, fill = "lawngreen") +
scale_x_continuous(labels = min(states_pca_var$PC):max(states_pca_var$PC),
breaks = min(states_pca_var$PC):max(states_pca_var$PC)) +
xlab("Principal Component") +
ylab("Cumulative % variance explained") +
clean_theme
states_pca_df <- as.data.frame(states_pca$x) %>%
mutate(Name = rownames(states_mat)) %>%
select(Name, PC1, PC2) %>%
left_join(states_pct_water, by = "Name")
ggplot(states_pca_df, mapping = aes(x = PC1, y = PC2)) +
geom_point(mapping = aes(color = category)) +
geom_text_repel(mapping = aes(label = abbreviation), max.overlaps = 50) +
theme_bw()
library(pheatmap)
# pheatmap is another quick way to visualize your numerical tables
pheatmap(states_pca$rotation, cluster_cols = FALSE, cluster_rows = FALSE)
state_data_pca_mat <- states_pca_df %>% select(PC1, PC2)
rownames(state_data_pca_mat) <- states_pca_df$Name
state_data_kclust2 <- kmeans(state_data_pca_mat, centers = 2)
summary(state_data_kclust2)
## Length Class Mode
## cluster 50 -none- numeric
## centers 4 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
state_data_kclust2_aug <- augment(state_data_kclust2, state_data_pca_mat)
state_data_kclust2_aug
state_data_kclust2_aug$.cluster
## Alabama Alaska Arizona Arkansas California
## 2 1 2 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 1 1 1 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 2 2 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 2 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 1 1 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 2 1 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 1 2 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 1 2
## South Dakota Tennessee Texas Utah Vermont
## 2 2 2 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 2 2 2
## Levels: 1 2
# not incredibly informative, let's try iterating through possible values of k
state_data_kclust_iter <-
tibble(k = 1:6) %>%
mutate(
kclust = map(k, ~kmeans(state_data_pca_mat, .x)),
tidied = map(kclust, tidy), # turn into a tibble
glanced = map(kclust, glance), # summary "glance"
augmented = map(kclust, augment, state_data_pca_mat) # augment our original data with k-means clusters
)
# we have lists of tibbles, each corresponding to a k
state_data_kclust_iter
# the augmented list contains the data we want, again-- one tibble per value of k
state_data_kclust_iter$augmented
## [[1]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 1
## 2 Alaska 10.6 1.07 1
## 3 Arizona 0.209 0.709 1
## 4 Arkansas -0.957 1.69 1
## 5 California 1.62 -0.728 1
## 6 Colorado 0.489 -0.585 1
## 7 Connecticut -0.451 -2.16 1
## 8 Delaware -0.632 -1.89 1
## 9 Florida 0.219 -0.521 1
## 10 Georgia -0.452 0.499 1
## # … with 40 more rows
##
## [[2]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 2
## 2 Alaska 10.6 1.07 1
## 3 Arizona 0.209 0.709 2
## 4 Arkansas -0.957 1.69 2
## 5 California 1.62 -0.728 1
## 6 Colorado 0.489 -0.585 1
## 7 Connecticut -0.451 -2.16 1
## 8 Delaware -0.632 -1.89 1
## 9 Florida 0.219 -0.521 1
## 10 Georgia -0.452 0.499 2
## # … with 40 more rows
##
## [[3]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 3
## 2 Alaska 10.6 1.07 1
## 3 Arizona 0.209 0.709 3
## 4 Arkansas -0.957 1.69 3
## 5 California 1.62 -0.728 2
## 6 Colorado 0.489 -0.585 2
## 7 Connecticut -0.451 -2.16 2
## 8 Delaware -0.632 -1.89 2
## 9 Florida 0.219 -0.521 2
## 10 Georgia -0.452 0.499 3
## # … with 40 more rows
##
## [[4]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 4
## 2 Alaska 10.6 1.07 2
## 3 Arizona 0.209 0.709 1
## 4 Arkansas -0.957 1.69 4
## 5 California 1.62 -0.728 1
## 6 Colorado 0.489 -0.585 1
## 7 Connecticut -0.451 -2.16 3
## 8 Delaware -0.632 -1.89 3
## 9 Florida 0.219 -0.521 1
## 10 Georgia -0.452 0.499 4
## # … with 40 more rows
##
## [[5]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 1
## 2 Alaska 10.6 1.07 2
## 3 Arizona 0.209 0.709 3
## 4 Arkansas -0.957 1.69 1
## 5 California 1.62 -0.728 5
## 6 Colorado 0.489 -0.585 5
## 7 Connecticut -0.451 -2.16 4
## 8 Delaware -0.632 -1.89 4
## 9 Florida 0.219 -0.521 5
## 10 Georgia -0.452 0.499 3
## # … with 40 more rows
##
## [[6]]
## # A tibble: 50 × 4
## .rownames PC1 PC2 .cluster
## <chr> <dbl> <dbl> <fct>
## 1 Alabama -0.859 1.40 6
## 2 Alaska 10.6 1.07 4
## 3 Arizona 0.209 0.709 5
## 4 Arkansas -0.957 1.69 6
## 5 California 1.62 -0.728 3
## 6 Colorado 0.489 -0.585 3
## 7 Connecticut -0.451 -2.16 2
## 8 Delaware -0.632 -1.89 2
## 9 Florida 0.219 -0.521 3
## 10 Georgia -0.452 0.499 5
## # … with 40 more rows
# now plot each tibble per k, with PC1 and PC2 as the coordinates
state_data_kclust_iter_df <- unnest(state_data_kclust_iter, cols = c(augmented))
state_data_kclust_iter_df$.rownames <- state_abbrev$abbreviation[match(state_data_kclust_iter_df$.rownames, state_abbrev$Name)]
ggplot(data = state_data_kclust_iter_df,
mapping = aes(x = PC1, y = PC2)) +
geom_point(mapping = aes(color = .cluster), alpha = 0.6, size = 4, shape = 21) +
geom_text(mapping = aes(label = .rownames), alpha = 0.8, size = 2) +
facet_wrap(~k) + theme_minimal()
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pheatmap_1.0.12 corrr_0.4.3 GGally_2.1.2 skimr_2.1.3
## [5] naniar_0.6.1 ggrepel_0.9.1 yardstick_0.0.9 workflowsets_0.1.0
## [9] workflows_0.2.4 tune_0.1.6 rsample_0.1.1 recipes_0.1.17
## [13] parsnip_0.1.7 modeldata_0.1.1 infer_1.0.0 dials_0.1.0
## [17] scales_1.1.1 broom_0.7.9 tidymodels_0.1.4 forcats_0.5.1
## [21] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.0.1
## [25] tidyr_1.2.0 tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ellipsis_0.3.2 class_7.3-19 visdat_0.5.3
## [5] base64enc_0.1-3 fs_1.5.0 rstudioapi_0.13 farver_2.1.0
## [9] listenv_0.8.0 furrr_0.2.3 bit64_4.0.5 prodlim_2019.11.13
## [13] fansi_0.5.0 lubridate_1.7.10 xml2_1.3.2 codetools_0.2-18
## [17] splines_4.1.1 knitr_1.34 jsonlite_1.7.2 pROC_1.18.0
## [21] dbplyr_2.1.1 compiler_4.1.1 httr_1.4.2 backports_1.2.1
## [25] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0 cli_3.0.1
## [29] htmltools_0.5.2 tools_4.1.1 gtable_0.3.0 glue_1.4.2
## [33] Rcpp_1.0.7 cellranger_1.1.0 jquerylib_0.1.4 DiceDesign_1.9
## [37] vctrs_0.3.8 iterators_1.0.14 timeDate_3043.102 gower_1.0.0
## [41] xfun_0.26 globals_0.14.0 rvest_1.0.1 lifecycle_1.0.1
## [45] future_1.23.0 MASS_7.3-54 TSP_1.1-11 ipred_0.9-12
## [49] vroom_1.5.5 hms_1.1.0 parallel_4.1.1 RColorBrewer_1.1-2
## [53] yaml_2.2.1 rpart_4.1-15 reshape_0.8.8 stringi_1.7.4
## [57] highr_0.9 foreach_1.5.2 seriation_1.3.1 lhs_1.1.3
## [61] hardhat_0.2.0 lava_1.6.10 repr_1.1.4 rlang_0.4.11
## [65] pkgconfig_2.0.3 evaluate_0.14 lattice_0.20-44 labeling_0.4.2
## [69] bit_4.0.4 tidyselect_1.1.1 parallelly_1.30.0 plyr_1.8.6
## [73] magrittr_2.0.1 R6_2.5.1 generics_0.1.0 DBI_1.1.1
## [77] pillar_1.6.2 haven_2.4.3 withr_2.4.2 survival_3.2-11
## [81] nnet_7.3-16 future.apply_1.8.1 modelr_0.1.8 crayon_1.4.1
## [85] utf8_1.2.2 tzdb_0.1.2 rmarkdown_2.11 grid_4.1.1
## [89] readxl_1.3.1 reprex_2.0.1 digest_0.6.28 munsell_0.5.0
## [93] GPfit_1.0-8 registry_0.5-1
save.image(file = paste0(my_home, "R_tidyverse_data_science.RData"))