Check shape, display by a certain order, use specialized functions like glimpse()
# check shape
dim(state_areas)
## [1] 57 7
dim(state_abbrev)
## [1] 52 6
# displaying
head(state_abbrev)
# changing the order of the rows & displaying
state_areas %>% arrange(desc(Name)) %>% head()
# glimpse is another way to see column types
glimpse(state_areas)
## Rows: 57
## Columns: 7
## $ Name <chr> "United States", "Alabama", "Alaska", "Arizona", "Ar…
## $ Total_Sq_Mi <dbl> 3796742, 52420, 665384, 113990, 53179, 163695, 10409…
## $ Total_Sq_Km <dbl> 9833517, 135767, 1723337, 295234, 137732, 423967, 26…
## $ Land_Sq_Mi <dbl> 3531905, 50645, 570641, 113594, 52035, 155779, 10364…
## $ Land_Sq_Km <dbl> 9147593, 131171, 1477953, 294207, 134771, 403466, 26…
## $ Water_Total_Sq_Mi <dbl> 264837, 1775, 94743, 396, 1143, 7916, 452, 701, 540,…
## $ Water_Total_Sq_Km <dbl> 685924, 4597, 245383, 1026, 2961, 20501, 1170, 1816,…
Manipulate dataframe using filter()
, select()
, rename()
; add the variables we want with left_join()
and mutate()
# we'll come up with a made-up category -- "dry" or "water" states
# based on the total water area / total area
states_pct_water <- state_areas %>%
filter((Name != "United States") & (Name != "District of Columbia")) %>%
select(Name, ends_with("Mi")) %>% # keep the area data in miles
mutate(pct_water = 100 * (Water_Total_Sq_Mi / Total_Sq_Mi),
category = ifelse(as.numeric(pct_water) > 20, "water state", "dry state"))
# cleaning up the abbreviations dataframe, renaming the column we're interested in (present abbreviation)
state_abbrev <- state_abbrev %>%
select(Name, ends_with("present")) %>%
rename(abbreviation = ends_with("present")) %>% #new name = old name
mutate(abbreviation = str_sub(abbreviation,1,2)) #make sure abbreviation is two letters
# joining to our water table
states_pct_water <- left_join(states_pct_water, state_abbrev, by = "Name")
head(states_pct_water)
# see how our percentage calculation & join went -- any NA's or join by wrong key?
library(naniar)
vis_miss(states_pct_water)

# in addition to glimpse, another method from outside the tidyverse is skim
library(skimr)
skim(states_pct_water)
Data summary
Name |
states_pct_water |
Number of rows |
55 |
Number of columns |
7 |
_______________________ |
|
Column type frequency: |
|
character |
3 |
numeric |
4 |
________________________ |
|
Group variables |
None |
Variable type: character
Name |
0 |
1.00 |
4 |
24 |
0 |
55 |
0 |
category |
0 |
1.00 |
9 |
11 |
0 |
2 |
0 |
abbreviation |
4 |
0.93 |
2 |
2 |
0 |
51 |
0 |
Variable type: numeric
Total_Sq_Mi |
0 |
1 |
69197.49 |
95173.42 |
571.00 |
18318.00 |
53819.00 |
82923.5 |
665384.00 |
▇▁▁▁▁ |
Land_Sq_Mi |
0 |
1 |
64288.58 |
84220.46 |
76.00 |
16872.50 |
50645.00 |
78225.5 |
570641.00 |
▇▁▁▁▁ |
Water_Total_Sq_Mi |
0 |
1 |
4908.96 |
13637.82 |
192.00 |
596.00 |
1494.00 |
3625.0 |
94743.00 |
▇▁▁▁▁ |
pct_water |
0 |
1 |
14.09 |
21.70 |
0.24 |
1.78 |
4.16 |
16.5 |
90.74 |
▇▁▁▁▁ |
# exclude entries that do not have a matching abbreviation
states_pct_water <- filter(states_pct_water, !is.na(abbreviation))
head(states_pct_water)