Skip to content

Commit

Permalink
Merge pull request #13 from kuriwaki/fix_iss12
Browse files Browse the repository at this point in the history
Redo education codes
  • Loading branch information
kuriwaki committed Jul 8, 2023
2 parents bbff305 + a6a1cbb commit 66cf538
Show file tree
Hide file tree
Showing 22 changed files with 208 additions and 67 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ccesMRPprep
Type: Package
Title: Functions and Data to Prepare CCES data for MRP
Version: 0.1.11
Version: 0.1.11.9999
Authors@R:
c(person(given = "Shiro",
family = "Kuriwaki",
Expand All @@ -24,7 +24,7 @@ Imports:
fs,
rlang,
Formula,
dplyr (>= 1.0.0),
dplyr (>= 1.1.0),
tidyr,
haven (>= 2.1.0),
stringr,
Expand Down
20 changes: 16 additions & 4 deletions R/cces_std-for-acs.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@
#' * \code{age} is coded to match up with the ACS bins and the recoding occurs
#' in a separate function, \code{ccc_bin_age}. The unbinned age is left instead to
#' \code{age_orig}.
#' * \code{educ} is recoded (coarsened and relabelled) to match up with the ACS.
#' * \code{educ} is coarsened and relabelled with 4 categories to match up with the ACS.
#' (the original version is left as \code{educ_cces_chr}). Recoding is governed by
#' the key-value pairs \link{educ_key}
#' the key-value pairs \link{educ_key}.
#' * \code{educ_3} is further coarsened to 3 categories, grouping together a BA
#' and a higher degree into one category. This is necessary for some ACS tables
#' that do not make the distinction. Make sure to decide which type of education
#' variable to use beforehand after looking at the ACS codes
#' * the same goes for \code{race}. These recodings are governed by the
#' key-value pair \link{race_key}.
#' * \code{cd} is standardized so that at large districts are given "01" and
Expand Down Expand Up @@ -83,6 +87,7 @@ ccc_std_demographics <- function(tbl,

race_cces_to_acs <- ccesMRPprep::race_key %>% distinct(.data$race_cces_chr, .data$race)
educ_cces_to_acs <- ccesMRPprep::educ_key %>% distinct(.data$educ_cces_chr, .data$educ)
educ3_cces_to_acs <- ccesMRPprep::educ3_key %>% distinct(.data$educ_cces_chr, .data$educ_3)

# districts
if (inherits(tbl$st, "haven_labelled"))
Expand Down Expand Up @@ -116,6 +121,7 @@ ccc_std_demographics <- function(tbl,
# demographics
age_vec <- tbl$age # to check

# recode
tbl_mod <- tbl %>%
# age
mutate(age_orig = .data$age,
Expand All @@ -125,11 +131,17 @@ ccc_std_demographics <- function(tbl,
# race
rename(race_cces_chr = .data$race) %>%
mutate(race_cces_chr = as.character(as_factor(.data$race_cces_chr))) %>%
left_join(race_cces_to_acs, by = "race_cces_chr") %>%
left_join(race_cces_to_acs, by = "race_cces_chr", relationship = "many-to-one") %>%
# education
rename(educ_cces_chr = .data$educ) %>%
mutate(educ_cces_chr = as.character(as_factor(.data$educ_cces_chr))) %>%
left_join(educ_cces_to_acs, by = "educ_cces_chr")
left_join(educ_cces_to_acs, by = "educ_cces_chr", relationship = "many-to-one") %>%
select(-educ_cces_chr) %>%
# educ 3
left_join(ed_ed3_cces, by = "educ", relationship = "many-to-one") %>%
rename(educ_cces_chr = .data$educ_3) %>%
mutate(educ_cces_chr = as.character(as_factor(.data$educ_cces_chr))) %>%
left_join(educ3_cces_to_acs, by = "educ_cces_chr", relationship = "many-to-one")

# hispanic conversion
if (wh_as_hisp && ("hispanic" %in% colnames(tbl_mod))) {
Expand Down
14 changes: 8 additions & 6 deletions R/datadoc_acscodes.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ NULL
#' @format ### acscodes_age_sex_race
#'
#' There are `r length(acscodes_age_sex_race)` codes in \code{acscodes_age_sex_race} because they specify cells
#' interacting age (10 bins), sex (2 bins), and race/ethnicity (8 bins).
#' interacting age (10 bins, later coarsened to 5), sex (2 bins), and race/ethnicity (8 bins).
#'
"acscodes_age_sex_race"

Expand All @@ -44,9 +44,9 @@ NULL
#' @format ### acscodes_sex_educ_race
#'
#' There are `r length(acscodes_sex_educ_race)` codes in \code{acscodes_sex_educ_race} because they specify cells
#' interacting sex (2 bins), education (3 bins), and race (8 bins). The entire partition
#' is not actually exhaustive; it appears to only limit to 25 years and above
#' and not include postgraduate degrees. Cross-check with \link{acscodes_df} to verify.
#' interacting sex (2 bins), education (4 bins, later recoded to 3), and race (8 bins).
#' The entire partition is not actually exhaustive; it appears to only limit to 25 years and above.
#' Cross-check with \link{acscodes_df} to verify.
#'
"acscodes_sex_educ_race"

Expand All @@ -61,13 +61,15 @@ NULL
#' 18-24 year olds who identify as Hispanic).
#'
#' \describe{
#' \item{variable}{the ACS code (2016)}
#' \item{variable}{the ACS code for the variable (2016)}
#' \item{table}{the ACS table the variable is in (2016)}
#' \item{gender}{A labelled variable for gender. 1 is Male, 2 is Female. Use
#' the \code{labelled} or \code{haven} package to see labels.}
#' \item{female}{A numeric, binary version of gender}
#' \item{age_5}{A labelled variable specifying which 5-way age bin the variable specifies}
#' \item{age_10}{A labelled variable specifying which 10-way age bin the variable specifies}
#' \item{educ}{A labelled variable specifying which race bin the variable specifies}
#' \item{educ}{A labelled variable specifying which education (four-way) bin the variable specifies}
#' \item{educ_3}{A labelled variable specifying which education (three-way) bin the variable specifies}
#' \item{race}{A labelled variable specifying which education bin the variable specifies}
#' }
#'
Expand Down
18 changes: 17 additions & 1 deletion R/datadoc_namevalues.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' library(ccesMRPprep)
#' race_key
#' educ_key
#' educ3_key
#' gender_key
#' age5_key
#' age10_key
Expand Down Expand Up @@ -51,14 +52,29 @@ NULL

#' @rdname namevalue
#' @format ### \code{educ_key}
#' For mapping ACS data values for education e.g. in \link{get_acs_cces}:
#' For mapping ACS data values for four-way education e.g. in \link{get_acs_cces}:
#' \describe{
#' \item{educ_cces_chr}{Character to recode from, in CCES}
#' \item{educ_chr}{Character to recode from, in ACS.}
#' \item{educ}{An labelled integer of class haven::labelled. Target variable}
#' }
"educ_key"

#' @rdname namevalue
#' @format ### \code{educ3_key}
#' For mapping ACS data values for three-way education e.g. in \link{get_acs_cces}:
#' \describe{
#' \item{educ_cces_chr}{Character to recode from, in CCES}
#' \item{educ_chr}{Character to recode from, in ACS.}
#' \item{educ3}{An labelled integer of class haven::labelled. Target variable}
#' }
"educ3_key"

#' @rdname namevalue
#' @format ### \code{ed_ed3_cces}
#' A key to link educ (4-way) and educ3
"ed_ed3_cces"


#' @rdname namevalue
#' @format ### \code{age5_key}
Expand Down
3 changes: 2 additions & 1 deletion R/get_acs-poststrat.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ get_acs_cces <- function(varlist,
.data$year,
.data$cd,
matches("(gender|female|age|educ|race)"), .data$count, .data$count_moe) %>%
select(-matches("age_(5|10)"))
select(-matches("age_(5|10)")) |>
select_if(~ any(!is.na(.x)))

acs_lbl
}
Expand Down
112 changes: 77 additions & 35 deletions data-raw/create_namevalue-pairs.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,74 @@ age10_key <- tibble(age_chr = ages10) %>%

# Education ----
educ_cces_lbl <- setNames(1L:7L,
c("Less than 9",
"No HS",
"High School Graduate",
"Some College",
"2-Year",
"4-Year",
"Post-Grad"))
c("Less than 9",
"No HS",
"High School Graduate",
"Some College",
"2-Year",
"4-Year",
"Post-Grad"))
educ_lbl_clps <- setNames(1L:4L,
c("HS or Less", "Some College", "4-Year", "Post-Grad"))
educ3_lbl_clps <- setNames(1L:3L,
c("HS or Less", "Some College", "4-Year or Post-Grad"))


## CCES lumps the first two, and let's also lump the 2-year
cces_edlbl <- tibble(educ_cces_chr = names(educ_cces_lbl)[2:7],
educ = labelled(c(1, 1, 2, 2, 3, 4), educ_lbl_clps))

cces_ed3lbl <- tibble(educ_cces_chr = c("HS or Less", "Some College", "4-Year or Post-Grad"),
educ_3 = labelled(c(1, 2, 3), educ3_lbl_clps))

# link in CCES between the two measures
ed_ed3_cces <- tibble(
educ = unique(cces_edlbl$educ),
educ_3 = labelled(c(1, 2, 3, 3), educ3_lbl_clps)
)

educ_key <- tribble(
~educ_chr, ~educ_cces_chr, ~doc_note,
"Less than 9th grade", "No HS", NA,
"9th to 12th grade no diploma", "No HS", "ACS spelling varies",
"9th to 12th grade, no diploma", "No HS", "ACS spelling varies",
"High school graduate (includes equivalency)", "High School Graduate", NA,
"High school graduate, GED, or alternative", "High School Graduate", NA,
"Some college no degree", "Some College", "ACS spelling varies",
"Some college, no degree", "Some College", "ACS spelling varies",
"Associate's degree", "2-Year", "Lumped into Some college",
"Bachelor's degree", "4-Year", NA,
"Graduate or professional degree", "Post-Grad", NA
"Nursery to 4th grade", "No HS", NA,
"No schooling completed", "No HS", NA,
"Less than 9th grade", "No HS", NA,
"Less than high school graduate", "No HS", NA,
"Less than high school diploma", "No HS", NA,
"5th and 6th grade", "No HS", NA,
"7th and 8th grade", "No HS", NA,
"9th grade", "No HS", NA,
"10th grade", "No HS", NA,
"11th grade", "No HS", NA,
"12th grade, no diploma", "No HS", NA,
"9th to 12th grade no diploma", "No HS", "ACS spelling varies",
"9th to 12th grade, no diploma", "No HS", "ACS spelling varies",
"High school graduate (includes equivalency)", "High School Graduate", NA,
"High school graduate, GED, or alternative", "High School Graduate", NA,
"Some college", "Some College", "ACS spelling varies",
"Some college no degree", "Some College", "ACS spelling varies",
"Some college, no degree", "Some College", "ACS spelling varies",
"Some college, 1 or more years, no degree", "Some College", NA,
"Some college, less than 1 year", "Some College", NA,
"Associate's degree", "2-Year", "Lumped into Some college",
"Bachelor's degree", "4-Year", NA,
"Master's degree", "4-Year", NA,
"Professional school degree", "4-Year", NA,
"Doctorate degree", "4-Year", NA,
"Graduate or professional degree", "Post-Grad", NA,
) %>%
left_join(cces_edlbl, by = "educ_cces_chr")

educ3_key <- tribble(
~educ_chr, ~educ_cces_chr,
"Less than high school graduate", "HS or Less",
"Less than high school diploma", "HS or Less",
"High school graduate (includes equivalency)", "HS or Less",
"Some college", "Some College",
"Some college or associate's degree", "Some College",
"Bachelor's degree or higher", "4-Year or Post-Grad"
) |>
left_join(cces_ed3lbl, by = "educ_cces_chr")

# Gender ----
gender_key <- tibble(gender_chr = c("Male", "Female"),
gender = labelled(1:2L, c(Male = 1, Female = 2)))
Expand Down Expand Up @@ -101,7 +139,9 @@ race_key <- tribble(
left_join(race_cces_key, by = "race_cces_chr") %>%
mutate(race = labelled(race_num, my_racelbl))

usethis::use_data(age5_key, age10_key, gender_key, educ_key, race_key,
usethis::use_data(age5_key, age10_key, gender_key,
educ_key, educ3_key, ed_ed3_cces,
race_key,
overwrite = TRUE)


Expand All @@ -113,26 +153,28 @@ if (recreate_sheet) {



#' transform int+labelled to csv
# flatten_csv <- function(rd_tbl) {
# rd_tbl %>%
# mutate(race_cces_value = zap_labels(race_cces),
# race_cces_name = as_factor(race_cces),
# race_value = zap_labels(race),
# race_name = as_factor(race)) %>%
# select(-race_cces, -race)
# }
#' transform int+labelled to csv
# flatten_csv <- function(rd_tbl) {
# rd_tbl %>%
# mutate(race_cces_value = zap_labels(race_cces),
# race_cces_name = as_factor(race_cces),
# race_value = zap_labels(race),
# race_name = as_factor(race)) %>%
# select(-race_cces, -race)
# }

educ_key_csv <- educ_key %>%
rename(educ_acs_name = educ_chr,
educ_cces_name = educ_cces_chr) %>% # shouldn't this be called ACS to begin with
mutate(educ_value = zap_labels(educ),
educ_name = as_factor(educ),
educ = NULL) %>%
relocate(doc_note, .after = -1)
educ_key_csv <- educ_key %>%
rename(educ_acs_name = educ_chr,
educ_cces_name = educ_cces_chr) %>% # shouldn't this be called ACS to begin with
mutate(educ_value = zap_labels(educ),
educ_name = as_factor(educ),
educ3_value = zap_labels(educ3),
educ3_name = as_factor(educ3),
educ = NULL) %>%
relocate(doc_note, .after = -1)


sheet_write(educ_key_csv, ss = gs_cces_acs, sheet = "educ")
sheet_write(educ_key_csv, ss = gs_cces_acs, sheet = "educ")


}
56 changes: 48 additions & 8 deletions data-raw/specify_ACS-codes.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,30 @@ ages <- c("18 to 24 years",
"65 to 74 years",
"75 to 84 years",
"85 years and over")
education <- c("Less than 9th grade",
education <- c("Less than high school diploma",
"Nursery to 4th grade",
"No schooling completed",
"Less than 9th grade",
"5th and 6th grade",
"7th and 8th grade",
"9th grade",
"10th grade",
"11th grade",
"12th grade, no diploma",
"Less than 9th grade",
"Less than high school graduate",
"9th to 12th grade,? no diploma",
"High school graduate \\(includes equivalency\\)",
"High school graduate, GED, or alternative",
"Some college,? no degree",
"Some college",
"Some college or associate's degree",
"Associate's degree",
"Bachelor's degree",
"Graduate or professional degree")
"Bachelor's degree$",
"Doctorate degree",
"Master's degree",
"Professional school degree",
"Graduate or professional degree",
"Bachelor's degree or higher")
races <- c("White alone, not Hispanic or Latino",
"Hispanic or Latino",
"Black or African American alone",
Expand Down Expand Up @@ -84,12 +99,37 @@ acscodes_df <- vars %>%
rename(gender_chr = gender, age_chr = age, educ_chr = educ, race_acs = race) %>%
left_join(gender_key, by = "gender_chr") %>%
# age 10 if using race interactions consider binding while keeping the label
left_join(age5_key, by = "age_chr") %>%
left_join(age5_key, by = "age_chr", relationship = "many-to-one") %>%
left_join(age10_key, by = "age_chr", suffix = c("_5", "_10")) %>%
left_join(educ_key, by = "educ_chr") %>%
left_join(filter(race_key, !is.na(race_acs)), by = "race_acs") %>%
left_join(educ_key, by = "educ_chr", relationship = "many-to-one") %>%
left_join(educ3_key, by = "educ_chr", relationship = "many-to-one", suffix = c("", "_3")) %>%
left_join(filter(race_key, !is.na(race_acs)), by = "race_acs", relationship = "many-to-one") %>%
mutate(female = as.integer(gender == 2)) %>%
select(variable, gender, female, matches("age_(5|10)"), educ, race)
select(variable, gender, female, matches("age_(5|10)"), matches("educ($|_3)"), race) |>
mutate(table = str_sub(variable, 1, 6), .after = variable)

# Distinguish between two types of educ
educ_type <- acscodes_df |>
summarize(
use_educ3 = all(1:3 %in% educ_3, na.rm = TRUE) & all(educ != 4, na.rm = TRUE),
use_educ = all(1:4 %in% educ, na.rm = TRUE),
.by = table)

age_type <- acscodes_df |>
summarize(
use_age5 = all(1:5 %in% age_5, na.rm = TRUE) & all(age_10 != 5, na.rm = TRUE),
use_age10 = all(1:5 %in% age_10, na.rm = TRUE),
.by = table)

acscodes_df <- acscodes_df |>
left_join(educ_type, by = "table") |>
mutate(educ = replace(educ, use_educ3, NA),
educ_3 = replace(educ_3, use_educ & !use_educ3, NA)) |>
left_join(age_type, by = "table") |>
mutate(age_10 = replace(age_10, use_age5, NA),
age_5 = replace(age_5, use_age10 & !use_age5, NA)) |>
select(-starts_with("use_"))


# Write ----
usethis::use_data(acscodes_age_sex_educ, overwrite = TRUE)
Expand Down
Binary file modified data/acscodes_age_sex_educ.rda
Binary file not shown.
Binary file modified data/acscodes_age_sex_race.rda
Binary file not shown.
Binary file modified data/acscodes_df.rda
Binary file not shown.
Binary file modified data/acscodes_sex_educ_race.rda
Binary file not shown.
Binary file modified data/age10_key.rda
Binary file not shown.
Binary file modified data/age5_key.rda
Binary file not shown.
Binary file added data/ed_ed3_cces.rda
Binary file not shown.
Binary file added data/educ3_key.rda
Binary file not shown.
Binary file modified data/educ_key.rda
Binary file not shown.
Binary file modified data/gender_key.rda
Binary file not shown.
Binary file modified data/race_key.rda
Binary file not shown.
Loading

0 comments on commit 66cf538

Please sign in to comment.