Merge pull request #13 from kuriwaki/fix_iss12

Redo education codes
kuriwaki · Jul 8, 2023 · 66cf538 · 66cf538
2 parents bbff305 + a6a1cbb
commit 66cf538
Show file tree

Hide file tree

Showing 22 changed files with 208 additions and 67 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: ccesMRPprep
 Type: Package
 Title: Functions and Data to Prepare CCES data for MRP
-Version: 0.1.11
+Version: 0.1.11.9999
 Authors@R: 
    c(person(given = "Shiro",
              family = "Kuriwaki",
@@ -24,7 +24,7 @@ Imports:
     fs,
     rlang,
     Formula,
-    dplyr (>= 1.0.0),
+    dplyr (>= 1.1.0),
     tidyr,
     haven (>= 2.1.0),
     stringr,

diff --git a/R/cces_std-for-acs.R b/R/cces_std-for-acs.R
@@ -31,9 +31,13 @@
 #' * \code{age} is coded to match up with the ACS bins and the recoding occurs
 #'  in a separate function, \code{ccc_bin_age}. The unbinned age is left instead to
 #'  \code{age_orig}.
-#' * \code{educ} is recoded (coarsened and relabelled) to match up with the ACS.
+#' * \code{educ} is coarsened and relabelled with 4 categories to match up with the ACS.
 #'  (the original version is left as \code{educ_cces_chr}). Recoding is governed by
-#'  the key-value pairs \link{educ_key}
+#'  the key-value pairs \link{educ_key}.
+#' * \code{educ_3} is further coarsened to 3 categories, grouping together a BA
+#'  and a higher degree into one category. This is necessary for some ACS tables
+#'  that do not make the distinction. Make sure to decide which type of education
+#'  variable to use beforehand after looking at the ACS codes
 #' * the same goes for \code{race}. These recodings are governed by the
 #'  key-value pair \link{race_key}.
 #' * \code{cd} is standardized so that at large districts are given "01" and
@@ -83,6 +87,7 @@ ccc_std_demographics <- function(tbl,
 
   race_cces_to_acs <- ccesMRPprep::race_key %>% distinct(.data$race_cces_chr, .data$race)
   educ_cces_to_acs <- ccesMRPprep::educ_key %>% distinct(.data$educ_cces_chr, .data$educ)
+  educ3_cces_to_acs <- ccesMRPprep::educ3_key %>% distinct(.data$educ_cces_chr, .data$educ_3)
 
   # districts
   if (inherits(tbl$st, "haven_labelled"))
@@ -116,6 +121,7 @@ ccc_std_demographics <- function(tbl,
   # demographics
   age_vec <-  tbl$age # to check
 
+  # recode
   tbl_mod <- tbl %>%
     # age
     mutate(age_orig = .data$age,
@@ -125,11 +131,17 @@ ccc_std_demographics <- function(tbl,
     # race
     rename(race_cces_chr = .data$race) %>%
     mutate(race_cces_chr = as.character(as_factor(.data$race_cces_chr))) %>%
-    left_join(race_cces_to_acs, by = "race_cces_chr") %>%
+    left_join(race_cces_to_acs, by = "race_cces_chr", relationship = "many-to-one") %>%
     # education
     rename(educ_cces_chr = .data$educ) %>%
     mutate(educ_cces_chr = as.character(as_factor(.data$educ_cces_chr))) %>%
-    left_join(educ_cces_to_acs, by = "educ_cces_chr")
+    left_join(educ_cces_to_acs, by = "educ_cces_chr", relationship = "many-to-one") %>%
+    select(-educ_cces_chr) %>%
+    # educ 3
+    left_join(ed_ed3_cces, by = "educ", relationship = "many-to-one") %>%
+    rename(educ_cces_chr = .data$educ_3) %>%
+    mutate(educ_cces_chr = as.character(as_factor(.data$educ_cces_chr))) %>%
+    left_join(educ3_cces_to_acs, by = "educ_cces_chr", relationship = "many-to-one")
 
   # hispanic conversion
   if (wh_as_hisp && ("hispanic" %in% colnames(tbl_mod))) {

diff --git a/R/datadoc_acscodes.R b/R/datadoc_acscodes.R
@@ -23,7 +23,7 @@ NULL
 #' @format ### acscodes_age_sex_race
 #'
 #' There are `r length(acscodes_age_sex_race)` codes in \code{acscodes_age_sex_race} because they specify cells
-#' interacting age (10 bins), sex (2 bins), and race/ethnicity (8 bins).
+#' interacting age (10 bins, later coarsened to 5), sex (2 bins), and race/ethnicity (8 bins).
 #'
 "acscodes_age_sex_race"
 
@@ -44,9 +44,9 @@ NULL
 #' @format ### acscodes_sex_educ_race
 #'
 #' There are `r length(acscodes_sex_educ_race)` codes in \code{acscodes_sex_educ_race} because they specify cells
-#' interacting sex (2 bins), education (3 bins), and race (8 bins). The entire partition
-#' is not actually exhaustive; it appears to only limit to 25 years and above
-#' and not include postgraduate degrees. Cross-check with \link{acscodes_df} to verify.
+#' interacting sex (2 bins), education (4 bins, later recoded to 3), and race (8 bins).
+#' The entire partition is not actually exhaustive; it appears to only limit to 25 years and above.
+#' Cross-check with \link{acscodes_df} to verify.
 #'
 "acscodes_sex_educ_race"
 
@@ -61,13 +61,15 @@ NULL
 #' 18-24 year olds who identify as Hispanic).
 #'
 #' \describe{
-#'  \item{variable}{the ACS code (2016)}
+#'  \item{variable}{the ACS code for the variable (2016)}
+#'  \item{table}{the ACS table the variable is in (2016)}
 #'  \item{gender}{A labelled variable for gender. 1 is Male, 2 is Female. Use
 #'    the \code{labelled} or \code{haven} package to see labels.}
 #'  \item{female}{A numeric, binary version of gender}
 #'  \item{age_5}{A labelled variable specifying which 5-way age bin the variable specifies}
 #'  \item{age_10}{A labelled variable specifying which 10-way age bin the variable specifies}
-#'  \item{educ}{A labelled variable specifying which race bin the variable specifies}
+#'  \item{educ}{A labelled variable specifying which education (four-way) bin the variable specifies}
+#'  \item{educ_3}{A labelled variable specifying which education (three-way) bin the variable specifies}
 #'  \item{race}{A labelled variable specifying which education bin the variable specifies}
 #'  }
 #'

diff --git a/R/datadoc_namevalues.R b/R/datadoc_namevalues.R
@@ -20,6 +20,7 @@
 #'  library(ccesMRPprep)
 #'  race_key
 #'  educ_key
+#'  educ3_key
 #'  gender_key
 #'  age5_key
 #'  age10_key
@@ -51,14 +52,29 @@ NULL
 
 #' @rdname  namevalue
 #' @format ### \code{educ_key}
-#'  For mapping ACS data values for education e.g. in \link{get_acs_cces}:
+#'  For mapping ACS data values for four-way education e.g. in \link{get_acs_cces}:
 #'  \describe{
 #'  \item{educ_cces_chr}{Character to recode from, in CCES}
 #'  \item{educ_chr}{Character to recode from, in ACS.}
 #'  \item{educ}{An labelled integer of class haven::labelled. Target variable}
 #'  }
 "educ_key"
 
+#' @rdname  namevalue
+#' @format ### \code{educ3_key}
+#'  For mapping ACS data values for three-way education e.g. in \link{get_acs_cces}:
+#'  \describe{
+#'  \item{educ_cces_chr}{Character to recode from, in CCES}
+#'  \item{educ_chr}{Character to recode from, in ACS.}
+#'  \item{educ3}{An labelled integer of class haven::labelled. Target variable}
+#'  }
+"educ3_key"
+
+#' @rdname  namevalue
+#' @format ### \code{ed_ed3_cces}
+#'  A key to link educ (4-way) and educ3
+"ed_ed3_cces"
+
 
 #' @rdname  namevalue
 #' @format ### \code{age5_key}

diff --git a/R/get_acs-poststrat.R b/R/get_acs-poststrat.R
@@ -81,7 +81,8 @@ get_acs_cces <- function(varlist,
            .data$year,
            .data$cd,
            matches("(gender|female|age|educ|race)"), .data$count, .data$count_moe) %>%
-      select(-matches("age_(5|10)"))
+      select(-matches("age_(5|10)")) |>
+    select_if(~ any(!is.na(.x)))
 
   acs_lbl
 }

diff --git a/data-raw/create_namevalue-pairs.R b/data-raw/create_namevalue-pairs.R
@@ -34,36 +34,74 @@ age10_key <- tibble(age_chr = ages10) %>%
 
 # Education ----
 educ_cces_lbl <- setNames(1L:7L,
-                     c("Less than 9",
-                       "No HS",
-                       "High School Graduate",
-                       "Some College",
-                       "2-Year",
-                       "4-Year",
-                       "Post-Grad"))
+                          c("Less than 9",
+                            "No HS",
+                            "High School Graduate",
+                            "Some College",
+                            "2-Year",
+                            "4-Year",
+                            "Post-Grad"))
 educ_lbl_clps <- setNames(1L:4L,
                           c("HS or Less", "Some College", "4-Year", "Post-Grad"))
+educ3_lbl_clps <- setNames(1L:3L,
+                           c("HS or Less", "Some College", "4-Year or Post-Grad"))
 
 
 ## CCES lumps the first two, and let's also lump the 2-year
 cces_edlbl <- tibble(educ_cces_chr = names(educ_cces_lbl)[2:7],
                      educ = labelled(c(1, 1, 2, 2, 3, 4), educ_lbl_clps))
 
+cces_ed3lbl <- tibble(educ_cces_chr = c("HS or Less", "Some College", "4-Year or Post-Grad"),
+                     educ_3 = labelled(c(1, 2, 3), educ3_lbl_clps))
+
+# link in CCES between the two measures
+ed_ed3_cces <- tibble(
+  educ = unique(cces_edlbl$educ),
+  educ_3 = labelled(c(1, 2, 3, 3), educ3_lbl_clps)
+)
+
 educ_key  <- tribble(
   ~educ_chr, ~educ_cces_chr, ~doc_note,
-   "Less than 9th grade", "No HS", NA,
-   "9th to 12th grade no diploma", "No HS", "ACS spelling varies",
-   "9th to 12th grade, no diploma", "No HS", "ACS spelling varies",
-   "High school graduate (includes equivalency)", "High School Graduate", NA,
-   "High school graduate, GED, or alternative", "High School Graduate", NA,
-   "Some college no degree", "Some College", "ACS spelling varies",
-   "Some college, no degree", "Some College", "ACS spelling varies",
-   "Associate's degree", "2-Year", "Lumped into Some college",
-   "Bachelor's degree", "4-Year", NA,
-   "Graduate or professional degree", "Post-Grad", NA
+  "Nursery to 4th grade", "No HS", NA,
+  "No schooling completed", "No HS", NA,
+  "Less than 9th grade", "No HS", NA,
+  "Less than high school graduate", "No HS", NA,
+  "Less than high school diploma", "No HS", NA,
+  "5th and 6th grade", "No HS", NA,
+  "7th and 8th grade", "No HS", NA,
+  "9th grade", "No HS", NA,
+  "10th grade", "No HS", NA,
+  "11th grade", "No HS", NA,
+  "12th grade, no diploma", "No HS", NA,
+  "9th to 12th grade no diploma", "No HS", "ACS spelling varies",
+  "9th to 12th grade, no diploma", "No HS", "ACS spelling varies",
+  "High school graduate (includes equivalency)", "High School Graduate", NA,
+  "High school graduate, GED, or alternative", "High School Graduate", NA,
+  "Some college", "Some College", "ACS spelling varies",
+  "Some college no degree", "Some College", "ACS spelling varies",
+  "Some college, no degree", "Some College", "ACS spelling varies",
+  "Some college, 1 or more years, no degree", "Some College", NA,
+  "Some college, less than 1 year", "Some College", NA,
+  "Associate's degree", "2-Year", "Lumped into Some college",
+  "Bachelor's degree", "4-Year", NA,
+  "Master's degree", "4-Year", NA,
+  "Professional school degree", "4-Year", NA,
+  "Doctorate degree", "4-Year", NA,
+  "Graduate or professional degree", "Post-Grad", NA,
 ) %>%
   left_join(cces_edlbl, by = "educ_cces_chr")
 
+educ3_key <- tribble(
+  ~educ_chr, ~educ_cces_chr,
+  "Less than high school graduate", "HS or Less",
+  "Less than high school diploma", "HS or Less",
+  "High school graduate (includes equivalency)", "HS or Less",
+  "Some college", "Some College",
+  "Some college or associate's degree", "Some College",
+  "Bachelor's degree or higher", "4-Year or Post-Grad"
+) |>
+  left_join(cces_ed3lbl, by = "educ_cces_chr")
+
 # Gender ----
 gender_key <- tibble(gender_chr = c("Male", "Female"),
                      gender = labelled(1:2L, c(Male = 1, Female = 2)))
@@ -101,7 +139,9 @@ race_key <- tribble(
   left_join(race_cces_key, by = "race_cces_chr") %>%
   mutate(race = labelled(race_num, my_racelbl))
 
-usethis::use_data(age5_key, age10_key, gender_key, educ_key, race_key,
+usethis::use_data(age5_key, age10_key, gender_key,
+                  educ_key, educ3_key, ed_ed3_cces,
+                  race_key,
                   overwrite = TRUE)
 
 
@@ -113,26 +153,28 @@ if (recreate_sheet) {
 
 
 
-#' transform int+labelled to csv
-# flatten_csv <- function(rd_tbl) {
-#   rd_tbl %>%
-#     mutate(race_cces_value = zap_labels(race_cces),
-#            race_cces_name = as_factor(race_cces),
-#            race_value = zap_labels(race),
-#            race_name = as_factor(race)) %>%
-#     select(-race_cces, -race)
-# }
+  #' transform int+labelled to csv
+  # flatten_csv <- function(rd_tbl) {
+  #   rd_tbl %>%
+  #     mutate(race_cces_value = zap_labels(race_cces),
+  #            race_cces_name = as_factor(race_cces),
+  #            race_value = zap_labels(race),
+  #            race_name = as_factor(race)) %>%
+  #     select(-race_cces, -race)
+  # }
 
-educ_key_csv <- educ_key %>%
-  rename(educ_acs_name = educ_chr,
-         educ_cces_name = educ_cces_chr) %>% # shouldn't this be called ACS to begin with
-  mutate(educ_value = zap_labels(educ),
-         educ_name = as_factor(educ),
-         educ = NULL) %>%
-  relocate(doc_note, .after = -1)
+  educ_key_csv <- educ_key %>%
+    rename(educ_acs_name = educ_chr,
+           educ_cces_name = educ_cces_chr) %>% # shouldn't this be called ACS to begin with
+    mutate(educ_value = zap_labels(educ),
+           educ_name = as_factor(educ),
+           educ3_value = zap_labels(educ3),
+           educ3_name = as_factor(educ3),
+           educ = NULL) %>%
+    relocate(doc_note, .after = -1)
 
 
-sheet_write(educ_key_csv, ss = gs_cces_acs, sheet = "educ")
+  sheet_write(educ_key_csv, ss = gs_cces_acs, sheet = "educ")
 
 
 }
diff --git a/data-raw/specify_ACS-codes.R b/data-raw/specify_ACS-codes.R
@@ -19,15 +19,30 @@ ages  <- c("18 to 24 years",
            "65 to 74 years",
            "75 to 84 years",
            "85 years and over")
-education <- c("Less than 9th grade",
+education <- c("Less than high school diploma",
+               "Nursery to 4th grade",
+               "No schooling completed",
+               "Less than 9th grade",
+               "5th and 6th grade",
+               "7th and 8th grade",
+               "9th grade",
+               "10th grade",
+               "11th grade",
+               "12th grade, no diploma",
+               "Less than 9th grade",
+               "Less than high school graduate",
                "9th to 12th grade,? no diploma",
                "High school graduate \\(includes equivalency\\)",
                "High school graduate, GED, or alternative",
-               "Some college,? no degree",
+               "Some college",
                "Some college or associate's degree",
                "Associate's degree",
-               "Bachelor's degree",
-               "Graduate or professional degree")
+               "Bachelor's degree$",
+               "Doctorate degree",
+               "Master's degree",
+               "Professional school degree",
+               "Graduate or professional degree",
+               "Bachelor's degree or higher")
 races <- c("White alone, not Hispanic or Latino",
            "Hispanic or Latino",
            "Black or African American alone",
@@ -84,12 +99,37 @@ acscodes_df <- vars %>%
   rename(gender_chr = gender, age_chr = age, educ_chr = educ, race_acs = race) %>%
   left_join(gender_key, by = "gender_chr") %>%
   # age 10 if using race interactions consider binding while keeping the label
-  left_join(age5_key, by = "age_chr") %>%
+  left_join(age5_key, by = "age_chr", relationship = "many-to-one") %>%
   left_join(age10_key, by = "age_chr", suffix = c("_5", "_10")) %>%
-  left_join(educ_key, by = "educ_chr") %>%
-  left_join(filter(race_key, !is.na(race_acs)), by = "race_acs") %>%
+  left_join(educ_key, by = "educ_chr", relationship = "many-to-one") %>%
+  left_join(educ3_key, by = "educ_chr", relationship = "many-to-one", suffix = c("", "_3")) %>%
+  left_join(filter(race_key, !is.na(race_acs)), by = "race_acs", relationship = "many-to-one") %>%
   mutate(female = as.integer(gender == 2)) %>%
-  select(variable, gender, female, matches("age_(5|10)"), educ, race)
+  select(variable, gender, female, matches("age_(5|10)"), matches("educ($|_3)"), race) |>
+  mutate(table = str_sub(variable, 1, 6), .after = variable)
+
+# Distinguish between two types of educ
+educ_type <- acscodes_df |>
+  summarize(
+    use_educ3 = all(1:3 %in% educ_3, na.rm = TRUE) & all(educ != 4, na.rm = TRUE),
+    use_educ = all(1:4 %in% educ, na.rm = TRUE),
+    .by = table)
+
+age_type <- acscodes_df |>
+  summarize(
+    use_age5 = all(1:5 %in% age_5, na.rm = TRUE) & all(age_10 != 5, na.rm = TRUE),
+    use_age10 = all(1:5 %in% age_10, na.rm = TRUE),
+    .by = table)
+
+acscodes_df <- acscodes_df |>
+  left_join(educ_type, by = "table") |>
+  mutate(educ = replace(educ, use_educ3, NA),
+         educ_3 = replace(educ_3, use_educ & !use_educ3, NA)) |>
+  left_join(age_type, by = "table") |>
+  mutate(age_10 = replace(age_10, use_age5, NA),
+         age_5 = replace(age_5, use_age10 & !use_age5, NA)) |>
+  select(-starts_with("use_"))
+
 
 # Write ----
 usethis::use_data(acscodes_age_sex_educ, overwrite = TRUE)

diff --git a/data/acscodes_age_sex_educ.rda b/data/acscodes_age_sex_educ.rda
diff --git a/data/acscodes_age_sex_race.rda b/data/acscodes_age_sex_race.rda
diff --git a/data/acscodes_df.rda b/data/acscodes_df.rda
diff --git a/data/acscodes_sex_educ_race.rda b/data/acscodes_sex_educ_race.rda
diff --git a/data/age10_key.rda b/data/age10_key.rda
diff --git a/data/age5_key.rda b/data/age5_key.rda
diff --git a/data/ed_ed3_cces.rda b/data/ed_ed3_cces.rda
diff --git a/data/educ3_key.rda b/data/educ3_key.rda
diff --git a/data/educ_key.rda b/data/educ_key.rda
diff --git a/data/gender_key.rda b/data/gender_key.rda
diff --git a/data/race_key.rda b/data/race_key.rda