Fully implement word-level preprocessing

NDCLab · Jan 26, 2024 · 8031a3d · 8031a3d
1 parent a426cfb
commit 8031a3d
Showing 1 changed file with 63 additions and 72 deletions.
diff --git a/code/preprocDisfluenciesWordLevel.R b/code/preprocDisfluenciesWordLevel.R
@@ -1,7 +1,7 @@
 # Reading in reconciled Excels, summarizing the errors of each, and writing that
 # to a new CSV
 # Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
-# last updated 2024-01-25
+# last updated 2024-01-26
 # This version is intended for word-level operations
 
 # NB passages "sun" and "broccoli" as coded contain errors. Namely, broccoli had
@@ -47,6 +47,8 @@
 #                 label -- the string identifying a file -- rather than the file
 #                 itself or its contents
 #       - df: dataframe, the R structure
+#       - passage_df: a dataframe for a participant's reading of a given
+#                 passage, with that passage's scaffolding included
 #       - DEBUG_MODE: a 'flag' which when set to TRUE enables features including
 #                 increased verbosity and incremental outputs (in case the
 #                 program fails before creating the intended file per below)
@@ -109,9 +111,9 @@ error_types_idiomatic = c(
   "filled_pause", "hesitation", "elongation", "corrected"
 )
 
-base = "~/Documents/ndclab/rwe-analysis-sandbox/github-structure-mirror/readAloud-valence-dataset"
-# base = "/home/data/NDClab/datasets/readAloud-valence-dataset"
-passage_dir = "~/Documents/ndclab/rwe-analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed/error-coding/test"
+# base = "~/Documents/ndclab/rwe-analysis-sandbox/github-structure-mirror/readAloud-valence-dataset"
+base = "/home/data/NDClab/datasets/readAloud-valence-dataset"
+# passage_dir = "~/Documents/ndclab/rwe-analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed/error-coding/test"
 
 # if we're in debug mode, write output dataframes to disk as they are made
 incremental_writeout = if(DEBUG_MODE)
@@ -128,9 +130,6 @@ filler = data.frame( # what we'll use when data is empty or invalid, until the f
   row.names = error_types_idiomatic[1:7] # misprod...elongation
 ) %>% t %>% as.data.frame
 
-dummy <- function(.) NA # ignore argument, just return dummy value
-fill_dummy <- function(df, cols) cbind(df, setNames(lapply(cols, dummy), cols)) # fill dummy value into all listed cols
-
 ## Calculations about the passages themselves, for things like word ratios
 timestamp = now("America/New_York") %>% format("%Y%m%d_%I%M%P")
 scaffolds_path = paste(base, "code/scaffolds.xlsx", sep = '/')
@@ -175,7 +174,7 @@ build_full_passage_path <- function(passage_nickname, dir_root, participant_id)
     passage_nickname,
     '.xlsx')
 
-passage_path_to_df <- function(passage_path) {
+read_error_data_from_path <- function(passage_path) {
   df = data.frame(
     read_xlsx(passage_path)[2:9,], # get only the rows misprod ... corrected
     row.names = error_types_idiomatic
@@ -189,12 +188,12 @@ passage_name_to_df <- function(passage_nickname, participant_id, dir_root)
   cbind(scaffolds[[passage_nickname]], # include our scaffolds
         passage_nickname %>%
           build_full_passage_path(dir_root, participant_id) %>% # path
-          passage_path_to_df
+          read_error_data_from_path
   )
 
 ## Counting up totals for a given passage
-count_rows_with_a_one <- function(df)
-  filter(df, if_any(everything(), ~ . == 1)) %>% nrow
+# count_rows_with_a_one <- function(df)
+#   filter(df, if_any(everything(), ~ . == 1)) %>% nrow
 
 complain_when_invalid <- function(passage_df, participant_id, passage_nickname) {
   report = paste("\n\t\t<< ERROR REPORT", participant_id, "-", passage_nickname, ">>")
@@ -215,8 +214,7 @@ complain_when_invalid <- function(passage_df, participant_id, passage_nickname)
 }
 
 
-
-collapse_by_word <- function(scaffolded_df) {
+collapse_by_word <- function(passage_df) {
   # for each word in the passage, consider it an error when any of its syllables
   # is marked as such
 
@@ -225,15 +223,14 @@ collapse_by_word <- function(scaffolded_df) {
   # if yes TRUE
   # else FALSE
 
-  scaffolded_df %>%
+  passage_df %>%
     mutate(across(misprod:corrected,
-                  ~ as.logical(as.integer(.)))) %>%
-    group_by(word_id) %>%
-    summarize(across(misprod:corrected, ~ any(.)))
+                  ~ as.logical(as.integer(.)))) %>% # make booleans
+    group_by(word_id) %>%                           # collapse as words
+    summarize(across(misprod:corrected, ~ any(.)))  # error on this word or not?
 }
 
 
-
 get_frequency_for_word <- function(word) {
   # caution: if a lemma is desired, it must be derived elsewhere (this function
   # does not contain that logic)
@@ -249,10 +246,10 @@ get_frequencies_for_passage <- function(nickname)
 
 passage_frequencies = into_dict(titles, get_frequencies_for_passage)
 
-append_words_and_frequencies <- function(collapsed_scaffolded_df, nickname)
+append_words_and_frequencies <- function(collapsed_df, nickname)
   cbind(word           = word_lists[[nickname]],
         log10frequency = passage_frequencies[[nickname]],
-        collapsed_scaffolded_df)
+        collapsed_df)
 
 
 
@@ -279,7 +276,6 @@ append_words_and_frequencies <- function(collapsed_scaffolded_df, nickname)
 colnames_from_range <- function(df, colrange)
   colnames(select(df, {{colrange}}))
 
-
 append_lookback <- function(df, col, lookback_index)
   # Add a new column (e.g. prev_misprod4) representing the value of e.g. misprod, four rows prior.
   # This is useful for hunting for patterns of errors in a particular sequence
@@ -315,16 +311,12 @@ a_b_sequence_lookback <- function(df, errtypes_a, errtypes_b, prior_context = 1)
   df_with_lhs_lookbacks = append_lookbacks_multicol(df, {{errtypes_a}}, prior_context)
 
 
-  filter(df_with_lhs_lookbacks,
-         if_any(rhs_cols, ~ . == 1) & if_any(matches(lookbacks_regex), ~ . == 1))
+  mutate(df_with_lhs_lookbacks,
+         "any_prior_{{errtypes_a}}" := if_any(matches(lookbacks_regex), ~ . == 1),
+         "{{errtypes_b}}_with_any_prior_{{errtypes_a}}" := if_any(rhs_cols, ~ . == 1) & if_any(matches(lookbacks_regex), ~ . == 1))
 }
 
-count_a_b_sequences_lookback <- function(df, errtypes_a, errtypes_b, prior_context = 1)
-  a_b_sequence_lookback(df, {{errtypes_a}}, {{errtypes_b}}, prior_context) %>% nrow
-
-
 # All the above, with lookaheads
-# seems logically equivalent, but in case
 append_lookahead <- function(df, col, lookahead_index)
   # Add a new column (e.g. next_misprod4) representing the value of e.g. misprod, four rows ahead.
   # This is useful for hunting for patterns of errors in a particular sequence
@@ -359,63 +351,63 @@ a_b_sequence_lookahead <- function(df, errtypes_a, errtypes_b, forward_context =
 
   df_with_rhs_lookaheads = append_lookaheads_multicol(df, {{errtypes_b}}, forward_context)
 
-
-  filter(df_with_rhs_lookaheads,
-         if_any(lhs_cols, ~ . == 1) & if_any(matches(lookaheads_regex), ~ . == 1))
+  mutate(df_with_rhs_lookaheads,
+         "any_upcoming_{{errtypes_b}}" := if_any(matches(lookaheads_regex), ~ . == 1),
+         "{{errtypes_a}}_with_any_upcoming_{{errtypes_b}}" := if_any(lhs_cols, ~ . == 1) & if_any(matches(lookaheads_regex), ~ . == 1)
+  )
 }
 
-count_a_b_sequences_lookahead <- function(df, errtypes_a, errtypes_b, forward_context = 1)
-  a_b_sequence_lookahead(df, {{errtypes_a}}, {{errtypes_b}}, forward_context) %>% nrow
+# we just want from lookaheads and lookbacks:
+# was there ANY in previous 5
+# psuedocode: any(prev_*)
+# likewise for next
 
+# & all of this for misprod->hes and hes->misprod
 
+# so we add four columns:
+# is it a misprod with preceding hes?
+# is it a misprod with following hes?
+# is it a hes with preceding misprod?
+# is it a hes with following misprod?
 
+append_pes_annotation_cols <- function(df, errtypes_a, errtypes_b, forward_context = 1, prior_context = 1) {
+  # a: LHS errors; b: RHS errors; context: how many rows forward we can look for RHS errors
 
-error_summary <- function(passage_df, passage_name) {
-  # a quick repair instead instead of an error: so we don't have to halt everything and start over
-  if (any(passage_df == FALSE)) {
-    return(passage_df %>% fill_dummy(c("errors", "corrections", "uncorrected_errors", "skipped_end")))
-  }
+  lhs_cols = colnames_from_range(df, {{errtypes_a}})
+  rhs_cols = colnames_from_range(df, {{errtypes_b}})
 
-  summary <- count_errors_by_type(passage_df)
-  # passage_x_scaffold <- cbind(scaffolds[[passage_name]], passage_df)
-  return(summary %>% cbind(
-    words_with_misprod   = count_errors_by_word(passage_df, misprod),
-    words_with_hes       = count_errors_by_word(passage_df, hesitation),
-    hes_with_misprod_in_previous_syllables = count_a_b_sequences_lookback(passage_df, misprod, hesitation, prior_context = 5),
-    hes_with_misprod_in_next_syllables     = count_a_b_sequences_lookahead(passage_df, hesitation, misprod, forward_context = 5),
-    misprod_with_hes_in_previous_syllables = count_a_b_sequences_lookback(passage_df, hesitation, misprod, prior_context = 5),
-    misprod_with_hes_in_next_syllables     = count_a_b_sequences_lookahead(passage_df, misprod, hesitation, forward_context = 5),
-    errors               = count_error_syllables_any_type(passage_df),
-    corrections          = count_corrected_error_syllables(passage_df),
-    uncorrected_errors   = count_uncorrected_error_syllables(passage_df),
-    skipped_end          = all(passage_df$omit %>% tail(10) == 1)
-  ))
-}
+  lookaheads_regex = rhs_cols %>% paste(collapse = "|") %>% paste("next_(", ., ").*", sep = "") # as in next_(misprod|hesitation).*
+  df_with_rhs_lookaheads = append_lookaheads_multicol(df, {{errtypes_b}}, forward_context)
+
+  lookbacks_regex = lhs_cols %>% paste(collapse = "|") %>% paste("prev_(", ., ").*", sep = "") # as in prev_(misprod|hesitation).*
+  df_with_lhs_lookbacks = append_lookbacks_multicol(df, {{errtypes_a}}, prior_context)
 
+  df_with_rhs_lookaheads_and_lhs_lookbacks =
+    df %>%
+    a_b_sequence_lookahead({{errtypes_a}}, {{errtypes_b}}, forward_context) %>%
+    a_b_sequence_lookback({{errtypes_a}}, {{errtypes_b}}, prior_context)
+
+  return(df_with_rhs_lookaheads_and_lhs_lookbacks)
+}
 
 status_message <- function(passage_name, participant_id) {
-  status = paste("Generating summary from participant ", participant_id, "'s ",
+  status = paste("Generating word errors from participant ", participant_id, "'s ",
                  passage_name, " passage...",
                  sep = '')
   message(status)
 }
 
-append_rates <- function(error_df, colrange, count) {
-  mutate(error_df, across({{colrange}}, \(x) x / count, .names = "{.col}_rate"))
-}
-
 error_summary_with_metadata <- function(passage_nickname, participant_id, dir_root) {
-  #passage_nickname = fs::path_ext_remove(passage_name) # chomp 'bees.xlsx' to 'bees', e.g.
   if(DEBUG_MODE) status_message(passage_nickname, participant_id)
 
   summary =
     passage_name_to_df(passage_nickname, participant_id, dir_root) %>%
-    complain_when_invalid(participant_id, passage_nickname) %>%
-    error_summary(passage_nickname) %>%
-    append_rates(misprod:elongation, syllable_counts[[passage_nickname]]) %>%    # then errors per syllable
-    append_rates(starts_with("words_with"), word_counts[[passage_nickname]]) %>% # then errors per word, where applicable
-    append_rates(starts_with("hes_with"), syllable_counts[[passage_nickname]]) %>% # rates of misproductions relative to a central hesitation (by syllables)
-    append_rates(starts_with("misprod_with"), syllable_counts[[passage_nickname]]) # rates of hesitations relative to a central misproduction (by syllables)
+    # complain_when_invalid(participant_id, passage_nickname) %>%
+    collapse_by_word %>%
+    append_words_and_frequencies(passage_nickname) %>%
+    append_pes_annotation_cols(misprod, hesitation, forward_context = 5, prior_context =  5) %>%
+    append_pes_annotation_cols(hesitation, misprod, forward_context = 5, prior_context =  5) %>%
+    select(word:corrected, matches("(^|.*_)any.*[_$]")) # any_prior_misprod, any_following_hesitation, any_prior_hesitation, any_following_misprod)
 
   return(cbind(
     id = participant_id, # pre-pose an id column
@@ -424,11 +416,13 @@ error_summary_with_metadata <- function(passage_nickname, participant_id, dir_ro
   ))
 }
 
-
 # All passages for a participant
 generate_summary_for_each_passage_with_metadata <- function(dir_root, participant_id, write_to = incremental_writeout) {
-  df = build_participant_dirname(dir_root, participant_id) %>% dir %>% # passage name _with_ the extension
-      map_df(error_summary_with_metadata, participant_id, dir_root)
+  df =
+    build_participant_dirname(dir_root, participant_id) %>% # identify their folder
+    dir %>% # pick out: which passages did they actually read?
+    fs::path_ext_remove() %>% # take them _without_ the extension
+    map_df(error_summary_with_metadata, participant_id, dir_root) # glue that together into a df
 
   if (!is.null(write_to) && fs::is_dir(write_to)) { # incremental CSV: just this participant, with all their passages
     outfile_debug = paste(write_to, "/", participant_id, "_", timestamp, '.csv', sep = "")
@@ -438,9 +432,6 @@ generate_summary_for_each_passage_with_metadata <- function(dir_root, participan
   return(df)
 }
 
-summarize_numeric_cols <- function(df, f, label = as.character(substitute(f))) # currently unused- previously in function below with mean and sd
-  cbind(id = label, reframe(df, across(where(is.numeric), f)))
-
 # Now, for each participant under a directory, each identified by the form sub_XXXXXX_reconciled,
 # call generate_summary_for_each_passage_with_metadata(the_parentdir_of_all_those, that_id)
 summarize_errors_in_subdirectories <- function(dir_root, subfolder_match)
@@ -461,7 +452,7 @@ label = "disfluencies_subject-x-passage_"
 outpath <- paste(sep = "", annotations_base, '/', label, timestamp, ".csv")
 # e.g. "./some/path/disfluencies_20230520_1240pm.csv"
 
-github_root %>%
+github_root %>% # passage_dir %>%
   summarize_errors_in_subdirectories("^sub-\\d{6}_reconciled$") %>%
   write_csv(outpath)