library(Hmisc) library(assertthat) library(dplyr) # Define functions -------------------------------------------------------- wtd_quantile <- Hmisc::wtd.quantile is.all.na.or.zero <- function(x){ return(length(x[!is.na(x) & x != 0]) == 0) } #' Apply top and bottom coding with log IQR to a level-defined file #' #' Lower-level functions used within 'implement_top_code_with_iqr()' and #' 'implement_bottom_code_with_iqr()' of the 'lissyrtools' package. #' #' @param file A tibble or data.frame with a LIS or LWS file. #' @param file_name A string with the name of the LIS or LWS file. #' @param variable A string with the variable to which top coding should be applied. #' @param times A numeric indicating the scale factor for IQR. Defaults to 3. #' @param variable_level A string with the level of the variable. Should be either 'household' or 'person'. #' If NULL (default), the function will try to guess the level of the variable. #' This is done by comparing the value in 'variable' with pre-set lists of variables. #' @param weight A string with the name of the variable in 'file' that should be #' used as sample weights. #' #' @return A tibble containing the file with the recoded variable. #' #' @keywords internal implement_top_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr), true = exp(log_third_quartile + log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' @rdname implement_top_code_with_iqr_hfile implement_bottom_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr), true = exp(log_first_quartile - log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' Read and process a single file for the family IKF estimates #' #' @param dataset_name A character vector of length one with the name of the dataset #' to read and process in 'ccyy' format. E.g. 'lu10'. #' #' @return A tibble with 6 columns: 'hid', 'dhi', 'did', 'nhhmem', 'person_weight', 'children_weight'. read_and_process_family <- function(dataset_name) { hfile_name <- paste(dataset_name,'h',sep='') pfile_name <- paste(dataset_name,'p',sep='') h_file <- read.LIS(hfile_name, vars = c("did", "hid", "hpartner", "nhhmem", "nhhmem17", "dhi", "hwgt"), subset = "!is.na(dhi) & !is.na(hwgt)", labels=FALSE) p_file <- read.LIS(pfile_name, vars = c("did", "hid", "relation", "sex"), labels=FALSE) %>% dplyr::filter(relation == 1000) h_file <- h_file %>% dplyr::left_join(p_file, by = c("did", "hid")) h_file[["dhi"]] <- dplyr::if_else((h_file[["dhi"]]< 0) & !is.na(h_file[["dhi"]]), true = 0, false = as.numeric(h_file[["dhi"]])) h_file$dhi <- h_file$dhi / sqrt(h_file$nhhmem) h_file <- implement_top_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- implement_bottom_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- h_file %>% dplyr::mutate(person_weight = hwgt * nhhmem, children_weight = hwgt * nhhmem17) %>% dplyr::select(-hwgt) return(h_file) } #' Compute the children poverty estimates for a single file #' #' @description #' Computes the children poverty estimates estimates for a single file. #' #' To be used within 'compute_ikf_estimates()'. #' #' @param file A tibble with the merged household and person-level files. #' #' @return A list with 3 elements. #' @keywords internal compute_ikf_children_poverty_rates <- function(file){ output <- list(poverty_two_parents = NA_real_, poverty_single_mother = NA_real_, prop_children_single_mother_family = NA_real_) if(is.all.na.or.zero(file[["dhi"]])){ return(output) } median_dhi <- unname(wtd_quantile(x = file[["dhi"]], weights = file[["person_weight"]], probs = 0.5)) poverty_50 <- 0.5 * median_dhi file <- file %>% dplyr::mutate(two_parents = dplyr::if_else(hpartner == 1 & nhhmem > 2 & nhhmem17 > 0, true = TRUE, false = FALSE), single_mother = dplyr::if_else(sex == 2 & hpartner == 0 & nhhmem17 > 0, true = TRUE, false = FALSE), poor_50 = dplyr::if_else(dhi < poverty_50, true = 1, false = 0)) output[["poverty_two_parents"]] <- dplyr::filter(file, two_parents) %>% dplyr::summarise(poverty_two_parents = sum(poor_50 * children_weight, na.rm = TRUE)/sum(children_weight, na.rm = TRUE)) %>% dplyr::pull(poverty_two_parents) output[["poverty_single_mother"]] <- dplyr::filter(file, single_mother) %>% dplyr::summarise(poverty_single_mother = sum(poor_50 * children_weight, na.rm = TRUE)/sum(children_weight, na.rm = TRUE)) %>% dplyr::pull(poverty_single_mother) output[["prop_children_single_mother_family"]] <- dplyr::summarise(file, prop_children = sum(single_mother * children_weight/sum(children_weight, na.rm = TRUE), na.rm = TRUE)) %>% dplyr::pull(prop_children) return(output) } # Implement functions ----------------------------------------------------- h_file <- read_and_process_family("lu10") # <- Insert dataset in 'ccyy' format (e.g. 'lu10') compute_ikf_children_poverty_rates(h_file)