library(Hmisc) library(assertthat) library(dplyr) # Define functions -------------------------------------------------------- gini <- function(x,weight=rep(1,length(x))) { ox <- order(x) x <- x[ox] weight <- weight[ox]/sum(weight) p <- cumsum(weight) nu <- cumsum(weight*x) n <- length(nu) nu <- nu/nu[n] res <- sum(nu[-1]*p[-n])-sum(nu[-n]*p[-1]) return(res) } atkinson <- function(x,weight=rep(1,length(x)),epsilon=1) { keep <- which(!is.na(x)) x <- x[keep] weight <- weight[keep] x <- x/mean(x) weight <- weight/sum(weight) res <- ifelse(epsilon==1,1-(prod(exp(weight*log(x)))/sum(x*weight/sum(weight))), 1-(sum(((x/sum(x*weight/sum(weight)))^(1-epsilon))*weight/sum(weight)))^(1/(1-epsilon))) return(res) } wtd_quantile <- Hmisc::wtd.quantile is.all.na.or.zero <- function(x){ return(length(x[!is.na(x) & x != 0]) == 0) } #' Apply top and bottom coding with log IQR to a level-defined file #' #' Lower-level functions used within 'implement_top_code_with_iqr()' and #' 'implement_bottom_code_with_iqr()' of the 'lissyrtools' package. #' #' @param file A tibble or data.frame with a LIS or LWS file. #' @param file_name A string with the name of the LIS or LWS file. #' @param variable A string with the variable to which top coding should be applied. #' @param times A numeric indicating the scale factor for IQR. Defaults to 3. #' @param variable_level A string with the level of the variable. Should be either 'household' or 'person'. #' If NULL (default), the function will try to guess the level of the variable. #' This is done by comparing the value in 'variable' with pre-set lists of variables. #' @param weight A string with the name of the variable in 'file' that should be #' used as sample weights. #' #' @return A tibble containing the file with the recoded variable. #' #' @keywords internal implement_top_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr), true = exp(log_third_quartile + log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' @rdname implement_top_code_with_iqr_hfile implement_bottom_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr), true = exp(log_first_quartile - log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' Read and process a single file for the inequality IKF estimates #' #' @param dataset_name A character vector of length one with the name of the dataset #' to read and process in 'ccyy' format. E.g. 'lu10'. #' #' @return A tibble with 7 columns: 'hid', 'dhi', 'did', 'nhhmem', 'hwgt'. read_and_process_inequality <- function(dataset_name) { hfile_name <- paste(dataset_name,'h',sep='') h_file <- read.LIS(hfile_name, vars = c("did","hid","nhhmem","dhi","hwgt"), subset = "!is.na(dhi) & !is.na(hwgt)", labels=FALSE) h_file[["dhi"]] <- dplyr::if_else((h_file[["dhi"]]< 0) & !is.na(h_file[["dhi"]]), true = 0, false = as.numeric(h_file[["dhi"]])) h_file$dhi <- h_file$dhi / sqrt(h_file$nhhmem) h_file$hwgt <- h_file$hwgt * h_file$nhhmem h_file <- implement_top_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- implement_top_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- implement_bottom_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- h_file %>% dplyr::mutate(person_weight = hwgt * nhhmem) %>% dplyr::select(-hwgt) return(h_file) } compute_ikf_inequality_estimates <- function(hfile){ dhi <- h_file[["dhi"]] weight <- h_file[["person_weight"]] return(list(`gini` = gini(x = dhi, weight = weight), `atkinson_0.5` = atkinson(x = dhi, weight = weight, epsilon = 0.5), `atkinson_1` = atkinson(x = dhi, weight = weight, epsilon = 1), `ratio_90/10` = unname(wtd_quantile(x = dhi, weights = weight, probs = 0.9)/wtd_quantile(x = dhi, weights = weight, probs = 0.1)), `ratio_90/50` = unname(wtd_quantile(x = dhi, weights = weight, probs = 0.9)/wtd_quantile(x = dhi, weights = weight, probs = 0.5)), `ratio_80/20` = unname(wtd_quantile(x = dhi, weights = weight, probs = 0.8)/wtd_quantile(x = dhi, weights = weight, probs = 0.2)) )) } # Implement functions ----------------------------------------------------- h_file <- read_and_process_inequality("lu10") # <- Insert dataset in 'ccyy' format (e.g. 'lu10') compute_ikf_inequality_estimates(h_file)