R语言高效数据处理-变量批量统计检验
A:以下的4个检验都会自动判断数据里面的数据类型,然后选择对应的变量作统计检验;
B:检验结果中有的是捕获了检验警告信息的,帮助选择检验结果数据;
C:所有检验均只返回统计检验结果数据,不判断使用哪种检验结果作为最终需要的数据,有需要这一层判断自行添加逻辑
1、单样本检验
#date_base为需要检验变量的数据,虽然单样本检验为连续型变量,
#但这里不用去选择变量的数据类型,函数里面会去判断
one_independent_test <- function(data_base){data_base %>% select(where(is.double)) %>% imap(~ {shapiro_p <- shapiro.test(.x)$p.valuet_or_wil_test=if_else(shapiro.test(.x)$p.value > 0.05,t.test(.x) %>% tidy(),wilcox.test(.x) %>% tidy())bind_cols(tibble(shapiro_p,t_or_wil_test))}) %>% bind_rows( .id = "variable")
}2、两独立样本检验
#date_base为需要检验变量的数据,group_var为分组的变量,
#虽然两独立样本检验为连续型变量,但这里不用去选择变量的数据类型,函数里面会去判断
two_independent_test <- function(data_base,group_var){group_vec <- data_base %>% pull({{group_var}})test_var_list=data_base %>% select(where(is.double))all_test_res=test_var_list %>% imap(~{shapiro_res=data_base %>%group_by({{group_var}}) %>%summarise(test_result = list(shapiro.test(!!sym(.y)))) %>%mutate(results = map(test_result, tidy)) %>%unnest(results)levene_res=leveneTest(.x~group_vec,data = data_base)%>% tidy() %>%mutate(method="Levene's Test for Homogeneity")t_res_vareaual=t.test(.x~group_vec,data = data_base,var.equal=TRUE) %>% tidy()t_res_varuneaual=t.test(.x~group_vec,data = data_base,var.equal=FALSE) %>% tidy()wilcox_res=wilcox.test(.x~group_vec,data = data_base)%>% tidy()per_test_res=bind_rows(shapiro_res,levene_res,t_res_vareaual,t_res_varuneaual,wilcox_res)})%>%bind_rows(.id = "variable")
}3、配对样本检验
#date_base为需要检验变量的数据,
#配对样本检验为连续型变量,这里不用去选择变量的数据类型,函数里面会去判断,
#但需要配对检验的变量在判断完数据类型后是相邻的,否则得到结果不是预期的
two_paired_test <- function(data_base){test_var_list=data_base %>% select(where(is.double)) %>% names()var_pair_former=head(test_var_list,-1)var_pair_latter=tail(test_var_list,-1)names(var_pair_former) <- var_pair_formernames(var_pair_latter) <- var_pair_latterall_test_res=map2(var_pair_former,var_pair_latter,~{x <- data_base[[.x]]y <- data_base[[.y]]z <- x-yshapiro_res=bind_rows(shapiro.test(x) %>% tidy() %>% mutate(variable_format=.x),shapiro.test(y) %>% tidy() %>% mutate(variable_format=.y),shapiro.test(z) %>% tidy() %>% mutate(variable_format=str_c('diff',.x,'-',.y)))t_paired_res=t.test(x,y,data = data_base,paired=TRUE) %>%tidy() %>% mutate(variable_format=paste0(.x,'~',.y))wilcox_paired_res=wilcox.test(x, y,data = data_base,paired=TRUE)%>% tidy() %>% mutate(variable_format=paste0(.x,'~',.y))per_test_paired_res=bind_rows(shapiro_res,t_paired_res,wilcox_paired_res)})%>%bind_rows( .id= "variable")
}
4、分类样本检验
#date_base为需要检验变量的数据,group_var为分组变量
#分类样本检验为分类变量,这里不用去选择变量的数据类型,函数里面会去判断
multi_category_test <- function(data_base,group_var){test_var_list=data_base %>% select(where(is.character)) %>%select(-{{group_var}}) %>% names()names(test_var_list) <- test_var_listall_test_res=test_var_list %>% imap(~{var_distinct=data_base %>% select(.x) %>% distinct()var_distinct_mum=data_base %>% select(.x) %>%n_distinct()data_base_nomiss=if(str_detect(var_distinct,"missing")&var_distinct_mum==2){data_base %>%filter(is.na(!!sym(.x))==FALSE)}else{data_base %>%filter(!!sym(.x)!='missing') %>%filter(is.na(!!sym(.x))==FALSE)}chisq_base_table=data_base_nomiss %>%tabyl(!!sym(.x),{{group_var}})var_distinct_mum_deal=data_base_nomiss %>% select(.x) %>%n_distinct()warn_msg <- NA_character_chisq_not_correct_res=if(var_distinct_mum_deal==1){return(tibble(test_tag = NA_character_,method = NA_character_,p.value = NA_real_,statistic = NA_real_,warning_message = "Only one unique value"))}else{withCallingHandlers({test_res <- chisq.test(chisq_base_table, correct = F) %>% tidy()%>% remove_rownames()},warning = function(w) {warn_msg <<- conditionMessage(w)invokeRestart("muffleWarning") # 阻止警告打印})}chisq_not_correct_res$warning_message <- warn_msgchisq_not_correct=chisq.test(chisq_base_table,correct=F)chisq_not_correct_observed=chisq_not_correct$observed %>%mutate(test_tag='observed')%>% select(-.x)print(chisq_not_correct_observed)chisq_not_correct_expected=chisq_not_correct$expected %>%mutate(test_tag='expected')%>% select(-.x)chisq_correct_res=chisq.test(chisq_base_table,correct=T) %>%tidy()%>% remove_rownames()fisher_test_res=fisher.test(chisq_base_table) %>% tidy()%>% remove_rownames()per_test_res=bind_rows(chisq_not_correct_res,chisq_not_correct_observed,chisq_not_correct_expected,chisq_correct_res,fisher_test_res)}) %>% bind_rows(.id = "variable") %>% remove_rownames()
}