R语言随机森林分析显示R方和P值
最近在做一个因子的驱动力分析。我参考了相关文献,写了个函数,希望能帮助到大家。
library(ggplot2)
library(tidyverse)
library(randomForest)
library(devtools)
library(rfUtilities)
library(rfPermute)
library(forcats)rfCalulate2 <- function(data_excel,dep_var,formulaTXT,imagename){# 定义变量名映射表 - 在这里自定义变量显示名称var_name_mapping <- c("npp00_23" = "NPP","FVC00_23" = "植被覆盖度","GDP00_23" = "GDP","pop00_23" = "人口","tem00_23" = "温度","pre00_23" = "降雨",#"wind00_23" = "风速","forest00_23" = "森林面积","nature00_23" = "自然生境面积","wetland00_23" = "湿地面积","csh00_23" = "城市化率","TGDP00_23" = "旅游收入","Tpop00_23" = "旅游人口","WZL00_23" = "农业产品","WZC00_23" = "林产品", "WZM00_23" = "牧产品","WZY00_23" = "渔产品","WZS00_23" = "生态能源","WZW00_23" = "水资源")# 定义因变量显示名称dep_var_names <- c("GEP00_23" = "Gross Ecosystem Product","TJ00_23" = "调节服务", "WH00_23" = "文化服务","WZ00_23" = "物质供给")mydata <- read_xlsx(data_excel) %>% # 读取Excel数据#select(4:30) %>%# 获取前八列作为分析变量#log1p() %>% # 对数变换 有负数的话就会存在nodata值na.omit() %>% # 移除缺失值as.data.frame() # 转换为数据框# 打印NaNs值的行mydata <- as.data.frame(mydata)set.seed(123)# for i in ["TJ00_23", "WH00_23", "WZ00_23", "GEP00_23"]:# print(i)formula_str <- paste(dep_var, formulaTXT)#+ wind00_23 + csh00_23treat_rf <- randomForest(as.formula(formula_str), data = mydata, importance = TRUE, proximity = TRUE)# 置换检验显著性set.seed(123)treat_perm <- rf.significance(treat_rf, mydata, nperm = 99, ntree = 500)treat_perm# 检查 treat_perm 的结构,提取 p-valuep_value <- as.numeric(treat_perm$pValue) # 如果出错,请用 `str(treat_perm)` 确认字段名# 如果 p-value 提取失败,提示检查数据结构if (is.na(p_value)) { stop("Unable to extract or convert p-value. Check treat_perm structure.")}# 提取模型的 R-squaredr_squared <- round(treat_rf$rsq[500], 4)# 变量重要性分析并转换数据set.seed(123)ANPP_rfP <- rfPermute(as.formula(formula_str), data = mydata, ntree = 500, na.action = na.omit, nrep = 100, num.cores = 1)ANPP_dat <- importance(ANPP_rfP, sort.by = NULL, decreasing = TRUE)# 数据转换并添加显著性标记plot_data <- ANPP_dat %>% as_tibble(rownames = "names") %>% data.frame() %>% mutate( label = if_else(X.IncMSE.pval < 0.001, "***", if_else(X.IncMSE.pval < 0.01, "**", if_else(X.IncMSE.pval < 0.05, "*", "ns"))), X.IncMSE = as.numeric(X.IncMSE), group = if_else(label == "ns", "Non-Significant", "Significant") ) %>% arrange(X.IncMSE) %>% # 按 X.IncMSE 值升序排序 mutate(names = factor(names, levels = names)) # 确保因子水平顺序为升序# 自定义颜色custom_colors <- c("Non-Significant" = "#FD7D6D", "Significant" = "#008CB7")# 应用变量名重命名plot_data <- plot_data %>%mutate(display_names = ifelse(names %in% names(var_name_mapping), var_name_mapping[as.character(names)], as.character(names)),display_names = factor(display_names, levels = unique(display_names)))# 自定义颜色custom_colors <- c("Non-Significant" = "#FD7D6D", "Significant" = "#008CB7")# 获取因变量显示名称dep_display_name <- ifelse(dep_var %in% names(dep_var_names), dep_var_names[dep_var], dep_var) # 绘图p <- ggplot(plot_data, aes(x = display_names, y = X.IncMSE, fill = group)) + geom_bar(stat = "identity") + scale_fill_manual(values = custom_colors) + geom_text(aes(y = X.IncMSE + 1, label = label), hjust = 0, size = 3) + labs(x = "", y = "%IncMSE", fill = "Significance",title = paste("Variable Importance for", dep_display_name)) + coord_flip() + theme_bw() + theme(legend.position = "right") + # 在右下角添加注释 annotate("text", x = 1, y = max(plot_data$X.IncMSE) * 0.5, hjust = 0, label=paste0("p-value:",round(p_value,4),"\nR-squared:",r_squared))# 保存图形(可选)ggsave(imagename, p, width = 12, height = 9, dpi = 300)#print(p)
运行的代码。
data_excel<- "change_analysis.xlsx"
dep_var <- "WZ00_23"
formulaTXT <- "~ tem00_23 + wetland00_23 + csh00_23 + TGDP00_23 + Tpop00_23 + WZL00_23 + WZC00_23 + WZM00_23 + WZY00_23 + WZS00_23 + WZW00_23"rfCalulate2(data_excel,dep_var,formulaTXT,imagename="image/变化-WZ00_23.png")
目前这个存在的问题
1.目前这个模型还存在一个问题,就是这个模型有些因子加进去可能是增加其均方差的,删除其后其的排序又会有所变化。
2.另外,就是当调整变量顺序的时候,其影响因子的排序和效果也会发生改变。
参考文献
https://mp.weixin.qq.com/s/aKk_DXczV4MtBg9we_PVBQ
