RNA-seq分析之单基因Wilcoxon秩和检验
适用于单基因分析差异显著性,不考虑Log2FC值,仅能做验证使用,所得不可作为差异基因进行后续分析
# 1. 安装并加载必需包(确保包存在,避免报错)
if (!require("dplyr", quietly = TRUE)) install.packages("dplyr")
if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
library(dplyr)
library(ggplot2)# 2. 基础设置与数据读取
setwd("G:/aml") # 替换为你的实际数据路径
# 读取表达矩阵(行=基因,列=样本)
readCount <- read.table("merged_AML_CML_with_0.txt", header = TRUE, check.names = FALSE, # 保留原始样本名row.names = 1
)
# 读取分组文件(需包含sample列和group列)
conditions <- read.table("group.txt", header = TRUE, sep = "\t", stringsAsFactors = FALSE
)# 3. 数据检查与样本顺序匹配(避免后续分析错位)
cat("表达矩阵维度(基因数×样本数):", dim(readCount), "\n")
cat("分组类型及样本数:\n")
print(table(conditions$group)) # 确认分组是否为2组(Wilcoxon适合两组比较)# 匹配样本顺序(关键!确保表达量与分组一一对应)
conditions <- conditions[match(colnames(readCount), conditions$sample), ]
stopifnot(all(colnames(readCount) == conditions$sample)) # 不匹配则报错提醒# 4. 提取目标基因的表达量(这里以SIRT6为例,可替换为其他基因)
gene_interest <- "SIRT6"
# 检查基因是否在表达矩阵中,避免报错
if (!gene_interest %in% rownames(readCount)) {stop(paste("基因", gene_interest, "不在表达矩阵中,请检查基因名!"))
}# 整理基因表达+分组数据(转置确保样本为行)
expr_df <- data.frame(sample = colnames(readCount),expression = as.numeric(readCount[gene_interest, ]), # 目标基因表达量group = conditions$group, # 对应分组stringsAsFactors = FALSE
)# 5. Wilcoxon秩和检验(两组差异分析核心步骤)
# 注:仅适用于2组比较,多组需用Kruskal-Wallis检验
wilcox_result <- wilcox.test(expression ~ group, data = expr_df)
p_val <- wilcox_result$p.value # 提取p值
cat("\n", gene_interest, "基因两组差异检验结果:\n")
cat("p值 =", round(p_val, 4), "\n")
cat("差异显著性:", ifelse(p_val < 0.05, "显著(p<0.05)", "不显著(p≥0.05)"), "\n")# 6. 查看分组表达量统计(辅助解读结果)
expr_stats <- expr_df %>%group_by(group) %>%summarise(样本数 = n(),均值 = round(mean(expression), 2),中位数 = round(median(expression), 2),.groups = "drop")
print(expr_stats)# 7. 绘制箱线图(可视化差异)
# 若表达量为原始计数,可加log2转换(如expression = log2(expression + 1))
p <- ggplot(expr_df, aes(x = group, y = expression, fill = group)) +geom_boxplot(alpha = 0.7, width = 0.6) + # 箱线图主体# 标注p值(放在两组中间,高于最大值10%的位置)annotate("text",x = mean(c(1, 2)),y = max(expr_df$expression) * 1.1,label = paste("p =", round(p_val, 4)),size = 4) +# 图表标签与主题labs(title = paste(gene_interest, "Gene Expression in Different Groups"),x = "Group",y = "Expression Level" # 若用log转换,可改为"Expression (log2(count+1))") +theme_bw() +theme(plot.title = element_text(hjust = 0.5, size = 12), # 标题居中legend.position = "none", # 移除图例(x轴已显示分组)axis.text = element_text(size = 10),axis.title = element_text(size = 11))# 显示并保存图片
print(p)
ggsave(paste(gene_interest, "_expression_boxplot.pdf", sep = ""),plot = p,width = 5,height = 4,dpi = 300
)
cat("\n箱线图已保存为:", paste(gene_interest, "_expression_boxplot.pdf", sep = ""))