RNA-seq分析之基因ID转换
跑了生信项目这么久,还是得做一下总结,将常用的代码记录保存一下,以免电脑坏了又没备份,代码在确保运转稳定后使用AI进行补充和完善,适合立马使用
# 1. 基础设置
setwd("G:/aml") # 替换为你的实际路径
rm(list = ls()) # 清空环境变量# 2. 安装并加载必需的R包
if (!require("BiocManager", quietly = TRUE)) {install.packages("BiocManager")
}# 安装依赖包
packages <- c("AnnotationDbi", "org.Hs.eg.db", "dplyr", "tidyr")
for (pkg in packages) {if (!require(pkg, quietly = TRUE, character.only = TRUE)) {if (pkg %in% c("AnnotationDbi", "org.Hs.eg.db")) {BiocManager::install(pkg, update = FALSE)} else {install.packages(pkg)}}
}# 加载包
library(AnnotationDbi)
library(org.Hs.eg.db)
library(dplyr)
library(tidyr)# 3. 读取原始表达数据
dat <- read.table(file = "AML.txt", # 基因表达矩阵文件header = TRUE, # 第一行为列名sep = "\t", # 制表符分隔check.names = FALSE, # 保留原始列名格式stringsAsFactors = FALSE
)
#可供转换的基因名字
keytypes(org.Hs.eg.db)# 查看数据基本信息
cat("原始数据维度:", dim(dat), "\n")
head(dat[, 1:3])# 4. 基因注释(Entrez ID转基因符号)
# 提取Entrez ID
entrez_ids <- as.character(dat$GeneID)# 注释基因(获取SYMBOL、GENENAME)
annot_result <- AnnotationDbi::select(x = org.Hs.eg.db,keys = entrez_ids,columns = c("SYMBOL", "GENENAME", "ENTREZID"),keytype = "ENTREZID"
)# 查看注释结果
cat("\n注释结果维度:", dim(annot_result), "\n")
head(annot_result)# 5. 注释结果去重
annot_unique <- annot_result %>%distinct(ENTREZID, .keep_all = TRUE) # 按Entrez ID去重cat("\n去重后注释行数:", nrow(annot_unique), "\n")# 6. 合并注释到原始数据并清理
# 转换为字符型确保匹配
dat$GeneID <- as.character(dat$GeneID)
annot_unique$ENTREZID <- as.character(annot_unique$ENTREZID)# 合并注释信息
dat_annotated <- left_join(x = dat,y = annot_unique[, c("ENTREZID", "SYMBOL", "GENENAME")],by = c("GeneID" = "ENTREZID")
)# 调整列顺序并删除冗余列
dat_clean <- dat_annotated %>%select(SYMBOL, GENENAME, everything()) %>% # 注释列放前面select(-GeneID) %>% # 移除原始ID列filter(!is.na(SYMBOL)) # 保留有基因符号的行cat("\n清理后数据维度:", dim(dat_clean), "\n")# 7. 按基因符号合并重复基因(取均值,NA用0代替)
sample_cols <- setdiff(colnames(dat_clean), c("SYMBOL", "GENENAME")) # 样本列名dat_merged <- dat_clean %>%group_by(SYMBOL) %>%summarise(GENENAME = first(na.omit(GENENAME)), # 保留基因名across(all_of(sample_cols), ~mean(.x, na.rm = TRUE)), # 样本列取均值.groups = "drop") %>%mutate(across(all_of(sample_cols), ~replace_na(.x, 0))) # NA替换为0cat("\n合并后数据维度:", dim(dat_merged), "\n")dat_merged<-as.data.frame(dat_merged)# 8. 保存最终结果
rownames(dat_merged) <- dat_merged$SYMBOL # 基因符号设为行名
dat_final <- select(dat_merged, -SYMBOL)#移除重复的基因列
dat_final <- select(dat_final, -GENENAME)write.table(x = dat_final,file = "count_Matrix_annotated.txt",sep = "\t",row.names = TRUE,col.names = TRUE,quote = FALSE,na = ""
)# 最终验证
cat("\n最终矩阵维度:", dim(dat_final), "\n")
cat("结果保存路径:", file.path(getwd(), "count_Matrix_annotated.txt"), "\n")
head(dat_final[, 1:3])