当前位置: 首页 > news >正文

【TCGA-CRC】TCGA数据读取

写在前面

参考已有的帖子写的,但是临床数据和UCSC的不同。有知道的小伙伴欢迎指正。

rm(list = ls()); gc()
test1 = data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00ae9ab8-6eaa-4085-af72-26f96df97fa3/90c9f8cd-4c8c-4f07-af2f-e17db69bd561.rna_seq.augmented_star_gene_counts.tsv")
test2 = data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00cdff29-697a-4a17-ba67-cf55c006b827/0f35c851-1cb8-4f75-a661-eae9111b7362.rna_seq.augmented_star_gene_counts.tsv")
identical(test1$gene_id,test2$gene_id)
identical(test1$gene_name,test2$gene_name)
sum(duplicated(test1$gene_id))
sum(duplicated(test1$gene_name))
x1 <-data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00cdff29-697a-4a17-ba67-cf55c006b827/0f35c851-1cb8-4f75-a661-eae9111b7362.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_id"))
head(x1)
exp_COAD <- dir("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/",pattern = "*.rna_seq.augmented_star_gene_counts.tsv",#*表示任意前缀,$表示固定后缀recursive = T)
exp_READ <- dir("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/",pattern = "*.rna_seq.augmented_star_gene_counts.tsv",#*表示任意前缀,$表示固定后缀recursive = T)
#批量读入所有文件的unstranded列(counts)/tpm_unstranded列(TPM)
exp_dt_coad <- function(x){result <- data.table::fread(file.path("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/",x),select = c("tpm_unstranded"))return(result)
}
exp_dt_read <- function(x){result <- data.table::fread(file.path("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/",x),select = c("tpm_unstranded"))return(result)
}
exp_coad <- lapply(exp_COAD,exp_dt_coad)
exp_read <- lapply(exp_READ,exp_dt_read)exp_coad <- do.call(cbind,exp_coad)#将exp按列合并,并将list转化为data.table
exp_coad[1:6,1:6]
exp_read <- do.call(cbind,exp_read)#将exp按列合并,并将list转化为data.table
exp_read[1:6,1:6]#添加行名(gene_id),所有表达矩阵的gene_id是相同的:
exp_coad <- as.data.frame(exp_coad)#data.table格式不能自定行名,先转换为数据框
rownames(exp_coad) <- x1$gene_id
exp_coad[1:8,1:4]meta_dt <- jsonlite::fromJSON("./00_Rawdata/metadata.cart.2025-05-19.json")
id <- meta_dt$associated_entities
ids <- sapply(id,function(x){x[,1]}) #读取所有样本id,返回一个包含所有id名的向量(和前面lapply的区别,lapply返回的是list)
head(ids)
ids2 <- data.frame(file_name = meta_dt$file_name,id = ids) #生成一个包含有样本id和文件名这两列对应关系的数据框,文件名即为表达矩阵的文件名
head(ids2)
head(gsub("e_counts.tsv","",ids2$file_name))
# ids2$file_name<-gsub("e_counts.tsv","",ids2$file_name)
library(stringr)
exp_COAD2 <- str_split(exp_COAD,"/",simplify = T)#以/做拆分,simplify = T把结果返回成矩阵
head(exp_COAD2)
exp_READ2 <- str_split(exp_READ,"/",simplify = T)#以/做拆分,simplify = T把结果返回成矩阵
head(exp_READ2)
exp_COAD2 <- exp_COAD2[,2]
exp_READ2 <- exp_READ2[,2]
#用百分百in检测是否拆分后的文件名都完全包含在file_name中(此时顺序不同,因此不能用==)
table(exp_COAD2 %in% ids2$file_name)
table(exp_READ2 %in% ids2$file_name)#全部返回TRUE,则拆分无误
# 查看样本数是否一致
length(exp_COAD2)
length(exp_READ2)
length(ids2$file_name)
length(intersect(exp_COAD2,ids2$file_name))
length(intersect(exp_READ2,ids2$file_name))#用match函数调整ids2$file_name的顺序和exp_COAD2一致(因为我们合并的表达矩阵是按照exp_COAD2的顺序进行合并的)
ids3_COAD <- ids2[match(exp_COAD2,ids2$file_name),]
ids3_READ <- ids2[match(exp_READ2,ids2$file_name),]#检查顺序是否一致
head(ids3_COAD$file_name)
head(exp_COAD2)
head(exp_COAD)
identical(ids3_COAD$file_name,exp_COAD2)#返回TRUE,确认无误
#检查顺序是否一致
head(ids3_READ$file_name)
head(exp_READ2)
head(exp_READ)
identical(ids3_READ$file_name,exp_READ2)#返回TRUE,确认无误#修改列名为样本名。
colnames(exp_coad) <- ids3_COAD$id
exp_coad[1:8,1:2]
colnames(exp_read) <- ids3_READ$id
exp_read[1:8,1:2]
##转换为数据框,不然后面合并会丢失exp的行名。
list_coad <- as.data.frame(data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00ae9ab8-6eaa-4085-af72-26f96df97fa3/90c9f8cd-4c8c-4f07-af2f-e17db69bd561.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_name","gene_type")))
list_read <- as.data.frame(data.table::fread("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/00f55a16-0ee5-4939-8efb-de34e68d4ccd/ff11a9e3-d32b-431b-9ebb-c5a3d9eb0e4f.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_name","gene_type")))
exp_coad2 <- cbind(list_coad,exp_coad)
exp_coad2[1:10,1:3]
identical(rownames(exp_coad),rownames(exp_coad2))
exp_read2 <- cbind(list_read,exp_read)
exp_read2[1:10,1:3]
identical(rownames(exp_read),rownames(exp_read2))
mRNA_coad <- exp_coad2[exp_coad2$gene_type %in% c("protein_coding"),]
# lncRNA_coad <- exp_coad2[exp_coad2$gene_type %in% c("lncRNA"),]
save(mRNA_coad, file = './00_Rawdata/TCGA-COAD_mRNA.Rdata')
save(mRNA_coad, file = './00_Rawdata/TCGA-COAD_TPM.Rdata')mRNA_read <- exp_read2[exp_read2$gene_type %in% c("protein_coding"),]
# lncRNA_read <- exp_read2[exp_read2$gene_type %in% c("lncRNA"),]
save(mRNA_read, file = './00_Rawdata/TCGA-READ_mRNA.Rdata')
save(mRNA_read, file = './00_Rawdata/TCGA-READ_TPM.Rdata')########## clinical ###############
rm(list = ls())
library(readr)
library(tidyverse)
library(dplyr)
library(data.table)
clin <- read_tsv("00_Rawdata/clinical/clinical.tsv")
# clin <- fread("00_Rawdata/clinical/clinical.tsv",data.table = F)
clin_time <- clin %>% dplyr::select(cases.submitter_id,demographic.vital_status,demographic.days_to_death,diagnoses.days_to_last_follow_up,demographic.age_at_index,demographic.gender,diagnoses.ajcc_pathologic_t,diagnoses.ajcc_pathologic_m,diagnoses.ajcc_pathologic_n,diagnoses.ajcc_pathologic_stage) %>%dplyr::filter(!duplicated(cases.submitter_id))
#创建新的一列OS.time,如果vital_status列是Alive,就用days_to_last_follow_up数据填充,如果vital_status列是Dead,就用days_to_death数据填充。
#创建一个OS列,生存用0表示,死亡用1表示。
clin_merge <- clin_time %>% dplyr::mutate(OS.time = case_when(demographic.vital_status == "Alive" ~ diagnoses.days_to_last_follow_up,demographic.vital_status == "Dead" ~ demographic.days_to_death)) %>%dplyr::mutate(OS = case_when(demographic.vital_status == "Alive" ~ 0,demographic.vital_status == "Dead" ~ 1))
#使用rename函数修改列名
clin_merge1 <- clin_merge %>% rename(age = demographic.age_at_index,  # 将age_at_index列名修改为ageT = diagnoses.ajcc_pathologic_t,  # ajcc_pathologic_t列名修改为TM = diagnoses.ajcc_pathologic_m,  N = diagnoses.ajcc_pathologic_n,  stage = diagnoses.ajcc_pathologic_stage  )clin<-clin_merge1[,-(3:4)]
# clin<-na.omit(clin)
clin[clin == "'--"] <- NA
write.csv(clin, file = "00_Rawdata/clinical/clinical.csv", row.names = F, quote = FALSE)

相关文章:

  • 基于springboot的在线教育系统【附源码】
  • Kotlin 协程 (三)
  • 9、AI测试辅助-代码Bug分析提示词优化
  • 安卓settings单双屏显示
  • 用typoa写markdown文档笔记
  • 使用布隆过滤器实现java大数据筛选是否存在
  • 微软宣布的五大重要事项|AI日报0520
  • 微软开放代理网络愿景
  • 镜像管理(2)Dockerfile总结
  • vue3/vue2大屏适配
  • 扫盲笔记之NPM
  • Wan2.1 文生视频 支持批量生成、参数化配置和多语言提示词管理
  • C及C++的音频库与视频库介绍
  • WIFI信号状态信息 CSI 深度学习篇之CNN(Python)
  • 第5天-python饼图绘制
  • Jenkins:自动化之魂,解锁高效开发的密钥
  • 三、【数据建模篇】:用 Django Models 构建测试平台核心数据
  • SQLite基础及优化
  • PL/SQL 安装配置与使用
  • 力扣热题——零数组变换 |
  • 中沙深化多领域合作,达成60余项共识
  • 萨洛宁、康托罗夫、长野健……7月夏季音乐节来很多大牌
  • 全国35城居民对公共服务满意度“打分”,上海多项指标居首
  • 国家发改委:正在会同有关方面,加快构建统一规范、协同共享、科学高效的信用修复制度
  • 招商基金总经理徐勇因任期届满离任,“老将”钟文岳回归接棒
  • 世卫大会拒绝涉台提案,外交部:坚持一个中国原则是人心所向