【TCGA-CRC】TCGA数据读取
写在前面
参考已有的帖子写的,但是临床数据和UCSC的不同。有知道的小伙伴欢迎指正。
rm(list = ls()); gc()
test1 = data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00ae9ab8-6eaa-4085-af72-26f96df97fa3/90c9f8cd-4c8c-4f07-af2f-e17db69bd561.rna_seq.augmented_star_gene_counts.tsv")
test2 = data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00cdff29-697a-4a17-ba67-cf55c006b827/0f35c851-1cb8-4f75-a661-eae9111b7362.rna_seq.augmented_star_gene_counts.tsv")
identical(test1$gene_id,test2$gene_id)
identical(test1$gene_name,test2$gene_name)
sum(duplicated(test1$gene_id))
sum(duplicated(test1$gene_name))
x1 <-data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00cdff29-697a-4a17-ba67-cf55c006b827/0f35c851-1cb8-4f75-a661-eae9111b7362.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_id"))
head(x1)
exp_COAD <- dir("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/",pattern = "*.rna_seq.augmented_star_gene_counts.tsv",#*表示任意前缀,$表示固定后缀recursive = T)
exp_READ <- dir("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/",pattern = "*.rna_seq.augmented_star_gene_counts.tsv",#*表示任意前缀,$表示固定后缀recursive = T)
#批量读入所有文件的unstranded列(counts)/tpm_unstranded列(TPM)
exp_dt_coad <- function(x){result <- data.table::fread(file.path("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/",x),select = c("tpm_unstranded"))return(result)
}
exp_dt_read <- function(x){result <- data.table::fread(file.path("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/",x),select = c("tpm_unstranded"))return(result)
}
exp_coad <- lapply(exp_COAD,exp_dt_coad)
exp_read <- lapply(exp_READ,exp_dt_read)exp_coad <- do.call(cbind,exp_coad)#将exp按列合并,并将list转化为data.table
exp_coad[1:6,1:6]
exp_read <- do.call(cbind,exp_read)#将exp按列合并,并将list转化为data.table
exp_read[1:6,1:6]#添加行名(gene_id),所有表达矩阵的gene_id是相同的:
exp_coad <- as.data.frame(exp_coad)#data.table格式不能自定行名,先转换为数据框
rownames(exp_coad) <- x1$gene_id
exp_coad[1:8,1:4]meta_dt <- jsonlite::fromJSON("./00_Rawdata/metadata.cart.2025-05-19.json")
id <- meta_dt$associated_entities
ids <- sapply(id,function(x){x[,1]}) #读取所有样本id,返回一个包含所有id名的向量(和前面lapply的区别,lapply返回的是list)
head(ids)
ids2 <- data.frame(file_name = meta_dt$file_name,id = ids) #生成一个包含有样本id和文件名这两列对应关系的数据框,文件名即为表达矩阵的文件名
head(ids2)
head(gsub("e_counts.tsv","",ids2$file_name))
# ids2$file_name<-gsub("e_counts.tsv","",ids2$file_name)
library(stringr)
exp_COAD2 <- str_split(exp_COAD,"/",simplify = T)#以/做拆分,simplify = T把结果返回成矩阵
head(exp_COAD2)
exp_READ2 <- str_split(exp_READ,"/",simplify = T)#以/做拆分,simplify = T把结果返回成矩阵
head(exp_READ2)
exp_COAD2 <- exp_COAD2[,2]
exp_READ2 <- exp_READ2[,2]
#用百分百in检测是否拆分后的文件名都完全包含在file_name中(此时顺序不同,因此不能用==)
table(exp_COAD2 %in% ids2$file_name)
table(exp_READ2 %in% ids2$file_name)#全部返回TRUE,则拆分无误
# 查看样本数是否一致
length(exp_COAD2)
length(exp_READ2)
length(ids2$file_name)
length(intersect(exp_COAD2,ids2$file_name))
length(intersect(exp_READ2,ids2$file_name))#用match函数调整ids2$file_name的顺序和exp_COAD2一致(因为我们合并的表达矩阵是按照exp_COAD2的顺序进行合并的)
ids3_COAD <- ids2[match(exp_COAD2,ids2$file_name),]
ids3_READ <- ids2[match(exp_READ2,ids2$file_name),]#检查顺序是否一致
head(ids3_COAD$file_name)
head(exp_COAD2)
head(exp_COAD)
identical(ids3_COAD$file_name,exp_COAD2)#返回TRUE,确认无误
#检查顺序是否一致
head(ids3_READ$file_name)
head(exp_READ2)
head(exp_READ)
identical(ids3_READ$file_name,exp_READ2)#返回TRUE,确认无误#修改列名为样本名。
colnames(exp_coad) <- ids3_COAD$id
exp_coad[1:8,1:2]
colnames(exp_read) <- ids3_READ$id
exp_read[1:8,1:2]
##转换为数据框,不然后面合并会丢失exp的行名。
list_coad <- as.data.frame(data.table::fread("./00_Rawdata/GDCdata/TCGA-COAD/Transcriptome_Profiling/Gene_Expression_Quantification/00ae9ab8-6eaa-4085-af72-26f96df97fa3/90c9f8cd-4c8c-4f07-af2f-e17db69bd561.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_name","gene_type")))
list_read <- as.data.frame(data.table::fread("./00_Rawdata/GDCdata/TCGA-READ/Transcriptome_Profiling/Gene_Expression_Quantification/00f55a16-0ee5-4939-8efb-de34e68d4ccd/ff11a9e3-d32b-431b-9ebb-c5a3d9eb0e4f.rna_seq.augmented_star_gene_counts.tsv",select = c("gene_name","gene_type")))
exp_coad2 <- cbind(list_coad,exp_coad)
exp_coad2[1:10,1:3]
identical(rownames(exp_coad),rownames(exp_coad2))
exp_read2 <- cbind(list_read,exp_read)
exp_read2[1:10,1:3]
identical(rownames(exp_read),rownames(exp_read2))
mRNA_coad <- exp_coad2[exp_coad2$gene_type %in% c("protein_coding"),]
# lncRNA_coad <- exp_coad2[exp_coad2$gene_type %in% c("lncRNA"),]
save(mRNA_coad, file = './00_Rawdata/TCGA-COAD_mRNA.Rdata')
save(mRNA_coad, file = './00_Rawdata/TCGA-COAD_TPM.Rdata')mRNA_read <- exp_read2[exp_read2$gene_type %in% c("protein_coding"),]
# lncRNA_read <- exp_read2[exp_read2$gene_type %in% c("lncRNA"),]
save(mRNA_read, file = './00_Rawdata/TCGA-READ_mRNA.Rdata')
save(mRNA_read, file = './00_Rawdata/TCGA-READ_TPM.Rdata')########## clinical ###############
rm(list = ls())
library(readr)
library(tidyverse)
library(dplyr)
library(data.table)
clin <- read_tsv("00_Rawdata/clinical/clinical.tsv")
# clin <- fread("00_Rawdata/clinical/clinical.tsv",data.table = F)
clin_time <- clin %>% dplyr::select(cases.submitter_id,demographic.vital_status,demographic.days_to_death,diagnoses.days_to_last_follow_up,demographic.age_at_index,demographic.gender,diagnoses.ajcc_pathologic_t,diagnoses.ajcc_pathologic_m,diagnoses.ajcc_pathologic_n,diagnoses.ajcc_pathologic_stage) %>%dplyr::filter(!duplicated(cases.submitter_id))
#创建新的一列OS.time,如果vital_status列是Alive,就用days_to_last_follow_up数据填充,如果vital_status列是Dead,就用days_to_death数据填充。
#创建一个OS列,生存用0表示,死亡用1表示。
clin_merge <- clin_time %>% dplyr::mutate(OS.time = case_when(demographic.vital_status == "Alive" ~ diagnoses.days_to_last_follow_up,demographic.vital_status == "Dead" ~ demographic.days_to_death)) %>%dplyr::mutate(OS = case_when(demographic.vital_status == "Alive" ~ 0,demographic.vital_status == "Dead" ~ 1))
#使用rename函数修改列名
clin_merge1 <- clin_merge %>% rename(age = demographic.age_at_index, # 将age_at_index列名修改为ageT = diagnoses.ajcc_pathologic_t, # ajcc_pathologic_t列名修改为TM = diagnoses.ajcc_pathologic_m, N = diagnoses.ajcc_pathologic_n, stage = diagnoses.ajcc_pathologic_stage )clin<-clin_merge1[,-(3:4)]
# clin<-na.omit(clin)
clin[clin == "'--"] <- NA
write.csv(clin, file = "00_Rawdata/clinical/clinical.csv", row.names = F, quote = FALSE)