当前位置：首页 > news >正文

植物中lncRNA鉴定和注释流程，代码（包含Identified，Classification，WGCNA.....)

news 2025/9/2 9:25:48

原文链接：植物中lncRNA鉴定和注释流程，代码（包含Identified，Classification，WGCNA…)

本期文章

一、Identification of lncRNAs

List of used programs:

gffread
EMBOSS/v6.6.0 - infoseq function
kentUtils/v302 - faSomeRecords, gtfToGenePred and genePredToBed functions
TransDecoder/v5.3.0 - TransDecoder.LongOrfs and TransDecoder.Predict functions
ncbi-blast+/v2.6.0 - blastp, blastx and blastdbcmd functions
hmmer/v3.1b2 - hmmscan function
SignalP/v4.1 - signalp function
infernal/v1.1.2 - cmsearch function
get_intron_gtf_from_bed.pl - the whole code

List of input data:

At_stringtie_merged_team.gtf : Our annotation in GTF format on the Araport11 genome.
araport_biotypes.tsv : Transcript list with gene model type based on the Araport11 genome release.
Araport11_gene_type-ed.txt : Gene list with gene model type based on the Araport11 genome release. (from https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FGenes%2FAraport11_genome_release)
Araport11_GFF3_genes_transposons.201606.gtf : Araport11 annotation in GTF format with Col-0 Mitochondrial annotations. (from https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FGenes%2FAraport11_genome_release)
GreeNC_Araport11_lncRNAs.bed : File generated from GreeNC IDs for Arabidopsis thaliana in BED format.

1) Transcripts over 200 nt

Convert GTF to FASTA

gffread At_stringtie_merged_team.gtf -g At_allgenomes.fa -w At_stringm_seq.fasta --keep-genes

Select transcripts over 200 nt

infoseq At_stringm_seq.fasta -only -name -type -length | sort -k 3n > At_transcripts_length
awk '$3 >= 200 {print}' At_transcripts_length > At_transcripts_more_than_200
awk '{print $1}' At_transcripts_more_than_200 | tail -n +2 | sort -V > At_ID_list_more_than_200

Obtain the FASTA file

faSomeRecords At_stringm_seq.fasta At_ID_list_more_than_200 At_filter_200nt_st.fasta

2) Eliminate all sequences with homology to proteins

Predicting the ORF and amino acid sequence

TransDecoder.LongOrfs -m 100 -S -t At_filter_200nt_st.fasta
TransDecoder.Predict -t At_filter_200nt_st.fasta

Obtain all translated ORF or nucleotide sequence had homology to proteins in the Uniprot database (evalue <= 1e10-6)

blastx -query At_filter_200nt_st.fasta -db uniprot-sprot -num_threads 8 -max_target_seqs 100 -strand "plus" -outfmt "6 qseqid sseqid qlen slen qstart sstart qend send score evalue bitscore length positive qcovs" -evalue 1e-6 > At_blastx.outfmt6awk 'BEGIN{FS="\t"}{if($10<=(1e-6)){print $AF}}' At_blastx.outfmt6 > At_blastx_evalua10_6 
awk '{print $1}' At_blastx_evalua10_6 | sort -V | uniq > At_blastx_strand_IDsblastp -query At_filter_200nt_st.fasta.transdecoder.pep -db uniprot-sprot -num_threads 8 -max_target_seqs 100 -outfmt "6 qseqid sseqid qlen slen qstart sstart qend send score evalue bitscore length positive qcovs" -evalue 1e-6 > At_blastp.outfmt6 awk 'BEGIN{FS="\t"}{if($10<=(1e-6)){print $AF}}' At_blastp.outfmt6 > At_blastp_evalua10_6# Identify by strand
awk 'BEGIN { OFS = "\t" } { $13 = $5 - $7 } 1' At_blastp_evalua10_6 | awk 'BEGIN { OFS = "\t" } { $14 = $6 - $8 } 1' | cut -f1,2,13,14 > At_blastp.outfmt6_strand 
awk '$3<(-1) && $4<(-1)' At_blastp.outfmt6_strand > At_blastp.outfmt6_strand_negative
awk '{print $1}' At_blastp.outfmt6_strand_negative | sed 's/.$//' | sed 's/.$//' | sed 's/.$//' | sort -V | uniq > At_blastp_IDsdiff At_blastp_IDs At_blastx_strand_IDs --left-column | grep \> | cut -f2 -d' ' > At_blastxyp_IDs

Obtain all sequences related to protein domains

hmmscan -o At_pfam.log_new --domtblout At_TrinotatePFAM.out  --incE 0.0001 --incdomE 0.0001 --cpu 8 Pfam-A.hmm At_filter_200nt_st.fasta.transdecoder.pep
grep "(+)," At_pfam.log_new > At_pfam.log_strand_plus
awk '{print $7}' At_pfam.log_strand_plus | sed 's/:.*//' > At_hmmer_IDs_strand_plusgrep -Ff At_hmmer_IDs_strand_plus At_TrinotatePFAM.out | awk '{print $1,$3,$4,$6,$7}' > At_hmmer_strand
awk 'BEGIN{FS="\t"}{if($5<=(1e-6)){print $AF}}' At_hmmer_strand > At_hmmer_strand_evalua10_6
awk '{print $3}' At_hmmer_strand_evalua10_6 | sed 's/.$//' | sed 's/.$//' | sed 's/.$//' | sort -V| uniq > list_pfam_200nt

Obtain all sequences related to signal peptide

signalp -f short -n signalp.out At_filter_200nt_st.fasta.transdecoder.pep
awk '$6 >= 0.45 {print}' signalp.out  | awk '{print $1}' | sed '1d' | sed 's/.$//'| sed 's/.$//' | sed 's/.$//' | sed 's/##sequence-n//'| sort -V | uniq > At_signalp_IDs

Remove all sequences related to proteins

cat At_blastxyp_IDs list_pfam_200nt At_signalp_IDs > IDs_filtros
sort -V IDs_filtros | uniq > IDs_uniq_filtrosdiff IDs_uniq_filtros At_ID_list_more_than_200 --suppress-common-lines| grep \> | cut -f2 -d' ' > At_filter5_IDs

Obtain FASTA file

faSomeRecords At_filter_200nt_st.fasta At_filter5_IDs At_filter5_200nt.fasta

3) Remove sequence with an ORF greater than 100 nt

getorf -minsize 300 -find 0 At_filter5_200nt.fasta At_ORFs_300_stop-stop.fasta
infoseq At_ORFs_300_stop-stop.fasta -only -name -type -length | sort -k 3n > At_ORF_300_nt_length
grep -v '[[:space:]]100[[:space:]]' At_ORF_300_nt_length > At_ORF_morethan_300_nt_length
awk '{print $1}' At_ORF_morethan_300_nt_length | tail -n +2 | sort -V > sort_At_ORF_300_nt_length
sed 's/.$//' sort_At_ORF_300_nt_length |sed 's/.$//' | sort -V | uniq > At_ORF100aa_IDs_ed
diff At_ORF100aa_IDs_ed At_filter5_IDs --suppress-common-lines| grep \> | cut -f2 -d' ' > ORF100_At_filter6_IDs

Obtain FASTA file

faSomeRecords At_filter5_200nt.fasta ORF100_At_filter6_IDs ORF100_At_filter6.fasta

4) Eliminate sequences with homology to non-redundant protein (nr)

blastx -query ORF100_At_filter6.fasta -db /data/secuencias/NR/nr -num_threads 8 -max_target_seqs 100 -strand "plus" -outfmt "6 qseqid sseqid qlen slen qstart sstart qend send score evalue bitscore length positive qcovs" -evalue 1e-6 -out At_blastx_filtro7_nr_ORF100aa.outmt6awk 'BEGIN{FS="\t"}{if($10<=(1e-6)){print $AF}}' At_blastx_filtro7_nr_ORF100aa.outmt6 > At_blastxNR_evalua10_6
awk '{print $1}' At_blastxNR_evalua10_6 | sort -V | uniq > At_blastxNR_100aa_IDs

5) Hypothetical protein search

for i in $(cat At_blastxNR_100aa_IDs); do grep $i At_blastx_filtro7_nr_ORF100aa.outmt6 | sort -nrk9,9 | head -1 | cut -d ' ' -f3 | awk '{print $1,$2,$9}' >> At_BlastxNR_IDS_maxvalue; doneawk '{print $2}' At_BlastxNR_IDS_maxvalue | sed 's/ref|//' |  sed 's/gb|//' |  sed 's/dbj|//' |  sed 's/emb|//' | sed 's/prf|//' | sed 's/pir|//' | sed 's/sp|//' | sed 's/|//g' | sort -V | uniq > At_blastx_input
blastdbcmd -db /data/secuencias/NR/nr -entry_batch At_blastx_input  -target_only -out At_BlastxNR_NCBI_IDs
grep ">" At_BlastxNR_NCBI_IDs > At_BlastxNR_NCBI_info_only

Get all categories related to hypothetical proteins according to NCBI

grep "hypothetical protein" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_hypoteticalp_info
grep "unnamed protein product" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_unnamedp_info
grep "unknown" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_unknown_info
grep "putative protein" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_putativep_info
grep "predicted protein" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_predictedp_info
grep "similar to" At_BlastxNR_NCBI_info_only > At_BlastxNR_NCBI_similartop_info

Also we try with “uncharacterized protein” , “novel” and “unnamed gene”, but we did not find anything.

# Obtain IDs
cat At_BlastxNR_NCBI_hypoteticalp_info At_BlastxNR_NCBI_unnamedp_info At_BlastxNR_NCBI_unknown_info At_BlastxNR_NCBI_putativep_info At_BlastxNR_NCBI_predictedp_info At_BlastxNR_NCBI_similartop_info> At_hypothetical_protein_info_blastxNR
awk '{print $1}' At_hypothetical_protein_info_blastxNR | sed 's/>//' | sort -V | uniq > At_hypothetical_protein_info_blastxNR_IDs# Select by maxvalue
grep -Ff At_hypothetical_protein_info_blastxNR_IDs At_BlastxNR_IDS_maxvalue > At_hypothetical_protein_info_maxvalue
awk '{print $1}' At_hypothetical_protein_info_maxvalue > At_hypothetical_protein_BlastxNR_transcript_IDs

Retain hypothetical proteins related to lncRNAs

grep -vf At_hypothetical_protein_BlastxNR_transcript_IDs At_blastxNR_100aa_IDs | grep -v "AT3G09922.1" > At_blastxNR_100aa_IDs_ed
diff At_blastxNR_100aa_IDs_ed ../ORF100_At_filter6_IDs --suppress-common-lines| grep \> | cut -f2 -d' ' > ORF100_At_filter7_IDs

Note: At_hypothetical_protein_BlastxNR_transcript_IDs contain 972 IDs related to hypothetical proteins.

Obtain FASTA file

faSomeRecords ORF100_At_filter6.fasta ORF100_At_filter7_IDs ORF100_At_filter7.fasta

6) Delete tRNA, rRNA and mitochondria sequences

Remove tRNA and rRNA sequences

# Prepare database
wget ftp://ftp.ebi.ac.uk/pub/databases/Rfam/14.5/Rfam.cm.gz
mv Rfam.cm.gz Rfam_14.5.cm.gz
gzip -d Rfam_14.5.cm.gzcmfetch Rfam_14.5.cm tRNA > tRNA_Rfam.cm
cmfetch Rfam_14.5.cm SSU_rRNA_eukarya >SSU_rRNA_eukarya_Rfam.cm
cmfetch Rfam_14.5.cm LSU_rRNA_eukarya > LSU_rRNA_eukarya_Rfam.cm
cmfetch Rfam_14.5.cm 5S_rRNA > 5S_rRNA_Rfam.cm
cmfetch Rfam_14.5.cm 5_8S_rRNA > 5_8S_rRNA_Rfam.cmcat *_Rfam.cm > housekeeping_rfam_14.5.cm# Detection using Infernal
cmsearch --cpu 32 --cut_ga --tblout Inferna_housekeeping_rfam.tsv housekeeping_rfam_14.5.cm  ORF100_At_filter7.fasta
awk '{print $1}' At_Inferna_housekeeping_rfam.tsv | sed 1,2d | head -n-10 > At_tRNA_ORF100aa_IDs# Detection with BLAST
grep -w "rRNA" araport_biotypes.tsv | awk '{print $1}' > Araport_rRNA_IDs
grep -w "tRNA" araport_biotypes.tsv| awk '{print $1}' > Araport_tRNA_IDsfaSomeRecords At_stringm_seq.fasta Araport_rRNA_IDs At_rRNA.fasta
faSomeRecords At_stringm_seq.fasta Araport_tRNA_IDs At_tRNA.fastamakeblastdb -in At_rRNA.fasta -dbtype nucl
blastn -query ORF100_At_filter7.fasta -db At_rRNA.fasta -num_threads 8 -max_target_seqs 1 -outfmt "6 qseqid sseqid qlen slen qstart sstart qend send pident score evalue length positive qcovs" -out At_team_rRNA_ORF100.outfmt6blastn -query ORF100_At_filter7.fasta -db At_tRNA.fasta -num_threads 8 -max_target_seqs 1 -outfmt "6 qseqid sseqid qlen slen qstart sstart qend send pident score evalue length positive qcovs" -out At_team_tRNA_ORF100.outfmt6# Obtain IDs
awk '{print $1}' At_team_rRNA_ORF100.outfmt6 > At_rRNA_ORF100aa_IDs_blastn
awk '{print $1}' At_team_tRNA_ORF100.outfmt6 > At_tRNA_ORF100aa_IDs_blastn
cat At_rRNA_ORF100aa_IDs_blastn At_tRNA_ORF100aa_IDs_blastn | sort -V | uniq > At_tRNA_rRNA_ORF100aa_IDs_blastngrep -vf At_tRNA_ORF100aa_IDs BlastXNR_At_ORF100a_new/ORF100_At_filter7_IDs | grep -vf At_tRNA_rRNA_ORF100aa_IDs_blastn > ORF100_At_filter8_IDs

Remove sequences from mitochondria

grep -v "ATM" ORF100_At_filter8_IDs > ORF100_At_filter8_clean_IDs

Obtain FASTA file

faSomeRecords ORF100_At_filter7.fasta ORF100_At_filter8_clean_IDs ORF100_At_filter8.fasta

Obtain BED and GTF files

# Convert to BED file
for file in At_stringtie_merged_team.gtf; do clean=$(echo $file | sed 's/\.gtf//'); gtfToGenePred $file ${clean}.genePred; done
for file in At_stringtie_merged_team.genePred;  do clean=$(echo $file | sed 's/\.genePred//'); genePredToBed $file ${clean}.bed; done# Obtain GTF file
grep -Ff ORF100_At_filter8_clean_IDs ../../At_stringtie_merged_team.gtf | sort -k 1,1 -k2,2n > At_team_lncRNAs.gtf

7) Delete sequences with introns over 6000 nt

# Obtain exons
grep '[[:space:]]exon[[:space:]]' At_team_lncRNAs.gtf > At_exonlncRNAs.gtf# Convert to BED file
for file in At_exonlncRNAs.gtf; do clean=$(echo $file | sed 's/\.gtf//'); ./gtfToGenePred $file ${clean}.genePred; done
for file in At_exonlncRNAs.genePred;  do clean=$(echo $file | sed 's/\.genePred//'); ./genePredToBed $file ${clean}.bed; done
sort -k 1,1 -k2,2n At_exonlncRNAs.bed > At_exon_lncRNAs_sorted.bed # Obtain introns
perl ./get_intron_gtf_from_bed.pl At_exon_lncRNAs_sorted.bed At_lncRNAs_intrones.gtf
awk '{print $1,$4,$5,$10,$6,$7}' At_lncRNAs_intrones.gtf | sed 's/;//' | sed 's/transcript://' | perl -pi -e 's/[""]//g' > At_introns_lncRNAs.bed# Obtain length of introns
awk 'BEGIN { OFS = "\t" } { $7 = $3 - $2 } 1' At_introns_lncRNAs.bed | cut -f4,7 > At_introns_lncRNAs_length.txt
sort -k2 -n At_introns_lncRNAs_length.txt > At_introns_lncRNAs_length_sorted.txt# Obtain IDs and delete from the file
awk 'BEGIN {OFS=FS="\t"} $2 > 6000' At_introns_lncRNAs_length_sorted.txt | awk '{print $1}' | grep "MSTRG" | sort -V | uniq  > At_team_introns_lncRNAs_more_than_6000_IDs
grep -vf At_team_introns_lncRNAs_more_than_6000_IDs At_team_lncRNAs.gtf > At_team_lncRNAs_clean.gtf# Obtain BED file
for file in At_team_lncRNAs_clean.gtf; do clean=$(echo $file | sed 's/\.gtf//'); ./gtfToGenePred $file ${clean}.genePred; done
for file in At_team_lncRNAs_clean.genePred;  do clean=$(echo $file | sed 's/\.genePred//'); ./genePredToBed $file ${clean}.bed; done# Obtain IDs
awk '{print $4}' At_team_lncRNAs_clean.bed | sort -V | uniq > At_team_lncRNAs_IDs

8) Manual screening to eliminate annotated sequences and transmembrane proteins

Analyze the isoforms of the genes

clusterBed -s -i ../At_team_lncRNAs_clean.bed > cluster_At_team_ORF100_lncRNAs.bed
awk 'BEGIN{FS="\t"; OFS=FS} {print $1,$2,$3,$4,$13}' cluster_At_team_ORF100_lncRNAs.bed > cluster_At_team_lncRNAs_transcript_shorted.bed# ---- First revision and corrections to the file (In R)------
library("dplyr")
library("tidyverse")clusters <- read.table("cluster_At_team_lncRNAs_transcript_filtered.bed", colClasses =c("factor", "numeric", "numeric", "factor", "factor"), col.names = c("chrom", "start", "end", "gene", "clust")) 
lncRNAs_new_genes <- read.table("At_lncRNas_faltantes_IDs", header =F, colClasses ="factor")
numb <- nrow(lncRNAs_new_genes)
numb_plus <- sum(numb,5508) -1 # ultimo numero anotado 5507
lncRNAs_new_genes_df<- data.frame(lncRNAs_new_genes, 5508:numb_plus)
colnames(lncRNAs_new_genes_df) <- c("gene", "clust")
lncRNAs_new_genes_df$clust <- as.factor(lncRNAs_new_genes_df$clust)clusters_genes <-select(clusters, "gene", "clust")
clusters_df = rbind(clusters_genes, lncRNAs_new_genes_df) 
clust_group <- factor(clusters_df$clust)# check isoforms
lncRNAs_by_isoforms <- table(clust_group)[table(clust_group) >1]
lncRNAs_by_genes <- table(clust_group)[table(clust_group) ==1]
lncRNAs_by_isoforms_clusters <- c(x=names(lncRNAs_by_isoforms))# Obtain gene isoforms
lncRNAs_by_isoforms_out<-NULL
lncRNAs_by_isoforms_outdb<-NULL# cluster by groups
for (i in 1:length(lncRNAs_by_isoforms_clusters)) {lncRNAs_by_isoforms_out <- clusters_df %>% group_by(clust) %>% filter(clust == lncRNAs_by_isoforms_clusters[i])lncRNAs_by_isoforms_outdb <- rbind(lncRNAs_by_isoforms_outdb,lncRNAs_by_isoforms_out)
}lncRNAs_by_isoforms_db <- as.data.frame(lncRNAs_by_isoforms_outdb)# change the name in rows
rownames(lncRNAs_by_isoforms_db) <- lncRNAs_by_isoforms_db$gene
head(lncRNAs_by_isoforms_db)write.table(lncRNAs_by_isoforms_db, "./At_team_cluster_isoforms.txt", sep="\t", row.names = FALSE)
#-----# Note: Saves the contents of the lncRNAs_by_isoforms_db file to another file without characters, At_team_cluster_isoforms_IDs.
perl -pi -e 's/[""]//g' At_team_cluster_isoforms_IDs
head -n-1 At_team_cluster_isoforms_IDs > At_team_cluster_isoforms_ed_IDs
awk '{print $1}' At_team_cluster_isoforms_ed_IDs | sort -V | uniq > At_team_cluster_isoforms_uniq_IDs
grep -wf At_team_cluster_isoforms_uniq_IDs cluster_At_team_lncRNAs_transcript_filtered.bed > At_team_cluster_isoforms.bed# Note: Transcriptome coordinates were analyzed to confirm and discard the isoforms of each gene.# ----Second revision and corrections to the file (In R)------
library("dplyr")
library("tidyverse")# Load file
clusters_clean_data <- read.table("./At_team_cluster_isoforms_ed.bed", colClasses =c("factor", "numeric", "numeric", "factor", "factor"), col.names = c("chrom", "start", "end", "gene", "clust")) 
clusters_clean_data_select <-select(clusters_clean_data, "gene", "clust")
clust_group_clean <- factor(clusters_clean_data_select$clust)lncRNAs_by_isoforms_clean <- table(clust_group_clean)[table(clust_group_clean) >1]
lncRNAs_by_genes_clean <- table(clust_group_clean)[table(clust_group_clean) ==1]lncRNAs_by_isoforms_clusters_clean <- c(x=names(lncRNAs_by_isoforms_clean))# Obtain gene isoforms
lncRNAs_by_isoforms_out_clean<-NULL
lncRNAs_by_isoforms_outdb_clean<-NULLfor (i in 1:length(lncRNAs_by_isoforms_clusters_clean)) {lncRNAs_by_isoforms_out_clean <- clusters_clean_data_select %>% group_by(clust) %>% filter(clust == lncRNAs_by_isoforms_clusters_clean[i])lncRNAs_by_isoforms_outdb_clean <- rbind(lncRNAs_by_isoforms_outdb_clean,lncRNAs_by_isoforms_out_clean)
}lncRNAs_by_isoforms_db_clean <- as.data.frame(lncRNAs_by_isoforms_outdb_clean)
rownames(lncRNAs_by_isoforms_db_clean) <- lncRNAs_by_isoforms_db_clean$genewrite.table(lncRNAs_by_isoforms_db_clean, "./At_lncRNAs_by_isoforms_clusters_clean.txt", row.names = F, col.names = F)
# --------------# Note: Saves the contents of the At_lncRNAs_by_isoforms_clusters_clean.txt file to another file without characters, At_team_cluster_isoforms_class_ed.

Manual screening From tx2gene

awk '{print $1}' tx2gene_lncRNAs_team.txt | grep -v "transcriptID" | grep -v "MSTRG"| sort -V | uniq > At_team_lncRNas_transcriptIDs
grep -f At_team_lncRNas_transcriptIDs araport_biotypes.tsv > At_team_lncRNas_transcriptIDs_Araport_results
grep "mRNA" At_team_lncRNas_transcriptIDs_Araport_results > At_team_lncRNas_transcriptIDs_Araport_results_mRNA# Detection of mRNA in the data
awk '{print $1}' At_team_lncRNas_transcriptIDs_Araport_results_mRNA > At_team_lncRNas_transcriptIDs_Araport_results_mRNA_IDs# mRNA related to hypothetical proteins
grep -f At_team_lncRNas_transcriptIDs_Araport_results_mRNA_IDs At_hipotetical_protein_BlastxNR_transcript_IDs > At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_IDs# hypothetical in our data
grep -vf At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_IDs At_team_lncRNas_transcriptIDs_Araport_results_mRNA_IDs > At_team_lncRNas_transcriptIDs_Araport_results_mRNA_ANALIZAR_IDsgrep -f At_team_lncRNas_transcriptIDs_Araport_results_mRNA_ANALIZAR_IDs At_team_lncRNAs_clean.bed > At_team_lncRNAs_ANALIZAR.bed# lncRNAs previously reported in GreeNCawk '{print $4}' GreeNC_Araport11_lncRNAs.bed | sort -V | uniq > GreeNC_lncRNAs_IDs
sed 's/.$//' At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_IDs | sed 's/.$//' | sort -V | uniq > At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_genes_IDs
grep -Ff GreeNC_lncRNAs_IDs At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_genes_IDs > GreeNC_lncRNAs_HP_IDs
grep -Ff GreeNC_lncRNAs_HP_IDs At_team_lncRNAs_clean.bed > At_team_lncRNAs_ANALIZAR_HP.bed# Others IDs
grep -vf GreeNC_lncRNAs_HP_IDs At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_IDs > At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_others_IDs
grep -Ff At_team_lncRNas_transcriptIDs_Araport_results_mRNA_HP_others_IDs At_team_lncRNAs_clean.bed > At_team_lncRNAs_ANALIZAR_HP_others.bed

With this manual review we obtained the file At_team_transcriptIDs_deleted_sorted, this file contains 97 IDs.

grep -vf At_team_transcriptIDs_deleted_sorted At_team_lncRNAs_clean.bed > At_team_lncRNAs_clean_v2.bed

二、Classification of lncRNAs by categories

List of used programs:

kentUtils/v302 - gtfToGenePred and genePredToBed functions
get_intron_gtf_from_bed.pl - the whole code
BEDtools - intersectBed function

List of input data:

Araport11_gene_type-ed.txt : Gene list with gene model type based on the Araport11 genome release. (from https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FGenes%2FAraport11_genome_release)
Araport11_GFF3_genes_transposons.201606.gtf : Araport11 annotation in GTF format with Col-0 Mitochondrial annotations. (from https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FGenes%2FAraport11_genome_release)
At_team_lncRNAs_clean_v2.bed : File generated using the Strict Method (SM) for the identification of lncRNAs in Arabidopsis thaliana in BED format.

1) Obtain the positions of exons and introns from protein-coding genes

# Obtain protein exons 
grep "protein_coding" Araport11_gene_type-ed.txt > Araport11_mRNAs_gene_IDs
grep "AT[1-5]" Araport11_mRNAs_gene_IDs | sort -V | uniq > Araport_mRNAs_gene_IDs_nucleo
awk '{print $1}' Araport_mRNAs_gene_IDs_nucleo > Araport_mRNAs_gene_nucleo_IDs
awk '{print $4}' sort-Coding_Araport11_josian | sed 's/.$//' | sed 's/.$//' | sort -V | uniq | wc -lcat  Araport11_GFF3_genes_transposons.201606.gtf | grep '[[:space:]]exon[[:space:]]' | awk '{print $1,$4,$5,$10,$6,$7}'| sed 's/;//' | perl -pi -e 's/[""]//g' | sort -k 1,1 -k2,2n > At_exon_pcoding_sorted.bedgrep -Ff Araport_mRNAs_gene_nucleo_IDs At_exon_pcoding_sorted.bed > At_coding_transcript_exon_IDs_Araport11.bed# Obtain protein introns
grep "exon" Araport11_GFF3_genes_transposons.201606.gtf > Araport11_exones.gtf
for file in Araport11_exones.gtf; do clean=$(echo $file | sed 's/\.gtf//'); gtfToGenePred $file ${clean}.genePred; done
for file in Araport11_exones.genePred;  do clean=$(echo $file | sed 's/\.genePred//'); genePredToBed $file ${clean}.bed12; done
perl get_intron_gtf_from_bed.pl Araport11_exones.bed12 Araport11_intrones.gtf
cat  Araport11_intrones.gtf | grep '[[:space:]]exon[[:space:]]' | awk '{print $1,$4,$5,$10,$6,$7}'| sed 's/;//' | perl -pi -e 's/[""]//g' > At_intron_pcoding.bed # Delete hypothetical proteins
grep -vf At_hipotetical_protein_BlastxNR_transcript_IDs At_coding_transcript_exon_IDs_Araport11.bed > At_coding_transcript_exon_IDs_Araport11_sinhp.bed
grep -vf At_hipotetical_protein_BlastxNR_transcript_IDs At_coding_transcript_intron_IDs_Araport11.bed > At_coding_transcript_intron_IDs_Araport11_sinhp.bed# Change spaces to tabs 
perl -p -i -e 's/ /\t/g' At_coding_transcript_exon_IDs_Araport11_sinhp.bed
perl -p -i -e 's/ /\t/g' At_coding_transcript_intron_IDs_Araport11_sinhp.bed

2) Classification of lncRNAs

# > sense-exonic
intersectBed -wo -f 0.1 -s -a At_team_lncRNAs_clean_v2.bed -b At_coding_transcript_exon_IDs_Araport11_sinhp.bed > At_team_lncRNAs_sense.bed
awk '{print $1,$2,$3,$4,$6,$13,$14,$15,$16,$18,$19}' At_team_lncRNAs_sense.bed > At_team_lncRNAs_sense_summary.bed
awk '{print $4}' At_team_lncRNAs_sense_summary.bed | sort -V | uniq > At_team_lncRNAs_sentido_IDs# > NAT
intersectBed -wo -f 0.1 -S -a At_team_lncRNAs_clean_v2.bed -b At_coding_transcript_exon_IDs_Araport11_sinhp.bed > At_team_lncRNAs_NAT.bed
awk '{print $1,$2,$3,$4,$6,$13,$14,$15,$16,$18,$19}' At_team_lncRNAs_NAT.bed > At_team_lncRNAs_NAT_summary.bed
awk '{print $4}' At_team_lncRNAs_NAT_summary.bed | sort -V | uniq > At_team_lncRNAs_antisentido_IDs# > intronic
intersectBed -wo -f 1 -a At_team_lncRNAs_clean_v2.bed  -b At_coding_transcript_intron_IDs_Araport11_sinhp.bed > At_team_lncRNAs_intronic.bed
awk '{print $1,$2,$3,$4,$6,$13,$14,$15,$16,$18,$19}' At_team_lncRNAs_intronic.bed > At_team_lncRNAs_intronic_summary.bed
awk '{print $4}' At_team_lncRNAs_intronic_summary.bed | sort -V | uniq > At_team_lncRNAs_intronicos_IDs# > intronic nonsense in the strand
grep -w "[.]" At_team_lncRNAs_intronicos_summary.bed > At_team_lncRNAs_intronicos_SIN_sentido_summary.bed
awk '{print $4}' At_team_lncRNAs_intronicos_SIN_sentido_summary.bed | sort -V | uniq > At_team_lncRNAs_intronicos_SIN_sentido_IDs# > intergenic
intersectBed -wo -v -a At_team_lncRNAs_clean_v2.bed -b At_coding_transcript_exon_IDs_Araport11_sinhp.bed At_coding_transcript_intron_IDs_Araport11_sinhp.bed > At_team_lncRNAs_intergenicos.bed
awk '{print $1,$2,$3,$4,$6}' At_team_lncRNAs_intergenicos.bed > At_team_lncRNAs_intergenicos_summary.bed
awk '{print $4}' At_team_lncRNAs_intergenicos_summary.bed | sort -V | uniq > At_team_lncRNAs_intergenicos_IDs

Analyze the categorized lncRNAs

#--- Load data in R------
library(VennDiagram)# > intergenic
At_team_lncRNAs_intergenicos_IDs          <- read.table("At_team_lncRNAs_intergenicos_IDs", header= F)# > intronic
At_team_lncRNAs_intronicos_IDs             <- read.table("At_team_lncRNAs_intronicos_IDs", header= F)
At_team_lncRNAs_intronicos_SIN_sentido_IDs <- read.table("At_team_lncRNAs_intronicos_SIN_sentido_IDs", header= F)# > sense-exonic
At_team_lncRNAs_sentido_IDs                <- read.table("At_team_lncRNAs_sentido_IDs", header= F)# > NAT
At_team_lncRNAs_antisentido_IDs            <- read.table("At_team_lncRNAs_antisentido_IDs", header= F)ORF100_At_filter8_clean_IDs                <- read.table("./cluster_bed_genes/At_team_lncRNAs_clean_v2_IDs", header= F)At_team_lncRNAs_clasificados  <- c(At_team_lncRNAs_antisentido_IDs$V1,At_team_lncRNAs_sentido_IDs$V1,At_team_lncRNAs_intronicos_IDs$V1, At_team_lncRNAs_intergenicos_IDs$V1)venn.diagram(x = list(At_team_lncRNAs_clasificados, ORF100_At_filter8_clean_IDs$V1),"At_team_lncRNA_clasificados_noclasificados.png" , fill = c("blue", 'red'),category.names = c("Clasificados", "TODOS"))# Obtain the IDs that are not classified
At_team_noanotados_overlap <- calculate.overlap(x= list("TODOS" = ORF100_At_filter8_clean_IDs$V1, "Clasificados" = At_team_lncRNAs_clasificados))
At_team_NOanotados         <- subset(ORF100_At_filter8_clean_IDs,!(At_team_noanotados_overlap$a1 %in% At_team_noanotados_overlap$a3)) 
At_team_compartidos        <- At_team_noanotados_overlap$a3write.table(At_team_NOanotados$V1, "./At_team_NOanotados_noclasificados.txt", row.names = F, col.names = F)
#------# Create a new file without characters
# Note :  At_team_NOanotados_IDs contain the same information of At_team_NOanotados_noclasificados.txt.
perl -pi -e 's/[""]//g' At_team_NOanotados_IDs 
head -n-1 At_team_NOanotados_IDs > At_team_NO_anotados_IDs
grep -Ff At_team_NO_anotados_IDs At_team_lncRNAs_clean_v2.bed > At_team_lncRNAs_NO_anotados.bedintersectBed -wo -S -a At_team_lncRNAs_NO_anotados.bed -b At_coding_transcript_intron_IDs_Araport11_sinhp.bed > At_team_lncRNA_NO_anotados_intronico.bed
intersectBed -wo -S -a At_team_lncRNAs_NO_anotados.bed -b At_coding_transcript_exon_IDs_Araport11_sinhp.bed  > At_team_lncRNA_NO_anotados_exones.bedawk '{print $4}' At_team_lncRNA_NO_anotados_exones.bed | sort -V | uniq > At_team_lncRNAs_NO_anotados_ex_IDs
awk '{print $4}' At_team_lncRNA_NO_anotados_intronico.bed | sort -V | uniq > At_team_lncRNAs_NO_anotados_int_IDscat At_team_lncRNAs_NO_anotados_ex_IDs At_team_lncRNAs_NO_anotados_int_IDs | sort -V| uniq > At_team_lncRNAs_NO_anotados_ex_inIDS
diff At_team_lncRNAs_NO_anotados_ex_inIDS At_team_NO_anotados_IDs --suppress-common-lines | grep \> | cut -f2 -d' ' > At_team_lncRNAs_44_NOanno_IDs# > NAT
grep -v "AT5G00460.1" At_team_lncRNAs_NO_anotados_ex_inIDS > At_team_lncRNAs_NO_anotados_ex_inIDS_ed # Correct annotation of intergenic to NAT
cat At_team_lncRNAs_antisentido_IDs At_team_lncRNAs_NO_anotados_ex_inIDS_ed | sort -V | uniq > At_team_NAT_IDs# > intronic
cat At_team_lncRNAs_intronicos_antisentido_IDs At_team_lncRNAs_intronicos_sentido_IDs | sort -V | uniq > At_team_lncRNAs_intronic_IDs# > sense-exonic
cat At_team_lncRNAs_sentido_IDs At_team_lncRNAs_44_NOanno_IDs | sort -V | uniq > At_team_sense_exonic_IDs#--- Analyze the distribution of classifications (in R)----
At_team_sense_exonic_IDs <- read.table("At_team_sense_exonic_IDs", header= F)
At_team_NAT_IDs <- read.table("At_team_NAT_IDs", header= F)
At_team_lncRNAs_intergenicos_IDs <- read.table("At_team_lncRNAs_intergenicos_IDs", header= F)
At_team_lncRNAs_intronic_IDs  <- read.table("At_team_lncRNAs_intronic_IDs", header= F)# Graph
venn.diagram(x = list(At_team_lncRNAs_intronic_IDs$V1, At_team_sense_exonic_IDs$V1, At_team_NAT_IDs$V1, At_team_lncRNAs_intergenicos_IDs$V1),"At_team_Clasificacion_comparacionIDS_Araport11.png" , fill = c("blue", 'red', 'green', 'orange'),category.names = c("Intronic", "Sense-exonic", "NAT", "Int"))At_team_clasificados_overlap <- calculate.overlap(x= list("Intronic" = At_team_lncRNAs_intronic_IDs$V1, "sense_exonic" = At_team_sense_exonic_IDs$V1, "NAT" = At_team_NAT_IDs$V1, "Intergenic" = At_team_lncRNAs_intergenicos_IDs$V1))At_team_13compartidos_NAT_Intronic <- At_team_clasificados_overlap$a4
At_team_4compartidos_NAT_senseexonic <- At_team_clasificados_overlap$a13
write.table(At_team_13compartidos_NAT_Intronic, "./Comparaciones_database_a13.txt", row.names = F, col.names = F)
#------# Create a new file without characters
# Note :  At_team_20_repetidos_IDs contain the same information of Comparaciones_database_a13.txt.
perl -pi -e 's/[""]//g' At_team_20_repetidos_IDs
head -n-1 At_team_20_repetidos_IDs > At_team_20_repetidos_IDs_ed # Remove shared IDs between NAT and intronic from the intronic category
grep -vf At_team_20_repetidos_IDs_ed At_team_lncRNAs_intronic_IDs > At_team_lncRNAs_intronic_depurado_IDs
grep -v "AT3G09922.1" At_team_sense_exonic_IDs > At_team_sense_exonic_IDs_ed
cat gen_AT5G00460 gen_AT3G09922 At_team_lncRNAs_intergenicos_IDs > At_team_lncRNAs_intergenicos_IDs_edAt_team_lncRNAs_intronic_depurado_IDs  <- read.table("At_team_lncRNAs_intronic_depurado_IDs", header= F)#--- Analyze the distribution of classifications (in R)----
At_team_lncRNAs_intronic_depurado_IDs  <- read.table("At_team_lncRNAs_intronic_depurado_IDs", header= F)
venn.diagram(x = list(At_team_lncRNAs_intronic_depurado_IDs$V1, At_team_sense_exonic_IDs$V1, At_team_NAT_IDs$V1, At_team_lncRNAs_intergenicos_IDs$V1),"At_team_Clasificacion_comparacionIDS_Araport11_depurados.png" , fill = c("blue", 'red', 'green', 'orange'),category.names = c("Intronic", "Sense-exonic", "NAT", "Int"))
#------

Classify by genes

grep -wf At_team_lncRNAs_clean_v2_IDs At_team_lncRNas_class_table.txt > At_team_lncRNAs_class_table_complete.txt# > intronic
grep -Ff At_team_lncRNAs_intronic_depurado_IDs ./cluster_bed_genes/At_team_lncRNAs_class_table_complete.txt > ./cluster_bed_genes/At_team_intronic_stranded_lncRNAs_class_table.txt
grep -Ff At_team_lncRNAs_intronicos_SIN_sentido_IDs ./cluster_bed_genes/At_team_lncRNAs_class_table_complete.txt > ./cluster_bed_genes/At_team_intronic_wstranded_lncRNAs_class_table.txt
cat ./cluster_bed_genes/At_team_intronic_stranded_lncRNAs_class_table.txt ./cluster_bed_genes/At_team_intronic_wstranded_lncRNAs_class_table.txt > At_team_intronic_lncRNAs_table.txt
awk '{print $2}' At_team_intronic_lncRNAs_table.txt | sort -V | uniq > At_team_lncRNAs_intronic_depurado_GENES_IDs# > sense-exonic
grep -Ff At_team_sense_exonic_IDs_ed ./cluster_bed_genes/At_team_lncRNAs_class_table_complete.txt > At_team_sense_exonic_lncRNAs_table.txt
awk '{print $2}' At_team_sense_exonic_lncRNAs_table.txt | sort -V | uniq > At_team_sense_exonic_GENES_IDs# > NAT
grep -Ff At_team_NAT_IDs ./cluster_bed_genes/At_team_lncRNAs_class_table_complete.txt > At_team_NAT_lncRNAs_table.txt
awk '{print $2}' At_team_NAT_lncRNAs_table.txt | sort -V | uniq > At_team_NAT_GENES_IDs# > intergenic
grep -Ff At_team_lncRNAs_intergenicos_IDs_ed ./cluster_bed_genes/At_team_lncRNAs_class_table_complete.txt > At_team_intergenic_lncRNAs_table.txt
awk '{print $2}' At_team_intergenic_lncRNAs_table.txt | sort -V | uniq > At_team_lncRNAs_intergenicos_GENES_IDs

Obtain BED file

# > intronic
grep -Ff At_team_lncRNAs_intronic_depurado_IDs At_team_lncRNAs.bed > At_team_intronic_lncRNAs.bed
grep -Ff At_team_lncRNAs_intronicos_SIN_sentido_IDs At_team_lncRNAs.bed > At_team_wsense_intronic_lncRNAs.bed
awk '$(NF+1) = "intronic_lncRNA"' At_team_intronic_lncRNAs.bed > At_team_intronic_lncRNAs_ed.bed
awk '$(NF+1) = "intronic_lncRNA"' At_team_wsense_intronic_lncRNAs.bed > At_team_wsense_intronic_lncRNAs_ed2.bed# > sense-exonic
grep -Ff At_team_sense_exonic_IDs_ed At_team_lncRNAs.bed > At_team_sense_exonic_lncRNAs.bed
awk '$(NF+1) = "sense_exonic_lncRNA"' At_team_sense_exonic_lncRNAs.bed > At_team_sense_exonic_lncRNAs_ed.bed# > NAT
grep -Ff At_team_NAT_IDs At_team_lncRNAs.bed > At_team_NAT_lncRNAs.bed
awk '$(NF+1) = "NAT_lncRNA"' At_team_NAT_lncRNAs.bed > At_team_NAT_lncRNAs_ed.bed# > intergenic
grep -Ff At_team_lncRNAs_intergenicos_IDs_ed At_team_lncRNAs_clean_v2.bed > At_team_intergenic_lncRNAs.bed
awk '$(NF+1) = "intergenic_lncRNA"' At_team_intergenic_lncRNAs.bed > At_team_intergenic_lncRNAs_ed.bedcat At_team_NAT_lncRNAs_ed.bed At_team_intronic_lncRNAs_ed.bed At_team_wsense_intronic_lncRNAs_ed2.bed At_team_sense_exonic_lncRNAs_ed.bed At_team_intergenic_lncRNAs_ed.bed | sort -k1,1 -k2,2n > At_team_lncRNA_clasification.bed

三、Improve the filter to determine which lncRNAs are unique to the tissue

Use the TPM table
1 Generate the average TPM per category and collapse everything into a table
2 to calculate SPM you have to add all the TPM values and divide it by the TPM value of the specific tegido
3 You have to make an accumulation graph and look for the point where 95% of the genes are closest to 0
4 Test for lncRNAs independently of other genes

knitr::opts_chunk$set(echo = TRUE)library(tximport)library(rhdf5)
library(readr)

##Read transcriptome and gene tables

####kallisto results needed, Table with transcriptomes and tissue data, abundance values

dir <- "./Quant_result" # add directory where files are located when necessary#dir <- "./Quant_result_stranding" # add directory where files are located when necessarylist.files(dir)tx2gene <- read.csv(file = "tx2gene.csv")samples <- read.csv(file = "Transcriptomas_A_thaliana_wgcna_usados.csv")
samples
files <- file.path(dir, samples$Run, "abundance.h5")
names(files) <- samples$Run
all(file.exists(files))txi <- tximport(files, type = "kallisto", tx2gene = tx2gene)

Generate charts of accounts


#head(txi$counts)
names(txi)
#head(txi$countsFromAbundance)
#txi
Cuentas <- txi$counts
Abundance <- txi$abundance
Length <- txi$alength

Create TPM Table

rpk <- txi$counts / txi$length
scale_factor <- sum(rpk) / 1e6
tpm <- rpk / scale_factorwrite.csv(Cuentas, file = "KallistoExpr.csv", row.names = TRUE)

Gene filter with a number greater than 2 TPM this was taken from Julca et al. 2020

#tpm largue matrix
tpm <- as.data.frame(tpm)tpm<-tpm[rowSums(tpm[,-1])>0, ]tpm_tissue<-tpmc=0
for(l1 in tissue_list){#print (l1)c=c+1dft<- filter(table_at_wgcna, tissue == l1)list_trans<-dft$Runlistraw<-paste(list_trans, collapse = ',')list_trans<-strsplit(listraw, ",")list_trans<-unique((unlist(list_trans)))#print (c(list_trans))if (length(list_trans)>1){dft<-tpm[,list_trans]tpm_tissue$Mean <- apply(dft,1,mean)#print (c(list_trans))} else {tpm_tissue$Mean <-tpm[,list_trans]#print (c(list_trans))}names(tpm_tissue)[names(tpm_tissue) == "Mean"] <- l1
}
tpm_tissue<-tpm_tissue[,225:248]tpm_tissue$TOTAL <-rowSums( tpm_tissue)

Calculate SPM values by tissue

library(scales)
spm_tissue <-tpm_tissue
spm_tissue <-tpm_tissue/tpm_tissue$TOTAL
spm_tissue$TOTAL <- NULL
#head (spm_tissue)
## make a density plot
spm_tissue_s <-stack(spm_tissue)#spm_tissue_s
spm_tissue_s<-na.omit(spm_tissue_s)
#spm_tissue_s<-spm_tissue_s%>% filter(values > 0)
x<-nrow(spm_tissue_s)-nrow(spm_tissue_s)/20 #number of tissuesspm_tissue_s<-spm_tissue_s%>% arrange(values)x<-spm_tissue_s[x,1]
print(x)
#tail(spm_tissue_s)
#x
#ggplot(spm_tissue_s, aes(x=values)) + geom_density()
den <- ggplot(spm_tissue_s, aes(x=values, after_stat(count))) + geom_density(color= "blue", fill= "lightblue",alpha=0.6)+geom_vline(xintercept = x,colour = "red")+scale_y_log10() +#scale_y_continuous(name = "Average Gbases",limits = c(-0, 22.5),breaks = seq(from = 0.0, to = 30, by = 10))+#scale_fill_brewer(palette="Set2")+theme(axis.line = element_line(size=1, colour = "black"),legend.position="bottom",panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.border = element_blank(),panel.background = element_blank(),plot.title=element_text(size = 12),text=element_text(size = 12),axis.text.x=element_text(colour="black", size = 12, angle = 0),axis.text.y=element_text(colour="black", size = 12))
#den +  annotation_logticks()
den <- ggplotly(den)
den

四、WGCNA and annotation

### WGCNA#### Install packages 
#BiocManager::install("dynamicTreeCut")
#BiocManager::install("fastcluster")
#BiocManager::install("WGCNA")
#BiocManager::install("foreach")
#BiocManager::install("iterators")
#BiocManager::install("doParallel")
#BiocManager::install("Hmisc")
#BiocManager::install("Formula")
#BiocManager::install("acepack")
#BiocManager::install("base64enc")
#BiocManager::install("latticeExtra")
#BiocManager::install("jpeg")
#BiocManager::install("htmlTable")
#BiocManager::install("checkmate")
#BiocManager::install("htmlwidgets")
#BiocManager::install("htmltools")
#BiocManager::install("knitr")
#BiocManager::install("impute")
#BiocManager::install("preprocessCore")
#BiocManager::install("janitor")
#BiocManager::install("lubridate")#### Directory
setwd("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene")
#load ("D:/R-studio/script_limpio_WGCNA/")
### Cargar paquetes necesarios 
library (GenomicRanges)
library (DESeq2)
library (WGCNA)
library (janitor)
library (dplyr)
library (tidyr)#### Updating table 224 transcriptomes ### Load count tablecountData <- read.csv(file = 'KallistoExpr_gene.csv', header = TRUE)
head (countData) [1:10,1:10]
dim (countData)### Removing NAs 
names(countData)[1] <- "Geneid"
countData <- countData[!(is.na(countData$Geneid) | countData$Geneid==""), ]
### Rounding
countData[,-1] <-round (countData[,-1],0)
countData [1:10,1:5]### Columns in rownames 
countData2 <- countData[,-1]
rownames(countData2) <- countData[,1]
countData <- countData2
rm (countData2)
### Mostrar primeras columnas y filas 
countData [1:5,1:5]
dim (countData)
tail (countData) [1:5,1:5]### Convert SRR in data frame to comparing with trancriptomes table
colnams_df <- data.frame (Run = colnames(countData))
dim (colnams_df)table_at_wgcna <- read.csv (file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/Transcriptomas_A_thaliana_wgcna_usados.csv")
nrow(table_at_wgcna)
#head (table_at_wgcna)
#table_at_wgcna [grepl("*_seedling", table_at_wgcna$code),]### Generating file coldata for Deseq2 
### Add factors to experiments   
coldata <- merge(colnams_df, table_at_wgcna ,by="Run")
colnames(coldata)
nrow (coldata)col_order <- coldata$Run
### Order columns 
countData <- countData[,col_order]
## Add to each column the name of 
### Agregar a cada columna el nombre del corresponding treatment
colnames(countData) <- coldata$code### Columns in factors
coldata  <- mutate_at(coldata , vars(time_hrs, tissue, treatment), as.factor)### Object Deseq2 for vst transformationdds_wgcna <- DESeqDataSetFromMatrix(countData = countData,colData = coldata,design = ~ tissue)
dds_wgcna### Remove genes with few counts
ds_wgcna <- dds_wgcna[rowSums(counts(dds_wgcna)) > 10, ]### transforming (Variance Stabilizing transfomrmation)
vsd_wgcna <- varianceStabilizingTransformation(dds_wgcna, blind=FALSE)
#class (vsd_wgcna)
### Manipulate object 
vsd_wgcna <- assay(vsd_wgcna)### transpose matrix 
datExpr <- t(vsd_wgcna)    
head (datExpr[,1:10])
dim(datExpr)
rm(dds_wgcna)
rm(vsd_wgcna)### Save matrix as csv 
write.csv(datExpr, file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/datExpr.csv",row.names = TRUE)
### Load matrix 
datExpr <- read.csv("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/datExpr.csv", row.names = 1)
head (datExpr[,1:10])
dim (datExpr)###Next we cluster the samples (in contrast to clustering genes that will come later) 
#to see if there are any obvious outliers.
nGenes <- ncol(datExpr)
nSamples <- nrow(datExpr)sampleTrees <- hclust(dist(datExpr), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(17,13)
pdf(file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/sampleTrees.pdf",width = 20 , height = 10);
par(cex = 1);
par(mar = c(0,4,2,0))
plot(sampleTrees, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,cex.axis = 2, cex.main = 3)
dev.off();# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTrees, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)# Plot a line to show the cut
abline(h = 120, col = "red");
# Determine cluster under the line
clust <- cutreeStatic(sampleTrees, cutHeight = 120, minSize = 10)
table(clust)######################################## WGCNA_automatic ###########################
# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored. 
# "." means current directory. On Windows use a forward slash / instead of the usual \.
workingDir = ".";
setwd(workingDir); 
# Load the WGCNA package
library(WGCNA)
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
# Allow multi-threading within WGCNA. This helps speed up certain calculations.
# At present this call is necessary for the code to work.
# Any error here may be ignored but you may want to update WGCNA if you see one.
# Caution: skip this line if you run RStudio or other third-party R environments. 
# See note above.
enableWGCNAThreads()
# Load the data saved in the first part
lnames = load(file = "coexpression_ara.RData");
#The variable lnames contains the names of loaded variables.
lnames# Choose a set of soft-thresholding powers
powers <- c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 4,  networkType = "signed", corFnc = bicor)
sft
# Plot the results:sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
pdf(file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/scale_and_mean_connectivity.pdf",width = 12 , height = 10)
par(mfrow = c(1,2));
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()############################ Order samples by tissue and stage #################################
Trans_tm_tj <- read.csv("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/Transcriptomas_ordenados_tiempo_tejido.csv", row.names = 1)#### Removing sample pollen
Trans_tm_tj[grep("^pollen", Trans_tm_tj$tissue),]
Trans_tm_tj <- Trans_tm_tj[!(Trans_tm_tj$tissue=="pollen"),]datExpr[1:9,1:9]
rownames(datExpr)
#### Rownames as column
datExpr_order <- data.frame(names = rownames(datExpr), datExpr)
datExpr_order[1:9,1:9]
datExpr_order <- datExpr_order[ order(match(datExpr_order$names, Trans_tm_tj$code)), ]
datExpr_order$names <- NULL#### without sample pollen
write.csv(datExpr_order, file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/datExpr_stage.csv",row.names = TRUE)#### Network construction 
getwd()
datExpr_stage <- read.csv("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/datExpr_stage.csv", row.names = 1)
datExpr <- datExpr_stagenet_unmer <- blockwiseModules(datExpr, power = 12,TOMType = "signed", minModuleSize = 50,networkType = "signed",maxBlockSize = 8000,mergeCutHeight = FALSE, deepSplit = 2,corType = "bicor",numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "TOM_wgcna_unmerged", verbose = 4)# open a graphics window
sizeGrWindow(12, 9)
# Convert labels to colors for plottingunmergedColors <- labels2colors(net_unmer$colors)
unmergedColorsunique(mergedColors)
# Plot the dendrogram and the module colors underneath
pdf(file = "C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/clusterDendogram.pdf",width = 20 , height = 10);
plotDendroAndColors(net_unmer$dendrograms[[1]], unmergedColors[net_unmer$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
dev.off()############################################# Gene enrinchment topGO
####### load packages for annotation 
library(igraph)
library(DBI)
library(sqldf)
library(digest)
library(AnnotationDbi)
library(GO.db)
library(topGO)
library(ALL)
library (ath1121501.db)
library(Rcpp)########################## Load gene universe from datExpr matrix 
universe <- colnames(datExpr)
unique (moduleLabels_unmer)
unique (net_unmer$colors)
unique(bwModuleColors_unmer)
moduleLabels_unmer <- net_unmer$colors
#bwLabels_unmermode(universe)
str(universe)
mode (moduleLabels_unmer)
dim (moduleLabels_unmer)
str(moduleLabels_unmer)moduleLabels_unmer[moduleLabels_unmer = 1]############# top node 100 GOresults<-data.frame()
for(module in unique(moduleLabels_unmer))
{genes<-universe[net_unmer$colors==module]#genes<-universe[net_unmer$colors=="1"]geneList <- factor(as.integer(universe %in% genes))names(geneList) <- universe#pdf(file=paste("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/topgo/cME_",module,".pdf", sep=""))# topGO analysisif(exists("enrich")) {remove(enrich)}for(on in c("MF","BP","CC")){print(on)# Make topGO objectGOdata_wgcna <- new ("topGOdata", ontology = on, allGenes = geneList , nodeSize = 10, annot = annFUN.org, mapping = "org.At.tair.db")#GOdata <- new("topGOdata", ontology = on, allGenes = geneList, nodeSize = 5, annot = annFUN.gene2GO, gene2GO = geneID2GO)# fisher testresult <- runTest(GOdata_wgcna, algorithm = "classic", statistic = "fisher")results.table <- GenTable(GOdata_wgcna, result, topNodes = 100)colnames(results.table) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "result1")results.table$result1 <- gsub('<','', results.table$result1)#results.table <- data.frame(sapply(results.table, pattern = "<", replacement = ""))results.table$result1 <- as.numeric(results.table$result1)# reduce results to GO terms passing Benjamini-Hochberg multiple hypothesis corrected pval <= 0.05, FDR <= 5%results.table$qval.bh<-p.adjust(results.table[,"result1"],method="BH")# label ontology typeresults.table$ontology<-on# reduce results to GO terms passing Benjamini-Hochberg multiple hypothesis corrected pval <= 0.05, consider FDR <= 5% in futurekeep <- results.table[as.numeric(results.table[,"qval.bh"])<0.01,]if(exists("enrich")) enrich<- rbind(enrich, keep)if(!exists("enrich")) enrich<- keep# draw figure for GO terms pval<=0.05 before FDR correction#if(is.na(sigNo<-length(keep$ontology))){next}#showSigOfNodes(GOdata_wgcna, score(result), firstSigNodes = sigNo, useInfo = "all")#mtext(on, line=-1)}#dev.off()if(dim(enrich)[1]>0){enrichME<-enrichenrichME$MEs_mer=moduleGOresults<-rbind(GOresults,enrichME)   }
}
write.table(GOresults,file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/topgo/consensus_GOresults_unmer_tnode100.txt", sep="\t", row.names=FALSE)#################### lncRNAs in each module #### New annotation 7270 lncRNAs
lnc_bed_at <- read.table("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/At_team_lncRNA_clasification_revmanual.bed",header=FALSE, sep = "")colnames(lnc_bed_at) <- c("chrom", "chrSt", "chrE", "names", "score", "strand", "tickS","thickE", "itemRgb", "Exons", "blockSizes", "blockStarts","class")### Numbers modules
colors_dat <- as.data.frame(unmergedColors)
numbers_dat <- as.data.frame(moduleLabels_unmer)
all_modules <- cbind (numbers_dat, colors_dat)colnames(all_modules) <- c("module_number", "module_color")#write.table (write.table(all_modules,#                        file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/genes_each_module.txt",#                       sep="\t", row.names=TRUE))
all_modules <- read.table(file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/genes_each_module.txt", header = T)#### Classify the genes in each module
all_modules$names <- rownames(all_modules)
rownames(all_modules) <- NULL
head (all_modules)
uni_modules <-  unique(all_modules)
### Order modules
uni_modules <- uni_modules[order(uni_modules$module_number),]
head (uni_modules)module_gene_freq <- as.data.frame(table(all_modules$module_number))
module_gene_freq_color <- as.data.frame(table(all_modules$module_color))module_gene_freq_color <- module_gene_freq_color[order(module_gene_freq_color$Freq, decreasing = T),]
### Gene frequency per module
table_freq_modules <- cbind (module_gene_freq, module_gene_freq_color)[,c(1,3,4)]
colnames(table_freq_modules) <- c("mod_number", "mod_color", "freq_genes")
head (table_freq_modules)#### Merge the names of modules with names of lncRNAs classifiedlnc_bed_at_class <- lnc_bed_at[,c(4,13)]
head (lnc_bed_at_class)
library (stringr)
#### Remove decimals in IDs of genes
lnc_bed_at_class$names <- str_sub(lnc_bed_at_class$names, end=-3)
lnc_modules <- merge(all_modules, lnc_bed_at_class,by= "names",all = FALSE)### Delete repeated genes (isoforms)
lnc_modules <- unique (lnc_modules)### Annotation updated
write.table(lnc_modules , file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/lnc_modules_gene.txt",sep="\t", col.names = TRUE, row.names=FALSE)lnc_modules <- read.table("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/lnc_modules_gene.txt", header = T)#### Frequency of lncRNAsmodule_lnc_freq <- as.data.frame(table(lnc_modules$moduleLabels_unmer))
line_43 <-data.frame(as.factor(43), as.integer(0))
names(line_43)<-c("Var1","Freq")module_lnc_freq_2 <- rbind(module_lnc_freq, line_43)
module_lnc_freq_2 <- module_lnc_freq_2[c(1:43,46,44,45),]table_freqlnc_modules <- cbind(table_freq_modules, module_lnc_freq_2)[,c(1:3,5)]
colnames(table_freqlnc_modules) <- c("mod_number", "mod_color", "freq_genes", "freq_lnc")write.table (write.table(table_freqlnc_modules,file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/table_freqlnc_modules.txt",sep="\t", row.names=TRUE))### Calculate eigengenes for modules named with numbersMEs_unmer_number <- moduleEigengenes(datExpr,moduleLabels_unmer)$eigengenes;#write.table(MEs_unmer_number, file="C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/MEs_unmer_number.txt",#           sep="\t", col.names = TRUE, row.names= TRUE)MEs_unmer_number <- read.table("C:/Users/Hells/Desktop/wgcna_anolnc/kal_gene/MEs_unmer_number.txt", header = T)