This dataset contains RNA Seq data of an experiment to determine the transcriptomic changes caused by Radiation and IL12 treatment on murine B16 melanoma tumors.
The transcriptomic profile.
KEY
Non-radiated (no RT)= 0Gy
Irradiated (RT) = 8Gy)
AAV #216 = Adenoassociated Virus/ Inducible Luciferase transgene
AAV #189 = Adenoassociated Virus/ Inducible Interleukin 12 transgene
#set wd
setwd("C:/Users/Lenovo/MIB Assignments/NOTES/Bioinformatica/Bioinformatics Practice R/AFRILAB IL12 RNA Seq")
#Load Libraries
library(Biobase)
library(BiocManager)
#EDA, DEA packages
library(edgeR)
library(limma)
library(DESeq2)
#Data visualisation
library(ggplot2)
library(ggrepel)
suppressPackageStartupMessages(library(ComplexHeatmap))
library(VennDiagram)
#BiocManager::install("EnhancedVolcano")
library(EnhancedVolcano)
#Data manipulation
library(tidyverse)
library(dplyr)
library(reshape2)
library(writexl)
library(readxl)
##****** Mouse gene annotation package
#BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
#BiocManager::install("biomaRt")
library(biomaRt)
#BiocManager::install("AnnotationDbi")
library(AnnotationDbi)
#Pathway Analysis
library(clusterProfiler)
library(enrichplot)
Count Matrix (Data was normalised by Limma-Voom)
Tidy up Metadata and extract groups and samples of interest
Create Graphics for Weight. This only makes sense if you have the starting weight for normalization.
## Load Count Data
count_data <- read_xlsx("DataRaw.xlsx")
count_data <- as.data.frame(count_data)
#class(count_data)
head(count_data)
#View(count_data)
##gene_metadata
gene_metadata <- count_data[, 1:11]
## Load Sample Metadata
sample_metadata <- read_xlsx("Muestras transcriptoma.xlsx")
sample_metadata <- as.data.frame(sample_metadata, header=TRUE)
#View(sample_metadata)
With a hint from the previous table (gene_metadata) and as seen below, I found that the RNA data or the annotated genes contains not only the mRNA but many other types of RNA (non-coding RNA) including ribosomal RNA, siRNAs, TECs and other RNAs that maybe involved in regulation of transcription and epigenetics.
Therefore, I will extract mRNAs only and analyse that as they make up majority of the sequences and I will also analyse the non-coding RNA separately. This is exciting!!
rna_types <- table(gene_metadata$gene_type)
rna_types <- as.data.frame(rna_types)
#View(rna_types)
##* Ordering from Highest frequency to least didnt chnage the order on the plot.
##* The approach below did.
## Frequency of the different RNA types
rna_types %>%
arrange(Freq) %>% # First sort by Freq This sort the dataframe but NOT the factor levels
mutate(Var1=factor(Var1, levels=Var1)) %>% # This trick update the factor levels
ggplot(aes(x=Var1, y=Freq)) + #note that the data object (rna_types) is only refered to at the start
ggtitle("RNA types and their Frequency")+
geom_col(color="red", fill="purple") +
theme_classic() +
theme(axis.text.x = element_text(angle=90), axis)+
coord_flip() +
ylab("Frequency")+
xlab("Gene type")
NA
NA
Here we will retain mRNA protein-coding counts and we shall explore the rest of the non-coding RNAs.
## select counts for only mRNA (protein coding genes)
counts_mrna <- count_data[count_data$gene_type=="protein_coding" , ]
#dim(counts_mrna) ## [1] 21797 28 SUCCESS
Select counts for conditions to explore for example;
a) No Radiation, No AAV
b) No radiation, inducible Luciferase
c) Radiation , inducible Luciferase
d) Radiation, inducibe IL12
The BI6-JAK-Knockout samples (G,H,I) will be ommited for this anlaysis.
## protein coding counts for selected samples.
#colnames(counts_mrna)
counts_mrna <- counts_mrna[ , c("gene_id", "A_S7", "B_S8", "C_S9", "D_S10", "E_S11", "F_S12", "J_S16", "K_S17", "L_S18", "M_S19", "N_S20", "O_S21", "P_S22", "Q_S23")]
rownames(counts_mrna) <- counts_mrna$gene_id
counts_mrna <- counts_mrna[,-1]
## metadata_all groups of interest ---------------------------------------
metadata_all <- sample_metadata[-c(4,8,9,10,11, 12, 14, 20), ]
metadata_all$sample_id <- c("A_S7", "B_S8", "C_S9", "D_S10", "E_S11", "F_S12", "M_S19","J_S16", "K_S17", "L_S18","P_S22", "Q_S23", "N_S20", "O_S21")
metadata_all$group <- c("no_RT_no_AAV", "no_RT_no_AAV", "no_RT_no_AAV","RT_iIL12","RT_iIL12","RT_iIL12", "RT_iLuc","RT_iLuc","RT_iLuc", "RT_iLuc", "no_RT_iIL12","no_RT_iIL12", "no_RT_iIL12", "no_RT_iIL12")
metadata_all$group_description <- c("No Radiotherapy, No AAV mouse 1","No Radiotherapy, No AAV mouse 2","No Radiotherapy, No AAV mouse 3", "Radiotherapy + inducible IL12 mouse 1","Radiotherapy + inducible IL12 mouse 2","Radiotherapy + inducible IL12 mouse 3", "Radiotherapy + inducible Luciferase mouse 1", "Radiotherapy + inducible Luciferase mouse 2", "Radiotherapy + inducible Luciferase mouse 3","Radiotherapy + inducible Luciferase mouse 4", "No Radiotherapy + inducible IL12 mouse 1", "No Radiotherapy + inducible IL12 mouse 2", "No Radiotherapy + inducible IL12 mouse 3", "No Radiotherapy + inducible IL12 mouse 4")
#customise rownames to sample_id
rownames(metadata_all) <- metadata_all$sample_id
# make group object
group <- factor(metadata_all$group)
group <- relevel(factor(group), ref = "no_RT_no_AAV") #makes "no_RT+no_AAV" the reference group instead
levels(group)
[1] "no_RT_no_AAV" "no_RT_iIL12" "RT_iIL12" "RT_iLuc"
Now that we have extracted the samples of interest and their related data, we will explore this data using graphics. The B16 Knockout group was removed and is not a part of this analysis.
Each condition has replicates
A comprehensive comparison and contrast will be presented in Chapter II of this analysis.
## Filter counts-----------
# create filtering criteria
myge_filter <- edgeR::filterByExpr(counts_mrna, group=group, min.count = 10, min.total.count = 15)
# apply filter in criteria
counts_mrna_filt <- counts_mrna[myge_filter, ]
#View(counts_mrna_filt)
dim(counts_mrna_filt) #[1] 12417 14
[1] 12417 14
We will, normalise with edgeR TMM method and then apply limma-Voom instead of DEseq2,
Why?
Because we would like a more accurate DEGs with the lowest False
Discovery Rate (FDR) possible.
## Data Normalization using TMM
# create a DGE list
dge <- edgeR::DGEList(counts = counts_mrna_filt, group= group)
counts_mrna_norm <- calcNormFactors(dge, method = "TMM")
Depth Chart
Tumor Weight (Sacrificed same day)
From the DataRaw file, I was able to extract the gene metadata. Therefore there was no need to map all over again.
We will use limma-voom to extract the DEGs.
# create a design matrix
design_matrix <- model.matrix(~group) #choosing intercept
colnames(design_matrix)
[1] "(Intercept)" "groupno_RT_iIL12" "groupRT_iIL12" "groupRT_iLuc"
## Voom transformation
voom_t <- voom(counts_mrna_norm, design_matrix, plot = TRUE)
## limma (linear model fitting)
#?lmFit
#create fit data
fit <- lmFit(voom_t, design = design_matrix)
# Statistical test (eBayes)
fit_ebayes <- eBayes(fit)
#output matrix with fold change and adj.p-value<=0.05 = gene expression significant
#topTable function is part of the limma library
#We will use it to extract the output or result matrix
results <- topTable(fit_ebayes, coef = ncol(design_matrix), number = Inf, adjust.method = "BH")
#Above, Benjamini-Hochberg (BH) was applied for FDR correction
head(results, n=10)
NA
NA
How many genes are differentially expressed if we define a threshold of: adj.p-value<=0.05 and |log2(FCh)|>0.58 ? These will be saved as sigGenes
Both Statistical and Biological relevance.
sigGenes <- results[results$adj.P.Val<=0.05 & results$logFC>0.58, ] #it is the rows that we are filtering based on the defined criteria but we want all columns to come back.
dim(sigGenes) # [1] 11001 6
[1] 115 6
11001 genes are significantly expressed.
pca <- prcomp(t(voom_t$E), scale. = TRUE)
pca_data_to_plot <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], group = group)
pca_plot <- ggplot(pca_data_to_plot, aes(x = PC1, y = PC2, color = group)) +
geom_point(size=4) +
theme_minimal() +
labs(title = "PCA - RNA-seq ALL Samples", x = "PC1", y = "PC2")+
geom_text_repel(aes(label=rownames(pca_data_to_plot)))
#ggsave("PCA_ALL.png", plot = pca_plot)
pca_plot
Observation: The groups are clearly distinct as seen by the huge variance in the PC1 (main) axis. However, the no RT + inducible IL12 duplicate samples appear to be very distinct.
Biological relevance vs Statistical significance.
## Volcano Plot
ggplot(results, aes(x = logFC, y = -log10(adj.P.Val))) +
# All genes = BLACK (These don't change)
geom_point(color = "black", alpha = 0.3) +
#Overexpressed genes = RED
geom_point(data=results[results$logFC > 0.58 & results$adj.P.Val <= 0.05,],
aes(x = logFC, y = -log10(adj.P.Val)), color="red", alpha = 0.7) +
#Under expressed genes = BLUE
geom_point(data=results[results$logFC < -0.58 & results$adj.P.Val <= 0.05,],
aes(x = logFC, y = -log10(adj.P.Val)), color="blue", alpha = 0.7) +
#ref lines
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") +
geom_vline(xintercept = c(0.58, -0.58), linetype = "dashed", color = "gray") +
theme_minimal() +
labs(title = "Volcano Plot - Differential Expression",
x = "Log2 Fold Change",
y = "-Log10 P-value")
NA
NA
NA
Top 20 DEGS
# Since we already Voom transformed the data, we can z transform it without renormalizing.
Z_score_voom <- t(scale(t(voom_t$E), center = TRUE, scale = TRUE))
#Order by Fc from Big to Small
sigGenes_ordFch <- sigGenes[order(sigGenes$logFC, decreasing = TRUE),]
#View(sigGenes_ordFch)
# -----------------------------------
## Adding the gene metadata to the sigGenes_ordFch
rownames(gene_metadata) <- gene_metadata$gene_id
sigGenes_ordFch_metadata <- gene_metadata[match(rownames(sigGenes_ordFch), gene_metadata$gene_id), c("gene_type", "gene_name")]
sigGenes_ordFch_annotated <- cbind(sigGenes_ordFch, sigGenes_ordFch_metadata) #success
#head(sigGenes_ordFch_annotated)
# -----------------------------------
treatment_colors <- c(
"RT_iLuc" = "#EF476F",
"no_RT_iIL12" = "#FFD166",
"no_RT_no_AAV"= "#06D6A0",
"RT_iIL12"= "#118AB2"
)
# Column annotation
column_ha <- HeatmapAnnotation(
Treatment = group, #treatment group
col = list(Treatment = treatment_colors) #treatment stage coloring
)
# heatmap for top 20 DE genes
Heatmap(Z_score_voom[rownames(sigGenes_ordFch)[1:20],],
row_names_gp = gpar(fontsize = 5),
top_annotation = column_ha,
row_names_side = "left",
row_labels = sigGenes_ordFch_annotated$gene_name[1:20]
)
Top 10 DEGs
head(sigGenes_ordFch_annotated, n=10)
#mygenes <- head(sigGenes_ordFch_annotated, n=10)
#mygenes$gene_name
In the previous chapter, a general analysis of the groups has been
performed (compared to baseline group). In this chapter, we will define
and explore changes between different groups with depth.
Groups of interest will be compared against each other.
For this, we will have to fit the data again, starting from the normalisation, design matrix. Define contrast and visualise the comparisons.
For the purposes of this comparison, we will name the groups as below
Control = no_RT_no_AAV
TreatA = no_RT_iIL12
TreatB = RT_iLuc
TreatC = RT_iIL12
## defining group for new analysis
group2 <- factor(metadata_all$group)
levels(group2)
[1] "no_RT_iIL12" "no_RT_no_AAV" "RT_iIL12" "RT_iLuc"
# normalise data
dge2 <- edgeR::DGEList(counts = counts_mrna_filt, group= group2)
counts_mrna_norm2 <- calcNormFactors(dge2, method = "TMM")
# create a design matrix 2
design_matrix2 <- model.matrix(~0 + group2) #no intercept
colnames(design_matrix2) <- levels(group2) #no intercept defined
# Define contrasts of interest
contrast.matrix <- makeContrasts(
TreatA_vs_Control = no_RT_iIL12 - no_RT_no_AAV, #Uninduced AAV-iIL12
TreatB_vs_Control = RT_iLuc - no_RT_no_AAV, #RT effects
TreatC_vs_TreatB = RT_iIL12 - RT_iLuc, #effects of RT + induced iIL12
levels = colnames(design_matrix2)
)
## Voom transformation
voom_t2 <- voom(counts_mrna_norm2, design_matrix2)
## limma (linear model fitting)
fit2 <- lmFit(voom_t2, design = design_matrix2)
fit2c <- contrasts.fit(fit2, contrast.matrix)
# Statistical test (eBayes)
fit_ebayes2 <- eBayes(fit2c)
##RESULTS contrasts
results_TreatA_vs_Control <- topTable(fit_ebayes2, coef = "TreatA_vs_Control", n = Inf, adjust.method = "BH")
results_TreatB_vs_Control <- topTable(fit_ebayes2, coef = "TreatB_vs_Control", n = Inf, adjust.method = "BH")
results_TreatC_vs_TreatB <- topTable(fit_ebayes2, coef = "TreatC_vs_TreatB", n = Inf, adjust.method = "BH")
Pairwise Comparison of groups forexample No Radiation,No AAv vs No Radiation + IL12 to determine the changes caused by an inunduced AAV-IL12.
Volcano Plot ——- no_RT_iIL12 vs no_RT_no_AAV (Control)
Volcano Plot ——- RT_iLuc vs no_RT_no_AAV (Control)
Volcano Plot ——- RT_iLuc vs RT_iIL12
## Significant UPREGULATED Genes for Contrasts------
sigGenes_AvsCtr_UP <- results_TreatA_vs_Control[results_TreatA_vs_Control$adj.P.Val<=0.05 & results_TreatA_vs_Control$logFC>0.58, ]
sigGenes_BvsCtr_UP <- results_TreatB_vs_Control[results_TreatB_vs_Control$adj.P.Val<=0.05 & results_TreatB_vs_Control$logFC>0.58, ]
sigGenes_CvsB_UP <- results_TreatC_vs_TreatB[results_TreatC_vs_TreatB$adj.P.Val<=0.05 & results_TreatC_vs_TreatB$logFC>0.58, ]
## Significant DOWNREGULATED Genes for Contrasts------
sigGenes_AvsCtr_DW <- results_TreatA_vs_Control[results_TreatA_vs_Control$adj.P.Val<=0.05 & results_TreatA_vs_Control$logFC< -0.58, ]
sigGenes_BvsCtr_DW <- results_TreatB_vs_Control[results_TreatB_vs_Control$adj.P.Val<=0.05 & results_TreatB_vs_Control$logFC< -0.58, ]
sigGenes_CvsB_DW <- results_TreatC_vs_TreatB[results_TreatC_vs_TreatB$adj.P.Val<=0.05 & results_TreatC_vs_TreatB$logFC< -0.58, ]
RT_iLuc vs RT_IL12 Heatmap
Pathway analysis in contrasts.
Now that differentially expressed genes have been identified, we will analyse enriched pathways in our data groups. This will help us make biological meaning of the actual changes happening in our data.
Fore each of these analysis only code for one will be shown and just results for the rest.
Our data has Ensembl Ids and ORA Go input requires entrez Ids
library(biomaRt)
# Connect to the Ensembl mouse database
ensembl <- useEnsembl(biomart = "genes", dataset = "mmusculus_gene_ensembl")
# IDs to convert
ensembl_ids <- rownames(gene_metadata)
# Remove version numbers (the part after the dot)
ensembl_ids_no_version <- sub("\\.[0-9]+$", "", ensembl_ids)
# Get the mapping between Ensembl and Entrez IDs
entrez_id_mapping <- getBM(
attributes = c("ensembl_gene_id", "entrezgene_id", "external_gene_name"),
filters = "ensembl_gene_id",
values = ensembl_ids_no_version,
mart = ensembl
)
# View results
print(entrez_id_mapping)
NA
NA
NA
no_RT_iIL12 vs no_RT_no_AAV (Control)
no.RT_iL12 vs no.RT_no.AAV
# no.RT_iL12 vs no.RT_no.AAV
# Define the ranked genes list based on logFCh.
AvsCtrl_geneList <- results_TreatA_vs_Control$logFC
names(AvsCtrl_geneList) <- sub("\\.[0-9]+$", "", rownames(results_TreatA_vs_Control)) #removes version components (everything after dot) --- substitute("these", "with this", in)
AvsCtrl_geneList_ranked <- AvsCtrl_geneList[order(AvsCtrl_geneList, decreasing = TRUE)]
# Perform the GSEA based on GO terms
gsea_go <- gseGO(geneList = AvsCtrl_geneList_ranked,
OrgDb = org.Mm.eg.db,
keyType = "ENSEMBL",
ont = "BP",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.01,
verbose = FALSE)
View(gsea_go@result)
# Results visualization
layout(matrix(c(1, 2, 3), nrow = 1, ncol = 3, byrow = TRUE))
# Dotplot
dotplot(gsea_go,
x = "GeneRatio",
color = "NES",
size = "p.adjust",
showCategory=10) + ggtitle("GSEA: no.RT_AAV.iL12 vs no.RT_no.AAV")
#Network Plot
cnetplot(gsea_go, node_label="category",
cex.params = list(cex_label_category = 1.2),
categorySize="p.adjust", color.params = list(foldChange = AvsCtrl_geneList_ranked))
# Treeplot
gsea_go2 <- pairwise_termsim(gsea_go)
treeplot(gsea_go2, showCategory = 10)
RT_iLuc vs no.RT_no.AAV
GSEA: RT.AAV_iLuc vs RT_AAV.iIL12
The following are significant differentially expressed genes with a adjusted P value < 0.01 according to the contrasts. However, in the graphics below, the expression of these genes in each independent group will be plotted to see how they changed and in which direction.
For this we will use the contrasts to obtain gene ids and then plot them using their normalized expression data (voom$E) for each group. This normalized expression data is independent expression for each gene in each group, not linked to any comparison.
Top 40 sig expression in: no_RT_iIL12 vs no_RT_no_AAV
library(gridExtra)
###****** GENES plot
###*
# TreatA vs Ctr - most sigificant genes (adj.p.val< 0.01) and highest FC
topSig_AvsCtr <- results_TreatA_vs_Control[(results_TreatA_vs_Control$logFC > 1.5 | results_TreatA_vs_Control$logFC < -1.5 & results_TreatA_vs_Control$adj.P.Val < 0.01), ]
#These are arranged from the most different in each group therefore I will not rearrange them according to fold change or p value.
#By adj.P.Val
topSig_AvsCtr <- topSig_AvsCtr[order(topSig_AvsCtr$adj.P.Val, decreasing = FALSE), ]
#By FoldChange
#topSig_AvsCtr_ordFch <- topSig_AvsCtr[order(topSig_AvsCtr$logFC, decreasing = TRUE), ]
#Plot top 40 of each instance in both groups being compared
ids_to_plot_AvsCtr <- head(topSig_AvsCtr, n=40)
#These are arranged from the most different in each group therefore I will not rearrange them according to fold change or p value.
## Expression data to plot all groups----- could change to pval
expr_data_to_plot_AvsCtr <- as.data.frame(t(voom_t$E[rownames(ids_to_plot_AvsCtr), ]))
colnames(expr_data_to_plot_AvsCtr) <- ids_to_plot_AvsCtr$gene_name
expr_data_to_plot_AvsCtr$group <- metadata_all$group
#Only select for **no_RT_no_AAV** and **no_RT_iIL12***
#I used the rownames instead of group names because the group name only selected 2 samples of each group
expr_data_to_plot_AvsCtr <- expr_data_to_plot_AvsCtr[rownames(expr_data_to_plot_AvsCtr)==c( "A_S7","B_S8","C_S9", "N_S20", "O_S21", "P_S22", "Q_S23"), ]
# Convert from wide to long format
expr_data_to_plot_AvsCtr <- expr_data_to_plot_AvsCtr %>%
pivot_longer(cols = -group, names_to = "gene", values_to = "expression")
#dput(group)
## Plot topSig_AvsCtr_ordFch
ggplot(data = expr_data_to_plot_AvsCtr, mapping=aes(x=gene, y=expression, fill = group)) +
geom_boxplot(color="#e9ecef", alpha=0.6, position = 'identity')+
scale_fill_manual(values=c("no_RT_iIL12" ="#69b3a2", "no_RT_no_AAV" ="#404080")) +
geom_hline(yintercept = 0, linetype = "dashed", color = "blue") +
ggtitle("Top 40 sig expression in: no_RT_iIL12 vs no_RT_no_AAV")+
theme(axis.text.x = element_text(angle = 90,hjust=1))
NA
NA
NA
Top 40 sig expression in: RT_AAV.iLuc & no_RT_no_AAV
RT_iLuc vs RT_iIL12 Top 40 Sig genes
Idi genes
Zuniga, O., Byrum, S., & Wolfe, A. R. (2022). Discovery of the inhibitor of DNA binding 1 as a novel marker for radioresistance in pancreatic cancer using genome-wide RNA-seq. Cancer drug resistance (Alhambra, Calif.), 5(4), 926–938. https://doi.org/10.20517/cdr.2022.60
sessionInfo()