# Install the latest version of DEseq2
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("DESeq2")

# define working dir paths
datadir = "/cloud/project/data/bulk_rna"
outdir = "/cloud/project/outdir"

# load R libraries we will use in this section
library(DESeq2)
library(data.table)

# set working directory to data dir
setwd(datadir)

# read in the RNAseq read counts for each gene (produced by htseq-count)
htseqCounts <- fread("gene_read_counts_table_all_final.tsv")
# view class of the data
class(htseqCounts)

# convert the data.table to matrix format
htseqCounts <- as.matrix(htseqCounts)
class(htseqCounts)

# set the gene ID values to be the row names for the matrix
rownames(htseqCounts) <- htseqCounts[,"GeneID"]

# now that the gene IDs are the row names, remove the redundant column that contains them
htseqCounts <- htseqCounts[, colnames(htseqCounts) != "GeneID"]

# convert the actual count values from strings (with spaces) to integers, because originally the gene column contained characters, the entire matrix was set to character
class(htseqCounts) <- "integer"

# view the first few lines of the gene count matrix
head(htseqCounts)

# it can also be useful to view interactively (if in Rstudio)
View(htseqCounts)
# run a filtering step
# i.e. require that for every gene: at least 1 of 6 samples must have counts greater than 10
# get index of rows that meet this criterion and use that to subset the matrix
# note the dimensions of the matrix before and after filtering with dim

dim(htseqCounts)
htseqCounts <- htseqCounts[which(rowSums(htseqCounts >= 10) >=1),]
dim(htseqCounts)

# Hint! if you find the above command confusing, break it into pieces and observe the result
# 
# what does "rowSums(htseqCounts >= 10)" do?
#
# what does "rowSums(htseqCounts >= 10) >=1" do?
# construct a mapping of the meta data for our experiment (comparing UHR cell lines to HBR brain tissues)
# in simple terms this is defining the biological condition/label for each experimental replicate
# create a simple one column dataframe to start
metaData <- data.frame('Condition'=c('UHR', 'UHR', 'UHR', 'HBR', 'HBR', 'HBR'))

# convert the "Condition" column to a factor data type, this will determine the direction of log2 fold-changes for the genes (i.e. up or down regulated)
metaData$Condition <- factor(metaData$Condition, levels=c('HBR', 'UHR'))

# set the row names of the metaData dataframe to be the names of our sample replicates from the read counts matrix
rownames(metaData) <- colnames(htseqCounts)

# view the metadata dataframe
head(metaData)

# check that names of htseq count columns match the names of the meta data rows
# use the "all" function which tests whether an entire logical vector is TRUE
all(rownames(metaData) == colnames(htseqCounts))
# make deseq2 data sets
# here we are setting up our experiment by supplying: (1) the gene counts matrix, (2) the sample/replicate for each column, and (3) the biological conditions we wish to compare.
# this is a simple example that works for many experiments but these can also get more complex
# for example, including designs with multiple variables such as "~ group + condition",
# and designs with interactions such as "~ genotype + treatment + genotype:treatment".

dds <- DESeqDataSetFromMatrix(countData = htseqCounts, colData = metaData, design = ~Condition)
# run the DESeq2 analysis on the "dds" object
dds <- DESeq(dds)

# view the DE results
res <- results(dds)
View(res)
# shrink the log2 fold change estimates
#   shrinkage of effect size (log fold change estimates) is useful for visualization and ranking of genes

#   In simplistic terms, the goal of calculating "dispersion estimates" and "shrinkage" is also to account for the problem that
#   genes with low mean counts across replicates tend of have higher variability than those with higher mean counts.
#   Shrinkage attempts to correct for this. For a more detailed discussion of shrinkage refer to the DESeq2 vignette

# first get the name of the coefficient (log fold change) to shrink
resultsNames(dds)

# now apply the shrinkage approach
resLFC <- lfcShrink(dds, coef="Condition_UHR_vs_HBR", type="apeglm")

# make a copy of the shrinkage results to manipulate
deGeneResult <- resLFC

#contrast the values for a few genes before and after shrinkage
head(res)
head(deGeneResult)
# read in gene ID to name mappings (using "fread" an alternative to "read.table")
mapping <- fread("ENSG_ID2Name.txt", header=F)

# add names to the columns in the "mapping" dataframe
setnames(mapping, c('ensemblID', 'Symbol'))

# view the first few lines of the gene ID to name mappings
head(mapping)

# merge on gene names
deGeneResult$ensemblID <- rownames(deGeneResult)
deGeneResult <- as.data.table(deGeneResult)
deGeneResult <- merge(deGeneResult, mapping, by='ensemblID', all.x=T)

# merge the original raw count values onto this final dataframe to aid interpretation
original_counts = as.data.frame(htseqCounts)
original_counts[,"ensemblID"] = rownames(htseqCounts)
deGeneResult = merge(deGeneResult, original_counts, by='ensemblID', all.x=T)

# view the final merged dataframe
# based on the raw counts and fold change values what does a negative fold change mean with respect to our two conditions HBR and UHR?
head(deGeneResult)
# view the top genes according to adjusted p-value
deGeneResult[order(deGeneResult$padj),]

# view the top genes according to fold change
deGeneResult[order(deGeneResult$log2FoldChange),]

# determine the number of up/down significant genes at FDR < 0.05 significance level
dim(deGeneResult)[1] # number of genes tested
dim(deGeneResult[deGeneResult$padj < 0.05])[1] #number of significant genes

# order the DE results by adjusted p-value
deGeneResultSorted = deGeneResult[order(deGeneResult$padj),]

# create a filtered data frame that limits to only the significant DE genes (adjusted p.value < 0.05)
deGeneResultSignificant = deGeneResultSorted[deGeneResultSorted$padj < 0.05]
# set the working directory to the output dir where we will store any results files
setwd(outdir)

# save the final DE result (all genes)  to an output file
fwrite(deGeneResultSorted, file='DE_all_genes_DESeq2.tsv', sep="\t")

# save the final DE result (significant genes only)  to an output file
fwrite(deGeneResultSignificant, file='DE_sig_genes_DESeq2.tsv', sep="\t")

# save the DESeq2 objects for the data visualization section
saveRDS(dds, 'dds.rds')
saveRDS(res, 'res.rds')
saveRDS(resLFC, 'resLFC.rds')

# to exit R type the following
#quit(save="no")