# Install the latest version of DEseq2# if (!requireNamespace("BiocManager", quietly = TRUE))# install.packages("BiocManager")# BiocManager::install("DESeq2")# define working dir pathsdatadir ="/cloud/project/data/bulk_rna"outdir ="/cloud/project/outdir"# load R libraries we will use in this sectionlibrary(DESeq2)library(data.table)# set working directory to data dirsetwd(datadir)# read in the RNAseq read counts for each gene (produced by htseq-count)htseqCounts <-fread("gene_read_counts_table_all_final.tsv")
# view class of the dataclass(htseqCounts)# convert the data.table to matrix formathtseqCounts <-as.matrix(htseqCounts)class(htseqCounts)# set the gene ID values to be the row names for the matrixrownames(htseqCounts) <- htseqCounts[,"GeneID"]# now that the gene IDs are the row names, remove the redundant column that contains themhtseqCounts <- htseqCounts[, colnames(htseqCounts) !="GeneID"]# convert the actual count values from strings (with spaces) to integers, because originally the gene column contained characters, the entire matrix was set to characterclass(htseqCounts) <-"integer"# view the first few lines of the gene count matrixhead(htseqCounts)# it can also be useful to view interactively (if in Rstudio)View(htseqCounts)
# run a filtering step# i.e. require that for every gene: at least 1 of 6 samples must have counts greater than 10# get index of rows that meet this criterion and use that to subset the matrix# note the dimensions of the matrix before and after filtering with dimdim(htseqCounts)htseqCounts <- htseqCounts[which(rowSums(htseqCounts >=10) >=1),]dim(htseqCounts)# Hint! if you find the above command confusing, break it into pieces and observe the result# # what does "rowSums(htseqCounts >= 10)" do?## what does "rowSums(htseqCounts >= 10) >=1" do?
# construct a mapping of the meta data for our experiment (comparing UHR cell lines to HBR brain tissues)# in simple terms this is defining the biological condition/label for each experimental replicate# create a simple one column dataframe to startmetaData <-data.frame('Condition'=c('UHR', 'UHR', 'UHR', 'HBR', 'HBR', 'HBR'))# convert the "Condition" column to a factor data type, this will determine the direction of log2 fold-changes for the genes (i.e. up or down regulated)metaData$Condition <-factor(metaData$Condition, levels=c('HBR', 'UHR'))# set the row names of the metaData dataframe to be the names of our sample replicates from the read counts matrixrownames(metaData) <-colnames(htseqCounts)# view the metadata dataframehead(metaData)# check that names of htseq count columns match the names of the meta data rows# use the "all" function which tests whether an entire logical vector is TRUEall(rownames(metaData) ==colnames(htseqCounts))
# make deseq2 data sets# here we are setting up our experiment by supplying: (1) the gene counts matrix, (2) the sample/replicate for each column, and (3) the biological conditions we wish to compare.# this is a simple example that works for many experiments but these can also get more complex# for example, including designs with multiple variables such as "~ group + condition",# and designs with interactions such as "~ genotype + treatment + genotype:treatment".dds <-DESeqDataSetFromMatrix(countData = htseqCounts, colData = metaData, design =~Condition)
# run the DESeq2 analysis on the "dds" objectdds <-DESeq(dds)# view the DE resultsres <-results(dds)View(res)
# shrink the log2 fold change estimates# shrinkage of effect size (log fold change estimates) is useful for visualization and ranking of genes# In simplistic terms, the goal of calculating "dispersion estimates" and "shrinkage" is also to account for the problem that# genes with low mean counts across replicates tend of have higher variability than those with higher mean counts.# Shrinkage attempts to correct for this. For a more detailed discussion of shrinkage refer to the DESeq2 vignette# first get the name of the coefficient (log fold change) to shrinkresultsNames(dds)# now apply the shrinkage approachresLFC <-lfcShrink(dds, coef="Condition_UHR_vs_HBR", type="apeglm")# make a copy of the shrinkage results to manipulatedeGeneResult <- resLFC#contrast the values for a few genes before and after shrinkagehead(res)head(deGeneResult)
# read in gene ID to name mappings (using "fread" an alternative to "read.table")mapping <-fread("ENSG_ID2Name.txt", header=F)# add names to the columns in the "mapping" dataframesetnames(mapping, c('ensemblID', 'Symbol'))# view the first few lines of the gene ID to name mappingshead(mapping)# merge on gene namesdeGeneResult$ensemblID <-rownames(deGeneResult)deGeneResult <-as.data.table(deGeneResult)deGeneResult <-merge(deGeneResult, mapping, by='ensemblID', all.x=T)# merge the original raw count values onto this final dataframe to aid interpretationoriginal_counts =as.data.frame(htseqCounts)original_counts[,"ensemblID"] =rownames(htseqCounts)deGeneResult =merge(deGeneResult, original_counts, by='ensemblID', all.x=T)# view the final merged dataframe# based on the raw counts and fold change values what does a negative fold change mean with respect to our two conditions HBR and UHR?head(deGeneResult)
# view the top genes according to adjusted p-valuedeGeneResult[order(deGeneResult$padj),]# view the top genes according to fold changedeGeneResult[order(deGeneResult$log2FoldChange),]# determine the number of up/down significant genes at FDR < 0.05 significance leveldim(deGeneResult)[1] # number of genes testeddim(deGeneResult[deGeneResult$padj <0.05])[1] #number of significant genes# order the DE results by adjusted p-valuedeGeneResultSorted = deGeneResult[order(deGeneResult$padj),]# create a filtered data frame that limits to only the significant DE genes (adjusted p.value < 0.05)deGeneResultSignificant = deGeneResultSorted[deGeneResultSorted$padj <0.05]
# set the working directory to the output dir where we will store any results filessetwd(outdir)# save the final DE result (all genes) to an output filefwrite(deGeneResultSorted, file='DE_all_genes_DESeq2.tsv', sep="\t")# save the final DE result (significant genes only) to an output filefwrite(deGeneResultSignificant, file='DE_sig_genes_DESeq2.tsv', sep="\t")# save the DESeq2 objects for the data visualization sectionsaveRDS(dds, 'dds.rds')saveRDS(res, 'res.rds')saveRDS(resLFC, 'resLFC.rds')# to exit R type the following#quit(save="no")