QC: Replicate Agreement

Abstract

Calculate RLE to evaluate replicate agreement brentlabRnaSeqTools package version: 0.0.0.0

Introduction

This illustrates how to construct the data object for the Sample Agreement QC step – RLE – on each of the data sets. In each case, once you have added the design (the last step for each set), you need to write the object to file and send it to the cluster. I suggest mounting to the cluster and writing to the mounted directory. See the vignette Running DESeq on HTCF for instructions on running DESeq on HTCF. Note that at the bottom of this script, there are instructions on how to copy the necessary scripts to run the DESeq model using MPI on htcf.

You can see all defined sets by entering ?createExperimentSet. If an experiment set you are interested in does not exist as an option in set_names, then you’ll need to use the extractColData(blrs) of the blrs object below to play around and come up with a set of filters to define your new set. See the github repository, “R/ExperimentSetFunctions.R” for examples

Setup the environment

library(brentlabRnaSeqTools)
library(rtracklayer)
library(tidyverse)
library(caret)

# set variables 

KN99_GFF_RDS = Sys.getenv("kn99_stranded_gff_rds")
DB_USERNAME = Sys.getenv("db_username")
DB_PASSWORD = Sys.getenv("db_password")

# note: I mount to the cluster and output directly to it
DDS_OUTPUT_DIR = Sys.getenv("OUTPUT_DIR")
TODAY = format(lubridate::today(),"%Y%m%d")

# controls whether dds objects are written
WRITE_OUT = TRUE

Pull the database

blrs = brentlabRnaSeqSetFromDatabase('kn99',DB_USERNAME, DB_PASSWORD)

Add gene level data (optional)

this adds all of the data regarding each locus as a GRange object to the gene data slot of the brentlabRnaSeqSet object. Useful if you are going to use other Bioconductor packages, or brentlabRnaSeqTools::createIgvBrowserShot()

kn99_gff = readRDS(KN99_GFF_RDS)

kn99_genes = kn99_gff[kn99_gff$ID %in% rownames(blrs)]

rowRanges(blrs) = kn99_genes[order(match(kn99_genes$ID,rownames(blrs)))]

90minuteInduction

2016 grant set

This is the data set defined by the single locus KOs in 90minuteInduction conditions for genotypes in the grant_df (this is a data object which is loaded as part of brentlabRnaSeqTools – take a look if you are interested)

Create the set and quality filter

blrs_90min_grant = createExperimentSet(blrs, 'ninetyMin_2016Grant')

# note that this filters out those samples which failed QC1, 
# but but does not filter on RLE unless the argument rle_iqr_threshold 
# is set to a numeric value
blrs_90min_grant_fltr = qaFilter(blrs_90min_grant)

# remove WT which fall on dates with no perturbed samples
blrs_90min_grant_fltr = 
  filterWtByExperimentalLibdate_90min(blrs_90min_grant_fltr)

Add the design


blrs_90min_grant_fltr = estimateSizeFactorsByProtocol(blrs_90min_grant_fltr)

min_libdate = min(as.Date(colData(blrs_90min_grant_fltr)$libraryDate))

colData(blrs_90min_grant_fltr)$libraryDate = 
  colData(blrs_90min_grant_fltr)$libraryDate %>%
  relevel(ref = as.character(min_libdate)) %>%
  droplevels()

colData(blrs_90min_grant_fltr)$libraryProtocol = 
  colData(blrs_90min_grant_fltr)$libraryProtocol %>%
  factor() %>%
  relevel(ref = "SolexaPrep") %>%
  droplevels()

colData(blrs_90min_grant_fltr)$genotype1 = 
  colData(blrs_90min_grant_fltr)$genotype1 %>%
  relevel(ref = "CNAG_00000") %>%
  droplevels()

mm = model.matrix(~libraryProtocol+libraryDate+genotype1, 
                  extractColData(blrs_90min_grant_fltr))

min_new_protocol = min(as.Date((unique(pull(filter(
  extractColData(blrs_90min_grant_fltr), 
  libraryProtocol == 'E7420L'), 
  libraryDate)))))

mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]

inspect the model matrix for linear dependencies

lin_dep_report = caret::findLinearCombos(mm)

add the design to the object

design(blrs_90min_grant_fltr) = mm
rm(mm)

write out

if(WRITE_OUT){
  
  outpath = file.path(DDS_OUTPUT_DIR,
                      "90minDataFreeze",
                      TODAY,
                      "grant_only_input.rds")
  
  dds = DESeq2::DESeqDataSetFromMatrix(
    colData = extractColData(blrs_90min_grant_fltr),
    countData = counts(blrs_90min_grant_fltr),
    design = design(blrs_90min_grant_fltr))
  
  sizeFactors(dds) = sizeFactors(blrs_90min_grant_fltr)
  
  write_rds(dds, outpath)
}

2016 Grant Set with Doubles

This has all double KO samples in which either of the KO loci are in the grant_df, plus single KO samples corresponding to one of the double KO perturbed loci and WT.

Create set and quality filter

blrs_grant_with_doubles = createExperimentSet(blrs, 'ninetyMin_2016GrantWithDoubles')

# note that this filters out those samples which failed QC1, 
# but but does not filter on RLE unless the argument rle_iqr_threshold 
# is set to a numeric value
blrs_grant_with_doubles_fltr = qaFilter(blrs_grant_with_doubles)

# remove WT which fall on dates with no perturbed samples
blrs_grant_with_doubles_fltr = 
  filterWtByExperimentalLibdate_90min(blrs_grant_with_doubles_fltr)

Add the design

blrs_grant_with_doubles_fltr = estimateSizeFactorsByProtocol(blrs_grant_with_doubles_fltr)

min_libdate = min(as.Date(colData(blrs_grant_with_doubles_fltr)$libraryDate))

colData(blrs_grant_with_doubles_fltr)$libraryDate = 
  colData(blrs_grant_with_doubles_fltr)$libraryDate %>%
  relevel(ref = as.character(min_libdate)) %>%
  droplevels()

# add a 'genotype' column which is a concatenation of genotype1 and genotype2
colData(blrs_grant_with_doubles_fltr)$genotype = 
  paste(colData(blrs_grant_with_doubles_fltr)$genotype1,
        colData(blrs_grant_with_doubles_fltr)$genotype2,
        sep = "_") %>%
  str_remove('_$') %>%
  factor() %>%
  relevel(ref = "CNAG_00000") %>%
  droplevels()

colData(blrs_grant_with_doubles_fltr)$libraryProtocol = 
  colData(blrs_grant_with_doubles_fltr)$libraryProtocol %>%
  factor() %>%
  relevel(ref = "SolexaPrep") %>%
  droplevels()

mm = model.matrix(~libraryProtocol+libraryDate+genotype, 
                  extractColData(blrs_grant_with_doubles_fltr))

min_new_protocol = min(
  as.Date(
    (unique(
      pull(
        filter(extractColData(blrs_grant_with_doubles_fltr), 
               libraryProtocol == 'E7420L'), libraryDate)))))

mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]

inspect the model matrix for linear dependencies

lin_dep_report = caret::findLinearCombos(mm)
lin_dep_report

add the design to the object

design(blrs_grant_with_doubles_fltr) = mm
rm(mm)

Write out

if(WRITE_OUT){
  
  outpath = file.path(DDS_OUTPUT_DIR,
                      "90minDataFreeze",
                      TODAY,
                      "grant_doubles_input.rds")
  
  dds = DESeq2::DESeqDataSetFromMatrix(
    colData = extractColData(blrs_grant_with_doubles_fltr),
    countData = counts(blrs_grant_with_doubles_fltr),
    design = design(blrs_grant_with_doubles_fltr))
  
  sizeFactors(dds) = sizeFactors(blrs_grant_with_doubles_fltr)
  
  write_rds(dds, outpath)
}

All

This set is all samples – singles and doubles – which are in the 90minuteInduction conditions.

Create set and quality filter

blrs_90min_all = createExperimentSet(blrs, 'ninetyMin_all')

# note that this filters out those samples which failed QC1, 
# but but does not filter on RLE unless the argument rle_iqr_threshold 
# is set to a numeric value
blrs_90min_all_fltr = qaFilter(blrs_90min_all)

# remove WT which fall on dates with no perturbed samples
blrs_90min_all_fltr = 
  filterWtByExperimentalLibdate_90min(blrs_90min_all_fltr)

Add the design

blrs_90min_all_fltr = estimateSizeFactorsByProtocol(blrs_90min_all_fltr)

min_libdate = min(as.Date(colData(blrs_90min_all_fltr)$libraryDate))

colData(blrs_90min_all_fltr)$libraryDate = 
  colData(blrs_90min_all_fltr)$libraryDate %>%
  relevel(ref = as.character(min_libdate)) %>%
  droplevels()

# add a 'genotype' column which is a concatenation of genotype1 and genotype2
colData(blrs_90min_all_fltr)$genotype = 
  paste(colData(blrs_90min_all_fltr)$genotype1,
        colData(blrs_90min_all_fltr)$genotype2,
        sep = "_") %>%
  str_remove('_$') %>%
  factor() %>%
  relevel(ref = "CNAG_00000") %>%
  droplevels()

colData(blrs_90min_all_fltr)$libraryProtocol = 
  colData(blrs_90min_all_fltr)$libraryProtocol %>%
  factor() %>%
  relevel(ref = "SolexaPrep") %>%
  droplevels()

mm = model.matrix(~libraryProtocol+libraryDate+genotype, 
                  extractColData(blrs_90min_all_fltr))

min_new_protocol = min(
  as.Date(
    (unique(
      pull(
        filter(extractColData(blrs_90min_all_fltr), 
               libraryProtocol == 'E7420L'), libraryDate)))))

mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]

inspect the model matrix for linear dependencies

lin_dep_report = caret::findLinearCombos(mm)
lin_dep_report

add the design to the object

design(blrs_90min_all_fltr) = mm
rm(mm)

Write out

if(WRITE_OUT){
  
  outpath = file.path(DDS_OUTPUT_DIR,
                      "90minDataFreeze",
                      TODAY,
                      "90min_all_input.rds")
  
  dds = DESeq2::DESeqDataSetFromMatrix(
    colData = extractColData(blrs_90min_all_fltr),
    countData = counts(blrs_90min_all_fltr),
    design = design(blrs_90min_all_fltr))
  
  sizeFactors(dds) = sizeFactors(blrs_90min_all_fltr)
  
  write_rds(dds, outpath)
}

Environmental Perturbation

Create Sets

ep_list = list(
  wt = createExperimentSet(blrs, 'envPert_epWT'),
  titration = createExperimentSet(blrs, 'envPert_titrationWT'),
  perturbed = createExperimentSet(blrs, 'envPert_perturbed')
)

# NOTE!! AS OF 20220201 CNAG_03894 forms a linear depdendence with both 
# concat treatment and libraryDate columns. it is being removed here to solve
# that issue

ep_list$perturbed = ep_list$perturbed[,ep_list$perturbed$genotype1 != "CNAG_03894"]

Quality Filter

ep_list_qc_passing = map(ep_list, qaFilter)

Expression Filter

# How this is done is up to you, and obviously affects what genes are left in.
# Below is an example. You need to think about the thresholds and filter method 
# that suits your purpose best.

expr_fltr_list = map(ep_list, ~rowSums(edgeR::cpm(counts(.))>4) >= 4)

ep_list_qc_passing_fltr = map2(ep_list_qc_passing, expr_fltr_list, ~.x[.y,])

Set Design

setBaseConcatTreatmentBaseCond = function(ep_set, concat_base_cond){
  colData(ep_set)$concat_treatment = 
    relevel(colData(ep_set)$concat_treatment, ref = concat_base_cond)
  
  colData(ep_set) = droplevels(DataFrame(colData(ep_set)))
  
  ep_set
  
}

setEpDesign = function(ep_set, design){
  design(ep_set) = design
  
  ep_set
}

concat_base_cond_list = list(
  
  wt = "YPD_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30",
  titration = "RPMI_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30",
  perturbed = 'PBS_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_0'
)

ep_designs = list(
  wt = formula(~libraryDate + concat_treatment),
  titration = formula(~libraryDate + concat_treatment),
  perturbed = formula(~libraryDate + concat_treatment + genotype1)
)

ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr, 
                               concat_base_cond_list, 
                               setBaseConcatTreatmentBaseCond)

ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr, 
                               ep_designs, 
                               setEpDesign)

Coerce back to DeseqDataObjects for proessing

ep_dds_list = map(ep_list_qc_passing_fltr, coerceToDds)

DDS_OUTPUT_DIR = "/mnt/scratch/rnaseq_pipeline/experiments/epTally"

if(WRITE_OUT){
  today = format(lubridate::today(),"%Y%m%d")

  output_dir = file.path(DDS_OUTPUT_DIR, today)

  dir.create(output_dir, recursive=TRUE)
  
  map(names(ep_dds_list), ~write_rds(ep_dds_list[[.]],
                                     file.path(output_dir,
                                               paste0("ep_",.,".rds"))))
}

Copy the HTCF DESeq scripts to the working directory

These scripts will run DESeq in parallel on HTCF. The only items you’l need to edit is the path to the lookup file (if you are running more than one model), and to the dds_input in deseq_mpi.sh. If you do not keep deseq_de.R in the same directory as deseq_mpi.sh, then you’ll need to update the path to the deseq_de.R script, also.

if(WRITE_OUT){
  htcf_deseq_scripts = c(system.file('bash', 
                            'htcf_parallel_deseq.sh', 
                            package = "brentlabRnaSeqTools"),
                       system.file('R_executable', 
                               'deseq_de.R', 
                                package = "brentlabRnaSeqTools"))

lapply(htcf_deseq_scripts, file.copy, to = DDS_OUTPUT_DIR)
}

After DESeq(), Calculate RLE

Environmental Perturbation

Load in the post DESeq() data sets

Note: this entirely depends on what set you want to look at – in this example, I am doing the sample agreement and tallies for the EP sets

DESEQ_OUTPUT_LIST = list(
  ep_wt = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_wt_20220201_output.rds",
  ep_titration = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_titration_20220201_output.rds",
  ep_perturbed = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_perturbed_20220201_output.rds"
)

Environmental Perturbation

Create the set

# get deseq output object
ep_dds_list = list(
  ep_wt = readRDS(DESEQ_OUTPUT_LIST$ep_wt),
  ep_titr = readRDS(DESEQ_OUTPUT_LIST$ep_titration),
  ep_pert = readRDS(DESEQ_OUTPUT_LIST$ep_perturbed)
)

add a column describing replicate groups if necessary

ep_dds_list$ep_pert$rep_col = 
  paste(ep_dds_list$ep_pert$genotype1, 
        ep_dds_list$ep_pert$concat_treatment, sep = "_")

calculate RLE

ep_rle_list = list(
  ep_wt = removeLibdateByReplicate(ep_dds_list$ep_wt, "concat_treatment"),
  ep_titr = removeLibdateByReplicate(ep_dds_list$ep_titr, "concat_treatment"),
  ep_pert = removeLibdateByReplicate(ep_dds_list$ep_pert, "rep_col")
)

Update the database

if(UPDATE_DB){

 updateRleTable = function(set_name, set_rle_summary){
   iqr_col = paste0(set_name, "_iqr")
   med_dev_col = paste0(set_name, "_median_dev")
   
   update_df = set_rle_summary %>%
   dplyr::select(replicateAgreementNumber, rle_iqr, rle_median_deviation) %>%
   dplyr::rename(!!quo_name(iqr_col) := rle_iqr, 
                 !!quo_name(med_dev_col) := rle_median_deviation)
  
  
 res = patchTable(
  database_info$kn99$urls$replicateAgreement,
  Sys.getenv("kn99_db_token"),
  update_df,
  "replicateAgreementNumber")
 
 res
 }
  
res_list = map(names(ep_rle_list), ~updateRleTable(., ep_rle_list[[.]]$without_libdate_effect$summary))

# check status ----
## failures should only be those without enough replicates to calculate RLE
extract_status = map(res_list, ~map(., ~as.numeric(.$status_code)))

extract_status[[1]][unlist(extract_status[[1]]) != 200]
extract_status[[2]][unlist(extract_status[[2]]) != 200]
extract_status[[3]][unlist(extract_status[[3]]) != 200]
}

Chase Mateusiak

08/04/2022

Introduction

Setup the environment

Pull the database

Add gene level data (optional)

90minuteInduction

2016 grant set

Create the set and quality filter

Add the design

inspect the model matrix for linear dependencies

add the design to the object

write out

2016 Grant Set with Doubles

Create set and quality filter

Add the design

inspect the model matrix for linear dependencies

add the design to the object

Write out

All

Create set and quality filter

Add the design

inspect the model matrix for linear dependencies

add the design to the object

Write out

Environmental Perturbation

Create Sets

Quality Filter

Expression Filter

Set Design

Coerce back to DeseqDataObjects for proessing

Copy the HTCF DESeq scripts to the working directory

After DESeq(), Calculate RLE

Environmental Perturbation

Load in the post DESeq() data sets

Environmental Perturbation

Create the set

add a column describing replicate groups if necessary

calculate RLE

Update the database