QC: Replicate Agreement
Chase Mateusiak
08/04/2022
QC_Sample_Agreement_Objects.Rmd
Abstract
Calculate RLE to evaluate replicate agreement brentlabRnaSeqTools package version: 0.0.0.0
Introduction
This illustrates how to construct the data object for the Sample Agreement QC step – RLE – on each of the data sets. In each case, once you have added the design (the last step for each set), you need to write the object to file and send it to the cluster. I suggest mounting to the cluster and writing to the mounted directory. See the vignette Running DESeq on HTCF for instructions on running DESeq on HTCF. Note that at the bottom of this script, there are instructions on how to copy the necessary scripts to run the DESeq model using MPI on htcf.
You can see all defined sets by entering ?createExperimentSet
. If an experiment set you are interested in does not exist as an option in set_names
, then you’ll need to use the extractColData(blrs)
of the blrs
object below to play around and come up with a set of filters to define your new set. See the github repository, “R/ExperimentSetFunctions.R” for examples
Setup the environment
library(brentlabRnaSeqTools)
library(rtracklayer)
library(tidyverse)
library(caret)
# set variables
KN99_GFF_RDS = Sys.getenv("kn99_stranded_gff_rds")
DB_USERNAME = Sys.getenv("db_username")
DB_PASSWORD = Sys.getenv("db_password")
# note: I mount to the cluster and output directly to it
DDS_OUTPUT_DIR = Sys.getenv("OUTPUT_DIR")
TODAY = format(lubridate::today(),"%Y%m%d")
# controls whether dds objects are written
WRITE_OUT = TRUE
Pull the database
blrs = brentlabRnaSeqSetFromDatabase('kn99',DB_USERNAME, DB_PASSWORD)
90minuteInduction
2016 grant set
This is the data set defined by the single locus KOs in 90minuteInduction conditions for genotypes in the grant_df
(this is a data object which is loaded as part of brentlabRnaSeqTools
– take a look if you are interested)
Create the set and quality filter
blrs_90min_grant = createExperimentSet(blrs, 'ninetyMin_2016Grant')
# note that this filters out those samples which failed QC1,
# but but does not filter on RLE unless the argument rle_iqr_threshold
# is set to a numeric value
blrs_90min_grant_fltr = qaFilter(blrs_90min_grant)
# remove WT which fall on dates with no perturbed samples
blrs_90min_grant_fltr =
filterWtByExperimentalLibdate_90min(blrs_90min_grant_fltr)
Add the design
blrs_90min_grant_fltr = estimateSizeFactorsByProtocol(blrs_90min_grant_fltr)
min_libdate = min(as.Date(colData(blrs_90min_grant_fltr)$libraryDate))
colData(blrs_90min_grant_fltr)$libraryDate =
colData(blrs_90min_grant_fltr)$libraryDate %>%
relevel(ref = as.character(min_libdate)) %>%
droplevels()
colData(blrs_90min_grant_fltr)$libraryProtocol =
colData(blrs_90min_grant_fltr)$libraryProtocol %>%
factor() %>%
relevel(ref = "SolexaPrep") %>%
droplevels()
colData(blrs_90min_grant_fltr)$genotype1 =
colData(blrs_90min_grant_fltr)$genotype1 %>%
relevel(ref = "CNAG_00000") %>%
droplevels()
mm = model.matrix(~libraryProtocol+libraryDate+genotype1,
extractColData(blrs_90min_grant_fltr))
min_new_protocol = min(as.Date((unique(pull(filter(
extractColData(blrs_90min_grant_fltr),
libraryProtocol == 'E7420L'),
libraryDate)))))
mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
add the design to the object
design(blrs_90min_grant_fltr) = mm
rm(mm)
write out
if(WRITE_OUT){
outpath = file.path(DDS_OUTPUT_DIR,
"90minDataFreeze",
TODAY,
"grant_only_input.rds")
dds = DESeq2::DESeqDataSetFromMatrix(
colData = extractColData(blrs_90min_grant_fltr),
countData = counts(blrs_90min_grant_fltr),
design = design(blrs_90min_grant_fltr))
sizeFactors(dds) = sizeFactors(blrs_90min_grant_fltr)
write_rds(dds, outpath)
}
2016 Grant Set with Doubles
This has all double KO samples in which either of the KO loci are in the grant_df
, plus single KO samples corresponding to one of the double KO perturbed loci and WT.
Create set and quality filter
blrs_grant_with_doubles = createExperimentSet(blrs, 'ninetyMin_2016GrantWithDoubles')
# note that this filters out those samples which failed QC1,
# but but does not filter on RLE unless the argument rle_iqr_threshold
# is set to a numeric value
blrs_grant_with_doubles_fltr = qaFilter(blrs_grant_with_doubles)
# remove WT which fall on dates with no perturbed samples
blrs_grant_with_doubles_fltr =
filterWtByExperimentalLibdate_90min(blrs_grant_with_doubles_fltr)
Add the design
blrs_grant_with_doubles_fltr = estimateSizeFactorsByProtocol(blrs_grant_with_doubles_fltr)
min_libdate = min(as.Date(colData(blrs_grant_with_doubles_fltr)$libraryDate))
colData(blrs_grant_with_doubles_fltr)$libraryDate =
colData(blrs_grant_with_doubles_fltr)$libraryDate %>%
relevel(ref = as.character(min_libdate)) %>%
droplevels()
# add a 'genotype' column which is a concatenation of genotype1 and genotype2
colData(blrs_grant_with_doubles_fltr)$genotype =
paste(colData(blrs_grant_with_doubles_fltr)$genotype1,
colData(blrs_grant_with_doubles_fltr)$genotype2,
sep = "_") %>%
str_remove('_$') %>%
factor() %>%
relevel(ref = "CNAG_00000") %>%
droplevels()
colData(blrs_grant_with_doubles_fltr)$libraryProtocol =
colData(blrs_grant_with_doubles_fltr)$libraryProtocol %>%
factor() %>%
relevel(ref = "SolexaPrep") %>%
droplevels()
mm = model.matrix(~libraryProtocol+libraryDate+genotype,
extractColData(blrs_grant_with_doubles_fltr))
min_new_protocol = min(
as.Date(
(unique(
pull(
filter(extractColData(blrs_grant_with_doubles_fltr),
libraryProtocol == 'E7420L'), libraryDate)))))
mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
inspect the model matrix for linear dependencies
lin_dep_report = caret::findLinearCombos(mm)
lin_dep_report
add the design to the object
design(blrs_grant_with_doubles_fltr) = mm
rm(mm)
Write out
if(WRITE_OUT){
outpath = file.path(DDS_OUTPUT_DIR,
"90minDataFreeze",
TODAY,
"grant_doubles_input.rds")
dds = DESeq2::DESeqDataSetFromMatrix(
colData = extractColData(blrs_grant_with_doubles_fltr),
countData = counts(blrs_grant_with_doubles_fltr),
design = design(blrs_grant_with_doubles_fltr))
sizeFactors(dds) = sizeFactors(blrs_grant_with_doubles_fltr)
write_rds(dds, outpath)
}
All
This set is all samples – singles and doubles – which are in the 90minuteInduction conditions.
Create set and quality filter
blrs_90min_all = createExperimentSet(blrs, 'ninetyMin_all')
# note that this filters out those samples which failed QC1,
# but but does not filter on RLE unless the argument rle_iqr_threshold
# is set to a numeric value
blrs_90min_all_fltr = qaFilter(blrs_90min_all)
# remove WT which fall on dates with no perturbed samples
blrs_90min_all_fltr =
filterWtByExperimentalLibdate_90min(blrs_90min_all_fltr)
Add the design
blrs_90min_all_fltr = estimateSizeFactorsByProtocol(blrs_90min_all_fltr)
min_libdate = min(as.Date(colData(blrs_90min_all_fltr)$libraryDate))
colData(blrs_90min_all_fltr)$libraryDate =
colData(blrs_90min_all_fltr)$libraryDate %>%
relevel(ref = as.character(min_libdate)) %>%
droplevels()
# add a 'genotype' column which is a concatenation of genotype1 and genotype2
colData(blrs_90min_all_fltr)$genotype =
paste(colData(blrs_90min_all_fltr)$genotype1,
colData(blrs_90min_all_fltr)$genotype2,
sep = "_") %>%
str_remove('_$') %>%
factor() %>%
relevel(ref = "CNAG_00000") %>%
droplevels()
colData(blrs_90min_all_fltr)$libraryProtocol =
colData(blrs_90min_all_fltr)$libraryProtocol %>%
factor() %>%
relevel(ref = "SolexaPrep") %>%
droplevels()
mm = model.matrix(~libraryProtocol+libraryDate+genotype,
extractColData(blrs_90min_all_fltr))
min_new_protocol = min(
as.Date(
(unique(
pull(
filter(extractColData(blrs_90min_all_fltr),
libraryProtocol == 'E7420L'), libraryDate)))))
mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
inspect the model matrix for linear dependencies
lin_dep_report = caret::findLinearCombos(mm)
lin_dep_report
add the design to the object
design(blrs_90min_all_fltr) = mm
rm(mm)
Write out
if(WRITE_OUT){
outpath = file.path(DDS_OUTPUT_DIR,
"90minDataFreeze",
TODAY,
"90min_all_input.rds")
dds = DESeq2::DESeqDataSetFromMatrix(
colData = extractColData(blrs_90min_all_fltr),
countData = counts(blrs_90min_all_fltr),
design = design(blrs_90min_all_fltr))
sizeFactors(dds) = sizeFactors(blrs_90min_all_fltr)
write_rds(dds, outpath)
}
Environmental Perturbation
Create Sets
ep_list = list(
wt = createExperimentSet(blrs, 'envPert_epWT'),
titration = createExperimentSet(blrs, 'envPert_titrationWT'),
perturbed = createExperimentSet(blrs, 'envPert_perturbed')
)
# NOTE!! AS OF 20220201 CNAG_03894 forms a linear depdendence with both
# concat treatment and libraryDate columns. it is being removed here to solve
# that issue
ep_list$perturbed = ep_list$perturbed[,ep_list$perturbed$genotype1 != "CNAG_03894"]
Quality Filter
ep_list_qc_passing = map(ep_list, qaFilter)
Expression Filter
# How this is done is up to you, and obviously affects what genes are left in.
# Below is an example. You need to think about the thresholds and filter method
# that suits your purpose best.
expr_fltr_list = map(ep_list, ~rowSums(edgeR::cpm(counts(.))>4) >= 4)
ep_list_qc_passing_fltr = map2(ep_list_qc_passing, expr_fltr_list, ~.x[.y,])
Set Design
setBaseConcatTreatmentBaseCond = function(ep_set, concat_base_cond){
colData(ep_set)$concat_treatment =
relevel(colData(ep_set)$concat_treatment, ref = concat_base_cond)
colData(ep_set) = droplevels(DataFrame(colData(ep_set)))
ep_set
}
setEpDesign = function(ep_set, design){
design(ep_set) = design
ep_set
}
concat_base_cond_list = list(
wt = "YPD_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30",
titration = "RPMI_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30",
perturbed = 'PBS_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_0'
)
ep_designs = list(
wt = formula(~libraryDate + concat_treatment),
titration = formula(~libraryDate + concat_treatment),
perturbed = formula(~libraryDate + concat_treatment + genotype1)
)
ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr,
concat_base_cond_list,
setBaseConcatTreatmentBaseCond)
ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr,
ep_designs,
setEpDesign)
Coerce back to DeseqDataObjects for proessing
ep_dds_list = map(ep_list_qc_passing_fltr, coerceToDds)
DDS_OUTPUT_DIR = "/mnt/scratch/rnaseq_pipeline/experiments/epTally"
if(WRITE_OUT){
today = format(lubridate::today(),"%Y%m%d")
output_dir = file.path(DDS_OUTPUT_DIR, today)
dir.create(output_dir, recursive=TRUE)
map(names(ep_dds_list), ~write_rds(ep_dds_list[[.]],
file.path(output_dir,
paste0("ep_",.,".rds"))))
}
Copy the HTCF DESeq scripts to the working directory
These scripts will run DESeq in parallel on HTCF. The only items you’l need to edit is the path to the lookup file (if you are running more than one model), and to the dds_input in deseq_mpi.sh. If you do not keep deseq_de.R in the same directory as deseq_mpi.sh, then you’ll need to update the path to the deseq_de.R script, also.
if(WRITE_OUT){
htcf_deseq_scripts = c(system.file('bash',
'htcf_parallel_deseq.sh',
package = "brentlabRnaSeqTools"),
system.file('R_executable',
'deseq_de.R',
package = "brentlabRnaSeqTools"))
lapply(htcf_deseq_scripts, file.copy, to = DDS_OUTPUT_DIR)
}
After DESeq(), Calculate RLE
Load in the post DESeq() data sets
Note: this entirely depends on what set you want to look at – in this example, I am doing the sample agreement and tallies for the EP sets
DESEQ_OUTPUT_LIST = list(
ep_wt = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_wt_20220201_output.rds",
ep_titration = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_titration_20220201_output.rds",
ep_perturbed = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_perturbed_20220201_output.rds"
)
add a column describing replicate groups if necessary
ep_dds_list$ep_pert$rep_col =
paste(ep_dds_list$ep_pert$genotype1,
ep_dds_list$ep_pert$concat_treatment, sep = "_")
calculate RLE
ep_rle_list = list(
ep_wt = removeLibdateByReplicate(ep_dds_list$ep_wt, "concat_treatment"),
ep_titr = removeLibdateByReplicate(ep_dds_list$ep_titr, "concat_treatment"),
ep_pert = removeLibdateByReplicate(ep_dds_list$ep_pert, "rep_col")
)
Update the database
if(UPDATE_DB){
updateRleTable = function(set_name, set_rle_summary){
iqr_col = paste0(set_name, "_iqr")
med_dev_col = paste0(set_name, "_median_dev")
update_df = set_rle_summary %>%
dplyr::select(replicateAgreementNumber, rle_iqr, rle_median_deviation) %>%
dplyr::rename(!!quo_name(iqr_col) := rle_iqr,
!!quo_name(med_dev_col) := rle_median_deviation)
res = patchTable(
database_info$kn99$urls$replicateAgreement,
Sys.getenv("kn99_db_token"),
update_df,
"replicateAgreementNumber")
res
}
res_list = map(names(ep_rle_list), ~updateRleTable(., ep_rle_list[[.]]$without_libdate_effect$summary))
# check status ----
## failures should only be those without enough replicates to calculate RLE
extract_status = map(res_list, ~map(., ~as.numeric(.$status_code)))
extract_status[[1]][unlist(extract_status[[1]]) != 200]
extract_status[[2]][unlist(extract_status[[2]]) != 200]
extract_status[[3]][unlist(extract_status[[3]]) != 200]
}