Cluster Usage¶
For large-scale analyses (many perturbed TFs), the workflow is designed to run as a SLURM job array where each array task processes one TF.
Lookup File¶
Create a TSV with one row per TF. Only the first two columns are required:
# response_file perturbed_tf suffix (opt) predictors_file (opt)
/data/response/GAT2.csv GAT2
/data/response/pTF1.csv pTF1 ypd1_run2 /data/predictors_v2.csv
response_file: path to the response CSV for this TFperturbed_tf: the TF name (must match a column in both files)suffix(optional): appended to the output subdirectory namepredictors_file(optional): overrides the global--predictors_filefor this row
Job Script¶
Save the following as perturbation_binding_modeling.sh. It is submitted as a
SLURM array; each task reads one line from the lookup file.
#!/bin/bash
#SBATCH --output=logs/%A_%a.out
#SBATCH --open-mode=append
#SBATCH --cpus-per-task=4
#SBATCH --mem=1G
set -euo pipefail
log() { echo -e "[INFO] $*"; }
print_usage() {
cat <<EOF
Usage:
sbatch --array=1-N perturbation_binding_modeling.sh \
--venv=PATH_TO_VENV_ACTIVATE \
--lookup=LOOKUP_FILE \
--output_dir=OUTPUT_DIR \
[--predictors=PREDICTORS_CSV] \
[--all_data_ci_level=98.0] \
[--topn_ci_level=90.0] \
[--n_bootstraps=1000] \
[--random_state=42] \
[--stage3_lasso] \
[--stage3_lasso_topn] \
[--stage3_lassocv_bootstrap] \
[--iterative_dropout] \
[--scale_by_std] \
[--bins=0,64,512,np.inf] \
[--exclude_model_variables=VARNAME]
NOTE: --predictors is required unless provided in the lookup file.
EOF
exit 1
}
parse_lookup_line() {
local line="$1"
local num_fields
num_fields=$(echo "$line" | awk -F'\t' '{print NF}')
RESPONSE_FILE=$(echo "$line" | awk -F'\t' '{print $1}')
PERTURBED_TF=$(echo "$line" | awk -F'\t' '{print $2}')
SUFFIX=""
PREDICTORS_FILE_OVERRIDE=""
[[ "$num_fields" -ge 3 ]] && SUFFIX=$(echo "$line" | awk -F'\t' '{print $3}')
[[ "$num_fields" -ge 4 ]] && \
PREDICTORS_FILE_OVERRIDE=$(echo "$line" | awk -F'\t' '{print $4}')
}
[[ "$#" -eq 0 || "$1" == "--help" || "$1" == "-h" ]] && print_usage
# Defaults
MAX_ITER=10000
N_BOOTSTRAPS=1000
ALL_DATA_CI_LEVEL=98.0
TOPN_CI_LEVEL=90.0
VENV="" LOOKUP="" PREDICTORS_FILE="" OUTPUT_OUTER=""
RANDOM_STATE="" STAGE3_LASSO_FLAG="" STAGE3_LASSO_TOPN_FLAG=""
STAGE3_LASSOCV_BOOTSTRAP_FLAG="" BINS_FLAG="" EXCLUDE_MODEL_VARIABLES=""
ADD_MODEL_VARIABLES="" SCALE_BY_STD="" ITERATIVE_DROPOUT=""
STABILIZATION_CI_START=""
while [[ "$#" -gt 0 ]]; do
case "$1" in
--venv=*) VENV="${1#*=}" ;;
--lookup=*) LOOKUP="${1#*=}" ;;
--predictors=*) PREDICTORS_FILE="${1#*=}" ;;
--output_dir=*) OUTPUT_OUTER="${1#*=}" ;;
--all_data_ci_level=*) ALL_DATA_CI_LEVEL="${1#*=}" ;;
--topn_ci_level=*) TOPN_CI_LEVEL="${1#*=}" ;;
--n_bootstraps=*) N_BOOTSTRAPS="${1#*=}" ;;
--random_state=*) RANDOM_STATE="--random_state ${1#*=}" ;;
--stage3_lasso) STAGE3_LASSO_FLAG="--stage3_lasso" ;;
--stage3_lasso_topn) STAGE3_LASSO_TOPN_FLAG="--stage3_lasso_topn" ;;
--stage3_lassocv_bootstrap) \
STAGE3_LASSOCV_BOOTSTRAP_FLAG="--stage3_lassocv_bootstrap" ;;
--bins=*) BINS_FLAG="--bins ${1#*=}" ;;
--exclude_model_variables=*) \
EXCLUDE_MODEL_VARIABLES="${1#*=}" ;;
--add_model_variables=*) \
ADD_MODEL_VARIABLES="${1#*=}" ;;
--scale_by_std) SCALE_BY_STD="--scale_by_std" ;;
--iterative_dropout) ITERATIVE_DROPOUT="--iterative_dropout" ;;
--stabilization_ci_start=*) \
STABILIZATION_CI_START="--stabilization_ci_start ${1#*=}" ;;
-h|--help) print_usage ;;
*) echo "[ERROR] Unknown argument: $1"; print_usage ;;
esac
shift
done
log "Activating environment..."
source "$VENV"
LINE=$(sed -n "${SLURM_ARRAY_TASK_ID}p" "$LOOKUP")
parse_lookup_line "$LINE"
[[ -n "${PREDICTORS_FILE_OVERRIDE}" ]] && PREDICTORS_FILE="$PREDICTORS_FILE_OVERRIDE"
[[ -z "$VENV" || -z "$LOOKUP" || -z "$PREDICTORS_FILE" || -z "$OUTPUT_OUTER" ]] && {
echo "[ERROR] Missing required argument."; print_usage
}
[[ -n "$SUFFIX" ]] && OUTPUT_SUFFIX_FLAG="--output_suffix $SUFFIX" \
|| OUTPUT_SUFFIX_FLAG=""
[[ -n "$EXCLUDE_MODEL_VARIABLES" ]] \
&& EXCLUDE_FLAG="--exclude_model_variables=$EXCLUDE_MODEL_VARIABLES" \
|| EXCLUDE_FLAG=""
[[ -n "$ADD_MODEL_VARIABLES" ]] \
&& ADD_VAR_FLAG="--add_model_variables=$ADD_MODEL_VARIABLES" \
|| ADD_VAR_FLAG=""
CMD=(
python -m tfbpmodeling
--response_file "$RESPONSE_FILE"
--predictors_file "$PREDICTORS_FILE"
--perturbed_tf "$PERTURBED_TF"
--n_bootstraps "$N_BOOTSTRAPS"
--output_dir "$OUTPUT_OUTER"
--max_iter "$MAX_ITER"
--all_data_ci_level "$ALL_DATA_CI_LEVEL"
--topn_ci_level "$TOPN_CI_LEVEL"
$STAGE3_LASSO_FLAG
$STAGE3_LASSO_TOPN_FLAG
$STAGE3_LASSOCV_BOOTSTRAP_FLAG
$RANDOM_STATE
$OUTPUT_SUFFIX_FLAG
$BINS_FLAG
$EXCLUDE_FLAG
$ADD_VAR_FLAG
$ITERATIVE_DROPOUT
$SCALE_BY_STD
$STABILIZATION_CI_START
)
log "Launching:"
printf "%q " "${CMD[@]}"
echo
"${CMD[@]}"
Submission Script¶
Save the following as submit_jobs.sh alongside the job script:
#!/bin/bash
# Usage:
# ./submit_jobs.sh OUTPUT_DIR LOOKUP VENV_ACTIVATE [ARRAY_LENGTH]
#
# ARRAY_LENGTH defaults to the number of lines in LOOKUP.
OUTPUT_DIR=$1
LOOKUP=$2
ACTIVATE=$3
ARRAY_LENGTH=${4:-$(wc -l < "$LOOKUP")}
sbatch --array=1-${ARRAY_LENGTH} perturbation_binding_modeling.sh \
--venv="$ACTIVATE" \
--lookup="$LOOKUP" \
--output_dir="$OUTPUT_DIR" \
--bins="0,64,512,np.inf" \
--scale_by_std \
--stage3_lasso \
--stage3_lasso_topn \
--stage3_lassocv_bootstrap \
--iterative_dropout \
--random_state=42
Submit with:
./submit_jobs.sh /path/to/results lookup.tsv /path/to/venv/bin/activate