#!/bin/bash # # Contingency runner: runs inside a single DNAnexus swiss-army-knife job. # Extracts phenotype, computes contingency tables for all variant sources # in parallel, and merges results into one output TSV. # # Usage (called from run_contingency.sh): # bash contingency_runner.sh [N_PARALLEL] # set -euo pipefail PHENOTYPE="${1:?Usage: bash contingency_runner.sh [N_PARALLEL] [PHENO_TYPE]}" TYPES="${2:-snv,hla,cypmicro,cypdosage,lof,mpc,cnv}" N_PARALLEL="${3:-36}" PHENO_TYPE="${4:-binary}" DX="/mnt/project" TOOLS_MNT="${DX}/results/tools" WORK_DIR="/tmp/contingency_work" RESULTS_DIR="/tmp/contingency_results" PHENO_TSV="${WORK_DIR}/${PHENOTYPE}_pheno.tsv" OUTPUT="/home/dnanexus/out/out/contingency_${PHENOTYPE}.tsv" mkdir -p "$WORK_DIR" "$RESULTS_DIR" /home/dnanexus/out/out echo "=== Contingency Runner ===" echo "Phenotype: $PHENOTYPE" echo "Types: $TYPES" echo "Parallel: $N_PARALLEL" echo "Pheno type: $PHENO_TYPE" echo "" # ---- Step 1: Extract phenotype ---- echo "--- Step 1: Phenotype extraction ---" EXISTING_PHENO="${DX}/results/tools/pheno_extracts/${PHENOTYPE}_pheno.tsv" if [[ -f "$EXISTING_PHENO" ]]; then echo "Found existing phenotype extract, copying..." cp "$EXISTING_PHENO" "$PHENO_TSV" else echo "Extracting phenotype from parquet archives..." pip install pyarrow -q python3 "${TOOLS_MNT}/extract_phenotype.py" \ --parquet-tar "${DX}/results/pheno/applets-resources/clinical_phenotypes_parquet.tar.gz" \ --parquet-tar "${DX}/results/pheno/applets-resources/prescription_phenotypes_parquet.tar.gz" \ --parquet-tar "${DX}/results/pheno/applets-resources/assessment_centre_phenotypes_parquet.tar.gz" \ --samples "${DX}/data/cohorts/train_set.txt" \ --phenotype "${PHENOTYPE}" \ --phenotype-type "${PHENO_TYPE}" \ --output "${PHENO_TSV}" fi N_SAMPLES=$(tail -n +2 "$PHENO_TSV" | wc -l) echo "Phenotype samples: $N_SAMPLES" echo "" # ---- Step 2: Define variant sources ---- echo "--- Step 2: Defining variant sources ---" declare -a TASKS=() add_task() { local base_path="$1" vtype="$2" ext="$3" group="$4" TASKS+=("${base_path}|${vtype}|${ext}|${group}") } # HLA add_task "hla/chr6_hla" hardcall ".vcf.gz" hla # SNV for chr in $(seq 1 22) X; do if [[ "$chr" == "X" ]]; then add_task "snp/chrX/hemizygous_to_homozygous/chrX_snv_homozygous" hardcall ".vcf.gz" snv else add_task "snp/chr${chr}/chr${chr}_snv" hardcall ".vcf.gz" snv fi done # cypmicro for chr in 2 3 4 6 7 8 10 11 12 14 15 19 20 22; do add_task "cypmicro/chr${chr}_cypmicro" hardcall ".vcf.gz" cypmicro done # cypdosage for chr in 1 7 10 12 13 19 22; do add_task "cyp2/chr${chr}_cypdosage" dosage ".vcf.gz" cypdosage done # LOF for chr in $(seq 1 22) X; do add_task "wes/lof/chr${chr}_lof" hardcall ".vcf.bgz" lof done # MPC for chr in $(seq 1 22) X; do add_task "wes/missense/chr${chr}_mpc" dosage ".vcf.bgz" mpc done # CNV (hardcalls_down_up) for chr in $(seq 1 22) X; do for subtype in genes regions; do add_task "hardcalls_down_up/chr${chr}_cnv_${subtype}_dosage_to_hardcall" hardcall ".vcf.bgz" cnv done done # Filter by allowed types IFS=',' read -ra ALLOWED_TYPES <<< "$TYPES" is_type_allowed() { local group="$1" for t in "${ALLOWED_TYPES[@]}"; do if [[ "$t" == "$group" ]]; then return 0 fi done return 1 } declare -a FILTERED_TASKS=() for task_spec in "${TASKS[@]}"; do IFS='|' read -r base_path vtype ext group <<< "$task_spec" if is_type_allowed "$group"; then FILTERED_TASKS+=("$task_spec") fi done echo "Total sources: ${#FILTERED_TASKS[@]}" echo "" # ---- Step 3: Run contingency_all.py in parallel ---- echo "--- Step 3: Running contingency computations (${N_PARALLEL} parallel) ---" # Check if base_path has pgen files available (for plink2 mode) has_pgen() { local base_path="$1" if [[ "$base_path" == hardcalls_down_up/* ]]; then echo "false" return fi # chrX hemizygous VCF has no pgen files — use bcftools mode if [[ "$base_path" == *hemizygous* ]]; then echo "false" return fi echo "true" } run_one_task() { local base_path="$1" vtype="$2" ext="$3" local sanitized="${base_path//\//_}" local output_path="${RESULTS_DIR}/${sanitized}.tsv" local vcf_mnt="${DX}/results/gemo/${base_path}${ext}" local pfile_mnt="${DX}/results/gemo/${base_path}" if [[ "$(has_pgen "$base_path")" == "true" && "$vtype" == "hardcall" ]]; then # plink2 mode python3 "${TOOLS_MNT}/contingency_all.py" \ --use-plink2 \ --pfile-prefix "${pfile_mnt}" \ --phenotype-tsv "${PHENO_TSV}" \ --variant-type hardcall \ --vcf-source "${base_path}" \ --phenotype-type "${PHENO_TYPE}" \ --output "${output_path}" 2>&1 | sed "s/^/[${sanitized}] /" else # bcftools mode python3 "${TOOLS_MNT}/contingency_all.py" \ --vcf "${vcf_mnt}" \ --phenotype-tsv "${PHENO_TSV}" \ --variant-type "${vtype}" \ --vcf-source "${base_path}" \ --phenotype-type "${PHENO_TYPE}" \ --output "${output_path}" 2>&1 | sed "s/^/[${sanitized}] /" fi } # Launch tasks with semaphore-based parallelism RUNNING=0 for task_spec in "${FILTERED_TASKS[@]}"; do IFS='|' read -r base_path vtype ext group <<< "$task_spec" sanitized="${base_path//\//_}" echo "LAUNCHING: ${sanitized}" run_one_task "$base_path" "$vtype" "$ext" & RUNNING=$((RUNNING + 1)) if [[ $RUNNING -ge $N_PARALLEL ]]; then wait -n 2>/dev/null || true RUNNING=$((RUNNING - 1)) fi done # Wait for all remaining tasks echo "" echo "Waiting for remaining tasks to complete..." wait echo "" echo "--- Step 4: Merging results ---" # Write header echo -e "VARIANT_ID\tVCF_SOURCE\tFORMAT_TYPE\tCAT1_NAME\tCAT2_NAME\tCAT1_CASE\tCAT2_CASE\tCAT1_CTRL\tCAT2_CTRL\tTOTAL_CASE\tTOTAL_CTRL\tMIN_CELL\tEXTRA" > "$OUTPUT" SUCCEEDED=0 FAILED=0 for task_spec in "${FILTERED_TASKS[@]}"; do IFS='|' read -r base_path vtype ext group <<< "$task_spec" sanitized="${base_path//\//_}" result_file="${RESULTS_DIR}/${sanitized}.tsv" if [[ -f "$result_file" ]]; then n_lines=$(tail -n +2 "$result_file" | wc -l) tail -n +2 "$result_file" >> "$OUTPUT" echo " OK: ${sanitized} (${n_lines} variants)" SUCCEEDED=$((SUCCEEDED + 1)) else echo " MISSING: ${sanitized}" FAILED=$((FAILED + 1)) fi done echo "" echo "=== MERGE COMPLETE ===" echo "Succeeded: $SUCCEEDED / ${#FILTERED_TASKS[@]}" echo "Failed: $FAILED / ${#FILTERED_TASKS[@]}" TOTAL_VARIANTS=$(tail -n +2 "$OUTPUT" | wc -l) echo "Total variants: $TOTAL_VARIANTS" echo "Output: $OUTPUT" if [[ $TOTAL_VARIANTS -gt 0 ]]; then echo "" echo "Preview (first 10 rows):" head -11 "$OUTPUT" | column -t -s $'\t' fi