#!/bin/bash
#
# Contingency runner: runs inside a single DNAnexus swiss-army-knife job.
# Extracts phenotype, computes contingency tables for all variant sources
# in parallel, and merges results into one output TSV.
#
# Usage (called from run_contingency.sh):
#   bash contingency_runner.sh <PHENOTYPE> <TYPES> [N_PARALLEL]
#

set -euo pipefail

PHENOTYPE="${1:?Usage: bash contingency_runner.sh <PHENOTYPE> <TYPES> [N_PARALLEL] [PHENO_TYPE]}"
TYPES="${2:-snv,hla,cypmicro,cypdosage,lof,mpc,cnv}"
N_PARALLEL="${3:-36}"
PHENO_TYPE="${4:-binary}"

DX="/mnt/project"
TOOLS_MNT="${DX}/results/tools"
WORK_DIR="/tmp/contingency_work"
RESULTS_DIR="/tmp/contingency_results"
PHENO_TSV="${WORK_DIR}/${PHENOTYPE}_pheno.tsv"
OUTPUT="/home/dnanexus/out/out/contingency_${PHENOTYPE}.tsv"

mkdir -p "$WORK_DIR" "$RESULTS_DIR" /home/dnanexus/out/out

echo "=== Contingency Runner ==="
echo "Phenotype:  $PHENOTYPE"
echo "Types:      $TYPES"
echo "Parallel:   $N_PARALLEL"
echo "Pheno type: $PHENO_TYPE"
echo ""

# ---- Step 1: Extract phenotype ----
echo "--- Step 1: Phenotype extraction ---"

EXISTING_PHENO="${DX}/results/tools/pheno_extracts/${PHENOTYPE}_pheno.tsv"
if [[ -f "$EXISTING_PHENO" ]]; then
    echo "Found existing phenotype extract, copying..."
    cp "$EXISTING_PHENO" "$PHENO_TSV"
else
    echo "Extracting phenotype from parquet archives..."
    pip install pyarrow -q
    python3 "${TOOLS_MNT}/extract_phenotype.py" \
        --parquet-tar "${DX}/results/pheno/applets-resources/clinical_phenotypes_parquet.tar.gz" \
        --parquet-tar "${DX}/results/pheno/applets-resources/prescription_phenotypes_parquet.tar.gz" \
        --parquet-tar "${DX}/results/pheno/applets-resources/assessment_centre_phenotypes_parquet.tar.gz" \
        --samples "${DX}/data/cohorts/train_set.txt" \
        --phenotype "${PHENOTYPE}" \
        --phenotype-type "${PHENO_TYPE}" \
        --output "${PHENO_TSV}"
fi

N_SAMPLES=$(tail -n +2 "$PHENO_TSV" | wc -l)
echo "Phenotype samples: $N_SAMPLES"
echo ""

# ---- Step 2: Define variant sources ----
echo "--- Step 2: Defining variant sources ---"

declare -a TASKS=()

add_task() {
    local base_path="$1" vtype="$2" ext="$3" group="$4"
    TASKS+=("${base_path}|${vtype}|${ext}|${group}")
}

# HLA
add_task "hla/chr6_hla" hardcall ".vcf.gz" hla

# SNV
for chr in $(seq 1 22) X; do
    if [[ "$chr" == "X" ]]; then
        add_task "snp/chrX/hemizygous_to_homozygous/chrX_snv_homozygous" hardcall ".vcf.gz" snv
    else
        add_task "snp/chr${chr}/chr${chr}_snv" hardcall ".vcf.gz" snv
    fi
done

# cypmicro
for chr in 2 3 4 6 7 8 10 11 12 14 15 19 20 22; do
    add_task "cypmicro/chr${chr}_cypmicro" hardcall ".vcf.gz" cypmicro
done

# cypdosage
for chr in 1 7 10 12 13 19 22; do
    add_task "cyp2/chr${chr}_cypdosage" dosage ".vcf.gz" cypdosage
done

# LOF
for chr in $(seq 1 22) X; do
    add_task "wes/lof/chr${chr}_lof" hardcall ".vcf.bgz" lof
done

# MPC
for chr in $(seq 1 22) X; do
    add_task "wes/missense/chr${chr}_mpc" dosage ".vcf.bgz" mpc
done

# CNV (hardcalls_down_up)
for chr in $(seq 1 22) X; do
    for subtype in genes regions; do
        add_task "hardcalls_down_up/chr${chr}_cnv_${subtype}_dosage_to_hardcall" hardcall ".vcf.bgz" cnv
    done
done

# Filter by allowed types
IFS=',' read -ra ALLOWED_TYPES <<< "$TYPES"

is_type_allowed() {
    local group="$1"
    for t in "${ALLOWED_TYPES[@]}"; do
        if [[ "$t" == "$group" ]]; then
            return 0
        fi
    done
    return 1
}

declare -a FILTERED_TASKS=()
for task_spec in "${TASKS[@]}"; do
    IFS='|' read -r base_path vtype ext group <<< "$task_spec"
    if is_type_allowed "$group"; then
        FILTERED_TASKS+=("$task_spec")
    fi
done

echo "Total sources: ${#FILTERED_TASKS[@]}"
echo ""

# ---- Step 3: Run contingency_all.py in parallel ----
echo "--- Step 3: Running contingency computations (${N_PARALLEL} parallel) ---"

# Check if base_path has pgen files available (for plink2 mode)
has_pgen() {
    local base_path="$1"
    if [[ "$base_path" == hardcalls_down_up/* ]]; then
        echo "false"
        return
    fi
    # chrX hemizygous VCF has no pgen files — use bcftools mode
    if [[ "$base_path" == *hemizygous* ]]; then
        echo "false"
        return
    fi
    echo "true"
}

run_one_task() {
    local base_path="$1" vtype="$2" ext="$3"
    local sanitized="${base_path//\//_}"
    local output_path="${RESULTS_DIR}/${sanitized}.tsv"
    local vcf_mnt="${DX}/results/gemo/${base_path}${ext}"
    local pfile_mnt="${DX}/results/gemo/${base_path}"

    if [[ "$(has_pgen "$base_path")" == "true" && "$vtype" == "hardcall" ]]; then
        # plink2 mode
        python3 "${TOOLS_MNT}/contingency_all.py" \
            --use-plink2 \
            --pfile-prefix "${pfile_mnt}" \
            --phenotype-tsv "${PHENO_TSV}" \
            --variant-type hardcall \
            --vcf-source "${base_path}" \
            --phenotype-type "${PHENO_TYPE}" \
            --output "${output_path}" 2>&1 | sed "s/^/[${sanitized}] /"
    else
        # bcftools mode
        python3 "${TOOLS_MNT}/contingency_all.py" \
            --vcf "${vcf_mnt}" \
            --phenotype-tsv "${PHENO_TSV}" \
            --variant-type "${vtype}" \
            --vcf-source "${base_path}" \
            --phenotype-type "${PHENO_TYPE}" \
            --output "${output_path}" 2>&1 | sed "s/^/[${sanitized}] /"
    fi
}

# Launch tasks with semaphore-based parallelism
RUNNING=0
for task_spec in "${FILTERED_TASKS[@]}"; do
    IFS='|' read -r base_path vtype ext group <<< "$task_spec"
    sanitized="${base_path//\//_}"

    echo "LAUNCHING: ${sanitized}"
    run_one_task "$base_path" "$vtype" "$ext" &

    RUNNING=$((RUNNING + 1))
    if [[ $RUNNING -ge $N_PARALLEL ]]; then
        wait -n 2>/dev/null || true
        RUNNING=$((RUNNING - 1))
    fi
done

# Wait for all remaining tasks
echo ""
echo "Waiting for remaining tasks to complete..."
wait

echo ""
echo "--- Step 4: Merging results ---"

# Write header
echo -e "VARIANT_ID\tVCF_SOURCE\tFORMAT_TYPE\tCAT1_NAME\tCAT2_NAME\tCAT1_CASE\tCAT2_CASE\tCAT1_CTRL\tCAT2_CTRL\tTOTAL_CASE\tTOTAL_CTRL\tMIN_CELL\tEXTRA" > "$OUTPUT"

SUCCEEDED=0
FAILED=0
for task_spec in "${FILTERED_TASKS[@]}"; do
    IFS='|' read -r base_path vtype ext group <<< "$task_spec"
    sanitized="${base_path//\//_}"
    result_file="${RESULTS_DIR}/${sanitized}.tsv"

    if [[ -f "$result_file" ]]; then
        n_lines=$(tail -n +2 "$result_file" | wc -l)
        tail -n +2 "$result_file" >> "$OUTPUT"
        echo "  OK: ${sanitized} (${n_lines} variants)"
        SUCCEEDED=$((SUCCEEDED + 1))
    else
        echo "  MISSING: ${sanitized}"
        FAILED=$((FAILED + 1))
    fi
done

echo ""
echo "=== MERGE COMPLETE ==="
echo "Succeeded: $SUCCEEDED / ${#FILTERED_TASKS[@]}"
echo "Failed:    $FAILED / ${#FILTERED_TASKS[@]}"
TOTAL_VARIANTS=$(tail -n +2 "$OUTPUT" | wc -l)
echo "Total variants: $TOTAL_VARIANTS"
echo "Output: $OUTPUT"

if [[ $TOTAL_VARIANTS -gt 0 ]]; then
    echo ""
    echo "Preview (first 10 rows):"
    head -11 "$OUTPUT" | column -t -s $'\t'
fi
