
    5i;=                         d Z ddlmZmZmZmZ ddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlmZ ddlmZ ddZddZd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd ZddZy)a  
This module provides support for file download and upload. It calculates the
   location of the input and output directories. It also has a utility for parsing
   the job input file ('job_input.json').

We use the following shorthands
   <idir> == input directory     $HOME/in
   <odir> == output directory    $HOME/out

A simple example of the job input, when run locally, is:

{
    "seq2": {
        "$dnanexus_link": {
            "project": "project-1111",
            "id": "file-1111"
        }
    },
    "seq1": {
        "$dnanexus_link": {
            "project": "project-2222",
            "id": "file-2222"
        }
    }
    "blast_args": "",
    "evalue": 0.01
}

The first two elements are files {seq1, seq2}, the other elements are
{blast_args, evalue}. The files for seq1,seq2 should be saved into:
<idir>/seq1/<filename>
<idir>/seq2/<filename>

An example for a shell command that would create these arguments is:
    $ dx run coolapp -iseq1=NC_000868.fasta -iseq2=NC_001422.fasta
It would run an app named "coolapp", with file arguments for seq1 and seq2. Both NC_*
files should be the names of files in a DNAnexus project (and should be resolved to their
file IDs by dx). Subsequently, after dx-download-all-inputs is run,
file seq1 should appear in the execution environment at path:
    <idir>/seq1/NC_000868.fasta

File Arrays

{
    "reads": [{
        "$dnanexus_link": {
            "project": "project-3333",
            "id": "file-3333"
        }
    },
    {
        "$dnanexus_link": {
            "project": "project-4444",
            "id": "file-4444"
        }
    }]
}

This is a file array with two files. Running a command like this:
    $ dx run coolapp -ireads=A.fastq -ireads=B.fasta
will download into the execution environment:
<idir>/reads/A.fastq
             B.fastq

    )print_functionunicode_literalsdivisionabsolute_importN   )
basestring)DXErrorc                     | | }nt         j                  j                  d      }t         j                  j	                  |d      }|S )z
    :param job_homedir: explicit value for home directory, used for testing purposes
    :rtype: string
    :returns: path to input directory

    Returns the input directory, where all inputs are downloaded
    HOMEinosenvirongetpathjoin)job_homedirhome_diridirs      |/home/marpiech/ifpan-abm-pgxpred/analysis/marpiech-gwas-test/venv/lib/python3.12/site-packages/dxpy/utils/file_load_utils.pyget_input_dirr   b   s:     ::>>&)77<<$'DK    c                     | | }nt         j                  j                  d      }t         j                  j	                  |d      }|S )z
    :param job_homedir: explicit value for home directory, used for testing purposes
    :rtype: string
    :returns: path to output directory

    Returns the output directory, where all outputs are created, and
    uploaded from
    r   outr   )r   r   odirs      r   get_output_dirr   r   s:     ::>>&)77<<%(DKr   c                      t         j                  j                  d      } t         j                  j	                  | d      S )z>
    :rtype: string
    :returns: path to input JSON file
    r   zjob_input.jsonr   r   s    r   get_input_json_filer      s,    
 zz~~f%H77<<"233r   c                      t         j                  j                  d      } t         j                  j	                  | d      S )z?
    :rtype: string
    :returns: Path to output JSON file
    r   zjob_output.jsonr   r   s    r   get_output_json_filer!      s,    
 zz~~f%H77<<"344r   c                      t               } 	 t        j                  |        y# t        $ r)}|j                  t        j
                  k(  rn Y d}~yd}~ww xY w)z| Warning: this is not for casual use.
    It erases the output json file, and should be used for testing purposes only.
    N)r!   r   removeOSErrorerrnoENOENT)r   es     r   rm_output_json_filer(      sG      !D
		$ 77ell" s   " 	AAAc                     t         j                  j                  |       st        j                  |        yt         j                  j	                  |       rt        d| z        y)zl
    :param path: path to directory to be created

    Create a directory if it does not already exist.
    z9Path %s already exists, and it is a file, not a directoryN)r   r   existsmkdirisfile	Exception)r   s    r   
ensure_dirr.      sH     77>>$
 77>>$WZ^^__  r   c                 j    ddg}| |v rt        dj                  |             | j                  dd      S )a  
    :param fname: the basename of a file (e.g., xxx in /zzz/yyy/xxx).
    :returns: a valid unix filename
    :rtype: string
    :raises: DXError if the filename is invalid on a Unix system

    The problem being solved here is that *fname* is a python string, it
    may contain characters that are invalid for a file name. We replace all the slashes with %2F.
    Another issue, is that the user may choose an invalid name. Since we focus
    on Unix systems, the only possibilies are "." and "..".
    .z..zInvalid filename {}/z%2F)r	   formatreplace)fnamebad_filenamess     r   make_unix_filenamer6      s=     $KM+2259::==e$$r   c                 r    t        | j                               D ci c]  \  }}||vs|| c}}S c c}}w N)listitems)dict_	excl_keyskvs       r   filter_dictr?      s0    !%++-0GTQAY4FAqDGGGs   33c                 r   	
  fd} |       }t        j                  t              g 

fd		
fd}t        |j                               D ])  \  }}t	        |t              r
 |||         	|d|       + t        |j                               D ci c]  \  }}|vs|| }}}
|fS c c}}w )ag  Extract list of files, returns a set of directories to create, and
    a set of files, with sources and destinations. The paths created are
    relative to the input directory.

    Note: we go through file names inside arrays, and create a
    separate subdirectory for each. This avoids clobbering files when
    duplicate filenames appear in an array.
    c                  t    t              5 } t        j                  |       }|cd d d        S # 1 sw Y   y xY wr8   )openjsonload)fh	job_inputjob_input_files     r   get_input_hashz/get_job_input_filenames.<locals>.get_input_hash   s2    .! 	R		"I	 	 	s   .7c                    t        j                  |      sy t        j                  |      }t        |t         j                        sy t        |j                        }| }| t        j                  j                  ||      }|    j                  t        j                  j                  ||      ||j                  d       j                  |       y )N)	trg_fnamehandlersrc_file_id)dxpy	is_dxlinkget_handler
isinstanceDXFiler6   namer   r   r   appendid)inamesubdirvaluerK   filenametrg_dirdirsfiless         r   add_filez)get_job_input_filenames.<locals>.add_file   s    ~~e$""5)'4;;/%gll3ggll7F3Ge"'',,w*I(/,3JJ8 	9 	Gr   c                     t        |      }|dk(  ry t        t        |dz
              }j                  |        t        |      D ])  \  }}t        |      j	                  |      } | ||       + y )Nr      )lenstrrS   	enumeratezfill)	
input_namelinks	num_files
num_digitsilinkrV   r\   rZ   s	          r   add_file_arrayz/get_job_input_filenames.<locals>.add_file_array   sk    J	>Y]+,
J ' 	/GAtV\\*-FZ.	/r   N)collectionsdefaultdictr9   r:   rP   )rG   rH   rF   ri   rc   rW   keyval	rest_hashr\   rZ   r[   s   `        @@@r   get_job_input_filenamesro      s      I##D)ED,/ ")//"34 .
EeT":u-Zu-. +/y/@*AVhc3SPUEUcVIV	!! Ws   B3&B3c                  b   d} dt         j                  v rit        j                  t        j                        }|d   dk(  rt        j                  |j                  d|j                  d                  }d|v re|d   } n_dt         j                  v rMt         j                  d   }t        |      5 }t        j                  |      }|j                  d      } ddd       | i S d	 }i }| D ]8  }|d
   }	d|v sg ||	<   |d   D ]  }
 ||
      s||	   j                  |
       ! : |S # 1 sw Y   QxY w)z Extract the inputSpec patterns, if they exist -- modifed from dx-upload-all-outputs

    Returns a dict of all patterns, with keys equal to the respective
    input parameter names.
    N	DX_JOB_IDfunctionmainappapplet	inputSpecDX_TEST_DXAPP_JSONc                 
    d| v S )N* )patterns    r   is_legal_patternz1get_input_spec_patterns.<locals>.is_legal_pattern/  s    g~r   rR   patterns)
r   r   rM   describeJOB_IDr   rB   rC   rD   rS   )
input_specjob_descdescpath_to_dxapp_jsonfd
dxapp_jsonr|   patterns_dictspecrR   ps              r   get_input_spec_patternsr     s7    Jbjj ==-J6)==eX\\(5K!LMDd"!+.
		+ZZ(<=$% 	52J#4J	5 	M 2F|"$M$*% 2#A&!$'..q12	2 +	5 	5s   .'D%%D.c                 H    | |S || S t        |      t        |       k  r|S | S r8   )r_   )r   qs     r   choose_shorter_stringr   =  s/    yy
1vAHr   c                 $   t        |       \  }}}t               fd}d }t        j                  |      }t	        |      }t        |j                               D ]  \  }	}
|
D ]  }|d   }t        j                  j                  |      } |||	      }||	   }|d   j                  |d          |d   j                  |       |d   j                  |       |d   j                  t        j                  j                  ||               ||fS )aO  
    This function examines the input file, and calculates variables to
    instantiate in the shell environment. It is called right before starting the
    execution of an app in a worker.

    For each input key, we want to have
    $var
    $var_filename
    $var_prefix
       remove last dot (+gz), and/or remove patterns
    $var_path
       $HOME/in/var/$var_filename

    For example,
    $HOME/in/genes/A.txt
                   B.txt

    export genes=('{"$dnanexus_link": "file-xxxx"}' '{"$dnanexus_link": "file-yyyy"}')
    export genes_filename=("A.txt" "B.txt")
    export genes_prefix=("A" "B")
    export genes_path=("$HOME/in/genes/A.txt" "$HOME/in/genes/B.txt")

    If there are patterns defined in the input spec, then the prefix respects them.
    Here are several examples, where the patterns are:
       *.bam, *.bwa-index.tar.gz, foo*.sam, z*ra.sam

    file name                prefix     matches
    foo.zed.bam              foo.zed    *.bam
    xxx.bwa-index.tar.gz     xxx        *.bwa-index.tar.gz
    food.sam                 food       foo*.sam
    zebra.sam                zebra      z*ra.sam
    xx.c                     xx
    xx.c.gz                  xx

    The only patterns we recognize are of the form x*.y. For example:
      legal    *.sam, *.c.py,  foo*.sam,  a*b*c.baz
      ignored  uu.txt x???.tar  mon[a-z].py
    c           	      j   d }j                  |      }|L|D ]G  }t        j                  | |      s|j                  d      \  }}}t        || d t	        |              }I ||S t
        j                  j                  |       }|d   dk(  r"t
        j                  j                  |d         }|d   S )Nry   r^   z.gzr   )r   fnmatch
rpartitionr   r_   r   r   splitext)	basenamerl   best_prefixr}   r{   _right_piecepartsr   s	           r   
get_prefixz%analyze_bash_vars.<locals>.get_prefixr  s     $$S)# c??8W5(/(:(:3(?%Aq+"7XN`PST_P`O`Ea"bKc " GG$$X.EQx5 ((q28Or   c                      g g g g dS )N)rK   r   prefixr   rz   rz   r   r   factoryz"analyze_bash_vars.<locals>.factory  s    2"bIIr   rJ   rK   r   r   r   )ro   r   rj   rk   r   r9   r:   r   r   r   rS   r   )rG   r   r   file_entriesrn   r   r   file_key_descsrel_home_dirrl   entriesentryrX   r   r   k_descr   s                   @r   analyze_bash_varsr   G  s   N "9!HA|Y+-M"J ,,W5N -L\//12 	HW 	HE[)Hww''1H#.F#C(F9$$U9%56:%%h/8##F+6N!!"'',,|X"FG	H	H 9$$r   c                   
 t        | |      \  }}d 

fd}i fd}t        |j                               D ]  \  }} || ||              t        |j                               D ]V  \  }	} ||	 ||d                 ||	dz    ||d                 ||	dz    ||d                 ||	d	z    ||d
                X S )a  
    :param job_input_file: path to a JSON file describing the job inputs
    :param job_homedir: path to home directory, used for testing purposes
    :param check_name_collision: should we check for name collisions?
    :return: list of lines
    :rtype: list of strings

    Calculates a line for each shell variable to instantiate.
    If *check_name_collision* is true, then detect and warn about
    collisions with essential environment variables.
    c                    d }t        | t              r| }nXt        | t        j                        r)t	        j
                  t        j                  |             }nt	        j
                  |       }t        j                  |      S r8   )	rP   r   rM   rQ   rC   dumpsdxlinkshlexquote)elemresults     r   string_of_elemz%gen_bash_vars.<locals>.string_of_elem  sX    dJ'Fdkk*ZZD 12FZZ%F{{6""r   c                     t        | t              r6dj                  | D cg c]
  } |       c}      }dj                  |      S  |       S c c}w )N z( {} ))rP   r9   r   r2   )rm   vitemstringr   s      r   string_of_valuez&gen_bash_vars.<locals>.string_of_value  sK    c4 XX#F~e4FGF??6**!#&& Gs   Ac                     ru| t         j                  vr
| vr|| <   yt        j                  j	                  t
        j                  j                  j                  dj                  |             dz          y|| <   y)zW In the absence of a name collision, create a line describing a bash variable.
        z?Creating environment variable ({}) would cause a name collision
N)
r   r   sysstderrwriterM   utilsprintingfillr2   )rl   rm   check_name_collisionvar_defs_hashs     r    gen_text_line_and_name_collisionz7gen_bash_vars.<locals>.gen_text_line_and_name_collision  sq      "**$M)A%(c"

  !4!4!9!9U\\]`a""  "%M#r   rK   _namer   _prefixr   _pathr   )r   r9   r:   )rG   r   r   r   rn   r   r   rl   r   file_keyr   r   s     `       @@r   gen_bash_varsr     s     !2.+ NNI#' M% )//+, E	T(od.CDE~3356 \$(?4	?3ST(G);_TR\M]=^_(I)=tT\~?^_(G);_TRX\=Z[	\ r   r8   )NT)__doc__
__future__r   r   r   r   rC   r   r   r   r   rj   r%   rM   compatr   
exceptionsr	   r   r   r   r!   r(   r.   r6   r?   ro   r   r   r   r   rz   r   r   <module>r      s{   "@D S R   	  
       "45`%(HE"P'VJ%Z8r   