
    5iL/                        d dl mZ d dlZd dlZd dlZddlmZ ddlmZ dZ	dZ
dZd	 Zd
 Z	 	 	 	 ddZd Zd Zd Zd Zd Zd Zd Zd Zd ZddZ	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZy)    )annotationsN   )extract_utils_basepath)GENOTYPE_TYPES)refno-callz#SELECT\s+(DISTINCT\s+)?(.+?)\s+FROMz((`?(\w+)?`?\.)?`?(\w+)`?( AS `?(\w+)`?)?c                p    | j                  d      xs t        }|D cg c]  }|t        vs| c}S c c}w )zl
    Returns a list of genotype types for the filter that are use in genotype and allele table queries.
    genotype_type)getr   GENOTYPE_ONLY_TYPES)filter_dictgenotype_typesr
   s      /home/marpiech/ifpan-abm-pgxpred/analysis/marpiech-gwas-test/venv/lib/python3.12/site-packages/dxpy/dx_extract_utils/germline_utils.pyget_genotype_typesr      s3    
 !___5GN/=jmViAiMjjjs   33c                    |||d}| j                  d      xs t        |      }g }|j                         D ]!  \  }}||v s||r|j                  |       # |S )zf
    Returns a list of genotype types for the filter that are use in genotype table only queries.
    )r   halfr   r
   )r   listitemsappend)	r   exclude_refdataexclude_halfrefexclude_nocallgenotype_type_exclude_flag_mapr   genotype_only_typesr
   excludeds	            r   get_genotype_only_typesr      sx     !&"
 !___5]>\9]N#A#G#G#I 6xN*x/CH&&}56     c                <    | rdD cg c]  }|| vr|
 c}S g S c c}w )z
    Infer option require all genotypes types to be queried.
    If users wishes to obtain only certain types of genotypes,
    reminder of the types should be filtered out post querying
    )r   zhet-refhomzhet-altr   r    )requested_typestypes     r   %get_types_to_filter_out_when_inferingr#   /   s<     
 L& 	 ! !	! s   c                n    d| d   v r.| j                  d      r| j                  d      |d<   | d   |d<   y y )NCohortBrowserrecordTypesbaseSqlbase_sqlfilters)r   )resppayloads     r   add_germline_base_sqlr,   >   sA    $}--88I"&((9"5GJ!)_	 .r   c                :   d| v r| d   r| d   j                  d      \  }}}}n%d| v r!| d   r| d   j                  d      d d \  }}d}| j                  dd      }j                         rt        |      dt              |fS t	        d      |t              |fS )N	allele_id_locus_id    	sample_idinf)splitr   isdigitintfloat)dchromposr/   altr3   s         r   sort_germline_variantr=   E   s    aAkN{^11#6sAs	qQz]z]((-bq1
sk2&I}}5z2s3xi77<C#y88r   c              #  <  K   t        j                  t        |       j                  d      }|j	                  d      D cg c]  }|j                          c}D ]:  }t        j                  t        |      j                         }|d   |d   |d   f < yc c}w w)zz
    Parses the SELECT list of a SQL statement and returns a generator of named expressions table, column, and alias.
    r1   ,r      N)rematchSELECT_LIST_REGEXgroupr5   stripNAMED_EXPRESSION_REGEXgroups)sqlselect_list_matchxnamed_expressionnamed_expression_matchs        r   "_parse_sql_select_named_expressionrM   Q   s      !2C8>>qA0A0G0G0LM1QWWYM ^!#*@BR!S!Z!Z!\$Q')?)BDZ[\D]]]^Ms   =BBABc                X   t        |       D ci c]  \  }}}|||f }}}}g }|D ]  }t        |      d   }||v r,||   \  }}|j                  dj                  |||             A||v r/|j                  dj                  ||   d   ||   d                t|j                  dj                  |              d	j	                  |      }	t        j                  t        |       j                  d      }
t        j                  t        d
j                  |
|	      |       S c c}}}w )z
    Harmonizes the SELECT list of a SQL statement to include all columns in return_columns. NULL values are used for
    columns not in the SELECT list.
    r   z)`{table}`.`{column}` AS `{return_column}`)tablecolumnreturn_columnz`{table}`.`{column}` AS `ref`r   )rO   rP   zNULL AS `{return_column}`)rQ   z, z#SELECT {distinct}{select_list} FROM)distinctselect_list)
rM   tupler   formatjoinrA   rB   rC   rD   sub)rH   return_columnskwargsrO   rP   aliasselect_infoselect_listsrQ   rS   rR   s              r   &_harmonize_sql_select_named_expressionr]   [   sb   
 GiilFmnn.BeVU55&/)nKnL' am,Q/K''6ME6 K R RY^Z`an !S !p q f$ ? F FVTaMbcdMeNTUbNcdeNf !G !h i  ; B BQ^ B _`a ))L)Kxx)3/55a8H66-44hT_4` % os   D%c                P   t        t        j                  j                  t        d            5 }t        j                  |      }ddd       d}t        j                  ||       j                  d      }t        j                  |d|       } t        | |df      } | S # 1 sw Y   YxY w)z
    Harmonize the SQL statement for genotype table only queries to include columns to UNION with genotype and allele
    table queries. JOIN genotype table to allele table on locus_id to include ref columns values.
    zreturn_columns_genotype.jsonNz6ON\s+`(\w+)`\.`(a_id)`\s+=\s+`(\w+)`\.`(a_id)`\s+WHEREr1   z*ON `\1`.`locus_id` = `\3`.`locus_id` WHEREa_id)r   )openospathrV   r   jsonloadrA   searchrD   rW   r]   )rH   infilegenotype_return_columnsjoin_condition_regexallele_tables        r   harmonize_germline_sqlrj   y   s    
 
bggll13QR	S 4W]"&))F"34 U99137==a@L
&&%'UWZ
[C 16MT`bhSi
jCJ4 4s   BB%c                p    g }| D ].  }i }|D ]  }||v r	||   ||<   d||<    |j                  |       0 |S )zy
    Harmonizes raw query results to include all columns in fields_list. Columns not in fields_list have value None.
    N)r   )resultsfields_listharmonized_resultsresultharmonized_resultfields         r   harmonize_germline_resultsrr      sh      5  	0E+1%=!%(+/!%(		0
 	!!"345 r   c                    t        d | D              }g }|sy|D ]!  \  }}}|j                  d|g|||dgd       # |d   ddd	d
diddigdd|d   d   d   |d   d   d   d|ididS )zu
    Create a payload to query locus_id/ref pairs from the allele table for genotypes missing ref column values.
    c              3  D   K   | ]  }|d    	|d   |d   |d   f  yw)r   Nr0   
chromosomestarting_positionr    ).0rs     r   	<genexpr>z+get_germline_ref_payload.<locals>.<genexpr>   s2     rQRabchaiaqQz]AlOQ7J5KLrs   
  Ninchrstartend	conditionvalues	geno_binsproject_contextFTandr0   allele$locus_idr   
allele$refassay_filtersraw_filtersidnameallele$a_idr   r   r)   r   adjust_geno_binsrR   logicfields	is_cohortr   )setr   )rl   genotype_payload	locus_idsallele_filtersr0   r|   r;   s          r   get_germline_ref_payloadr      s     rV]rrIN' #sj"%C@A
 	 ,,=>!*+L!
 &}5oFtL(7HP!>
 r   c                p    |d   D ci c]  }|d   |d    }}| D ]  }|d   	||d      |d<    yc c}w )zW
    Update genotype results with ref column values from locus_id/ref query result
    rl   r0   r   Nr    )rl   locus_id_refsro   locus_id_ref_maps       r   update_genotype_only_refr      sc     IVV_H`afz*F5M9aa =%=$(
);<u= bs   3c                    |d   dddddiddid	d
iddigdd|d   d   d   |d   d   d   ddg | D cg c]  }|d   |d	   |d	   d c}dgididS c c}w )zZ
    Create a payload to query locus ids from the allele table with a location filter
    r   FTr   r0   r   ru   z
allele$chrrv   z
allele$posr   r   r   r   r   r   r   rz   r{   r   r   r   r    )	locationsr   locations      r   get_germline_loci_payloadr      s    
 ,,=>!*+<( ,/L!	
 &}5oFtL(7HP!%)"$ /8&
 #+ (0'=)12E)F'/0C'D&
% 
$
 (&s   Ac           	         i }| D ](  }t               d|d   |d   |d   |d   ddd||d   <   * |D ]  }||d      d   j                  |d	          ! |S )
z]
    Produces a dictionary with locus_id as key and a set of samples and entry as value.
    Nr0   ru   rv   r   )r.   r0   ru   rv   r   r<   )samplesentryr   r3   )r   add)lociresults_entries	loci_dictlocusr   s        r   _produce_loci_dictr      s     I 
u!!*-#L1%*+>%?U|
(
	%
#$
 ! H%
#$Y/33E+4FGH r   c                    t        ||      }g }|D ]3  }| D ],  }|||   d   vs|j                  d|i||   d   d|i       . 5 ||z   S )a  
    If the result_entries does not contain entry with sample_id of specifific starting_position the the genotype type is either no-call or ref.
    Args:
        samples: list of all samples
        loci: list of information on each loci within the filter  e.g.
            {
            "locus_id": "1_1076145_A_T",
            "chromosome": "1",
            "starting_position": 1076145,
            "ref": "A",
            }
        result_entries: list of results from extract_assay query. e.g.
            {
            "sample_id": "SAMPLE_2",
            "allele_id": "1_1076145_A_AT",
            "locus_id": "1_1076145_A_T",
            "chromosome": "1",
            "starting_position": 1076145,
            "ref": "A",
            "alt": "AT",
            "genotype_type": "het-alt",
            }
        type_to_infer: type to infer either  "ref" or "no-call"
    Returns: list of infered entries with added inferred genotype type and other entries retrieved from result for loci of interest.
    r   r3   r   r
   )r   r   )r   r   result_entriestype_to_inferr   inferred_entriesr   samples           r   infer_genotype_typer   	  s    8 #48I 	 	FYu-i88 ''#V#E*73 (		 ,,,r   c                <    | D cg c]  }||   |vs| c}S c c}w )a3  
    Filters results by key and restricted_values.
    Args:
        results: list of results from extract_assay query. e.g.
            {
            "sample_id": "SAMPLE_2",
            "allele_id": "1_1076145_A_AT",
            "locus_id": "1_1076145_A_T",
            "chromosome": "1",
            "starting_position": 1076145,
            "ref": "A",
            "alt": "AT",
            "genotype_type": "het-alt",
            }
        key: key to filter by
        restricted_values: list of values to filter by
    Returns: list of filtered entries
    r    )rl   keyrestricted_valuesr   s       r   filter_resultsr   4  s$    *  'Ne%*<M*MENNNs   )r!   z	list[str]returnr   )r   
list[dict]r   r   r   dict)
r   r   r   r   r   r   r   strr   r   )rl   r   r   r   r   r   r   r   )
__future__r   rc   ra   rA   filter_to_payloadr   input_validationr   r   rC   rF   r   r   r#   r,   r=   rM   r]   rj   rr   r   r   r   r   r   r   r    r   r   <module>r      s    "  	 	 5 , 
 ; D k&!!	!-	9^<& !H=#L0(-(-#(-5?(-PS(-(-VOO!O6:OOr   