o
    h
K                     @   sl  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efddZde
j fddZ!e" dd Z#e" de$fddZ%de$fddZ&e" de$fddZ'e"ddd Z(de$fddZ)eddG d d! d!Z*G d"d# d#eZ+dS )$    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dt tttf fdd}|S )Nreturnc                 S   s0   | j j}|j j}||ksJ d|dkrdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )ZscalarZprimitive_bitwidth)Zlhs_typeZrhs_typeZlhs_bitwidthZrhs_bitwidth r   e/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r   r   r   r   min_dot_size   s   	r   r   c                   C   s   t jjS N)r	   r   ptxasr   r   r   r   	get_ptxas!   s   r   c                  C   s0   t jj} | d ur
| S tt jdgd}|S )Nz	--versionutf-8)r	   r   Zmock_ptx_version
subprocesscheck_outputr   pathdecode)Zmock_verversionr   r   r   get_ptxas_version%   s
   r%   c                 C   sr   t | tsJ tt| d\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S td
|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P         F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr   splitRuntimeError)cuda_versionmajorminorr   r   r   ptx_get_version.   s   r7   archc                 C   s"   | j }|d u rt j}t|}|S r   )ptx_versionr   r$   r7   )optionsr8   r9   r4   r   r   r   get_ptx_version_from_optionsA   s
   r;   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)r;   min)r:   r8   r9   Zllvm_ptx_versionfeaturesr   r   r   get_featuresI   s   


r?   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r"   fr   r   r   	file_hashW   s   $rG   
capabilityc                 C   s   | dkrdnd}d|  | S )NZ   a Zsm_r   )rH   suffixr   r   r   sm_arch_from_capability]   s   rM   T)frozenc                   @   s.  e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZe	e ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"d# Z d$d% Z!dS )&CUDAOptions   	num_warpsr*   num_ctas   
num_stagesNmaxnreg)r*   r*   r*   cluster_dimsr9   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)Zfp8e5fp8e4b15supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)r_   Ztf32x3Zieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowr8   c                 C   s   t tjd }| jd u ri nt| j}|dd s%tjjp"t	|d |d< t
| dt|  | jdkr?| j| jd @ dksCJ dd S )Nlib	libdevicezlibdevice.10.bcrc   r   r*   znum_warps must be a power of 2)r   __file__parentrc   dictgetr	   r   Zlibdevice_pathr0   object__setattr__tupleitemsrQ   )selfZdefault_libdirrc   r   r   r   __post_init__}   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S r   )rG   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>rc   _c                 S   s   g | ]\}}| d | qS )-r   )rt   namevalr   r   r   
<listcomp>   s    z$CUDAOptions.hash.<locals>.<listcomp>r   )
rl   __dict__rp   sortedjoinrq   rB   rC   encoderE   )rr   Z	hash_dictkeyr   r   r   hash   s   
zCUDAOptions.hash)"__name__
__module____qualname__rQ   r   __annotations__rR   rT   rU   r   rV   rp   r9   rW   r0   rX   rY   boolrZ   r[   r]   r   r^   r`   ra   rb   rc   rl   rd   rf   rg   r8   rs   r   r   r   r   r   rO   c   s0   
 
rO   c                       s   e Zd ZedefddZdd ZdefddZdedd	f fd
dZ	de
fddZdd Zdd Zdeeef fddZdd Zedd Zedd Zdd Zdd Zdd Zd d! Zd"d# Ze d$d% Z  ZS )&CUDABackendr   c                 C   s
   | j dkS )Nre   )backend)r   r   r   r   supports_target   s   
zCUDABackend.supports_targetc                 C   s0   d}t ||}|std| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r*   )re	fullmatch
ValueErrorr   group)rr   r8   patternmatchr   r   r   _parse_arch   s
   zCUDABackend._parse_archr   c                 C   s   |  |j}d| S )Ncuda:)r   r8   )rr   r:   rH   r   r   r   get_target_name      
zCUDABackend.get_target_nameNc                    s   t  | d| _d S )Ncubin)super__init__Z
binary_ext)rr   r   	__class__r   r   r      r   zCUDABackend.__init__c                    s   dt jjpd| jj i}| fddtj D  t	| 
|d }d|vr?ttj}|dkr7|d tt||d< d|vrK|d	krKd
|d< d|vrUt jj|d< |d	kr[dnd|d< tdi |S )Nr8   smc                    s*   i | ]}| v r | d ur| | qS r   r   )rt   ru   optsr   r   
<dictcomp>   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>r]   Y   Zfp8e4nvr^   rI   )r\   rY   i   @r   rb   r   )r	   ZruntimeZoverride_archr   r8   updaterO   __dataclass_fields__keysr   r   setr]   addrp   r~   languageZdefault_fp_fusion)rr   r   argsrH   r]   r   r   r   parse_options   s   

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r*      )rQ   rR   sharedrV   )rr   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr|jn|jt	| j
d}|S )Nr   r)   )Zconvert_custom_typesr   )triton.language.extra.cudar   extrare   r   r   r8   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70r   r   )rr   r:   re   rH   Zcodegen_fnsr   r   r   get_codegen_implementation   s   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )ri   ztriton.language.extra.libdevice)r   ri   )rr   ri   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s   t | d S r   )r   load_dialects)rr   ctxr   r   r   r      s   zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| |d dk r"tj	| tj
| tj| tj| tj| tj| tj| ||  | S )Nr-   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttirZadd_rewrite_tensor_pointerZ(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizerZadd_combineZadd_reorder_broadcastadd_cseadd_symbol_dceZadd_loop_unrollrun)modr   optrH   pmr   r   r   	make_ttir   s   
zCUDABackend.make_ttirc                 C   sv  |j d ur| dt| j|j  t }|jd ur.|jd |_	|jd |_
|jd |_t| j}| }tj|d| |jd|j tj| |d dkrYtj| tjj|| tj| tj| tj| tj| tj||d	k tjj| tj| |d d
v rtj| tj | tj!| tj | tj"| tjj#$||j%| tj&||j% tj'| tj(||j%| n_|d dkr.tj| tj | tj!| tj)| tj*| tjj+| tj&||j% tj'| tj,||j% tj(||j%| tj"| tjj-| ntj!| tj | tj| tj.| tj||d	k tj/| tjj0| tj| tjj1| tj2| tj3| tj| tj4| |d dkrtjj5| tjj6| tj7| tj | |8|  |j	|j
|jf|d< | 9 }||d< | S )Nzttg.maxnregr   r*   r   r   r   r-   r   r)   )r   r   r   rV   tensordesc_meta):rU   Zset_attrr   builderr   Zget_int32_attrr   ZClusterInforV   ZclusterDimXZclusterDimYZclusterDimZr   r   r   r   Zadd_convert_to_ttgpuirrQ   rR   ttgpuirZadd_coalesceZadd_f32_dot_tc	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsZ add_optimize_descriptor_encodingadd_loop_aware_cseZadd_fuse_nested_loopsr   r   Zadd_triton_licm add_combine_tensor_select_and_ifZhopperZadd_hopper_warpspecrT   Zadd_assign_latenciesZadd_schedule_loopsZadd_pipelineZadd_optimize_accumulator_initZadd_hoist_tmem_allocZadd_promote_lhs_to_tmemZadd_warp_specializeZadd_remove_tmem_tokensZadd_prefetchZadd_coalesce_async_copyZadd_optimize_tmem_layoutsZadd_interleave_tmemZadd_reduce_data_duplicationZadd_reorder_instructionsr   Zadd_tma_loweringZadd_fence_insertionadd_sccpr   get_tensordesc_metadata)r   r   r   rH   Zcluster_infor   Zdump_enabledr   r   r   r   
make_ttgir   s   


zCUDABackend.make_ttgirc                 C   sn   |}t |j}|  tj| tj| tj	
| tj| tj| || | |d< |S )Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rr   srcr   r:   rH   r   r   r   r   r   	ttgir_opt0  s   
zCUDABackend.ttgir_optc                 C   s   t || jj}|}t|j}|  tjj	
| tj| tj| tj| tj| tjj	| tj| tjj||| tj| tj| tjj	| tjj	| tj| tj| tj| tjjs~tj| || t !  t  }tjj"rt#dt $||}	t%|}
t&|| jj}d}t'  t (|	||
| t)|	 |j*rdd |j*D }t +|	| t ,|	t j- |.d}|d ur||d< |.d|d< |.d	|d
< |.d|d< |.d|d< t/|	}~	~|S )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )rt   rz   r"   r   r   r   r|   h      z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsrQ   z
ttg.sharedr   zttg.tensor_memory_sizeZ	tmem_sizezttg.global_scratch_memory_sizeZglobal_scratch_sizez#ttg.global_scratch_memory_alignmentZglobal_scratch_align)0r;   r   r8   r   r   r   r   r   r   r   Zadd_lower_mmar   r   Zadd_allocate_warp_groupsconvertZadd_scf_to_cfZadd_allocate_shared_memoryZadd_allocate_tensor_memoryZ"add_allocate_global_scratch_memoryZadd_to_llvmirr   r   r   Zadd_nvgpu_to_llvmZadd_warp_specialize_to_llvmr   r	   compilationdisable_line_infoZllvmirZadd_di_scoper   r   Zinit_targetsZenable_asanr3   Z	to_modulerM   r?   Zset_short_ptrZattach_datalayoutZset_nvvm_reflect_ftzrc   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3Zget_int_attrr0   )rr   r   r   r:   rH   r9   r   r   r   Zllvm_modprocr>   triplepathsZtotal_num_warpsretr   r   r   	make_llir?  sd   


zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}t||||g |jd}	t	d|	}
t
|
dks.J |
d |d< |d  d|d  }tjd	d
| |	tjd}	tjdd| |	tjd}	tdd|	}	tjjrltd t|	 |	S )Nr   Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r*   r   rz   r-   r&   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rK   z // -----// NVPTX Dump //----- //)r;   r   r8   rM   r?   r   Ztranslate_to_asmrY   r   findalllensub	MULTILINEr	   r   Z
dump_nvptxprint)rr   r   r   r   rH   r9   r   r   r>   r   namesr   r   r   make_ptx{  s    zCUDABackend.make_ptxc                 C   s|  t  j}tjdddd&}tjdddd}|| |  |jd }tjj	r.dd	gndg}	|j
r6g nd
g}
t|}tjjrEddgng }|jrP|jdng }|g|	|
d||d| |jd|}z%tj|dd|d tj|jrt|j tj|jrt|j W n\ tjy } zOt|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }t| d| dd| dd }~ww t|d}| }W d    n	1 sw   Y  tj|rt| W d    n1 sw   Y  W d    |S W d    |S 1 s7w   Y  |S )NFwz.ptx)deletemoderL   rz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
r@   )r   r"   tempfileNamedTemporaryFilewriteflushrz   r	   r   r   rY   rM   r   Zdisable_ptxas_optrW   r2   r    r   osexistsremoveCalledProcessErrorrA   rD   
returncodesignalSIGSEGVr
   r   )rr   r   r   r   rH   r   fsrcZflogZfbinZ	line_infoZfmadr8   Zdisable_optZptx_extra_optionsZ	ptxas_cmdeZlog_filelogerrorrF   r   r   r   r   
make_cubin  s   






*///zCUDABackend.make_cubinc                    s    j |tjkr  fdd|d<  fdd|d< n|tjkr/ fdd|d<  fdd|d< fd	d|d
< fdd|d< d S )Nc                        | | S r   )r   r   r   rH   r:   rr   r   r   <lambda>      z(CUDABackend.add_stages.<locals>.<lambda>r   c                    r   r   )r   r   r   r   r   r     r   Zttgirc                    r   r   )r   r   r   r   r   r     r   c                    r   r   )r   r   r   r   r   r     r   Zllirc                        | | jjS r   )r   r   r8   r   r:   rr   r   r   r     r   ptxc                    r   r   )r   r   r8   r   r   r   r   r     r   r   )r   r8   r   ZTRITONZGLUON)rr   Zstagesr:   r   r   r   r   
add_stages  s   

zCUDABackend.add_stagesc                 C   s   t  }| d| jj S )Nry   )r%   r   r8   )rr   r$   r   r   r   r     s   zCUDABackend.hash)r   r   r   staticmethodr   r   r   r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  	functools	lru_cacher   __classcell__r   r   r   r   r      s,    



J<3r   ),Ztriton.backends.compilerr   r   r   Ztriton._C.libtritonr   r   r   r   Ztritonr	   Ztriton.runtime.errorsr
   dataclassesr   r  typingr   r   r   r   typesr   rB   r   r   r   r   r    pathlibr   r   Z
NvidiaToolr   r  r%   r   r7   r;   r?   rG   rM   rO   r   r   r   r   r   <module>   s<    

*