o
    h6                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dl	mZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dlm3Z3 d dl4Z4d dl5Z5d d	l6m7Z7 d d
l8m9Z9 d dl:m;Z; dgZ<e)r'd dlm=Z=m>Z>m?Z? d dl5m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZWmXZX ddlYmZZZ ddl[m\Z\m]Z] g dZ^e*dZ_ej`dddZad d lbmcZc d d!ldmeZe d d"lfmgZg d d#lhmiZi d d$ljmkZk d d%llmmZm d d&lnmoZompZpmqZqmrZrmsZs d d'ltmuZumvZv d d(lwmxZxmyZy dd)lzm{Z{ dd*l|m}Z~ ejd+kZeeZe*d,Zee4je4jf Ze'e+e5jee5jBf  Zd-d.d/Zd0Zd0Zd0Zd1Zd2Zeed @ d kred3ksJ d4dd7d8Zdd<d=ZG d>d? d?e4jZejd@dAG dBdC dCZdddKdLZ	EdddMdNZej`ddOdPZddTdUZddXdYZdd]d^ZddadbZddfdgZ}ddjdkZddodpZddsdtZddwdxZdd{d|Zd}d~ fdddZdddZddddZ		ddddZ					ddddZdddZdddZdddZdddZdddZe/dZe*dd@dZG dd de(e$eef ZdddZdddÄZdddȄZddd̈́Z	ddddԄZdddلZddd܄ZdddZdddZdddZdddZdddZdddZdddZdddZdddZdddZdÐd dZd dlZdĐddZg ZdeĐd< dŐddZŐdĐddZej			@dƐdǐddZeZeZeZːdȐddZ̐dɐddZed3dʐddZG dd de&ZejG d d! d!ZG d"d# d#ZG d$d% d%e҃Zejǐdːd&d'ZG d(d) d)ZG d*d+ d+eՃZej`d̐d͐d.d/Zejΐdΐd0d1Zؐdΐd2d3Z	ddϐd8d9ZڐdАd>d?ZېdѐdAdBZܐdѐdCdDZݐdEdEdFdҐdIdJZސdӐdMdNZߐdԐdRdSZdՐdUdVZdWZdXZg dYZe+ee4jf ZdZeĐd[< d֐d\d]Zej`dאd^d_Zej`dؐd`daZej`dِdcddZdڐdedfZdԐdgdhZdԐdidjZdڐdkdlZdڐdmdnZdېdrdsZ	E	@	E	dܐdݐdxdyZddzd{ZG d|d} d}ZdސddZdސddZdߐddZdddZdddZdddZdddZejǐdddZ	ddddZdddZdddZdddZdddZdddZ dddZejǐdddZdΐddZej`dΐddZej`dʐddZej`dΐddZdΐddZdddZdddZ	dddZ
dddZddŐdƄZddǐdȄZG dɐdʄ dejZddΐdτZddҐdӄZddԐdՄZ	dddِdڄZddܐd݄ZdddZdddZdddZdddZdd~ fdddZdd~ fdddZdddZdddZejG d d dZejǐdddZdddZdddZ dd	d
Z!dddZ"dddZ#dddZ$dddZ%d ddZ&dddZ'dd!d"Z(dd%d&Z)dd+d,Z*dd-d.Z+	ddd5d6Z,dd8d9Z-dd;d<Z.d	d?d@Z/ddAdBZ0ddCdDZ1dEdFdGdHdIdJdJdKZ2dLdM e23 D Z4e5dNZ6d
dOdPZ7ddQdRZ8ddUdVZ9ddWdXZ:ej`ddZd[Z;ejG d\d] d]Z<i Z=d^eĐd_< ddcddZ>e9 Z?deeĐdf< ddgdhZ@ddidjZAddkdlZBe*dmZCe*dnZDG dodp dpeeCeDf ZEe.d@dqdd@dAddudvZFddxdyZG	Eddd}d~ZHG dd dejZIej`dddZJdddZKdddZLdddZMdddZNdddZOdZPdddZQdddZRdS (      )annotationsN)
CollectionIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)DeviceProperties)
OrderedSet)tree_map_onlyZ!activation_quantization_aten_pass)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)ShapeEnv)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelExternKernelOutIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpuTreturnstrc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xr@   r@   [/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>`   s    z get_gpu_type.<locals>.<listcomp>r)   r   r:   )	GPU_TYPESlenpop)Z
avail_gpusZgpu_typer@   r@   rG   get_gpu_type^   s   rL   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRanges)config)ceildivwin32_Tz.cubinz.spv)r:   r<         @      zmust be power of 2nbytesintc                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr)   )ALIGN_BYTES)rd   r@   r@   rG   _align   s   rg   v
sympy.Exprboolc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddZMaxallmap_is_alignedargsaligngcdrf   )rh   r@   r@   rG   rp      s   rp   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )rr   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr)   Tvalueri   r>   Optional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)rk   re   rl   Integerrg   rp   )clsru   r@   r@   rG   eval   s
   z
align.evalN)ru   ri   r>   rv   )__name__
__module____qualname____doc__nargs
is_integerclassmethodrz   r@   r@   r@   rG   rr      s    rr   Tfrozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    re   idzlist[Optional[int]]Zinput_index_mappingZoutput_index_mapping	list[str]Zconstant_namesN)r{   r|   r}   r~   __annotations__r@   r@   r@   rG   r      s   
 r      d   fnCallable[[], Any]warmuprepfloatc              
   C  s   |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|	  |   q)|  t j  |
|d }tdt|| }tdt|| }	t|D ]}|   qYdd	 t|	D }d
d	 t|	D }t jjt jjjgdP}
t j  t|	D ],}|	  ||   t jjd |   W d   n1 sw   Y  ||   qt j  t dd	 t||D }W d   n1 sw   Y  t | }td t|
 jddd tdd	 |
 D }|r|tdd |D d 8 }td| |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        Ar:   dtypedeviceTZenable_timing   r)   c                 S     g | ]	}t jjd dqS Tr   rB   r:   EventrE   _r@   r@   rG   rH          zfp8_bench.<locals>.<listcomp>c                 S  r   r   r   r   r@   r@   rG   rH      r   Z
activitiesZRunCudaModuleNc                 S  s   g | ]	\}}| |qS r@   )elapsed_time)rE   ser@   r@   rG   rH      r   
raw eventsself_device_time_totalZsort_by	row_limitc                 S  s&   g | ]}|j tjkrd |jv r|qS )Zfused_abs_max_0device_typerO   CUDAnamerE   eventr@   r@   rG   rH          
c                 s      | ]}|j V  qd S rw   Zdevice_time_totalr   r@   r@   rG   	<genexpr>       zfp8_bench.<locals>.<genexpr>     @@profiling results: %s ms)rB   r:   synchronizeemptyre   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   Znvtxtensorzipmeanitemlogdebugkey_averagestablerP   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventsr@   r@   rG   	fp8_bench   sh   	




r   c                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt j  t jjt jjjgd}
t|	D ]	}|  |   qtt j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )r   r   r:   r   Tr   r   r)   r   Nr   r   r   r   c                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Syncr   r   r@   r@   rG   rH   7  r   z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   r@   )rE   r   r   Znum_event_per_groupr@   rG   rH   F  s    
zprofiling time breakdown)r   c                 s  r   rw   r   r   r@   r@   rG   r   R  r   z+do_bench_using_profiling.<locals>.<genexpr>r   r   )rB   r:   r   r   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rP   r   rJ   RuntimeError	enumerateZ_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   Zactual_eventsr   r@   r   rG   do_bench_using_profiling  sj   





r   c               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsZMetaZtorchvisionr   Fztorchvision::nms does not exist)Ztorchvision.opsr   rB   _CZ%_dispatch_has_kernel_for_dispatch_keyhasattrrA   opsImportErrorr   r?   )r   r   r@   r@   rG   has_torchvision_roi_alignW  s   
r   r   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
rB   r   r   rk   r?   typer   rM   ZWorkerZcurrent_devicer   Zdevice_interfacer@   r@   rG   decode_deviceg  s   


r   itIterable[sympy.Expr]c                 C  s   t tj| tjjS rw   )	functoolsreduceoperatormulrl   SZOner   r@   r@   rG   sympy_productr     r   seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S rw   r@   )rE   abr@   r@   rG   r   x  s    zsympy_dot.<locals>.<genexpr>)rJ   rl   expandr   r   )r   r   r@   r@   rG   	sympy_dotv  s   r   Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS r@   )r   rD   r@   r@   rG   
<dictcomp>|      zunique.<locals>.<dictcomp>)valuesr   r@   r@   rG   unique{     r   numberUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )rk   rl   ExprrS   sympifyre   r   runtime_ceildiv)r   r  r@   r@   rG   r]     s    
r]   keyOptional[torch.dtype]c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|d'd( t| D  t| t r`| S d)||  S )*Nz*i8.r   rj   i1Z
float8e4nvZfp8e4nvZfloat8e5Zfp8e5Zfloat8e4b15Zfp8e4b15Zfloat8e4b15x4Z
fp8e4b15x4float8_e4m3fnfloat8_e5m2Zfloat8_e8m0fnuu8Zfloat4_e2m1fn_x2r   Zfp16bfloat16Zbf16float32Zfp32float64Zfp64int8i8Zint16Zi16int32Zi32int64Zi64u16u32Zu64)uint8Zuint16Zuint32uint64c                 S  s   i | ]}||qS r@   r@   )rE   rh   r@   r@   rG   r     s    z_type_of.<locals>.<dictcomp>*)r?   splitupdatelistr   rk   )r  Z	dtype_strZtysr@   r@   rG   _type_of  sZ   
r  lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                 C     dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS r@   )rl   r  rE   r   r@   r@   rG   rH     r   z-convert_shape_to_inductor.<locals>.<listcomp>r@   r  r@   r@   rG   convert_shape_to_inductor  s   r#  r   Union[int, torch.SymInt]c                 C  sB   ddl m} t| tr| S t| tjrt| S |jjjj	| ddS )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r)   VN)hint)
virtualizedr&  rk   re   rl   rx   graphsizevars	shape_envZcreate_symintnode)r   r&  r@   r@   rG   convert_to_symint  s   
r,   Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                 C  r   )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                 S  s   g | ]}t |qS r@   )r,  r!  r@   r@   rG   rH         z+convert_shape_to_symint.<locals>.<listcomp>r@   r"  r@   r@   rG   convert_shape_to_symint  s   r0  optorch._ops.OpOverloadc                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S rw   )Z
alias_inforE   r   r@   r@   rG   r         zis_view.<locals>.<genexpr>)any_schema	argumentsr1  r@   r@   rG   is_view  s   r9  c                 C     dS NFr@   )r   r@   r@   rG   <lambda>      r<  user(   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   s~   | j dksdS t| jtjjs| jtju sdS ttjj| j}|tju s(t	|r4t
 fdd| jD S tjj|jv p> |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S rw   )is_pointwise_use)rE   ur?  r@   rG   r     r4  z#is_pointwise_use.<locals>.<genexpr>)r1  rk   targetrB   _ops
OpOverloadr   getitemr   r9  rn   usersTagZ	pointwisetags)r>  r?  rE  r@   rD  rG   rB    s   


rB  rE  r
   rq   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c                   s   t j  g d
 fdd} j| gtt j|||fR  }t| jjdkr5t	| jjd j
d	kr5|f} | t ji  }|fS )Nargtorch.Tensorr>   r(   c                   s    |   dt S )NrP  )appendplaceholderrJ   )rP  gZ
graph_argsr@   rG   add_tensor_arg  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr)   r   Tensor)rP  rQ  r>   r(   )rB   fxZGraphrA  r   rW  rJ   r6  returnsr?   r   outputr&   )rE  rq   rM  rV  nodegmr@   rT  rG   gen_gm_and_inputs  s    

r]  r:   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr   )rM   rC   r   r   r@   r@   rG   r     s   r   modelCallable[..., Any]example_inputsSequence[Any]r   c                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r   rB   Zmanual_seedtimeperf_counterr   )r`  rb  r   r   t0r   resultt1r@   r@   rG   timed  s   

ri  r@   
         ?repeatbaselinec                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]	}t  qS r@   )ri  r   r   rb  r`  r   r@   rG   rH   3  r   z%print_performance.<locals>.<listcomp>z.6f)rB   r   r   Zmedianprintr   )r`  rb  r   rl  rm  r   ZtimingsZtookr@   rn  rG   print_performance*  s   rp  objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S rw   r@   r@   rg  r@   rG   r<  =  r=  z#precompute_method.<locals>.<lambda>N)rA   setattr)rq  rr  r@   rs  rG   precompute_method:  s   ru  methodsr   c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)ru  )rq  rv  rr  r@   r@   rG   precompute_methods@  s   rw  r   r   c                 C  s   t | |kt | |k  S rw   )re   )r   r   r@   r@   rG   cmpF     rx  rF   Union[int, Sequence[int]]sizeSequence[int]c                 C  s:   t | tr
| g| S t| dkrt| | d g| S | S )Nr)   r   )rk   re   rJ   r   )rF   r{  r@   r@   rG   pad_listlikeJ  s
   

r}  tuple[_T, ...]list[_T]c                 C  s&   t | dkrg S d	dd}t| |dS )
Nr   elemr_   r>   r?   c                 S  s0   t | tr| S ddlm} t | |sJ |  S )Nr)   )r8   )rk   r?   	schedulerr8   get_name)r  r8   r@   r@   rG   	sort_funcW  s
   
ztuple_sorted.<locals>.sort_funcr  )r  r_   r>   r?   )rJ   sorted)rF   r  r@   r@   rG   tuple_sortedS  s   
	r  PRV)	covariantc                   @  s$   e Zd ZedddZdddZdS )CachedMethodr   r
   r>   r^  c                 C     d S rw   r@   )r   r@   r@   rG   clear_cacheh     zCachedMethod.clear_cacherq   P.argsrM  P.kwargsr  c                 O  r  rw   r@   selfrq   rM  r@   r@   rG   __call__k  r=  zCachedMethod.__call__N)r   r
   r>   r^  )rq   r  rM  r  r>   r  )r{   r|   r}   staticmethodr  r  r@   r@   r@   rG   r  g  s    r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sl   | j }d| d d| i}td| d  d  d | t| || d }d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        Z_cache_on_selfr  r
   r>   r^  c                   s   t |  rt|   d S d S rw   )r   delattrr  r  r@   rG   r    s   
z"cache_on_self.<locals>.clear_cache)r  r
   r>   r^  )r{   execlstripr   wrapsr  )r   r   ctxwrapperr  r@   r  rG   cache_on_selfo  s$   	r  node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr)   irc                 S  s$   g | ]}t |d r|jr|jjqS )r[  )r   r[  originsrE   r[  r@   r@   rG   rH     s    z%aggregate_origins.<locals>.<listcomp>) r  rk   r  r   r   r   or_r   r0   r  )r  r  r@   r@   rG   aggregate_origins  s   
	r  Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]}|j d krd|jv r|jd dur|jd jjqS )rA  r  N)r1  r   _overloadpacketr{   rE   originr@   r@   rG   rH     s    z)get_fused_kernel_name.<locals>.<listcomp>rB   rA  Zsource_fn_stackr   r)   Zinductor_nodec                 S  s   g | ]
}|j d kr|jqS rA  )r1  r   r  r@   r@   rG   rH     s
    
r   Zfused)r  r  r   r1  r   rk   r?   rR  r{   NotImplementedErrorjoin)r  r  all_originssourcesr  Z	source_fnr@   r@   rG   get_fused_kernel_name  s.   r  r  r,   tuple[str, str]c                   s  t | }dd |D }tt}tt}d  t|rKtdd |D }t|dkrK|d j t dsAdd	 t j	D }| _
|j fd
dd |D ]3}d|jv rk|jd d urkt|jd j}	||	 |j d|jv r|jd d j}	||	 |j qM d urdnd}
|j d|
 dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  q d ur||j d |D ]}||j d|   q|d|fS )Nc                 S  s   g | ]	}|j d kr|qS r  r8  r  r@   r@   rG   rH     r   z'get_kernel_metadata.<locals>.<listcomp>c                 s  r   rw   )r)  )rE   nr@   r@   rG   r     r   z&get_kernel_metadata.<locals>.<genexpr>r)   r   )_inductor_kernel_metadata_node_to_idx_mapc                 S     i | ]\}}||qS r@   r@   )rE   idxr  r@   r@   rG   r     r   z'get_kernel_metadata.<locals>.<dictcomp>c                   s
    j |  S rw   )r  r  Zsingle_graphr@   rG   r<    s   
 z%get_kernel_metadata.<locals>.<lambda>r  r  Z	from_nodezTopologically SortedZUnsorted z Source Nodes: [r  z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  collectionsdefaultdictr  rJ   r   r)  r   r   nodesr  sortr   r?   r  rR  r   commentr  keysr  itemsZformat_node)r  r  r  Zinductor_nodesZfrom_node_dictZoriginal_aten_dictZunique_graphsZnode_to_idx_mapr[  r  Zsort_strmetadataZdetailed_metadataZoriginal_noder  r  r@   r  rG   get_kernel_metadata  sL   





r  initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)r  r   rK   rI  addrR  )r  r  Zdominated_setr[  userr@   r@   rG   dominated_nodes  s   


	r  Sequence[IRNode]dict[str, IRNode]OrderedSet[IRNode]c                   sd   dd l }ddlm  d fdd	fd
d| D }fdd| D }t|jg ||R  S )Nr   r)   r  r  r2   r>   rj   c                   sD   t |  jr| jS t |  jr| jS t |  jo!t |  jS rw   )rk   	TensorBoxdata
StorageBoxr2   Z	Pointwiser  r  is_unrealized_noder@   rG   r    s
   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS r@   r  )rE   valr  r@   rG   rH      r   z"gather_origins.<locals>.<listcomp>c                   r  r@   r  rE   rP  r  r@   rG   rH   !  r   )r  r2   r>   rj   )	itertoolsr  r  r   r   chain)rq   rM  r  Zkwarg_originsZarg_originsr@   r  rG   gather_origins  s   r  exprc                   s@   ddd d fdd	d fd
ddfdd| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    r  ri   r>   rj   c                 S  s(   t | tjot| jdko| jd dkS )N   r   r   )rk   rl   MulrJ   rq   r  r@   r@   rG   is_neg_lead,  s   &zsympy_str.<locals>.is_neg_leadr?   c                   sj   t | tjr1t| jdkr( | jd r(| jd  d| jd jd  S dt| jS | S )Nr  r)   r   z - z + )rk   rl   rm   rJ   rq   r  ro   r  )r  sympy_str_mulr@   rG   sympy_str_add1  s
   (z sympy_str.<locals>.sympy_str_addc                   sB   t | tjr | rd| jd  S dt| jS | S )N-r)   z * )rk   rl   r  rq   r  ro   r  )r  sympy_str_atomr@   rG   r  <  s
   z sympy_str.<locals>.sympy_str_mulc                   sp   t | tjr	| jS t | tjtjfrd |  dS t | tttt	fr4| j
j ddtt| j dS t| S )N()r  )rk   rl   Symbolr   rm   r  rW   rT   rU   rV   funcr{   r  ro   	sympy_strrq   r?   r  )r  r@   rG   r  G  s   "z!sympy_str.<locals>.sympy_str_atomN)r  ri   r>   rj   r  ri   r>   r?   r@   r  r@   )r  r  r  r  rG   r  %  s
   

r  r   ValueRanges[Any]c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr)   r%  Zcurrent_nodeZ
index_expr)
r(  r&  r\   Zcompute_all_boundsrA   interpreterrE  rZ   r[   unknown)r   r&  Zfx_noder@   r@   rG   get_bounds_index_exprT  s   r  prefixc                 C  s   | d dkS )Nr   rr@   )r  r@   r@   rG   prefix_is_reductionb     r  rY   r  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    TintegerZnonnegative)rY   ZSIZErX   )r  r  r@   r@   rG   sympy_index_symbol_with_prefixf  s   r  checkc                 C  s   | st jot jS rw   )r\   Zdebug_index_assertsZassert_indirect_indexing)r  r@   r@   rG   generate_assertr     r  r   c                 C  s    | d dksJ t j| dddS )r  r   r   Tr  )rl   r  r   r@   r@   rG   sympy_index_symbolv  s   r  replacementsdict[sympy.Expr, Any]c                   s,   ddd t |  fd	d
| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedri   replacementUnion[sympy.Expr, str]r>   r  c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nr  )rk   rl   r  r?   r  r   Zis_nonnegative)r  r  r@   r@   rG   	to_symbol  s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS r@   r@   rE   krh   r  r@   rG   r         zsympy_subs.<locals>.<dictcomp>N)r  ri   r  r  r>   r  )rl   r  Zxreplacer  )r  r  r@   r  rG   
sympy_subs  s   

r  ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s:   t | tjpt | tjotdd t|  |  D S )Nc                 s      | ]}t |V  qd S rw   is_symbolicrD   r@   r@   rG   r         zis_symbolic.<locals>.<genexpr>)	rk   rB   r$   rW  r5  r  r  r{  stride)r   r@   r@   rG   r    s    r  c                  G     t dd | D S )Nc                 s  r  rw   r  r3  r@   r@   rG   r     r  z"any_is_symbolic.<locals>.<genexpr>r5  )rq   r@   r@   rG   any_is_symbolic  r   r  r\  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  s   ddl m} tg d}t r|d | jjD ]9}t|j	|v r&|  S tj
jjs@t|j	tjjr@tjjj|j	jv r@|  S |jd }d urR||rR|  S qd S )Nr   )free_unbacked_symbols)z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultZrun_and_save_rng_stateZrun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesr  r   rB   $are_deterministic_algorithms_enabledr  r)  r  r?   rE  	_inductorr\   Zgraph_partitionrk   rF  rG  r   rJ  cudagraph_unsaferK  r   get)r\  r  Zforbidden_setr[  r  r@   r@   rG   %get_first_incompatible_cudagraph_node  s,   r  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphrZ  )nextiterreversedr)  r  r1  )r\  Z	last_noder@   r@   rG   output_node  s   r#  OrderedSet[torch.device]c                 C  s\   | j jdd}tdd |D }t| jd }t|tr|n|f}tdd |D }||B S )NrS  r8  c                 s  s0    | ]}t |jd tjr|jd  jV  qdS r  N)rk   r   r  rB   rW  r   r  r@   r@   rG   r     s    

z"get_all_devices.<locals>.<genexpr>r   c                 s  s>    | ]}t |tjjrt |jd tjr|jd  jV  qdS r%  )rk   rB   rX  r(   r   r  rW  r   r  r@   r@   rG   r     s    

)r)  Z
find_nodesr   r#  rq   rk   tuple)r\  Zplaceholder_nodesZinput_devicesZout_argZout_argsZout_devicesr@   r@   rG   get_all_devices  s   r'  c                  C  s   t tj D ]B} | dsqtj|  }|j D ]+}|drDt||}t|tj	j
jjrD|jD ]}t|tj	j
jjrC|jjj  q1qtj| = qdtjv r_tjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.Ztriton_ztriton.runtime.driver)r  sysmodulesr  
startswith__dict__rA   rk   rB   r  runtimeZtriton_heuristicsZCachingAutotunerZcompile_resultsZTritonCompileResultkernelrunmod__del__r   driveractiveutilsinstancegcZcollect)module_namemZ	attr_namer-  rg  r/  r@   r@   rG   unload_xpu_triton_pyds  s.   








r8  _registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler:  AttributeErrorr9  rR  rq  r@   r@   rG   clear_on_fresh_cache  s   
r>  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r9  r:  r=  r@   r@   rG   clear_caches*  s   
r?  cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              	   #  sP   t   tj|d zztjtjd iX t	d  tj
 dtjtjdi1 dV  t| trXt| dksAJ dtj
rXt}| fd	d
|D  W d   n1 sbw   Y  W d   n1 sqw   Y  |rt rtj rt  tj  fddd W n ty   td   w W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )rB  ZTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonZTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)ospathgetsizer  )rE   f)triton_cache_dirr@   rG   r   N  s    zfresh_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  rH  rL  )inductor_cache_dirr@   rG   r<  ]  s
    zfresh_cache.<locals>.<lambda>)onerrorz(on error, temporary cache dir kept at %s)r?  tempfilemkdtempr   patchdictrG  environr   r   rH  r  rk   rJ   existslistdirr  
is_windowsrB   r<   rC   r8  shutilrmtree	ExceptionrM  )r@  rB  rD  filesr@   )rN  rK  rG   fresh_cache2  sL   




r\  seq	list[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r  reverse)__getitem__r   rJ   r  r"  r  )r]  getterZa_rr@   r@   rG   argsortp  s   rb  r+  r'   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                   sD   d fdd}dd	 t |D }t|t|d
}dd	 |D }|S )Nr   tuple[int, sympy.Expr]r   r>   re   c                   sZ   | \}}|\}}d
 fdd}|||k rdS |||krdS ||k r%dS ||kr+dS d	S )Nr  %Union[bool, torch.SymInt, sympy.Expr]r>   rj   c                   s   t | tr| S  j| ddS )NT)Zsize_oblivious)rk   rj   Zevaluate_exprr  r+  r@   rG   evaluate~  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r)   r   )r  re  r>   rj   r@   )r   r   Za_idxZa_valZb_idxZb_valrg  rf  r@   rG   rx  z  s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS r@   )rk   rB   r$   r[  r  )rE   r  r   r@   r@   rG   rH     s    zargsort_sym.<locals>.<listcomp>r  c                 S  s   g | ]\}}|qS r@   r@   )rE   r  r   r@   r@   rG   rH     r/  )r   rd  r   rd  r>   re   )r   r  r   
cmp_to_key)r+  r]  rx  exprsrg  r@   rf  rG   argsort_symw  s   rj  r   torch.dtypec                 C  s    | t jkrdS t jd| d S )Nrc   r@   r   )rB   r  r   Zelement_sizerl  r@   r@   rG   get_dtype_size  s   
rm  c                   @  s   e Zd ZU ded< dS )LineContextr
   contextNr{   r|   r}   r   r@   r@   r@   rG   rn    s   
 rn  c                   @     e Zd ZU ded< ded< dS )ValueWithLineMapr?   ru   zlist[tuple[int, LineContext]]Zline_mapNrp  r@   r@   r@   rG   rr       
 rr  c                   @  s   e Zd ZdZd@dAddZejdBddZdCddZdDddZ	dDddZ
dEddZdFddZdDddZdEddZdGd d!ZdHd$d%ZdIdJd)d*ZdIdKd+d,ZdIdKd-d.Z	/dLdMd3d4ZdNd7d8ZdDd9d:ZdOd=d>Zd?S )PIndentedBuffer   r   initial_indentre   r>   r^  c                 C  s   g | _ || _d S rw   )_lines_indent)r  rv  r@   r@   rG   __init__     
zIndentedBuffer.__init__tabwidthrE  c                 c  s*    | j }z|| _ d V  W || _ d S || _ w rw   )r{  )r  r{  prevr@   r@   rG   set_tabwidth  s   zIndentedBuffer.set_tabwidthrr  c                 C  s   t  }d}g }| jD ]:}t|tr| }|d u rq
nt|tr(|||jf q
|}t|ts1J || |d |d|	d 7 }q
t
| |S )Nr)   r  )r	   rw  rk   DeferredLineBasern  rR  ro  r?   writecountrr  getvalue)r  bufr   Zlinemapliliner@   r@   rG   getvaluewithlinemap  s$   




z"IndentedBuffer.getvaluewithlinemapr?   c                 C  s
   |   jS rw   )r  ru   r  r@   r@   rG   r       
zIndentedBuffer.getvaluec                 C  s   t  }| jD ]8}t|tr| }|d u rqnt|trq|}t|ts%J |dr4||d d  q|| |d q| S )N\r   r  )	r	   rw  rk   r~  rn  r?   endswithr  r  )r  r  r  r  r@   r@   rG   getrawvalue  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S rw   )rw  clearr  r@   r@   rG   r       zIndentedBuffer.clearrj   c                 C  
   t | jS rw   )rj   rw  r  r@   r@   rG   __bool__  r  zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )rx  r{  r  r@   r@   rG   r    r   zIndentedBuffer.prefixc                 C  s   |  d d S )Nr  	writeliner  r@   r@   rG   newline  r  zIndentedBuffer.newliner  )Union[LineContext, DeferredLineBase, str]c                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )rk   rn  rw  rR  r~  with_prefixr  stripr  r  r@   r@   rG   r    s   

zIndentedBuffer.writelinelines3Sequence[Union[LineContext, DeferredLineBase, str]]c                 C  s   |D ]}|  | qd S rw   r  )r  r  r  r@   r@   rG   
writelines  s   zIndentedBuffer.writelinesr)   offset'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )Nr>   rE  c                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w rw   rx  r@   r  r  r@   rG   r    
   "z"IndentedBuffer.indent.<locals>.ctxr>   rE  )
contextlibcontextmanager)r  r  r  r@   r  rG   indent  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S rw   r  r  r  r@   r@   rG   	do_indent  r   zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S rw   r  r  r@   r@   rG   do_unindent  r   zIndentedBuffer.do_unindentF
other_codeUnion[IndentedBuffer, str]r  c                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r  )rk   rt  r   rw  rn  minrJ   r  mathisinfrR  r  re   textwrapdedentrstripr  )r  r  r  r  r  r   r@   r@   rG   splice  s,   





zIndentedBuffer.splicer  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )Nrv  c                   s   g | ]} |qS r@   r@   )rE   r  r  r@   rG   rH   1  r/  z&IndentedBuffer.map.<locals>.<listcomp>)rt  rx  rw  )r  r  r   r@   r  rG   ro   /  s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr  r  )r   r  r  r@   r@   rG   __repr__4  ry  zIndentedBuffer.__repr__otherr   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nr  )rx  rt  r  rw  )r  r  r   r@   r@   rG   __add__7  s
   zIndentedBuffer.__add__Nr   )rv  re   r>   r^  )r{  re   r>   rE  )r>   rr  r>   r?   r>   r^  r>   rj   )r  r  r>   r^  )r  r  r>   r^  rt   )r  re   r>   r  )r  re   r>   r^  F)r  r  r  rj   r>   r^  )r  r  r>   rt  )r  r   r>   rt  )r{   r|   r}   r{  ry  r  r  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  ro   r  r  r@   r@   r@   rG   rt    s,    











rt  c                      s(   e Zd Zd
 fddZddd	Z  ZS )FakeIndentedBufferr>   r^  c                   s   t    d S rw   )superry  r  	__class__r@   rG   ry  A  r  zFakeIndentedBuffer.__init__r   r?   r
   c                 C  s$   |dkr
t | |S td| d)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   r@   r@   rG   r  D  s
   
z#FakeIndentedBuffer.__getattribute__r  )r   r?   r>   r
   )r{   r|   r}   ry  r  __classcell__r@   r@   r  rG   r  @  s    r  c               	   c  s<    t jt j} }zd V  W | |t _t _d S | |t _t _w rw   )r(  stdoutstderr)Zinitial_stdoutZinitial_stderrr@   r@   rG   restore_stdout_stderrO  r  r  c                   @  s`   e Zd ZdZdddZddd	ZdddZd ddZd!ddZd"ddZ	d#ddZ
d$ddZdS )%r~  z.A line that can be 'unwritten' at a later timer  r?   c                 C  s   |  sd}|| _d S r  )r  r  r  r@   r@   rG   ry  [  s   
zDeferredLineBase.__init__r>   Union[str, None]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  r@   r@   rG   r  `     zDeferredLineBase.__call__r   c                 C  r  )z3Returns a new deferred line with the same conditionr  r  r@   r@   rG   	_new_lined  r  zDeferredLineBase._new_liner  c                 C  s   |  | | j S rw   r  r  )r  r  r@   r@   rG   r  h  r   zDeferredLineBase.with_prefixc                 C  s   |  | j S rw   )r  r  r  r  r@   r@   rG   r  k  r   zDeferredLineBase.lstripr   Union[int, slice]c                 C  s   |  | j| S rw   r  )r  r   r@   r@   rG   r`  n  r   zDeferredLineBase.__getitem__rj   c                 C  r  rw   )rj   r  r  r@   r@   rG   r  q  r  zDeferredLineBase.__bool__re   c                 C  r  rw   )rJ   r  r  r@   r@   rG   __len__t  r  zDeferredLineBase.__len__N)r  r?   )r>   r  )r  r?   r>   r   )r  r?   r>   r   )r>   r   )r   r  r>   r   r  r>   re   )r{   r|   r}   r~   ry  r  r  r  r  r`  r  r  r@   r@   r@   rG   r~  X  s    






r~  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r  r?   value_fnCallable[[], str]r  c                   s   t  | || _|| _d S rw   )r  ry  r  r  )r  r  r  r  r  r@   rG   ry  {  s   
zDelayReplaceLine.__init__r>   c                 C  s   | j | j|  S rw   )r  replacer  r  r  r@   r@   rG   r    r   zDelayReplaceLine.__call__c                 C  s   t | j| j|S rw   )r  r  r  r  r@   r@   rG   r    r   zDelayReplaceLine._new_line)r  r?   r  r  r  r?   r  )r  r?   r>   r  )r{   r|   r}   r~   ry  r  r  r  r@   r@   r  rG   r  x  s
    
r  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntt | }t|}tjjr3|jd us J |jdk s*|jdkr1t	
d dS dS |jdkr:dnd}|j}||k rOt	j
d	||d
d dS dS )N	   rj  z6GPU arch does not support max_autotune_gemm mode usageFTr<   r`   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rk   rB   r   rL   r   createversionhipmajorr   rM  r   multi_processor_count)r  r   propr  r  r@   r@   rG   
is_big_gpu  s&   

r  c                   C  s   t jdjS )Nr:   )rB   r:   get_device_propertiesr  r@   r@   r@   rG   get_max_num_sms     r  c                  C  s"   t j } t | dur|  S d S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rB   r   Z_get_sm_carveout_experimentalr  )Zcarveoutr@   r@   rG   get_num_sms  s   
r  num_tma_descriptorsnum_programsOptional[int]r*   c                 C  sH   ddl m}m} |du rt }|d}||  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r)   )r*   WorkspaceZeroModeNF)r  	zero_moder   Z
outer_name)codegen.commonr*   r  r  Z	from_boolTMA_DESCRIPTOR_SIZEZunique_name)r  r   r  r*   r  r  r{  r@   r@   rG   get_tma_workspace_arg  s   
r  layoutr3   allowed_layout_dtypeslist[torch.dtype]c                 C  s:   | j |vrtd| j | t| jjo| j |v ot| jS )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r   r  )r  r  r@   r@   rG   _use_template_for_gpu  s   
r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS r@   r  rD   r@   r@   rG   rH         z)_use_autotune_backend.<locals>.<listcomp>,)upperr\   Zmax_autotune_gemm_backendsr  r  r@   r@   rG   _use_autotune_backend     r  c                 C  r  )Nc                 S  r  r@   r  rD   r@   r@   rG   rH     r  z._use_conv_autotune_backend.<locals>.<listcomp>r  )r  r\   Zmax_autotune_conv_backendsr  r  r@   r@   rG   _use_conv_autotune_backend  r  r  F)enable_int32enable_float8r  r  c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g t| jjo1t| |p<| jjdko<| j|v oMtjpBtjoMtdoM|| j|jS )Nr)   )BackendFeaturehas_backend_featurer   ZTRITON)r  r  r  rB   r   r  r  r  extendr
  r  r  r   r   r  r   r\   max_autotunemax_autotune_gemmr  ZTRITON_TEMPLATES)r  r  r  r  r  layout_dtypesr@   r@   rG   use_triton_template  s"   
	r  matricesr2   c                    s^   ddl m}m} ddlm  d fd	d
| rtjrdS tjjo.| o.t	fdd| D S )Nr   )has_triton_stable_tma_apihas_triton_tma_devicer)   r%  rF   r2   r>   rj   c                   s   t |  dkr
dS |  }|tjtjtjfvrdS |  }| }|	 s*|s*dS |j
d }|r6|j
d }|tjkrE jj|drEdS ||j } jj|tS )Nr  Fr)   r       )rJ   get_size	get_dtyperB   r   r  r
  Z
get_layoutZis_transposedis_contiguousr{  r)  r*  statically_known_ltitemsizeZstatically_known_multiple_ofTMA_ALIGNMENT)rF   r   r  Z
transposedZ	inner_dimZinner_bytesr%  r@   rG   _is_tma_compatible  s$   


z3use_triton_tma_template.<locals>._is_tma_compatibleFc                 3      | ]} |V  qd S rw   r@   )rE   r7  )r  r@   rG   r     r  z*use_triton_tma_template.<locals>.<genexpr>rF   r2   r>   rj   )
Ztorch.utils._tritonr  r  r(  r&  r\   cpp_wrapperrF  Zenable_persistent_tma_matmulrn   )r  r  r  r@   )r&  r  rG   use_triton_tma_template  s   r  r7  r  r
  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jg}t| |oAtjp=tjoAtd}|rN| sNtd	 dS |S )
Nr)   r%  r   fallbackr   F)try_import_cutlassZCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r(  r&  r)  r*  	size_hintr\   r:   Zcutlass_backend_min_gemm_sizeZcodegen.cuda.cutlass_utilsr  rB   r  r  r   r  r  r  r  r   r  r   rM  )	r  r7  r  r
  r&  Z	gemm_sizer  r  r   r@   r@   rG   use_cutlass_template!  s(   

r  op_namec                 C  s4   t jj }|dkrdS |  dd |dD v S )z8Check if CUTLASS should be used for the given operation.ZALLTc                 S  r  r@   r  rD   r@   r@   rG   rH   F  r/  z'_use_cutlass_for_op.<locals>.<listcomp>r  )r\   r:   Zcutlass_enabled_opsr  r  )r  Zenabled_opsr@   r@   rG   _use_cutlass_for_opA  s   r  r  r   )r`   r  rb   ra      r   _IntLikec              
   C  sV   ddl m} |jjtt|t|  t|t| o*|jj	 o*|jj
 o*tj S )Nr   r%  )torch._inductor.virtualizedr&  r)  r*  statically_known_truerl   AndZGedecompose_k_thresholdZaot_moder  r\   Zdisable_decompose_k)r7  r  r
  r&  r@   r@   rG   use_decompose_k_choiceT  s   r  c           
        s$  t |tjr|jstS t | tjr| jrt |tjr |js d n	t||  ||  dt|} fdd|D }g g g }}}|D ].}|| }|dk rOqD||d @ dkra|dkra|| qD|d dkrm|| qD|| qDtj	d	kr~|| | S t
|tkr|S || | }	|	d t S )
Nr  r  c                   s    g | ]}| kr|kr|qS r@   r@   )rE   ZdivisorZmax_k_splitZmin_k_splitr@   rG   rH   u  s    z get_k_splits.<locals>.<listcomp>ra   r)   r   r  Z
EXHAUSTIVE)rk   rl   r  	is_numberdefault_k_splitsr  divisorsrR  r\   Zmax_autotune_gemm_search_spacerJ   k_splits_limit)
r7  r  r
  r#  Zpow_of_2_divisorsZmul_of_32_divisorsZrest_of_splitsdZkPartZbest_splitsr@   r   rG   get_k_splitsd  s<   


r&  c                 C  s   t j| jS rw   )rB   r:   r  ZgcnArchNamer   r@   r@   rG   _rocm_native_device_arch_name  r  r(  Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s|   zdd l } ddlm}m} ddlm} tj| j	}W n t
y7   ddd}ddd	}G d
d d}d }Y nw ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationr>   rL  c                   S     g S rw   r@   r@   r@   r@   rG   r*    r  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  r-  rw   r@   r@   r@   r@   rG   r+    r  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r{   r|   r}   r@   r@   r@   rG   r,    s    r,  )r>   rL  )ck4inductorZ(ck4inductor.universal_gemm.gen_instancesr*  r+  Zck4inductor.universal_gemm.opr,  rG  rH  dirname__file__r   )r.  r*  r+  r,  Zpackage_dirnamer@   r@   rG   try_import_ck_lib  s   

r1  c                   s   t jst jsdS tjjsdS | jjdksdS t| j}dd t j	j
D p,|dd |i  fdd  t j	j@ D }|s@dS | jtjtjtjfvrMdS t \}}}}|s]td	 dS t  re|t j	_t j	jsptd
 dS |t j	jkr}td dS dS )NFr:   c                 S  s   i | ]
}| d d |qS ):r   )r  rE   r
  r@   r@   rG   r     r  z#use_ck_template.<locals>.<dictcomp>r2  r   c                   s   g | ]} | qS r@   r@   r3  Zrequested_archsr@   rG   rH     s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r\   r  r   rB   r  r  r   r   r(  Zrocmarchr  r  Zck_supported_archr   r   r  r  r1  r   rM  	is_fbcodeZck_dir)r  Znative_archZrequested_supported_archsZck_package_dirnamer   r@   r4  rG   use_ck_template  s<   




r7  c                 C  :   ddl m} tdot| o|jjj|| | dddkS )Nr)   r%  CKr   r  r   r(  r&  r  r7  r)  r*  r  r  r7  r  r
  r&  r@   r@   rG   use_ck_gemm_template     r<  c                 C  r8  )Nr)   r%  ZCKTILEr   r  r   r:  r;  r@   r@   rG   use_ck_tile_gemm_template  r=  r>  c                 C  s   t dot| S )Nr9  )r  r7  r  r@   r@   rG   use_ck_conv_template   r   r@  c                 C  s   t jpt jo| jjdkS r_  )r\   r  r   r   r   r?  r@   r@   rG   _use_template_for_cpu  s   

rA  mat1Union[ReinterpretView, Buffer]mat2c                 C  s6   ddl m} t|j|sJ t| ||ddo|j S )Nr)   )r3   F)require_constant_mat2)r  r3   rk   r  use_cpp_gemm_templater	  )r  rB  rD  r3   r@   r@   rG   use_cpp_bmm_template
  s
   rG  mat2_transposedrE  is_woq_int4q_group_sizec                 C  s:  ddl m} ddlm} ddlm}	 ddlm}
 t| r t	ds"dS t
jjs(dS | tjtjfv }tjtjtjtjg}|
|||rD| jnd ||d\}}}} }}t||frXdS t||jrb| }|	| \}}|d	|||| | |t | |d

}ddd}| j|v o|d uo||ot||jo| p| S )Nr)   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsZCPPF)	out_dtyperH  Zuse_4x2_dim
micro_gemm)Zinput_dtypeZinput2_dtypeoutput_dtypeZnum_threadsZuse_refrJ  rF   r2   r>   rj   c                 S  s   |    |  d dkS )Nr   r)   )Zfreeze_layoutZ
get_striderF   r@   r@   rG   is_last_dim_stride1J  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1r  )r  r  Zcodegen.cpp_micro_gemmrK  Zcodegen.cpp_utilsrL  Zkernel.mm_commonrM  rA  r  r\   cppZweight_prepackr  rB   r  r  r  r  Zhalfr   has_free_symbolsrk   BaseViewZunwrap_viewparallel_num_threadsr  Zis_module_buffer)r  rB  rD  rH  rE  rI  rJ  r  rK  rL  rM  Z	int8_gemmr  r7  r  r
  rP  r   rO  rR  r@   r@   rG   rF    sX   		


rF  c                   C  s   t jpt j p
tdS )NZATEN)r\   r  r   r  r@   r@   r@   rG   use_aten_gemm_kernelsW  s   
rW  c                   @  s>   e Zd ZU edZded< dddZddd	ZdddZ	dS )DebugDirManagerr   r?   prev_debug_namer>   r^  c                 C  s   t tj| _d S rw   )r   rX  counterr   r  r@   r@   rG   ry  a  r   zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )NZ_tmp_)rB   _dynamor\   debug_dir_rootrY  r   new_namer  r@   r@   rG   	__enter__d  s   zDebugDirManager.__enter__rq   r
   c                 G  s   t | j | jtjj_d S rw   )rX  rY  r]  rY  rB   r[  r\   r\  )r  rq   r@   r@   rG   __exit__i  s   zDebugDirManager.__exit__Nr  )rq   r
   r>   r^  )
r{   r|   r}   r  r  rZ  r   ry  r^  r_  r@   r@   r@   rG   rX  ]  s   
 


rX  Callable[P, _T]r  r  tuple[_T, list[str]]c                   st   ddl m} g  d
 fdd}tj|d	| tj  | |i |}W d    | fS 1 s1w   Y  | fS )Nr)   r-   coder?   r>   r^  c                        |  d S rw   rR  rb  source_codesr@   rG   save_output_codew  r  z*run_and_get_code.<locals>.save_output_coderh  rb  r?   r>   r^  r)  r.   r   rR  r  rB   r[  reset)r   rq   rM  r.   rh  rg  r@   rf  rG   run_and_get_coden  s   

rl  c                 O  sF   t | g|R i |\}}g }|D ]}|td|tj q||fS )Nz	'''.*?''')rl  r  refindallDOTALL)r   rq   rM  rg  rg  Zkernelsrb  r@   r@   rG   run_and_get_kernels  s
   rp  tuple[Any, list[str]]c                   s   d fdd}t |S )Nr>   r
   c                    s     } |     | S rw   )r   Zbackwardrs  r   r@   rG   run_with_backward  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)r>   r
   )rl  )r   rs  r@   rr  rG   run_fw_bw_and_get_code  s   rt  c              	     s   ddl m} g dfdd d fdd}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r)   r-   rb  r?   r>   r^  c                   rc  rw   rd  re  rf  r@   rG   rh    r  z"get_code.<locals>.save_output_coder  r.   r
   c                   sF   G dd d}| j r|  n|  \}} |j |r  |j | S )Nc                   @  s$   e Zd ZdZdddZdd	d
ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton moduler>   r^  c                 S  r  rw   r@   r  r@   r@   rG   ry    r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__rq   r
   rM  c                 _  r  rw   r@   r  r@   r@   rG   call  r  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNr  rq   r
   rM  r
   r>   r^  )r{   r|   r}   r~   ry  ru  r@   r@   r@   rG   DummyModule  s    
rw  )r  Zcodegen_with_cpp_wrapperZcodegenru   )r  rw  Zwrapper_codekernel_code)rh  r@   rG   patched_compile_to_module  s   

z+get_code.<locals>.patched_compile_to_moduleZcompile_to_modulerh  Nri  )r  r.   r>   r
   rj  )r   rq   rM  r.   ry  r   r@   )rh  rg  rG   get_code  s$   
(


rz  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr)   r  z%expected one or two code outputs got r   )rz  rJ   )r   rq   rM  rg  r@   r@   rG   get_triton_code  s
   r|  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r{  )rl  rJ   )r   rq   rM  r   rg  r@   r@   rG   run_and_get_triton_code  s
   r}  tuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg d fd	d
}tj|d| | |i |}W d    |fS 1 s7w   Y  |fS )Nr   r-   r6   rq   r
   rM  r>   r^  c                    s2   | i | | d }t | sJ | d S )Nr  )rk   rR  )rq   rM  r)  r.   Zgraph_loweringsZ	real_initr@   rG   	fake_init  s   z-run_and_get_graph_lowering.<locals>.fake_initry  rv  )Ztorch._inductor.graphr.   Ztorch._inductor.output_coder7   ry  r   rR  r  )r   rq   rM  r7   r  rg  r@   r  rG   run_and_get_graph_lowering  s   
r  aten_opoverride_fnc              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)Ztorch._inductorr  Z	loweringsr   partial)r  r  r  orig_fnr@   r@   rG   override_lowering  s   
r  pre_fnpost_fnOptional[Callable[..., Any]]c                   s6   ddl m} |j d fdd}tjj|d	|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r
   r  r>   c                   s&   | |  | |}r| | |S rw   r@   )r  r  outr  r  r  r@   rG   r    s
   


z(add_scheduler_init_hook.<locals>.wrapperry  N)r  r
   r  r
   r>   r
   )torch._inductor.schedulerr  ry  unittestr   rR  r  )r  r  r  r  r@   r  rG   add_scheduler_init_hook  s   r  msgc                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r\   Zdeveloper_warningsr   rM  info)r  r@   r@   rG   developer_warning  s   r  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr)   r   r  z--only=N)r(  argvr   rJ   
ValueErrorr*  )r  rP  r@   r@   rG   get_benchmark_name  s$   

r  r  c                 C  r  )Nc                 s      | ]}|d kV  qdS r)   Nr@   rD   r@   r@   rG   r   =  r  zis_ones.<locals>.<genexpr>rn   r  r@   r@   rG   is_ones<  r   r  c                 C  r  )Nc                 s  r  )r   Nr@   rD   r@   r@   rG   r   A  r  zis_zeros.<locals>.<genexpr>r  r  r@   r@   rG   is_zeros@  r   r  inputsSequence[torch.Tensor]c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r   N)rk   rB   rW  r   )rE   r   r@   r@   rG   r   E  s    

z is_cpu_device.<locals>.<genexpr>r  )r  r@   r@   rG   is_cpu_deviceD  s   r  r  c                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rk   rl   r  r   rB   r  r  )r  r@   r@   rG   get_sympy_Expr_dtypeL  s   r  should_profileIterator[Any]c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S rw   )rB   r   r   )r  rq   rM  r   r@   r@   rG   maybe_profileV  s   "
r  c                  C  s   t jj} | dk rt } | S Nr)   )r\   rS  threadsrB   Zget_num_threads)r  r@   r@   rG   rV  _  s   rV  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr)   )get_backend_optionsZ
num_stagesr     )Zruntime.triton_helpersr  r  rB   r  r  )r  optionsr@   r@   rG   get_backend_num_stagesf  s   r  c                 C  s   ddl m}m} | tjtjtjfv sJ t|j	
drEddlm} | }| tjtjfv r3|| |S tjjjjr?|tj|S |tj|S | tjtjfv rQ|| S tjjjjr\|tjS |tjS )Nr   )get_max_simd_tflopsget_max_tensorcore_tflopsZ
clock_rate)max_clock_rate)triton.testingr  r  rB   r   r  r  inspect	signature
parametersr  Ztorch._utils_internalr  backendsr:   matmulZ
allow_tf32)r   r  r  r  Zsm_clockr@   r@   rG   get_device_tflopsn  s   


r  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r  r  r  r@   r@   rG   get_gpu_dram_gbps  s   r  c                  C  s"   ddl m}  | jjdddS )Nr   r1  Zmax_shared_mem)Ztriton.runtimer1  r2  r3  r  r  r  r@   r@   rG   get_gpu_shared_memory  s   r  reduction_typec                 C  s
   |  dS )NZwelford)r*  r  r@   r@   rG   is_welford_reduction  r  r  c                 C  s   t | rdS | dkrdS dS )Nr  Zonline_softmax_reducer  r)   )r  r  r@   r@   rG   reduction_num_outputs  s
   r  c                   C  s   t  dkS )NLinux)platformsystemr@   r@   r@   rG   is_linux  r  r  c                   C  s
   t jdkS )Nr^   )r(  r  r@   r@   r@   rG   rW    r  rW  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S rw   )rk   rl   r  r!  rD   r@   r@   rG   r     s   " z#has_free_symbols.<locals>.<genexpr>r  )r  r@   r@   rG   rT    r   rT  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr)   r  r@   Tzunexpected type for is_dynamic F)r  r  rk   r  r  rU  ZComputedBufferr/   rT  Zmaybe_get_sizeZmaybe_get_strider2   	TypeErrorr   )rq   r  tr@   r@   rG   
is_dynamic  s   
r  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r{   r|   r}   r  r  r@   r@   r@   rG   r    s    r  r  r&   inpc              	   C  s4  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 sLw   Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    d S 1 sw   Y  d S )Nr)   )stable_topological_sortwzutf-8F)modeencodingrD  )r\  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)Zpattern_matcherr  rP  NamedTemporaryFileior	   rR   rN   	propagatero  r)  r   nowrQ   ZlintZ	recompiler  r   r  r   )r  r\  r  r  r  rJ  Z	before_ioZafter_io
start_timeZtime_elapsedr  r@   r@   rG   pass_execution_and_save  s>   

"r  	input_buf"Optional[Union[Buffer, Operation]]c                 C  s&   ddl m} t| |jot| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r)   r  )r  r  rk   ZCppTemplateBufferr  ZMultiOutputLayoutr  r  r@   r@   rG   is_multi_outputs_template  s   r  c                 C  s4   ddl m} t| |jot| jdkot| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r)   r  r   )r  r  rk   ZMultiOutputrJ   r  r  r  r@   r@   rG   #is_output_of_multi_outputs_template  s   r  r[   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                 C  s   | d u rdS ddl m} t| |jko|d u p| j|u pRt| |jkoRttjj	do2| jtjj	j
jkpRttjj	doB| jtjj	jjkpRttjj	doR| jtjj	jjkS )NFr)   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   Z_CollectiveKernelop_overloadFallbackKernelr   rB   r   Ztorchrecr  defaultr  r  r[  r1  r  r@   r@   rG   is_collective	  s"   

r  "Optional[Union[IRNode, Operation]]c                 C  s   ddl m} t| |jkS Nr)   r  )r  r  r   Z_WaitKernelr[  r  r@   r@   rG   is_wait0	  s   r  snoder8   c                 C  4   ddl m} t| |rtdd | jD S t| jS )Nr   GroupedSchedulerNodec                 s  r  rw   )contains_collectiverD   r@   r@   rG   r   :	  r  z&contains_collective.<locals>.<genexpr>)r  r  rk   r5  snodesr  r[  r  r  r@   r@   rG   r  6	     

r  c                 C  r  )Nr   r  c                 s  r  rw   )contains_waitrD   r@   r@   rG   r   C	  r  z contains_wait.<locals>.<genexpr>)r  r  rk   r5  r  r  r[  r  r@   r@   rG   r  ?	  r  r  Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr|g}t| |jo| j|v S r  )r  r  rk   rB   rF  rG  r  r  r  r@   r@   rG   is_fallback_opH	  s   r  buf_namename_to_bufname_to_fused_nodec                 C  s   |||  j   S rw   )Zdefining_opr  )r  r  r  r@   r@   rG   buf_name_to_fused_snodeS	  s   r  c                 C  r:  r;  r@   r  r@   r@   rG   r<  ^	  r=  collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                 C  sP   || rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Nr  )r  Zunmet_dependenciesr  r   find_recursive_deps_of_node)r  r  r  r  r  depZdefining_op_for_depr@   r@   rG   r  Y	  s"   

r  c                 C  r:  r;  r@   r  r@   r@   rG   r<  w	  r=  c              	   C  s   || rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr%q|j |vr-q||j  }||v r9qt|||||d qqd S )NZOUTPUTr   )r  Zget_outputsrI  r[  r  find_recursive_users_of_node)r  r  r  r  r  or  Zuser_opr@   r@   rG   r  r	  s,   

r  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rB   Z
_functorchr\   Zfunctionalize_rng_ops)r  r  Znum_rng_seed_offset_inputsr@   r@   rG   num_fw_fixed_arguments	  s   r  fx_gc                 C  sd   ddd}d}g }| j jD ]}|jdkr!||r|| |d	7 }q|ttt|ks.J t|S )z>
    Infers which inputs are static for a backwards graph
    rF   r(   r>   rj   c                 S  s(   d| j vod| j vod| j vod| j vS )NZtangentsZbwd_seedZbwd_base_offsetZbwd_rng_stater  rQ  r@   r@   rG   is_saved_tensor	  s   
z'count_tangents.<locals>.is_saved_tensorr   rS  r)   N)rF   r(   r>   rj   )r)  r  r1  rR  r  r   rJ   )r  r	  	arg_countZstatic_arg_idxsr  r@   r@   rG   count_tangents	  s   


r  c                   @  s.   e Zd ZU ded< dddZedd	d
ZdS )	BoxedBoolrj   ru   r>   c                 C  s   | j S rw   )ru   r  r@   r@   rG   r  	  s   zBoxedBool.__bool__rq  r
   Union[BoxedBool, bool]c                 C  s   t | tr
d| _| S dS r;  )rk   r  ru   r=  r@   r@   rG   disable	  s   
zBoxedBool.disableNr  )rq  r
   r>   r  )r{   r|   r}   r   r  r  r  r@   r@   r@   rG   r  	  s
   
 
r  kernel_listc                 #  sh    ddl m} |j	 		 dd fdd}tj|d| d V  W d    d S 1 s-w   Y  d S )Nr)   r+   Tr  r,   kernel_namer?   rx  r  rC  gpurj   cpp_definitionr>   r
   c                   s     | | |||||S rw   rd  )r  r  rx  r  r  r  r  Zorig_define_kernelr@   rG   define_kernel	  s   
z.collect_defined_kernels.<locals>.define_kernelr  )NTN)r  r,   r  r?   rx  r?   r  rC  r  rj   r  rC  r>   r
   )codegen.wrapperr,   r  r   rR  r  )r  r,   r  r@   r  rG   collect_defined_kernels	  s   "r  c                 C  s   | d S )NZ__original__r@   r  r@   r@   rG    get_cloned_parameter_buffer_name	     r  c                 C  s   | t v S rw   )rI   r'  r@   r@   rG   r  	  r  r  c                 C  s   | dkot | S )Nr;   )r  r'  r@   r@   rG   device_need_guard	  r   r  c                 C  sL   t  r| tjkrtj rtj dkrt jrdS | ttj	tj
tjgv S )N)r  r   F)r\   r6  rB   r  r:   rC   Zget_device_capabilityZbfloat16_atomic_adds_enabledr   r  rj   rl  r@   r@   rG   ,needs_fallback_due_to_atomic_add_limitations	  s   r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |fvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjfv p]t S )NFr  r   r   r)   )ZoverloadpacketrB   r   ZatenZscatter_reduce_Zscatter_reduceZscatter_r  r  r\   rS  Zfallback_scatter_reduce_sumZdynamic_threadsrV  rj   r  r  )r  r  r  r  r  r  Z	reduce_tyr@   r@   rG   use_scatter_fallback	  s<   	r  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r2  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )Ztorch._inductor.codegen.simdr!  r"  r  r#  ro  rJ   r   rk   Zis_reductionr[  r  Zreduction_hintZread_writesZreadsZwritesr   r   )r  r!  r"  r#  r  r[  Zis_redr  r@   r@   rG   dump_node_schedule
  s0   




r'  r   rQ  c                 C  s*   ddl m} ||  t| j t dkS )Nr   )r  )r  r  storage_offsetrm  r   GPU_ALIGN_BYTES)r   r  r@   r@   rG   tensor_is_aligned;
  s   r*  example_inputc                 C  s   t | jjsdS tjpt| S r;  )r  r   r   r\   Zassume_aligned_inputsr*  )r+  r@   r@   rG   should_assume_input_alignedI
  s   r,  r  c                  C  s4   t jj } | st S | jj}|st S | S rw   )	rB   _guardsTracingContexttry_getr  nullcontextr  r+  Zsuppress_guards)tracing_contextr+  r@   r@   rG   #maybe_get_suppress_shape_guards_ctxR
  s   r2  tuple[_T, str]c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)r  r   rR  r  r\   rB   r[  rk  r  loggingr	   StreamHandlerZtorch._inductor.codecacher4  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   rq   rM  r  r5  Zlog_capture_stringchr4  Z
prev_levelrg  r   r@   r@   rG   run_and_get_cpp_codec
  s$   




r=  Sequence[InputType]Optional[ShapeEnv]c                 C  s<   t | }|d ur|jS | D ]}t|tjr|jj  S qd S rw   )rN   r+  rk   rB   r$   r[  )r  r  inputr@   r@   rG   shape_env_from_inputs|
  s   rA  Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                   s&   t  dkrS d fdd}|S )	Nr   
new_inputslist[InputType]r>   r
   c                   s0   t |  \}}| }t|rt|| |S rw   )copy_misaligned_inputsrJ   rB   Z_foreach_copy_)rF  old_tensorsnew_tensorsr  rC  r`  rD  r@   rG   r.  
  s   z)align_inputs_from_check_idxs.<locals>.run)rF  rG  r>   r
   )rJ   )r`  rC  rD  r.  r@   rK  rG   align_inputs_from_check_idxs
  s   rL  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS r  r@   )rE   shaper  r@   r@   rG   r   
  s    z)clone_preserve_strides.<locals>.<genexpr>r)   rt   )r{  r   r   r  rB   Z
as_stridedclone)rF   Zneeded_sizebufferr@   r@   rG   clone_preserve_strides
  s   "rP  rF  rG  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                 C  s   g }g }|du}|D ]3}| | }t |tjsJ dt| | t r=t|| |< |r=||v r=|| || |  q
||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rk   rB   rW  r   data_ptr	ALIGNMENTrP  rR  )rF  rQ  rR  rI  rJ  Zret_pair_definedr   Z_inpr@   r@   rG   rH  
  s   

rH  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rk   rB   rW  rU  rV  rR  rJ   )r  rW  Zaligned_static_input_idxsr  r@  r@   r@   rG   remove_unaligned_input_idxs
  s   
rX  r   c                 C  sZ   ddl m} ttjj}|jjj}|jjj	j
}|jj| |kr#dS || o,|| |kS )Nr)   r%  T)r(  r&  rB   Ziinfor  r   r)  r*  r  r+  has_hintr  )r   r&  Zint_maxr  rY  r@   r@   rG   expr_fits_within_32bit
  s   
rZ  compiled_graphr7   c                   s   t jj }|d urX|jd urZt|jdksJ t| |jd us#J |jD ]5}|d u r3|jd  q&d t jj  }r@|j d fdd|jt	fd	d
|D  q&d S d S d S )Nr   Fr   r
   r>   ,Union[float, int, SymInt, SymFloat, SymBool]c                   s(   d u rt | S  r| S | S rw   )re   Zdeserialize_symexprZevaluate_symexpr)r   )fakify_first_callr+  r@   rG   map_expr  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  r  rw   r@   )rE   r   )r^  r@   rG   r     r  z5set_tracing_context_output_strides.<locals>.<genexpr>)r   r
   r>   r\  )
rB   r-  r.  r/  Zoutput_stridesrJ   rA  rR  r]  r&  )rb  r[  ro  ri  r  r@   )r]  r^  r+  rG   "set_tracing_context_output_strides
  s"   
r_  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
r\   Zfx_graph_remote_cacher6  rB   Z_utils_internalZis_fb_unit_testZtorch._inductor.fb.remote_cachera  ModuleNotFoundErrorZjustknobs_getval_intr`  r@   r@   rG    should_use_remote_fx_graph_cache  s   

rc  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )rm  subr  r@   r@   rG   normalize_name#  r  re  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                 C  r  r@   r@   r	  r@   r@   rG   r   3  r   r   z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rerd  r?   _triton_type_mappingr  )r   Ztriton_type_namer@   r@   rG   triton_type9  s   ri  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nrf  r  )_torch_triton_mappingr  r  rA   rB   rk   r   )r   Zadjusted_type	type_namerN  r@   r@   rG   triton_type_to_torch?  s
   
rl  r  ru   c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS rw   )	is_mkldnnr{  r  r   r   Zuntyped_storagerU  r(  r  ru   r@   r@   rG   is_same_tensorG  s   

ro  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS rw   )rm  r{  r   r   rB   r   mkldnnrU  rn  r@   r@   rG   is_same_mkldnn_tensorS  s   

rq  tuple[str, ...]c                   C  r:  )N)r  isnanZlogical_notlogical_andZsignbitand_leltgegteqner  xorr@   r@   r@   r@   rG   boolean_ops]  r  r}  c                   @  rq  )OpDtypeRuler%   type_promotion_kindr  override_return_dtypeNrp  r@   r@   r@   rG   r~  q  rs  r~  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr  r%   r  c                 C  s   t ||t| < d S rw   )r~  r  )r   r  r  r@   r@   rG   #register_op_dtype_propagation_rulesz  s   r  zOrderedSet[str]op_requires_libdevice_fp64c                 C  s   t |  d S rw   )r  r  r  r@   r@   rG   #register_op_requires_libdevice_fp64  r  r  c                  C  s8   ddl m}  | j j}|dkrtjS |dkrdS tjS )Nr   r%  r   r;   )r  r&  r)  Zget_current_device_or_throwr   r\   Zcpu_backendZcuda_backend)r&  Z
device_strr@   r@   rG   get_current_backend  s   r  c                 C  s,   | t jt jfv rtjjrt dkrt jS | S )z"Maybe upcast [b]float16 to float32rF  )rB   r   r  r\   rF  Zcodegen_upcast_to_fp32r  r  rl  r@   r@   rG   upcast_compute_type  s   r  KeyTypeValTypec                   @  sl   e Zd ZdZd#ddZd$d
dZd%ddZd&ddZd'd(ddZd)ddZ	d*ddZ
d+dd Zd,d!d"ZdS )-
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                 C  s   || _ i | _d S rw   r  	new_items)r  r  r@   r@   rG   ry    rz  zScopedDict.__init__r  r  r>   r  c                 C  s   || j v r
| j | S | j| S rw   r  r  r  r  r@   r@   rG   r`    s   


zScopedDict.__getitem__ru   r^  c                 C  s   || j |< d S rw   )r  )r  r  ru   r@   r@   rG   __setitem__  r  zScopedDict.__setitem__r  rj   c                 C  s   || j v p	|| jv S rw   r  r  r@   r@   rG   __contains__  r   zScopedDict.__contains__Nr  Optional[ValType]c                 C  s"   || j v r
| j | S | j||S rw   )r  r  r  )r  r  r  r@   r@   rG   r    s   

zScopedDict.getre   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S r  )rJ   r  r  )r  r  r
  r@   r@   rG   r    s   


zScopedDict.__len__Iterator[KeyType]c                 c  s.    | j E d H  | jD ]
}|| j vr|V  q
d S rw   r  )r  r
  r@   r@   rG   __iter__  s   

zScopedDict.__iter__c                 C  s   t | jp| jS rw   )rj   r  r  r  r@   r@   rG   r    r   zScopedDict.__bool__c                 C  r  rw   r  r  r@   r@   rG   __delitem__  r  zScopedDict.__delitem__)r  r  )r  r  r>   r  )r  r  ru   r  r>   r^  )r  r  r>   rj   rw   )r  r  r  r  r>   r  r  )r>   r  r  )r  r  r>   r^  )r{   r|   r}   r~   ry  r`  r  r  r  r  r  r  r  r@   r@   r@   rG   r    s    






r  )Zfrozen_defaultry   Optional[type[Any]]r   c                 s"   d fdd}| d u r|S || S )Nry   r_   r>   c                   s(   t jdkrtj| d dS tj|  dS )N)r  rj  T)kw_onlyr   r   )r(  version_infodataclasses	dataclass)ry   r   r@   rG   wrap  s   
zir_dataclass.<locals>.wrap)ry   r_   r>   r_   r@   )ry   r   r  r@   r   rG   ir_dataclass  s   r  Optional[list[int]]c                  C  s&   t jj } | d ur| jr| jjS d S rw   )rB   r-  r.  r/  Zfw_metadataZbw_donated_idxs)r1  r@   r@   rG   get_donated_idxs  s   r  3Union[Sequence[BaseSchedulerNode], ExternKernelOut]r  	is_externc                   s   ddl m}m} ddlm} ddlm} |r4t| |sJ |jj	
|g    fdd| jD  d S t| ts;J | D ]#}|||fvr`|jd ur`|jj	
|g    fdd|jjD  q=d S )Nr)   r   )r1   r%  c                 3       | ]}|j  vr|j V  qd S rw   r  r  Zcurr_node_infor@   rG   r         
z:set_kernel_post_grad_provenance_tracing.<locals>.<genexpr>c                 3  r  rw   r  r  r  r@   rG   r   	  r  )Zcodegen.simd_kernel_featuresr!  r"  r  r1   r(  r&  rk   r   Z._inductor_triton_kernel_to_post_grad_node_info
setdefaultr  r  r  r[  )r  r  r  r!  r"  r1   r&  r  r@   r  rG   'set_kernel_post_grad_provenance_tracing  s0   
r  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r)   r  r  ru  N)r{   r|   r}   V0_NO_TRITONV1_COMPILERV2_BACKENDSZV3_BACKENDS_TUPLEV4_DICTr@   r@   r@   rG   r    s    r  c                  C  sT   t jdd u rtjS dd l} dd l} t| jj	drtj
S t| j	j	dr'tjS tjS )NrF  r   ZAttrsDescriptor)	importlibutil	find_specr  r  Ztriton.backends.compilerZtriton.compiler.compilerr   r  compilerr  r  r  )rF  r@   r@   rG   #get_triton_attrs_descriptor_version  s   r  c                   C  s   t  tjkS rw   )r  r  r  r@   r@   r@   rG   triton_version_uses_attrs_dict4  r  r  r4   c                 C  sF   ddl m} t| |jsdS t| jtjjr!tjj	j
| jjv r!dS dS )zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r)   r  FT)r  r  rk   r  r  rB   rF  rG  r   rJ  r  rK  r  r@   r@   rG   is_cudagraph_unsafe_op8  s   r  c                  C  sX   t jdd} t r*ddlm} | }|r*t j|dd}| r(t j	|| gn|} | S )NZLD_LIBRARY_PATHr  r   )get_runtime_pathr,  lib)
rG  rT  r  r\   r6  Zlibfb.py.parutilr  rH  r  pathsep)rH  r  Zruntime_pathZlib_pathr@   r@   rG   get_ld_library_pathK  s   r  c                 C  s    ddl m} t| |o| jd uS )Nr   )SubgraphPythonWrapperCodegen)Ztorch._inductor.codegen.wrapperr  rk   Zpartition_signatures)r  r  r@   r@   rG   #is_codegen_graph_partition_subgraphX  s   
r  c                 C  s8   ddl m} |jj| dr|jj| drtjS tjS )Nr)   r%  l        i   )	r(  r&  r)  r*  r
  Zstatically_known_geqrB   r  r  )r{  r&  r@   r@   rG   dtype_from_sizea  s   r  )r   r<   r   c                 C  $   | dkr
t jj S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r   r<   TF)rB   r   rp  Z_is_mkldnn_bf16_supportedr   r@   r@   rG   is_mkldnn_bf16_supportedo  
   r  c                 C  r  )z;
    Returns True if the device supports MKL-DNN FP16.
    r   r<   TF)rB   r   rp  Z_is_mkldnn_fp16_supportedr  r@   r@   rG   is_mkldnn_fp16_supported{  r  r  r  )rd   re   r>   re   )rh   ri   r>   rj   )r   r   )r   r   r   re   r   re   r>   r   r  )r   r   r>   r   )r   r   r>   ri   )r   r   r   r   r>   ri   )r   r   r>   r   )r   r   r  r   r>   r   )r  r  r>   r?   )r  r  r>   r  )r   r   r>   r$  )r  r-  r>   r.  )r1  r2  r>   rj   )r>  r(   r?  r@  r>   rj   )rE  r
   rq   rL  rM  rN  r>   rO  )r:   )r   r?   r>   r^  )r)   r:   )
r`  ra  rb  rc  r   re   r   r?   r>   r   )r@   rj  rj  rk  r:   )r`  ra  rb  rc  r   re   rl  re   rm  r   r   r?   r>   r   )rq  r
   rr  r?   r>   r^  )rq  r
   rv  r   r>   r^  )r   re   r   re   r>   re   )rF   rz  r{  re   r>   r|  )rF   r~  r>   r  )r   r  r>   r  )r  r  r>   r  )r  r  r  r  r>   r?   )r  r  r  r,   r>   r  rw   )r  r  r  r  r>   r  )rq   r  rM  r  r>   r  r  )r   ri   r>   r  )r  r?   r>   rj   )r  rY   r  re   r>   r  )r  rj   r>   rj   )r   r?   r>   r  )r  ri   r  r  r>   ri   )r   r
   r>   r  )rq   r
   r>   rj   )r\  r  r>   r  )r\  r  r>   r(   )r\  r  r>   r$  r  )rq  r
   r>   r
   )NNT)r@  rA  rB  rC  rD  rj   r>   rE  )r]  rc  r>   r^  )r+  r'   r]  rc  r>   r^  )r   rk  r>   re   r  r   )r  r  r>   rj   r  )r  re   r   r   r  r  r>   r*   )r  r3   r  r  r>   rj   )r  r?   r>   rj   )r  r3   r  rj   r  rj   r>   rj   )r  r2   r>   rj   )
r  r3   r7  re   r  re   r
  re   r>   rj   )r  r?   r>   rj   )r7  r  r  r  r
  r  r>   rj   )r7  r  r  r  r
  r  r>   r^  )r   r?   r>   r?   )r>   r)  )r  r3   r>   rj   )r  r3   rB  rC  rD  r2   r>   rj   )FTFN)r  r3   rB  r2   rD  r2   rH  rj   rE  rj   rI  rj   rJ  r  r>   rj   )r   r`  rq   r  rM  r  r>   ra  )r   ra  r>   rq  )r   r`  rq   r  rM  r  r>   r   )r   r`  rq   r  rM  r  r>   r?   )r   r`  rq   r  rM  r  r>   r~  )r  ra  r  ra  r>   rE  )r  ra  r  r  r>   r
   )r  r?   r>   r^  )r>   rC  )r  rc  r>   rj   )r  r  r>   rj   )r  ri   r>   rk  )r  rj   rq   r
   rM  r
   r>   r  )r  r?   r>   rj   )r  r?   r>   re   )r  r  r>   rj   )
r  ra  r\  r&   r  rc  r  r?   r>   r^  )r  r  r>   rj   )r[  r  r1  r  r>   rj   )r[  r  r>   rj   )r  r8   r>   rj   )r[  r  r1  r  r>   rj   )r  r?   r  rN  r  rN  r>   r
   )r  r8   r  r  r  r  r  r  r  r  r>   r^  )r  re   r  re   r>   re   )r  r  r>   re   )r  r   r>   rE  )r   r?   r>   r?   )r   rC  r>   rj   )r   r?   r>   rj   )r   rk  r>   rj   )r  r2  r  rC  r  rk  r  rk  r  r?   r  rj   r>   rj   )r  r  r>   r^  )r   rQ  r>   rj   )r+  rQ  r>   rj   )r>   r  )r   r`  rq   r  rM  r  r>   r3  )r  r>  r>   r?  )r`  rB  rC  r|  rD  rE  r>   rB  )rF   rQ  r>   rQ  )rF  rG  rQ  r|  rR  rS  r>   rT  )r  r>  rW  r|  r>   r|  )r   ri   r>   rj   )rb  rc  r[  r7   r>   r^  )r   rk  r>   r?   )r   r?   r>   rk  )r  rQ  ru   rQ  r>   rj   )r>   rr  )r   r?   r  r%   r  r  r>   r^  )r   r?   r>   r^  )r   rk  r>   rk  )ry   r  r   rj   r>   r
   )r>   r  r  )r  r  r  r?   r  rj   r>   r^  )r>   r  )r[  r4   r>   rj   )r  r,   r>   rj   )r{  re   r>   rk  )r   r?   r>   rj   (S  
__future__r   r  r  r  enumr   r  r  r  r  r5  r  r   rG  r  rm  rX  r   r(  rP  r  rd  r  collections.abcr   r   r   r   r   r   r	   typingr
   r   r   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   r   r   r   r   rl   rB   Ztorch._inductor.runtime.hintsr   Ztorch.utils._ordered_setr   Ztorch.utils._pytreer   ZOPTIMUS_EXCLUDE_POST_GRADr   r    r!   r"   r#   r$   Ztorch._prims_commonr%   Ztorch.fxr&   r  r'   Ztorch.fx.noder(   r  r*   r  r,   r)  r.   r  r/   r0   r1   r2   r3   r4   r5   Zoutput_coder7   r  r8   r9   rI   r=   r   rL   Ztorch._dynamo.device_interfacerM   Ztorch._dynamo.utilsrN   Ztorch.autogradrO   Ztorch.autograd.profiler_utilrP   Z(torch.fx.passes.graph_transform_observerrQ   Ztorch.fx.passes.shape_proprR   Ztorch.utils._sympy.functionsrS   rT   rU   rV   rW   Ztorch.utils._sympy.symbolrX   rY   Ztorch.utils._sympy.value_rangesrZ   r[   r  r\   Zruntime.runtime_utilsr]   r  Z_IS_WINDOWS	getLoggerr{   r   r_   rS  r  Z	VarRangesrW  re   Z	InputTypeZGPU_KERNEL_BIN_EXTSr)  rV  r  r  rf   rg   rp   Functionrr   r  r   r   r   r   r   r   r   r   r  r#  r,  r0  r9  rB  r]  r   ri  rp  ru  rw  rx  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r'  r5  r8  r9  r   r>  r?  r  r\  Zclear_on_fresh_inductor_cacheZclear_inductor_cachesZfresh_inductor_cacherb  rj  	lru_cacherm  rn  rr  rt  r  r  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r$  r"  r  r  r&  r(  r1  r7  r<  r>  r@  rA  rG  rF  rW  rX  rl  rp  rt  rz  r|  r}  r  r  r  r  r  r  r  r  r  r  rV  r  r  r  r  r  r  r  rW  rT  r  Enumr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r*  r,  r2  r=  rA  rL  rP  rH  rX  rZ  r_  rc  re  rh  r  rj  compilerg  ri  rl  ro  rq  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  ZSUPPORTED_MKLDNN_DEVICESr  r  r@   r@   r@   rG   <module>   s~   4 $	


$
HV&
		$;/;8$  
) 
6.

@
	,	!
	
$$		'	


$
0
$
	