o
    hH                    @  s
  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dl
mZmZmZmZmZmZmZmZmZ d dlmZmZm Z  d d	l!m"Z" d dl#Z#d d
l#m$Z$m%Z%m&Z& d dl'm(  m)  m*Z+ d dl,m-  m.Z/ d dl0Z1d dl2Z1d dl3m.  m4Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZP d dlQmRZR d dlSmTZTmUZUmVZV d dlWmXZX ddlYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z` ddl[maZambZbmcZcmdZdmeZe ddlfmgZg ddlhmiZimjZjmkZkmlZl ddlmmnZn ddlompZpmqZq ddl.mrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZ ddlmZmZmZ erd dlmZ d d lmZ dd!lmZ dd"lmZ dd#l.mZ neZd$ed%< zd dlZejZd&ZW n ey   dZd'ZY nw ed(Zed)Zed*Zeee$f Zd$ed+< eeee$f Zd$ed,< eeZejejd-d.Ze1jjZ	 eed/eed/f d0d1eeeeeed/f d/d0d1f   f Zd$ed2< dd7d8Zejd&d9G d:d; d;Zdd?d@ZddDdEZddIdJZddKdLZddRdSZddWdXZg dYZg dZZ	ddd_d`ZddbdcZ	dddddeZedddhdiZedddkdiZ	&dddndiZddrdsZddvdwZddydzZdd{d|ZdddZdddZdddZdddZdddZG dd1 d1Ze~d'd9G dd dZe~G dd deZĐdddZe~G dd deăZe~G dd deƃZededededededdZded< 	&ddddZe~G dd deăZeee$ ee$ gef ZG dd deʃZG dd dẽZG dd dẽZe~G dd deăZe~G dd deσZe~G dd deăZѐdddĄZҐdddƄZ	&	'		'	ddddτZԐdddфZՐdddԄZe~G ddք deZe~G dd؄ de׃Ze~G ddڄ de׃Ze~G dd܄ de׃Ze~G ddބ de׃Ze~G dd deۃZe~G dd de׃Ze~G dd de׃ZG dd de܃Ze~G dd deZe~G dd deZe~G dd deZdddZdddZG dd dZe~G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZe~G d d deZG dd deZe~d'd9G dd dee^Ze~d'd9G dd deeÃZG dd	 d	eZG d
d deZG dd deZe~G dd deZe~G dd deZe~d'd9G dd deZG dd deZG dd deZeeeeeeeeeeef  f ZG dd dZG dd deZG dd deZG dd deZG d d! d!eZe~d'd9G d"d# d#eZ G d$d% d%e ZG d&d' d'eZe~d'd9G d(d) d)e Ze~d'd9G d*d+ d+eZG d,d- d-eZG d.d/ d/eZG d0d1 d1eZG d2d3 d3eZG d4d5 d5eZ	G d6d7 d7eZ
G d8d9 d9eZG d:d; d;eZG d<d= d=eZG d>d? d?eZG d@dA dAeZG dBdC dCeZG dDdE dEeZG dFdG dGeZG dHdI dIeZG dJdK dKeZG dLdM dMeZG dNdO dOeZe~d'd9G dPdQ dQZG dRdS dSeZe~d'd9G dTdU dUeZe~G dVdW dWeZG dXdY dYeZejG dZd[ d[eZG d\d/ d/eZG d]d^ d^eZe~d'd9G d_d` d`eZddcddZ e~d'd9G dedf dfeZ!e~d'd9G dgdh dheZ"ddldmZ#e~d'd9G dndo doeZ$G dpdq dqeZ%G drds dseZ&e~G dtdu due&Z'e~G dvdw dwe&Z(G dxdy dyeZ)G dzd{ d{e)Z*dd~dZ+dddZ,dS (      )annotationsN)	GeneratorIterableSequence)AbstractContextManagernullcontext)Enum)partial)	AnyCallableClassVarLiteralOptionaloverloadTYPE_CHECKINGTypeVarUnion)assert_neverNever	TypeAlias)patch)ExprIntegerSymbol)identity)GraphModuleSerializer)can_auto_functionalize)metrics)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_for
StrideType)get_schema_info)
&_remove_effect_token_unbacked_bindingscompute_unbacked_bindingsfree_symbolsfree_unbacked_symbolsIterateExprsrebind_unbackedresolve_unbacked_bindingsShapeEnvstatically_known_trueSymTypes
OrderedSet)CleanDivFloorDivModularIndexing)SymT   )configdependencies)BackendFeatureCodegenSymbolget_scheduling_for_deviceindex_prevent_reordering)Depextract_free_symbols#extract_input_node_reduction_rangesextract_read_writesvar_builder)LoopBody)OpCounterCSEOpCountResultReductionType	StoreMode)benchmarker)DevicePropertiesReductionHint)argsortargsort_symcache_on_selfceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningdo_bench_using_profilingdtype_from_sizeget_dtype_sizeget_kernel_metadataGPU_ALIGN_BYTESir_dataclass
is_dynamicis_gpu	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_substensor_is_aligned)opsOpsValueV)FakeScriptObject)Node)CUDATemplate)GraphLowering)IndentedBufferr   rb   TF_T_U_V_IntLike_NumLikez  prefix	TensorBoxr   IRNode_NodeOrNodesxobjectreturnboolc                 C  s   t | ttfS N)
isinstanceintr   ro    rw   X/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torch/_inductor/ir.py
_is_static      ry   )frozenc                   @  s>   e Zd ZU ded< ded< ded< ded< d	ed
< ded< dS )GraphPartitionSignatureOrderedSet[sympy.Symbol]Zsymbol_inputsz5dict[str, Union[IRNode, sympy.Expr, TorchBindObject]]input_nodeslist[IRNode]Zoutput_nodeszdict[str, bool]Zinput_deallocationrr   Zskip_cudagraphz	list[str]Zconstant_namesN__name__
__module____qualname____annotations__rw   rw   rw   rx   r|      s   
 r|   node_or_nodesOptional[_NodeOrNodes]Nonec                   s   d fdd  |  d S )Nnodesr   rq   r   c                   s   | d u rd S t | ttfr| D ]} | qd S t | tr*|  D ]} | q!d S t | ttttt	j
jjttttf	sEJ dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])rt   listtupledictvalues
ExpandViewDynamicScalarAssertScalarrl   sympylogicboolalgBooleanr   ru   EffectfulKernelShapeAsConstantBuffertype)r   node_check_tensorboxrw   rx   r      s2   


z%validate_ir.<locals>._check_tensorbox)r   r   rq   r   rw   )r   rw   r   rx   validate_ir   s   r   namestrCallable[..., OpsValue]c                   s    t  tsJ d fdd}|S )	Nargsrp   kwargsrq   r^   c                    s   t t | i |S rs   )getattrr]   r   r   r   rw   rx   fn     zops_wrapper.<locals>.fn)r   rp   r   rp   rq   r^   )rt   r   )r   r   rw   r   rx   ops_wrapper   s   r   orderSequence[int]&Callable[[Sequence[_T]], Sequence[_T]]c                   s(   t t| tt|  d fdd}|S )NindexSequence[_T]rq   c                   0   t  t ks
J  fddtt  D S )Nc                      g | ]} |  qS rw   rw   .0i)r   	inv_orderrw   rx   
<listcomp>      z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenranger   r   r   rx   reindex
     z inverse_reorder.<locals>.reindexr   r   rq   r   )r   zipr   r   r   r   rw   r   rx   inverse_reorder  s   r   c                   s   d fdd}|S )Nr   r   rq   c                   r   )Nc                   r   rw   rw   r   )r   r   rw   rx   r     r   z1same_reorder.<locals>.reindex.<locals>.<listcomp>r   r   r   r   rx   r     r   zsame_reorder.<locals>.reindexr   rw   r   rw   r   rx   same_reorder  s   r   reindex1&Callable[[Sequence[_U]], Sequence[_V]]reindex2&Callable[[Sequence[_T]], Sequence[_U]]&Callable[[Sequence[_T]], Sequence[_V]]c                   s   d fdd}|S )Nr   r   rq   Sequence[_V]c                       | S rs   rw   r   r   r   rw   rx   r        z fuse_reindexing.<locals>.reindex)r   r   rq   r   rw   )r   r   r   rw   r   rx   fuse_reindexing  s   r   r(   unbacked_onlyr}   c                 C  s   |rt | S t| S rs   )r'   r&   )ro   r   rw   rw   rx   get_free_symbols#  s   r   )   r      r4   )   r   r   r   r4   seq(Sequence[Union[int, torch.SymInt, Expr]]	shape_envOptional[ShapeEnv]c                 C  s"   |du r
t | }|S t|| }|S )z1
    Convert strides to fill order (argsort)
    N)rH   rI   )r   r   
sorted_idxrw   rw   rx   get_fill_order.  s
   
r   Sequence[Union[int, Integer]]c                   s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S     i | ]\}}||qS rw   rw   r   idxposrw   rw   rx   
<dictcomp>C      z+stride_order2fill_order.<locals>.<dictcomp>c                      g | ]} | qS rw   rw   r   lookuprw   rx   r   D      z+stride_order2fill_order.<locals>.<listcomp>)	enumerater   r   )r   
fill_orderrw   r   rx   stride_order2fill_order<  s   r   c                 C  s>   t | |}dd tt| D }t|D ]\}}|||< q|S )z)
    Convert strides to stride order
    c                 S     g | ]}d qS r   rw   r   _rw   rw   rx   r   O      z$get_stride_order.<locals>.<listcomp>)r   r   r   r   )r   r   r   outr   elemrw   rw   rx   get_stride_orderH  s
   

r   Literal[None]guard_shapec                 C     d S rs   rw   ro   r   rw   rw   rx   ir_node_to_tensorU     r   torch.Tensorc                 C  r   rs   rw   r   rw   rw   rx   r   Y  r   Optional[IRNode]Optional[torch.Tensor]c                   s   | d u rd S |st jjj nt  fdd|  D }t| r, fdd|  jD }nt	
|}|  }|  }t|}t|}t jjj  tj||||d }W d    |S 1 s_w   Y  |S )Nc                      g | ]} |qS rw   rw   r   sZshape_fnrw   rx   r   h  r   z%ir_node_to_tensor.<locals>.<listcomp>c                   r   rw   rw   r   r   rw   rx   r   k  r   )sizestridedtypedevice)r_   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   FlexibleLayoutcontiguous_strides	get_dtype
get_devicerM   r   Zsuppress_guardstorchempty_stridedZzero_)ro   r   r   r   r   r   trw   r   rx   r   ]  s.   

valueOptional[Sequence[_T]] Optional[Sequence[Optional[_T]]]c                 C  s   t | tr
| s
d gS | S rs   )rt   r   r	  rw   rw   rx   may_convert_to_optionaly  s   r  2Union[IRNode, OutputSpec, torch.device, None, str]Optional[str]c                 C  sb   t | ts	| d u r| S t | tjr| jS t | ttfr!t|  S t	d|  dt| j
 d d S )Nzget_device_type(: ))rt   r   r  r   r   rm   
OutputSpecget_device_typer  r   r   rv   rw   rw   rx   r    s    r  &Union[IRNode, torch.device, None, str]c                 C  sl   t | }|dv rtt| ddkrdS dS |d u s"t| }d u r$dS ddlm} t|ts1J t||S )N)cpucudaZ_backendtritonTFr4   )TritonScheduling)	r  r   r5   r9   Zcodegen.tritonr  rt   r   
issubclass)ro   r   Zdevice_schedulingr  rw   rw   rx   	is_triton  s   
r  c                 C  s   t | dkS )Nr  )r  rv   rw   rw   rx   is_cpu  r   r  Union[Buffer, TensorBox]	alignmentru   c                   s~   t tr d u rdS t fddtt d D }tjj	
 d dkp:tjj	
 d dk}|o>|S )NFc                 3  s.    | ]}t jj |   d kV  qdS )r   N)r_   r   r   size_hint_or_throw
get_strider   r  ro   rw   rx   	<genexpr>  s
    
z-is_aligned_realized_tensor.<locals>.<genexpr>r4   )rt   rm   maybe_get_strideallr   r   r  r_   r   r   r  r   )ro   r  Zaligned_stridesZaligned_last_dimrw   r   rx   is_aligned_realized_tensor  s   r%  strides1Sequence[_IntLike]strides2shapec                 C  s   t |t | krt | t |ksJ t|| |D ]'\}}}tjj|dr&qtjj||s?tjj|tjj|ks? dS qdS )zP
    Returns true if the strides are equal, ignoring dimensions of size 1 .
    r4   FT)r   r   r_   r   r   statically_known_leqstatically_known_equalssymbolic_hint)r&  r(  r)  dims1s2rw   rw   rx   significant_strides_equal  s   $r0  tensorUnion[TensorBox, BaseView]strides"Sequence[Union[int, torch.SymInt]]c                 C  s   t | s| S tdd t||  D r| S t||  |  s"| S t| \}}g |j}t|  D ]\}}t	j
j|drE|| ||< q3t|j|j|j||j}tt||dS )a  
    Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
    dimensions - size 0 or 1 - will be updated.

    If there are real stride differences (NHWC vs NCHW), or the tensor is not realized, then the input will be returned
    c                 s  $    | ]\}}t jj||V  qd S rs   r_   r   r   r+  r   r.  r/  rw   rw   rx   r!    
    
z2try_match_insignificant_strides.<locals>.<genexpr>r4   datalayout)r   r$  r   r  r0  r   as_storage_and_layoutr   r   r_   r   r   r*  FixedLayoutr   r   r   offsetrl   ReinterpretView)r1  r3  storage
old_layout
new_strider   r   
new_layoutrw   rw   rx   try_match_insignificant_strides  s,   

rD  gmtorch.fx.GraphModulec                 C  sD   | j jddd }dd t|jD |jd< ddlm} ||  d S )Noutput)opr   c                 S  s   g | ]\}}|qS rw   rw   )r   r   r   rw   rw   rx   r     s    z.gm_original_output_strides.<locals>.<listcomp>Zuser_visible_output_idxs)record_original_output_strides)r   Z
find_nodesr   r   metaZtorch._inductor.compile_fxrI  )rE  output_noderI  rw   rw   rx   gm_original_output_strides  s   rL  inputslist[Buffer]
list[Expr]c                 C  s@   t  }| D ]}|t| ddO }|t| ddO }qt|S )NFr   )r/   r   r   r  r   )rM  Zsym_varsinprw   rw   rx   get_symbolic_inputs  s
   rR  c                   @  s  e Zd ZU e Zded< ejddZded< ejddZ	ded< ejddZ
d	ed
< eejdddZdddZdddZdddZdddZdddZdd!d"Zddd'd(Z	#ddd,d-Zdd/d0Zdd2d3Zdd5d6Zdd8d9Zdd;d<Zdd>d?Zdd@dAZddCdDZddFdGZe ddIdJZ!ddLdMZ"ddNdOZ#ddQdRZ$dddVdWZ%ddYdZZ&dd\d]Z'dd^d_Z(ddadbZ)ddddeZ*ddgdhZ+ddidjZ,ddkdlZ-ddmdnZ.ddodpZ/dddsdtZ0ddwdxZ1ddydzZ2dd{d|Z3dd}d~Z4	ddddZ5dddZ6dddZ7	ddddZ8dddZ9dddZ:dddZ;dddZ<	ddddZ=dddZ>dddZ?dddZ@dddZAdddZBdddZCdddZDdddZEeFr[e dddZGdSS dSS )rm   zClassVar[OrderedSet[Any]]_current_originsF)initOrderedSet[Any]originsOptional[list[str]]	tracebackOptional[torch.fx.Node]origin_nodeOrderedSet[Node]rq   Generator[None, None, None]c                 c  s.    t j}|| B t _z	d V  W |t _d S |t _w rs   )rm   rS  )rV  oldrw   rw   rx   current_origins  s   
zIRNode.current_originsattrr   r	  r
   r   c                 C  s   t | || d S rs   )rp   __setattr__)selfr_  r	  rw   rw   rx   _post_init_setattr  s   zIRNode._post_init_setattrc                 C  s<   |  dt| j |  dtjrt nd  |  dd  d S )NrV  rX  rZ  )rb  r/   rS  r5   Zdebug_ir_tracebackrX  format_stackra  rw   rw   rx   __post_init__  s
   zIRNode.__post_init__OrderedSet[str]c                 C     t dd |  D S )Nc                 s      | ]}|j V  qd S rs   r   r   deprw   rw   rx   r!  '      z(IRNode.get_read_names.<locals>.<genexpr>r/   	get_readsrd  rw   rw   rx   get_read_names&     zIRNode.get_read_namesc                 C     | j S rs   )rX  rd  rw   rw   rx   get_traceback)     zIRNode.get_tracebackc                 C  rp  rs   rZ  rd  rw   rw   rx   get_origin_node,  rr  zIRNode.get_origin_nodeOptional[Operation]c                 C  r   rs   rw   rd  rw   rw   rx   get_defining_op/  r   zIRNode.get_defining_opTshortenrr   Sequence[str]c                 C  s:   dt | dd }|rt|dkr|d d  d}|gS )Nzorigins=rV   @   =   z...)r   r   )ra  rw  rV  rw   rw   rx   common_repr2  s   zIRNode.common_reprlinesSequence[object]	multilinec                 C  sb   t |t | | }t tt|}|r&td|}t| j d| dS t| j d| dS )Nz,
z(
z
)(r  )r   r|  mapr   indentjoinr   r   )ra  r}  rw  r  	new_linesrw   rw   rx   
str_helper9  s   zIRNode.str_helpertorch.dtypec                 C  rp  rs   r   rd  rw   rw   rx   r  D  rr  zIRNode.get_dtypeOptional[torch.dtype]c                 C      z|   W S  ty   Y d S w rs   )r  NotImplementedErrorrd  rw   rw   rx   maybe_get_dtypeG  
   
zIRNode.maybe_get_dtypeLayoutc                 C     t dt|  d)Nz#get_layout() is not implemented by !r  r   rd  rw   rw   rx   r  M  r   zIRNode.get_layoutOptional[Layout]c                 C  r  rs   )r  r  rd  rw   rw   rx   maybe_get_layoutP  r  zIRNode.maybe_get_layoutr  c                 C     |   S rs   )r  rd  rw   rw   rx   get_output_specV     zIRNode.get_output_specOptional[OutputSpec]c                 C  r  rs   )r  r  rd  rw   rw   rx   maybe_get_output_specY  r  zIRNode.maybe_get_output_specc                 C  s   t |  tS )z4True for single tensor output (excludes MultiOutput))rt   r  r  rd  rw   rw   rx   has_tensor_output_  s   zIRNode.has_tensor_outputSequence[Expr]c                 C  r  )Nz!get_size() is not implemented by r  r  rd  rw   rw   rx   r   c  r   zIRNode.get_sizeOptional[Sequence[_IntLike]]c                 C  r  rs   )r   r  rd  rw   rw   rx   maybe_get_sizef  r  zIRNode.maybe_get_size.Union[_IntLike, sympy.Rel, Sequence[_IntLike]]c                 C  r  rs   r   rd  rw   rw   rx   r)  l     zIRNode.shaper   c                 C     t |  S rs   )rZ   r   rd  rw   rw   rx   	get_numelp  r   zIRNode.get_numelc                 C     t jjt|  dS Nr   r_   r   r   r,   r   Eqr  rd  rw   rw   rx   is_zero_elementss     zIRNode.is_zero_elementsr  c                 C     t dt|  )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on r  rd  rw   rw   rx   realizev  s   zIRNode.realizeNwriterOptional[IndentedBuffer]c                 C  r  )Nzcodegen_reference NYI on r  ra  r  rw   rw   rx   codegen_reference     zIRNode.codegen_referenceOptional[torch.device]c                 C  r   rs   rw   rd  rw   rw   rx   r    r   zIRNode.get_devicetorch.devicec                 C  s   |   }|d us
J |S rs   r  ra  r   rw   rw   rx   get_device_or_error  s   zIRNode.get_device_or_errorc                 C     dS NFrw   rd  rw   rw   rx   has_exceeded_max_reads  r   zIRNode.has_exceeded_max_reads$Callable[[Sequence[Expr]], OpsValue]c                 C     t t| jrs   r  r   r   rd  rw   rw   rx   make_loader  rz   zIRNode.make_loader Callable[[Sequence[Expr]], Expr]c                 C  r  rs   r  rd  rw   rw   rx   make_indexer  rz   zIRNode.make_indexerr'  c                 C  r  rs   r  rd  rw   rw   rx   r    rz   zIRNode.get_stridec                 C  r  rs   )r  r  rd  rw   rw   rx   r#    r  zIRNode.maybe_get_stridec                 C  r  rs   r  rd  rw   rw   rx   get_name  rz   zIRNode.get_namec                 C  r  rs   )r  r  rd  rw   rw   rx   maybe_get_name  r  zIRNode.maybe_get_namec                 C  s(   z	|   tjjv W S  ty   Y dS w r  )r  r_   r   graph_inputsr  rd  rw   rw   rx   is_input_buffer  s
   zIRNode.is_input_buffer	thresholdOptional[int]c                 C  r  r  rw   ra  r  rw   rw   rx   has_large_inner_fn  r   zIRNode.has_large_inner_fnusersru   c                 C  r   rs   rw   ra  r  rw   rw   rx   
mark_reuse  r   zIRNode.mark_reusec                 C  r   rs   rw   rd  rw   rw   rx   realize_hint  r   zIRNode.realize_hintc                 C  r  rs   r  rd  rw   rw   rx   unwrap_view  rz   zIRNode.unwrap_viewc                 C  r  rs   r  rd  rw   rw   rx   freeze_layout  rz   zIRNode.freeze_layoutr   	list[int]allow_paddingc                 C  r  rs   r  ra  r   r  rw   rw   rx   freeze_layout_with_stride_order     z&IRNode.freeze_layout_with_stride_orderc                 C  r  rs   r  ra  r   rw   rw   rx   freeze_layout_with_fill_order  rz   z$IRNode.freeze_layout_with_fill_orderr   list[_IntLike]c                 C  r  rs   r  ra  r   rw   rw   rx   freeze_layout_with_same_order  rz   z$IRNode.freeze_layout_with_same_orderexact_stridesc                 C  r  rs   r  ra  r  r  rw   rw   rx    freeze_layout_with_exact_strides  r  z'IRNode.freeze_layout_with_exact_stridesdependencies.ReadWritesc                 C  r  rs   r  rd  rw   rw   rx   get_read_writes  rz   zIRNode.get_read_writesOrderedSet[Dep]c                 C  
   |   jS rs   r  readsrd  rw   rw   rx   rm       
zIRNode.get_readsc                 C  r  rs   )r   rm  rd  rw   rw   rx   	num_reads  r   zIRNode.num_readsrh   c                 C  r  rs   r  rd  rw   rw   rx   get_storage_numel  rz   zIRNode.get_storage_numelr   r}   c                 C  r  rs   r  ra  r   rw   rw   rx   get_free_symbol_uses  r  zIRNode.get_free_symbol_usesc                 C  r  rs   r  rd  rw   rw   rx   get_reduction_type  rz   zIRNode.get_reduction_typeSequence[sympy.Expr]c                 C  r  rs   r  rd  rw   rw   rx   get_reduction_size  rz   zIRNode.get_reduction_sizec                 C  r  r  rw   rd  rw   rw   rx   	is_extern  r   zIRNode.is_externc                 C  r  r  rw   rd  rw   rw   rx   is_no_op  r   zIRNode.is_no_opr   c                 C  r  rs   r  r  rw   rw   rx   constant_to_device  rz   zIRNode.constant_to_devicec                 C  r  rs   r  rd  rw   rw   rx   get_mutation_names  rz   zIRNode.get_mutation_namesc                 C  r  rs   r  rd  rw   rw   rx   get_operation_name  rz   zIRNode.get_operation_namec                 C  r  rs   r  rd  rw   rw   rx   get_inputs_that_alias_output  rz   z#IRNode.get_inputs_that_alias_outputc                 C  r   rs   rw   rd  rw   rw   rx   r     r   zIRNode.dtype)rV  r[  rq   r\  )r_  r   r	  r
   rq   r   rq   r   rq   rf  )rq   rW  rq   rY  rq   ru  T)rw  rr   rq   rx  )TT)r}  r~  rw  rr   r  rr   rq   r   rq   r  )rq   r  rq   r  )rq   r  rq   r  )rq   r  rq   rr   rq   r  )rq   r  )rq   r  rq   r   rq   r  rs   r  r  rq   r   rq   r  rq   r  rq   r  rq   r  rq   r'  rq   r   r  r  rq   rr   r  ru   rq   r   rq   rm   Fr   r  r  rr   rq   r   r   r  rq   r   r   r  rq   r   r  r  r  rr   rq   r   rq   r  rq   r  rq   ru   rq   rh   r   rr   rq   r}   rq   r  r   r  rq   rm   rq   rx  )Hr   r   r   r/   rS  r   dataclassesfieldrV  rX  rZ  staticmethod
contextlibcontextmanagerr^  rb  re  rn  rq  rt  rv  r|  r  r  r  r  r  r  r  r  r   r  propertyr)  r  r  r  r  r  r  r  r  r  r  r#  r  r  r  r  r  r  r  r  r  r  r  r  r  rm  r  r  r  r  r  r  r  r  r  r  r  r   r   rw   rw   rw   rx   rm     s   
 













































c                   @  s   e Zd Zd0ddZd1ddZd2d	d
Zd3ddZd4ddZd5ddZd5ddZ	d6ddZ
d7ddZd8ddZd9d d!Zd:d#d$Zd;d&d'Z	(d<d=d*d+Zd>d-d.Zd/S )?	Operationrq   r   c                 C  s
   d | _ d S rs   Zoperation_namerd  rw   rw   rx   re    r  zOperation.__post_init__r  c                 C     t rs   r  rd  rw   rw   rx   r    r   zOperation.get_devicerY  c                 C     t | dsJ | jS NrZ  )hasattrrZ  rd  rw   rw   rx   rt  
     zOperation.get_origin_noderU  c                 C  r  )NrV  )r  rV  rd  rw   rw   rx   get_origins  r  zOperation.get_originsr   c                 C  s   | j d usJ | j S rs   r  rd  rw   rw   rx   r    r  zOperation.get_operation_namerr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zOperation.is_externc                 C  r  r  rw   rd  rw   rw   rx   r    r   zOperation.is_no_opr  c                 C  r  rs   r  rd  rw   rw   rx   r    r   zOperation.get_read_writesr   c                 C  s   ||   v S rs   )rn  )ra  r   rw   rw   rx   
is_user_of  r   zOperation.is_user_ofrf  c                 C  rg  )Nc                 s  rh  rs   r   ri  rw   rw   rx   r!  #  rk  z+Operation.get_read_names.<locals>.<genexpr>rl  rd  rw   rw   rx   rn  "  ro  zOperation.get_read_namesr  c                 C  r  rs   r  rd  rw   rw   rx   rm  %  r  zOperation.get_readsrN  c                 C  r  rs   r  rd  rw   rw   rx   get_outputs(  r   zOperation.get_outputsr}   c                 C     t  S rs   r.   rd  rw   rw   rx   get_unbacked_symbol_defs+  rr  z"Operation.get_unbacked_symbol_defsFr   c                 C  r  )a  
        When unbacked_only=True:
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.

        When unbacked_only=False:
        Similar to `unbacked_only=True` but including all free symbols
        instead of only free unbacked symbols.
        r.   r  rw   rw   rx   r  .  s   zOperation.get_free_symbol_usesru   c                 C  r  )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   rw   rd  rw   rw   rx   get_workspace_sizeF  s   zOperation.get_workspace_sizeNr  r  r  )rq   rU  r  r  r  )r   r   rq   rr   r  r  rq   rN  rq   r}   r  r
  r  )r   r   r   re  r  rt  r  r  r  r  r  r  rn  rm  r  r   r  r!  rw   rw   rw   rx   r    s"    












r  c                      s*  e Zd ZU ded< ded< ded< ded< 		dQdRddZdSddZdT fddZdUddZeZdVddZ	dWddZ
dXd!d"ZdXd#d$ZedYd)d*ZeejfdZd-d.Zed[d0d1Zd\d3d4ZedUd5d6Zd]d^d:d;ZdQd_d=d>Zd`d@dAZdadCdDZdbdFdGZdcdIdJZdddLdMZdedOdPZ  ZS )fLoopsr  r   r  r   Callable[..., Any]inner_fnr'  rangesFr   rr   rq   r}   c                   s,   t  jg  fdd| jD |  R  S )Nc                 3      | ]}t | V  qd S rs   r   r   erP  rw   rx   r!  Y      z-Loops.get_free_symbol_uses.<locals>.<genexpr>)r/   unionr'  inner_fn_free_symbolsr  rw   rP  rx   r  U  s
   zLoops.get_free_symbol_usesnamesrx  r   c                   sF     d jj dt j  g fdd|D  d jg S )N'c                   s    g | ]}| d t  | qS =)r   )r   r   rd  rw   rx   r   d  s     z!Loops._to_str.<locals>.<listcomp>origin_node=)r  r   r   r   r   inner_fn_strrZ  )ra  r/  rw   rd  rx   _to_str]  s   zLoops._to_strr   c                   s   t    d S rs   )superre  rd  	__class__rw   rx   re  h  rz   zLoops.__post_init__c                 C  
   |  dS )Nr'  r5  rd  rw   rw   rx   __str__k  r  zLoops.__str__r  c                 C  rp  rs   r   rd  rw   rw   rx   r  p  rr  zLoops.get_devicerY  c                 C  rp  rs   rs  rd  rw   rw   rx   rt  s  rr  zLoops.get_origin_noder  c                 C  rp  rs   r:  rd  rw   rw   rx   r   v  rr  zLoops.get_sizec                 C  rp  rs   r:  rd  rw   rw   rx   get_pointwise_sizey  rr  zLoops.get_pointwise_sizer   r
   r   rl   c                 O  sN   | dd }| dd }| |i |}|d| |d|p |j t|S )NrZ  rX  )poprb  rX  rl   create)clsr   r   rZ  tbrrw   rw   rx   r@  |  s   
zLoops.createrk   r3   c                   s    fddt | D S )Nc                   s*   g | ]\}}|d krt jjnt |qS r4   )r   SZerorY   )r   nr   rj   rw   rx   r         z Loops._index.<locals>.<listcomp>)r   )r'  rk   rw   rj   rx   _index  s   
zLoops._indexrB   c              	   C  s   t t }t|2 ttdd | j|    |	 W  d    W  d    S 1 s0w   Y  W d    d S 1 s@w   Y  d S Nallow_indexingT)
rA   r_   ZMockHandlerZset_ops_handlerr   rp   r  r&  inner_fn_argsgetvalue)ra  Z	opcounterrw   rw   rx   inner_fn_opcount  s   RzLoops.inner_fn_opcountSequence[Sequence[_IntLike]]c                 C  s   |  | jfS rs   )rI  r'  rd  rw   rw   rx   rL    rz   zLoops.inner_fn_argsc                 C  s   t jj| jg|  R  S rs   )r_   ZKernelFormatterHandlerZir_to_stringr&  rL  rd  rw   rw   rx   r4    s
   zLoops.inner_fn_strNr  r  c                 C  s&   |d u rd}t |tj}|  j|kS r  )maxr5   Zrealize_opcount_thresholdrN  Znum_opsr  rw   rw   rx   r    s   zLoops.has_large_inner_fnOrderedSet[Symbol]c                 C  s   |  | j}t| j||dS NrP  )rI  r'  r<   r&  )ra  r   r   rw   rw   rx   r.    s   zLoops.inner_fn_free_symbolsr  c                 C  sv   t tdd* |  r t|  |  |  jW  d    S t|  |  jW  d    S 1 s4w   Y  d S rJ  )	r   rp   r  r  r>   r  r   r  r  rd  rw   rw   rx   rm    s   $zLoops.get_readsrf  c                 C     t |  jS rs   )r/   rN  read_buffersrd  rw   rw   rx   rn    rz   zLoops.get_read_namesru   c                 C  rS  rs   )r   rN  rT  rd  rw   rw   rx   r    rz   zLoops.num_readsr  c                 C  r  )Nz+get_reduction_size() is not implemented by r  r  rd  rw   rw   rx   r       zLoops.get_reduction_sizer  c                 C  r  )Nz+get_reduction_type() is not implemented by r  r  rd  rw   rw   rx   r    rU  zLoops.get_reduction_typerm   c                 C  r  )Nz+constant_to_device() is not implemented by r  r  r  rw   rw   rx   r    rU  zLoops.constant_to_devicer  r
  )r/  rx  rq   r   r  r  r  r  r  )r   r
   r   r
   rq   rl   )r'  r'  rk   r3   rq   r  )rq   rB   rq   rO  rs   r  r   rr   rq   rQ  r  r  r  r  r  r  ) r   r   r   r   r  r5  re  r<  __repr__r  rt  r   r>  classmethodr@  r  r3   INDEXrI  rJ   rN  rL  r4  r  r.  rm  rn  r  r  r  r  __classcell__rw   rw   r7  rx   r$  N  s@   
 






	




r$  r   Union[Expr, Sequence[Expr]]r   r  r^   c                C  s"   |j rttd|S td|S )Nnanr   )is_floating_pointr]   constantfloat)r   r   rw   rw   rx   nop_loader_fn  s   ra  c                   @  s>   e Zd ZdddZdddZdd	d
ZdddZdddZdS )	Pointwiserq   r  c                 C  s   |   rtt| jdS | jS Nr  )r  r	   ra  r   r&  rd  rw   rw   rx   r    s   zPointwise.make_loaderr  c                 C  s   g S rs   rw   rd  rw   rw   rx   r    r   zPointwise.get_reduction_sizer  c                 C  r   rs   rw   rd  rw   rw   rx   r    r   zPointwise.get_reduction_typeoutput_nameindexer!Callable[[Sequence[Expr]], Never]varsr  r   c                 C  s"   |   }t|p	d||||S Nunnamed)r  r]   storera  rd  re  rg  loaderrw   rw   rx   store_output  s   zPointwise.store_outputr   r  rm   c                 C  s.   |   }ttd||}t|| j|| jdS FMove this to a given device. Requires that all reads are to constants.override_devicer   r   r&  r'  )r  r   rp   ConstantBufferrb  r   r'  ra  r   rl  rw   rw   rx   r    s
   zPointwise.constant_to_deviceNr  r  r  rd  r  re  rf  rg  r  rq   r   r  )r   r   r   r  r  r  rm  r  rw   rw   rw   rx   rb    s    



	rb  c                   @  s6   e Zd ZU ded< dZded< dd
dZdddZdS )Scatterr  output_indexerNrD   scatter_moder   r  rq   rm   c                 C  s6   |   }ttd||}t|| j|| j| j| jdS )ro  rp  )r   r   r&  r'  rv  rw  )	r  r   rp   rr  ru  r   r'  rv  rw  rs  rw   rw   rx   r    s   zScatter.constant_to_devicerd  r  re  rf  rg  r  r   c                 C  s6   |   }|d u r
d}tj||| |||| jdS )Nri  )mode)r  r]   rj  rv  rw  rk  rw   rw   rx   rm  
  s   zScatter.store_outputr  rt  )r   r   r   r   rw  r  rm  rw   rw   rw   rx   ru    s
   
 
ru  
logical_ormaximumminimummuladdZbitwise_xor)anyrP  minprodsumxor_sumz"dict[str, Callable[..., OpsValue]]REDUCTION_COMBINE_FNreduction_typearg_break_ties_leftCallable[..., object]c                   sR   t v rt  S dv rd fdd}|S d	kr"ddd}|S td )Nargmaxargminatuple[object, object]brq   tuple[OpsValue, OpsValue]c                   s   | \}}|\}}dkrt ||}nt ||}t ||}trCt ||}t ||}	t |t ||	}t |t ||	} rKt ||nt ||}
t |t ||
}t |||t |||fS )Nr  )	r]   ltgteqr    nery  logical_andwhere)r  r  Za_valueZa_indexZb_valueZb_indexmaskequalZa_isnanZb_isnanZtier  r   r  rw   rx   argmax_combine_fn-  s&   
z3get_reduction_combine_fn.<locals>.argmax_combine_fnwelford_combine#tuple[OpsValue, OpsValue, OpsValue]c                 S  sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS rs   rw   )r  r  Za_meanZa_m2Za_weightZb_meanZb_m2Zb_weightdeltaZ
new_weightZ	w2_over_wrw   rw   rx   welford_combine_fnN  s   


z4get_reduction_combine_fn.<locals>.welford_combine_fnzunknown reduction_type=)r  r  r  r  rq   r  )r  r  r  r  rq   r  )r  r  )r  r   r  r  r  rw   r  rx   get_reduction_combine_fn%  s   
r  c                      sT  e Zd ZU ded< ded< ded< ded< djddZeZdkdl fddZdmddZdnddZdod!d"Z	dpd$d%Z
dqd'd(Zdkdld)d*Zdrd.d/Ze	0dsdtd;d<Zedud?d@Zeejd0fdvdCdDZedwdGdHZedwdIdJZedxdNdOZedydQdRZe	0dsdzdWdXZed{d_d`Zed|dddeZe	0dsd}dfdgZed~dhdiZ  ZS )	Reductionr'  reduction_rangesrC   r  r  	src_dtyperG   reduction_hintrq   r   c                 C  r9  )N)r'  r  r  r;  rd  rw   rw   rx   r<  l  r  zReduction.__str__Fr   rr   rQ  c                   s(   t   t j fdd| jD  B S )Nc                 3  r(  rs   r)  r*  rP  rw   rx   r!  s  r,  z1Reduction.get_free_symbol_uses.<locals>.<genexpr>)r6  r  r/   r-  r  r  r7  rP  rx   r  q  s   zReduction.get_free_symbol_usesr  c                 C  rp  rs   )r  rd  rw   rw   rx   r  v  rr  zReduction.get_reduction_sizer  c                 C  rp  rs   )r  rd  rw   rw   rx   r  y  rr  zReduction.get_reduction_typerd  re  rf  rg  r  reduction_varsSequence[Symbol]r   c              	   C  s4   t | j| j| j| ||}t |pd|||S rh  )r]   	reductionr   r  r  r&  store_reduction)ra  rd  re  rg  r  r	  rw   rw   rx   r  |  s   
zReduction.store_reductionru   c                 C     t | jt | j S rs   )r   r'  r  rd  rw   rw   rx   index_length  r   zReduction.index_lengthSequence[Sequence[Expr]]c                 C  s$   |  | j}|  | jtj}||fS rs   )rI  r'  r  r3   R0_INDEX)ra  r   rindexrw   rw   rx   rL    s   zReduction.inner_fn_argsc                 C  s.   |  | j}|  | jtj}t| j|||dS rR  )rI  r'  r  r3   r  r<   r&  )ra  r   r   r  rw   rw   rx   r.    s
   
zReduction.inner_fn_free_symbolsr   r  rm   c              
   C  s>   |   }ttd||}t|| j|| j| j| j| j	t
jdS )ro  rp  r   r   r&  r'  r  r  r  r  )r  r   rp   rr  r  r   r'  r  r  r  rG   DEFAULTrs  rw   rw   rx   r    s   zReduction.constant_to_deviceN	dst_dtyper&  r   r'  %Union[ReductionType, Literal['scan']]reduction_numelr   
input_noder   tuple[ReductionHint, _IntLike]c	           "   
   C  s  t jj|}	t jjt|}
|dkp#t j| tj o#|dvo#tj	}t
|	r,t
|
s1tjdfS t| }|j}d}|rRtjt jj| dd}tjt jj| dd}nddd}|}|
dkr||	|
}|dkrktj|fS |d urt|trttdd t|\}}W d    n1 sw   Y  |d ur|d urt jjt|| }|	|krtd||||| tjdfS tj|fS |	|ks|
|d d krtjdfS t| |||||dkr|nd|tjd}ddd}||\}}|r||\}}t|dkrtjdfS t |! |" \\}}}d}d}|D ].}t jj#||}t jj$||t%|& } t'dd | D }!|!r;|d7 }q|d7 }q||krNtj||	|
fS tj(||	|
fS )Nscanr  r4       T)Zinner_reductionFreduction_numel_hintru   
numel_hintrq   c                 S  r  Nr4   rw   )r  r  rw   rw   rx   inner_reduction_splits     z4Reduction.num_splits.<locals>.inner_reduction_splitsrK  zUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %sr"  r   r  r  rC  r  tuple[Sequence[Expr], bool]c                   s   t d t|  |  |  d| d}| }|jd usJ dd |jD }g }d}t|jdd dD ]7 t	 fd	d
|D ri|
 j  jtjjv ritjj j }t|jdd }|  t|jdd |krid}q2||fS )Nr   r   r   r   r;  r:  c                 S  s&   g | ]}t |trt |tjs|qS rw   )rt   r   r   Numberr   rC  rw   rw   rx   r     s    
zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S  rp  rs   r   rv   rw   rw   rx   <lambda>      z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>keyc                 3  s    | ]	}| j jv V  qd S rs   )r   r&   r  mdrw   rx   r!         zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>r   T)ComputedBufferr  r  r  r   r  
range_varssortedr  r$  appendr   r   r_   r   Zname_to_bufferr   r;  decide_layout)rC  cbread_writesr  indiceschangedbufZoriginal_striderw   r  rx   get_read_indices	  s6   	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s  s    | ]}|d kV  qdS r4   Nrw   r   rw   rw   rx   r!  <      z'Reduction.num_splits.<locals>.<genexpr>)r  ru   r  ru   rq   ru   )rC  r  rq   r  ))r_   r   r   r,  rZ   has_featurer7   ZREDUCE_TO_SINGLE_ELEMENTr5   Zsplit_reductionsry   rG   r  rF   r@  Zmulti_processor_count	functoolsr	   choicesZreduction_split_factorZINNERrt   rl   r   rp   r  r=   logdebugr  r   r6   index_vars_squeezer   r  simplify_with_rangesstride_hintsr   keysr$  OUTER)"r   r  r  r&  r'  r  r  r  r  r  r  Zshould_splitpropsZnum_smZmin_elements_per_threadr  Zouter_reduction_splitssplit
new_rangesnew_reduction_rangesZextracted_numel_hintrC  r  r  r  r   r  Zranges1Z	num_outerZ	num_innerr   jr3  outerrw   rw   rx   
num_splits  s   	











!

zReduction.num_splits<Callable[[Sequence[_IntLike], Sequence[_IntLike]], OpsValue](Callable[[Sequence[_IntLike]], OpsValue]c                   sn   dd D t || d fdd|d	v r3td
d
t dfddfddS S )z1Convert inner_fn from a reduction to an pointwisec                 S     g | ]	}t jj|qS rw   )r_   r   r   Zevaluate_static_shaper   ro   rw   rw   rx   r   R  s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>r   r'  rq   r
   c                   s,   t  fddtjdd D  D S )Nc                 3  s    | ]} |V  qd S rs   rw   )r   r  )r   value_fnrw   rx   r!  [  s
    
z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S  s   g | ]}t |qS rw   )r   r  rw   rw   rx   r   ^  r   z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)r  reduce	itertoolsproductr   )
combine_fnr  r  r   rx   r   X  s   z*Reduction._unroll_reduction_fn.<locals>.fnr  r  Nr  r  c                   s*   dd |D }| |t  |tjfS )Nc                 S     g | ]}t |qS rw   )r   expandr   rw   rw   rx   r   o  r   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)r]   
index_exprr  int64r   r  )flatten_indexr&  rw   rx   r  l  s   z0Reduction._unroll_reduction_fn.<locals>.value_fnc                   s    | d S r  rw   r   )r   rw   rx   r  u  s    z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r   r'  rq   r
   )r   r'  r  r'  rq   r  )r  r=  r  r  r  )r&  r  r  r  rw   )r  r  r   r&  r  r  rx   _unroll_reduction_fnJ  s$   
	zReduction._unroll_reduction_fnr%  rl   c
                   s  t jjtdkrDd fdd}
|
d|
d|
d|
dd	 v s0J  d
d fdd}tj|||t|dS dkredv rTd fdd}ndfdd}tj| ||dS t	t
rt jjtjk rt|dkst|jrtj| | ||dS | | |||		\}}dfdd}||}|tjkr|}|dkr|	d usJ t|	\}}|d usJ |d usJ | | |||||
S |dkr| | |||||	
S tt| |||dS )Nr   valrp   rq   Union[bool, float, int]c                   sH    t jkr	t| S  jrt| tjsJ t| S t| tjs J t| S rs   )	r  rr   r^  rt   typingSupportsFloatr`  SupportsIntru   r  r  rw   rx   py_cnst  s   
z!Reduction.create.<locals>.py_cnstr4   )r  r  r  r~  z* not supported for zero-dimension tensors!r   ru   r^   c                   s   t   S rs   r]   r_  r   )r  r  rtypes_to_initsrw   rx   const_fn     z"Reduction.create.<locals>.const_fnrq  r  c                   s   t d S r  r  r   r  rw   rx   r     r   zReduction.create.<locals>.fnc                      dd D } | |S )Nc                 S     g | ]}t jjqS rw   r   rE  rF  r   rw   rw   rx   r     r   z0Reduction.create.<locals>.fn.<locals>.<listcomp>rw   r   reduction_index)r&  r  rw   rx   r        
r  c                   s$   t  r| S | dkrt| tjS | S r  )ry   rP  r5   Zmin_num_split)r  )r  rw   rx   _maybe_increase_split  s
   z/Reduction.create.<locals>._maybe_increase_splitr"  r  )r  rp   rq   r  )r   ru   rq   r^   )r  ru   rq   ru   )r_   r   r   simplifyrZ   r  rb  r@  r   rt   r   r  r5   Zunroll_reductions_thresholdrV   r   r  r  rG   r  r=   !create_multilayer_existing_rangescreate_multilayerrl   r  )rA  r   r  r  r&  r'  r  r  r  r  r  r  r   hintr  r  r  r  rw   )r  r&  r  r  r  r  rx   r@  z  s   

	
zReduction.creater   #Union[_NumLike, Sequence[_NumLike]]c                 C  s   | dv rt |rtdS t|rdS t|jS | dv r0t |r$tdS t|r*dS t|jS t|r6dnd}t|r>dnd}|||||||f|||ftd|fd	|  S )
N)rP  r  z-infF)r  r  infTr   r4   )r  r  r  r~  welford_reducer  online_softmax_reduce)r    r`  r   r  iinfor  rP  )r  r   zeroonerw   rw   rx   default_accumulator  s0   
zReduction.default_accumulatorc                 C  s   | dkrdS t | |S )Nr  r   )r  r  r  r   rw   rw   rx   default_value:  s   zReduction.default_valuer  rh   r  c                 C  sP   | dkr|S | dkr|dkr|t jkrt jS | dkr&|dkr&|t jkr&t jS |S )Nr"     i      )rG   r  Z
OUTER_TINY)r  r  r  rw   rw   rx   _multilayer_second_step_hintB  s   z&Reduction._multilayer_second_step_hintr  c                 C  s   |du rdS t jj| |sdS |  zt| W n
 ty&   Y dS w | }t	|dd D ]\}}t jj|drC|  S q3dS )z
        If we are reducing over the full tensor, and it is non-dense in the last dimension,
        reindex so we reduce over the dense dimension. initially just handle complete
        reduction case
        Nr"  r4   )
r_   r   r   r+  r  r  r<  r  r  r   )rA  r  r  r3  r   r   rw   rw   rx   $check_for_split_dense_dim_reindexingS  s$   	z.Reduction.check_for_split_dense_dim_reindexingrl  
block_sizedefaultr  c           
        sT   |  |}t|g|tjjt| d d	 fdd}	|	S )
Nr   r   r  r  rq   r^   c                   sl   |\}| ^ }| |  d fdd}r3t }tt |t|}t||S | S )Nrq   r^   c                     s    gS rs   rw   rw   )r  rl  	new_indexr   rw   rx   body  r  zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)rq   r^   )rP   r]   r  r  Zmasked)r   r  Zreduction_blockr  Zindex_dtyper  r  r  rl  	need_maskr  r   )r  r  rx   
wrapper_fn  s   


z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)r   r  r  r  rq   r^   )	r  Viewdynamic_reshape_indexerr_   r   r   r,   r   r  )
rA  rl  r  r  r  r  r  r  Zdense_indexr  rw   r  rx   _multilayer_wrap_loaders  s   z!Reduction._multilayer_wrap_loader@Callable[[Sequence[sympy.Expr], Sequence[sympy.Expr]], OpsValue]original_rangesoriginal_reduction_rangesr  Sequence[Integer]r  c                   sN   t dd D sJ dt|t|t| d fd	d
}|S )Nc                 s  s    | ]}|d kV  qdS r  rw   r  rw   rw   rx   r!    r  zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=merged_indexr  new_reduction_indexrq   r^   c                   s:   | d t  }| t d  } |t|t| S rs   )r   r   )r&  r'  Zoriginal_idxr  rl  r#  r   rw   rx   r    s   zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r&  r  r'  r  rq   r^   )r$  r  r   r   )rA  rl  r#  r$  r  r  r  rw   r(  rx   '_multilayer_wrap_loader_existing_ranges  s   	z1Reduction._multilayer_wrap_loader_existing_rangesr  rO  list[Integer]c                   s   |t jt jfvr
|nt j}t|||||||	|}|  |  d
 fdd}tj	j
t|}| |
||}||dt| ksDJ tt|||||t|d |	||d	S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r   r'  r  rq   r^   c                   s    g | |S rs   rw   r  Zintermediate_loaderrw   rx   intermediate_fn     z;Reduction.create_multilayer_helper.<locals>.intermediate_fnNr  )r   r'  r  r'  rq   r^   )r  Zfloat16Zbfloat16r`  r  r@  r  r  r_   r   r   r   rZ   r  r   rl   )rA  r   r  r  r  r#  r$  r  r  r  r  r  Zintermediate_dtypeZintermediater-  r  rw   r,  rx   create_multilayer_helper  sD   
z"Reduction.create_multilayer_helperc                 C  sd   t |}t||d  |}| ||}| |||||||
}| ||||||g |||g|||	S )r+  r4   )rZ   r1   r  r!  r/  )rA  r   r  r  r&  r'  r  r  r  r  r  r  r  r  r  rw   rw   rx   r    s2   

zReduction.create_multilayerc                 C  s8   |  |||||}| ||||||g ||||	d|
S )r+  r"  )r)  r/  )rA  r   r  r  r&  r#  r$  r  r  r  r  r  rw   rw   rx   r  )  s(   
z+Reduction.create_multilayer_existing_rangesr  r  rW  r  r  
rd  r  re  rf  rg  r  r  r  rq   r   r  rq   r  r  rs   )r   r  r  r  r  r  r&  r   r'  r'  r  r'  r  r  r  r   r  r   rq   r  )
r&  r  r  r'  r  r   r  r  rq   r  )r   r  r  r  r  r  r&  r%  r'  r  r  r  r  rC   r  rG   r  r   rq   rl   r  r   r   r  rq   r
  )r  rh   r  ru   r  rG   rq   rG   )r  rh   r  r   rq   r  )rl  r   r  r'  r  rh   r  rh   r  rh   r  r
  r  r   rq   r  )rl  r"  r#  r  r$  r  r  r%  r  r%  rq   r"  )r   r  r  r  r  r  r  r%  r#  r  r$  r  r  rO  r  r*  r  rC   r  rh   r  rG   rq   rl   )r   r  r  r  r  r  r&  r%  r'  r  r  r  r  rC   r  rh   r  rG   r  r   rq   rl   )r   r  r  r  r  r  r&  r%  r#  r  r$  r  r  r*  r  r*  r  rC   r  rG   rq   rl   )r   r   r   r   r<  rX  r  r  r  r  r  rL  r.  r  r  r  r  rY  rG   r  r@  r  r  r  r  r!  r)  r/  r  r  r[  rw   rw   r7  rx   r  d  s\   
 







 !/
 !	*?-r  c                      s2   e Zd ZU ded< d fddZd ddZ  ZS )!MultiOutputReductionru   output_indexr   r  r  r  	inner_fns)Union[INNER_FN_TY, Sequence[INNER_FN_TY]]r'  r%  r  r  rC   r  r  rG   c
              
     sX   t  r f t dkr d }
nd fdd	}
t j|||
|||||d
 |	| _d S )Nr4   r   r   r  reduction_idxrq   tuple[OpsValue, ...]c                   s   t  fddD S )Nc                 3  s    | ]}| V  qd S rs   rw   r   r   r   r7  rw   rx   r!  n  r,  z@MultiOutputReduction.__init__.<locals>.loader.<locals>.<genexpr>)r   r:  r5  r:  rx   rl  k  s   z-MultiOutputReduction.__init__.<locals>.loaderr  )r   r  r7  r  rq   r8  )callabler   r6  __init__r4  )ra  r   r  r5  r'  r  r  r  r  r4  rl  r7  r;  rx   r=  W  s    


zMultiOutputReduction.__init__rd  r  re  rf  rg  r  r  r  rq   r   c              	   C  sZ   t | j| j| j| ||}t|ttfsJ t	| || j
 }t |p'd|||S rh  )r]   r  r   r  r  r&  rt   r   r   r   r4  r  )ra  rd  re  rg  r  r   r	  rw   rw   rx   r  |  s   

z$MultiOutputReduction.store_reduction)r   r  r  r  r5  r6  r'  r%  r  r%  r  rC   r  r  r  rG   r4  ru   r0  )r   r   r   r   r=  r  r[  rw   rw   r7  rx   r3  T  s   
 %r3  c                   @  s"   e Zd ZeejdfdddZdS )OnlineSoftmaxReductionNr   r  r  r  r  r&  r%  r'  r  r  
num_outputru   r  rG   r  r   rq   Sequence[TensorBox]c
                   s<   t  fddt|D }
|
D ]}|  q|
S )z>
        Create the reduction disregarding splitting.
        c                 3  s.    | ]}t t d |	V  qdS )r  N)rl   r@  r3  r   Z
output_idxr   r  r&  r'  r  r  r  rw   rx   r!    s"    
z0OnlineSoftmaxReduction.create.<locals>.<genexpr>)r   r   r  )rA  r   r  r  r&  r'  r  r?  r  r  resultsr  rw   rB  rx   r@    s   
zOnlineSoftmaxReduction.create)r   r  r  r  r  r  r&  r%  r'  r  r  r  r?  ru   r  rG   r  r   rq   r@  )r   r   r   rY  rG   r  r@  rw   rw   rw   rx   r>    s
    
r>  c                   @  s<   e Zd ZeejfdddZedddZedddZ	dS )WelfordReductionr   r  r   r  r5  Sequence[Callable[..., Any]]r'  r*  r  r  rC   r  rG   rq   r@  c              
     s6  dv sJ t jjt}dfdd}	|dkr-|	d}
|	d}|	d}|
||fS |d	krWdfdd dkrL d |	d|	d	fS t fddD S tjd |d\}}tj	kro||d	kr| 
|S fddtdD }|D ]}|  q|S )N)r  r  r  ru   rq   rl   c                   s&   d fdd}t j|tdS )	Nr   r  rq   r^   c                   s   t  S rs   r  r   )r   r  rw   rx   r&    s   z8WelfordReduction.create.<locals>.const.<locals>.inner_fnrq  r   r  rq   r^   rb  r@  r   )r  r&  )r   r   r'  r  rx   const  s   z&WelfordReduction.create.<locals>.constr   r4   rl  4Callable[[Sequence[Expr], Sequence[Expr]], OpsValue]c                   s&   d fdd}t j|tdS )	Nr   r  rq   r^   c                   r  )Nc                 S  r   rw   r  r   rw   rw   rx   r     r   zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>rw   )r   r  )rl  r  rw   rx   r&    r  z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnrq  rG  rH  )rl  r&  )r   r   r'  r  rl  rx   copy  s   z%WelfordReduction.create.<locals>.copyr  c                 3  s    | ]} |V  qd S rs   rw   r9  )rL  rw   rx   r!    r  z*WelfordReduction.create.<locals>.<genexpr>)r  r  c                   s*   g | ]}t t |	qS rw   )rl   r@  rD  rA  )r   r   r5  r'  r  r  r  rw   rx   r     s     z+WelfordReduction.create.<locals>.<listcomp>r   )r  ru   rq   rl   )rl  rJ  rq   rl   )r_   r   r   r  rZ   r   r  r  rG   r  r  r   r  )rA  r   r   r5  r'  r  r  r  r  rI  meanm2weightr	  r  rC  r  rw   )rL  r   r   r5  r'  r  r  r  rx   r@    sT   



zWelfordReduction.creater   r
  c                 C  r  )N)r   r   r   rw   r  rw   rw   rx   r  .  r  zWelfordReduction.default_valuer  rh   c	              
     s$  t tjjt d }	|	r9|dkr9dfd
d}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd|D g | g||}|D ]}|  qadddtjjt |}||}t
|tfdd|D |gd|S )r+  r   r  r   r  r7  r	  ru   rq   r^   c                   s   t | S rs   r  )r   r7  r	  r  rw   rx   r_  M     z4WelfordReduction.create_multilayer.<locals>.constantr  r4   )r   r   r5  r'  r  r  r  r  c              	   3  s&    | ]}j | d dV  qdS )r   )r  N)r!  )r   rl  )r  rA  r  r  r  rw   rx   r!  e  s    	
z5WelfordReduction.create_multilayer.<locals>.<genexpr>r   r  rl  r  c                 S  s   |g | |S rs   rw   )r   r  rl  rw   rw   rx   intermediate_loader_fnx  s   zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3  s     | ]}t  | d V  qdS )rK  N)r	   r  r   )rQ  rw   rx   r!    
    
N)r   r  r7  r  r	  ru   rq   r^   )r   r  r  r  rl  r  rq   r^   )rZ   r_   r   r   r,   r   r  r  r	   r1   rD  r@  r   r  r   r  )rA  r   r   r5  r'  r  r  r  r  r  r_  Zintermediatesr   r  rw   )r  rA  r   rQ  r  r  r  rx   r  4  sb   

	


z"WelfordReduction.create_multilayerN)r   r  r   r  r5  rE  r'  r*  r  r*  r  rC   r  rG   rq   r@  r2  )r   r  r   r  r5  rE  r'  r*  r  r*  r  rC   r  rh   r  rG   rq   r@  )
r   r   r   rY  rG   r  r@  r  r  r  rw   rw   rw   rx   rD    s    	xrD  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< dJdK fddZdL fddZdMd"d#ZdNd$d%ZdOd'd(ZdPd)d*Z	dPd+d,Z
dQd-d.ZdRd0d1ZdJdKd2d3Zeejfd4d5dSd>d?ZedTdHdIZ  ZS )UScanr*  scan_rangesr   =Callable[[tuple[Any, ...], tuple[Any, ...]], tuple[Any, ...]]r  zFCallable[[Sequence[_IntLike], Sequence[_IntLike]], Sequence[_IntLike]]r   rG   r  ru   r4  tuple[torch.dtype, ...]dtypestuple[Callable[..., Any], ...]r5  Fr   rr   rq   rQ  c                   D   t   t j fdd| jD  B t j fdd| jD  B S )Nc                 3  r(  rs   r)  r*  rP  rw   rx   r!    r,  z,Scan.get_free_symbol_uses.<locals>.<genexpr>c                 3  r(  rs   r)  r*  rP  rw   rx   r!    r,  )r6  r  r/   r-  rT  r   r  r7  rP  rx   r    s   
zScan.get_free_symbol_usesr   c                   0   t | jt | j t | jksJ t   d S rs   )r   r'  rT  r   r6  re  rd  r7  rw   rx   re       "zScan.__post_init__rd  r  re  %Callable[[Sequence[_IntLike]], Never]rg  r  	scan_varsr  c                   sR   |  || t fdd| jD }t| j| j|}t|p d| || j S )Nc                 3      | ]}| V  qd S rs   rw   r   r&  rF  rw   rx   r!    r  z'Scan.store_reduction.<locals>.<genexpr>ri  )	r   r   r5  r]   r  rW  r  rj  r4  )ra  rd  re  rg  r]  r   resultrw   rF  rx   r    s   zScan.store_reductionc                 C  r  )NZcustomrw   rd  rw   rw   rx   r       zScan.get_reduction_typer  c                 C  rp  rs   )rT  rd  rw   rw   rx   r    rr  zScan.get_reduction_sizec                 C  rp  rs   r   rd  rw   rw   rx   r     rr  zScan.get_sizec                 C  rp  rs   r:  rd  rw   rw   rx   r>    rr  zScan.get_pointwise_sizec                 C  r  rs   )r   r'  rT  rd  rw   rw   rx   r    r   zScan.index_lengthrO  c                 C  .   |  | j}|  | jtj}| ||}|fS rs   )rI  r'  rT  r3   r  r   ra  r   r  r   rw   rw   rx   rL       zScan.inner_fn_argsc                 C  8   |  | j}|  | jtj}| ||}t| j||dS rR  )rI  r'  rT  r3   r  r   r<   r&  ra  r   r   r  r   rw   rw   rx   r.       zScan.inner_fn_free_symbolsT)can_fallback_to_atenr   r  +tuple[Callable[[Sequence[Expr]], Any], ...]axisri  r   r
   Sequence[Optional[TensorBox]]c                  s  g d    d d    g	t jtjs$d gt S tdkr9t jtjs9d gt S t jj}
|
t		}ttksNJ |

t|drgfddttD S | jd d  	|d\}t
|dkrtjjd u ptotdkotdk}|s|rd gt S d}nt
d 	fdd	
fddttD }|D ]}|  q|S )Nr4   c                   &   g | ]}t j | | d qS rq  rb  r@  r   r4  r   rW  r5  r   rw   rx   r         zScan.create.<locals>.<listcomp>r   )r   r   r&  rk  pointwise_rangesrT  r  
scan_numelz3.3.0r   r  
scan_indexrq   rO  c                   H   t |t ks
J t | t ksJ g | d   ||  d  S rs   r   )r   ru  )rk  rs  rT  rw   rx   r   	      zScan.create.<locals>.reindexc                   sB   g | ]}t 	d| | 
 |d qS ))r   r   rW  r&  r5  r   r'  rT  r  r   r  r4  rw   )rl   r@  rp  )r  r   rW  r5  r   rs  r  r   rT  	scan_typer   rw   rx   r   $	  s*    )r   r  ru  r  rq   rO  )r_   r   r  r7   ZSCANr   ZTUPLE_REDUCTIONr   r  rZ   r,   r   Ler   r  rS  r  versionZhip
has_tritontriton_version	SplitScanr  )rA  r   rW  r5  r   rk  r  r  ri  r   r   rt  r  Zsupports_splitrC  r`  rw   )rk  r  r   rW  r5  r   rs  r  r   rT  ry  r   rx   r@    sV    







zScan.creater   r  r&  r  rs  rt  r   r  c	           
   
     s*   d
 fdd}	t j||||	||d|d	S )Nr   r  r7  rq   r^   c                   s$   g | d   ||  d  S rs   rw   r:  rk  r&  rw   rx   r  K	  s   $z#Scan.num_splits.<locals>.wrapper_fnr  )r   r  r  r&  r'  r  r  r  )r   r  r7  r  rq   r^   )r  r  )
rA  r   r   r&  rk  rs  rT  r  rt  r  rw   r  rx   r  >	  s   zScan.num_splitsr  rW  r  )
rd  r  re  r\  rg  r  r]  r  rq   r   r  r  r  r  rV  )r   r  rW  rV  r5  rj  r   r*  rk  ru   r  rU  r  rG   ri  rr   r   r
   rq   rl  )r   r  r   r  r&  r  rk  ru   rs  r*  rT  r*  r  rU  rt  r   rq   r  )r   r   r   r   r  re  r  r  r  r   r>  r  rL  r.  rY  rG   r  r@  r  r[  rw   rw   r7  rx   rS    s4   
 






	arS  c                   @     e Zd ZdS )r~  Nr   r   r   rw   rw   rw   rx   r~  [	  s    r~  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< d;d< fddZd= fddZd>d!d"Zd?d#d$Zd@d%d&Zd@d'd(Z	d@d)d*Z
dAd+d,ZdBd.d/Zd;d<d0d1ZeejfdCd9d:Z  ZS )DSortr*  sort_rangesr   z:Callable[[Sequence[Expr], Sequence[Expr]], Sequence[Expr]]r   rG   r  ru   r4  rV  rW  rX  r5  rr   stable
descendingFr   rq   rQ  c                   rY  )Nc                 3  r(  rs   r)  r*  rP  rw   rx   r!  u	  r,  z,Sort.get_free_symbol_uses.<locals>.<genexpr>c                 3  r(  rs   r)  r*  rP  rw   rx   r!  x	  r,  )r6  r  r/   r-  r  r   r  r7  rP  rx   r  q	  s   
zSort.get_free_symbol_usesr   c                   rZ  rs   )r   r'  r  r   r6  re  rd  r7  rw   rx   re  |	  r[  zSort.__post_init__rd  r  re  r  rg  r  r  c                   sV   |  || t fdd| jD }t| j|| j| j}t|p"d| || j	 S )Nc                 3  r^  rs   rw   r_  rF  rw   rx   r!  	  r  z'Sort.store_reduction.<locals>.<genexpr>ri  )
r   r   r5  r]   sortrW  r  r  rj  r4  )ra  rd  re  rg  r  r   r`  rw   rF  rx   r  	  s   zSort.store_reductionc                 C  r  )Nr  rw   rd  rw   rw   rx   r  	  r   zSort.get_reduction_typec                 C  rp  rs   )r  rd  rw   rw   rx   r  	  rr  zSort.get_reduction_sizec                 C  rp  rs   rb  rd  rw   rw   rx   r   	  rr  zSort.get_sizec                 C  rp  rs   r:  rd  rw   rw   rx   r>  	  rr  zSort.get_pointwise_sizec                 C  r  rs   )r   r'  r  rd  rw   rw   rx   r  	  r   zSort.index_lengthr  c                 C  rc  rs   )rI  r'  r  r3   r  r   rd  rw   rw   rx   rL  	  re  zSort.inner_fn_argsc                 C  rf  rR  )rI  r'  r  r3   r  r   r<   r&  rg  rw   rw   rx   r.  	  rh  zSort.inner_fn_free_symbolsr   r  'tuple[Callable[[list[Expr]], Any], ...]rk  r   r
   rl  c	                   s*  g 	d   	 d d  	  g
t jtjs$d gt S t jj}
|
t
}d}t	j
jo=|
t||}|sGd gt S ttksQJ |
t|drj	fddttD S d 
fd
d	
fddttD }|D ]}|  q|S )Nr4   r  c                   rm  rn  ro  rp  rq  rw   rx   r   	  rr  zSort.create.<locals>.<listcomp>r   r  
sort_indexrq   rO  c                   rv  rs   rw  )r   r  )rk  rs  r  rw   rx   r   	  rx  zSort.create.<locals>.reindexc                   sD   g | ]}t td| | 	|
 d qS ))r   r   rW  r&  r5  r   r'  r  r   r  r4  r  r  rw   )rl   r@  r  rp  )r  r   rW  r5  r   rs  r  r   r   r  r  rw   rx   r   	  s,    )r   r  r  r  rq   rO  )r_   r   r  r7   ZSORTr   r   r  rZ   r5   r  Zpersistent_reductionsr,   r   rz  r   r  )rA  r   rW  r5  r   rk  r  r  r  r   r   Z
sort_numelZ
max_rblockZis_persistent_kernelrC  r`  rw   )rk  r  r   rW  r5  r   rs  r  r   r   r  r  rx   r@  	  s0    




zSort.creater  rW  r  )
rd  r  re  r  rg  r  r  r  rq   r   r  r  r  r1  )r   r  rW  rV  r5  r  r   r*  rk  ru   r  rr   r  rr   r  rG   r   r
   rq   rl  )r   r   r   r   r  re  r  r  r  r   r>  r  rL  r.  rY  rG   r  r@  r[  rw   rw   r7  rx   r  `	  s.   
 







r  c                 C  s(   z	t | dd W dS  ty   Y dS w )NFfreezeT)r<  r  rv   rw   rw   rx   r   	  s   r   c                 C  s@   zt | dd\}}| r|  | W S  ty   Y dS w NFr  )r<  should_pad_stridespad_stridesis_contiguousr  )ro   _bufferr;  rw   rw   rx    is_contiguous_storage_and_layout
  s   
r  r  want_contiguousstride_order'Optional[Sequence[Union[int, Integer]]]r  r  tuple[StorageBox, Layout]c           	      C  s   t | trt| j|||||dS t | tr)t| j|||||d\}}| | j fS t | trc|r[|r?|   |   s>J n|durK| j	||d n|durW| j
||d n|   t| |  fS t | trvt| j|d\}}|| jfS t)z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    r  r  r  r  r  Nr  r  )rt   rl   r<  r:  
StorageBoxr  Bufferr  r  r  r  r  r?  r;  r  )	ro   r  r  r  r  r  r   r;  bufferrw   rw   rx   r<  
  sR   






r<  c                 C  s2   zt | dd\}}||W S  ty   Y dS w r  )r<  is_stride_orderedr  )ro   r  r  r;  rw   rw   rx   "is_stride_order_storage_and_layoutI
  s   r  r   c                 C  sr   t | ttfrt| jS t | tr*| j}t|jt	|j
 t dk }t| jp)|S t | tr7|  tjjv S dS )Nr   F)rt   rl   r  is_unalignedr:  r?  r;  r,   r>  rQ   r   rS   r  r  r_   r   unaligned_buffers)r   r;  Zhas_unaligned_layoutrw   rw   rx   r  S
  s   


r  c                   @  s   e Zd ZU ded< dDdEdd	ZdFddZdGddZdHddZedIddZ	dJddZ
dKddZdLddZdMd d!ZdNd#d$ZdOd(d)ZdPd*d+ZdQd-d.Zd/d0 Zd1d2 ZdPd3d4ZdPd5d6ZdRd8d9ZdSd;d<Zd=d> ZdTdAdBZdCS )UBaseViewrm   r:  Fr   rr   rq   rQ  c                 C     | j |S rs   r:  r  r  rw   rw   rx   r  i
  r   zBaseView.get_free_symbol_uses*Callable[[Sequence[Expr]], Sequence[Expr]]c                 C  s   t d|  )Nzmake_reindexer NYI on r  rd  rw   rw   rx   make_reindexerl
  rz   zBaseView.make_reindexerr  c                   &   | j   |  d fdd}|S )Nr   r  rq   r   c                   r   rs   rw   rF  innerr   rw   rx   re  s
  r   z&BaseView.make_indexer.<locals>.indexer)r   r  rq   r   )r:  r  r  ra  re  rw   r  rx   r  o
     
zBaseView.make_indexerr  c                   r  )Nr   r  rq   r^   c                   r   rs   rw   rF  r  rw   rx   rl  |
  r   z$BaseView.make_loader.<locals>.loaderrG  )r:  r  r  ra  rl  rw   r  rx   r  x
  r  zBaseView.make_loaderr  c                 C  
   | j  S rs   )r:  r  rd  rw   rw   rx   r   
     
zBaseView.dtyper  c                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.get_layoutr  c                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.get_devicerY  c                 C  r   rs   rw   rd  rw   rw   rx   rt  
  r   zBaseView.get_origin_noder   c                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.get_namer  c                 C  r  rs   r  rd  rw   rw   rx   r>  
  r  zBaseView.get_pointwise_sizer  ru   r   c                 C  r  rs   r:  r  r  rw   rw   rx   r  
  r   zBaseView.mark_reusec                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.has_exceeded_max_readsr  c                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.realizec                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.realize_hintc                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.get_storage_numelc                 C  r  rs   r:  r  rd  rw   rw   rx   r  
  r  zBaseView.is_externc                 C  r  rs   )r:  is_module_bufferrd  rw   rw   rx   r  
  r  zBaseView.is_module_bufferrf  c                 C  r  rs   r:  rn  rd  rw   rw   rx   rn  
  r  zBaseView.get_read_namesr  c                 C  sF   t tdd t|  |  jW  d    S 1 sw   Y  d S rJ  )r   rp   r  r>   r  r   r  rd  rw   rw   rx   rm  
  s   $zBaseView.get_readsc                 C  s"   | }t |tr|j}t |ts|S rs   )rt   r  r:  )ra  ro   rw   rw   rx   r  
  s
   

zBaseView.unwrap_viewr   r  c                 C  s2   |   }ttd||}t||  ||  dS rn  )r  r   rp   rr  rb  r  r   rs  rw   rw   rx   r  
  s   zBaseView.constant_to_deviceNr  rW  )rq   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r   r   r   r   r  r  r  r  r  r   r  r  rt  r  r>  r  r  r  r  r  r  r  rn  rm  r  r  rw   rw   rw   rx   r  e
  s0   
 


		











r  c                   @  s@   e Zd ZU ded< edd Zedd Zdd	d
Zdd Z	dS )r   rO  r   c                 C  s   t jj}tttj|}|  }dgt|t|  t| }t|t|ks)J t	t|D ]A}|| dkrF|| dus?J || ||< q/|| du s\t jjj
jt|| dddr]q/|j|| ||  dddkspJ dq/|S )	zReplace `-1` with correct sizesNr"  r4   TZsize_obliviousr   fallbackzKBroadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i})r_   r   r   r   r  r   r  r   r   r   r   evaluate_exprr  r   )ro   new_sizer   old_sizer   rw   rw   rx   _normalize_size
  s"    zExpandView._normalize_sizec           
      C  s   |  ||}t|r\t|\}}t|t|j }|dksJ tjjg| }t|j	|jD ]\}}|
tjjjjt|dddsC|ntjj q-t|j|jt|||j}	t||	dS t||dS )Nr   r4   Tr  r9  )r:  r   )r  r   r<  r   r   r   rE  rF  r   r   r  r_   r   r   r   r  r  r=  r   r   r   r>  r?  r   )
rA  ro   r  r@  rA  skiprB  r   r   rC  rw   rw   rx   r@  
  s.   
zExpandView.createrq   r  c                 C  rp  rs   rb  rd  rw   rw   rx   r      rr  zExpandView.get_sizec                   s4   |   }| j   t|t   fdd}|S )Nc                   sR   t | d  } t| t ksJ tt D ]} | dkr&tjj| |< q| S r  )r   r   r   r   rE  rF  )r   r   actualr  rw   rx   r     s   z*ExpandView.make_reindexer.<locals>.reindex)r   r:  r   )ra  targetr   rw   r  rx   r    s
   
	zExpandView.make_reindexerNr  )
r   r   r   r   r  r  rY  r@  r   r  rw   rw   rw   rx   r   
  s   
 


r   c                   @  s@   e Zd ZU ded< edd Zedd Zdd	d
Zdd ZdS )PermuteViewrO  dimsc                   s   |  |}t|ttt|ksJ t|r<t|\} t j j fdd|D  fdd|D  j	}t
||dS t||dS )Nc                      g | ]} j | qS rw   rb  r   rA  rw   rx   r   "  r   z&PermuteView.create.<locals>.<listcomp>c                   r  rw   r   r   r  rw   rx   r   #  r   r9  )r:  r  )_map_neg_dimsr/   r   r   r   r<  r=  r   r   r>  r?  r  )rA  ro   r  r@  rC  rw   r  rx   r@    s   
zPermuteView.createc                   s    fdd D S )Nc                   s$   g | ]}|d kr
|nt  | qS r   rw  )r   r-  r  rw   rx   r   ,  s   $ z-PermuteView._map_neg_dims.<locals>.<listcomp>rw   )rA  r  rw   r  rx   r  *     zPermuteView._map_neg_dimsrq   r  c                   sD   t | | jt tt| jksJ | j   fdd| jD S )Nc                   r   rw   rw   r   rb  rw   rx   r   3  r   z(PermuteView.get_size.<locals>.<listcomp>)r/   r  r  r   r   r:  r   rd  rw   rb  rx   r   .  s
   

zPermuteView.get_sizec                   s^   dd t | jD   fddtt| jD  t ttt| jks'J  fdd}|S )Nc                 S  r   rw   rw   )r   r   r  rw   rw   rx   r   6  r   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                   r   rw   rw   r   invrw   rx   r   7  r   z.PermuteView.make_reindexer.<locals>.<listcomp>c                   s    fddD S )Nc                   r   rw   rw   r   r   rw   rx   r   ;  r   z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>rw   r   r  r   rx   r   :  r  z+PermuteView.make_reindexer.<locals>.reindex)r   r  r   r   r/   )ra  r   rw   r  rx   r  5  s
   zPermuteView.make_reindexerNr  )	r   r   r   r   rY  r@  r  r   r  rw   rw   rw   rx   r    s   
 


r  c                   @  s6   e Zd ZeddddZedddZdddZdS )SqueezeViewNr-  c                  s>  t |rst|\}}g }g } d ur(t tsJ dd kr& t|jk s(J tt|j|jD ]0\}\}}	 d u rJ|dkrI|	| |	|	 q1| krY|	| |	|	 q1|dksaJ dq1t
|j|j|||j}
t||
dS  d u rt|dd | D S |   dksJ t| fddt| D S )	Nzexpected integer dim argumentr   r4   zexpected squeezed size to be 1r9  c                 S     g | ]}|d kr|qS rD  rw   r   rw   rw   rx   r   c  r   z&SqueezeView.create.<locals>.<listcomp>c                   s   g | ]
\}}| kr|qS rw   rw   r   r   r   r  rw   rx   r   f      )r   r<  rt   ru   r   r   r   r   r   r  r=  r   r   r>  r?  r  r@  r   )rA  ro   r-  r@  rA  r  rB  r   r   r   rC  rw   r  rx   r@  B  s:   


"zSqueezeView.creater   r  c                   s@   dd | D }dd t | D t|  d
 fdd	}||fS )Nc                 S  r  rD  rw   r   rw   rw   rx   r   j  r   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S  s   g | ]
\}}|d kr|qS rD  rw   r  rw   rw   rx   r   k  r  r   list[sympy.Expr]rq   tuple[sympy.Expr, ...]c                   sT   t | t ksJ |  d tjjg  }t| D ]\}}|||< qt|S )N )r   r   rE  rF  r   r   )r   r  r   r   lengthZnot_onerw   rx   r   n  s
   "
z%SqueezeView.squeezer.<locals>.reindex)r   r  rq   r  )r   r   )r   r  r   rw   r  rx   squeezerh  s
   zSqueezeView.squeezerrq   r   c                 C  s   t d)Nzuse SqueezeView.create())AssertionError)ra  r:  rw   rw   rx   r=  w  r  zSqueezeView.__init__)r   r  r  )r   r   r   rY  r@  r  r  r=  rw   rw   rw   rx   r  @  s    %r  c                   @  sT   e Zd ZU ded< ded< dd Zdd	d
ZdddZeZedd Z	dddZ
dS )GenericViewrO  r   r%  r   c                 C  rp  rs   )r   rd  rw   rw   rx   r    rr  zGenericView.make_reindexerrq   r   c                 C  sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S     g | ]}t tj|qS rw   )rY   r3   rZ  )r   rG  rw   rw   rx   r         z+GenericView.reindex_str.<locals>.<listcomp>zlambda , r  )r   r   r   r   r   r  r  r   )ra  Z	index_oldZ	index_newrw   rw   rx   reindex_str  s
   zGenericView.reindex_strc                 C  s$   |  | jd| j d|   gS )Nsize=zreindex=)r  r:  r   r  rd  rw   rw   rx   r<    s   zGenericView.__str__c                 C  s   | |t ||dS )Nr:  r   r   )r   )rA  ro   r  r   rw   rw   rx   r@    r  zGenericView.creater  c                 C  rp  rs   rb  rd  rw   rw   rx   r     rr  zGenericView.get_sizeNr  r  )r   r   r   r   r  r  r<  rX  rY  r@  r   rw   rw   rw   rx   r  {  s   
 


r  c                   @  sR   e Zd Zedd Zedd Zedd Ze	ddddZeddddZ	dS )r  c                 C  s<   t | } t |}tjjjj}|t | dr| | } | S r  )r   r  r_   r   r   r   r  Lt)r   r   r  rw   rw   rx   handle_negative_index  s   

zView.handle_negative_indexc           	        s   t |ttfs	J | | |\ }tjj |r|S d}t	t
 dks/t	t
|dkr1d}d|v rD fdd}| |t||dS t|sJ|rq|rUt|sUt|}t|dd\}}t|j|j|t||j}t||dS |  |}| |t||dS )	NFr   Tc                   s   t dgt  S r  )r   r   r   r  rw   rx   fake_reindex  r  z!View.create.<locals>.fake_reindexr  )r  r9  )rt   r   r   resolve_negative_sizer   r_   r   r   Zstatically_known_list_equalsr   r'   r  ExternKernelrequire_contiguousr<  r=  r   r   r  r  r>  r?  r   )	rA  ro   r  Zunbacked_symbols_in_sizesr  r@  rA  rC  r   rw   r  rx   r@    s6   
zView.createc                 C  s   dd |D }dd | D } t |}tt|D ]}|| dkr3tjj||< tt| t|||<  q4qtj	j
t| t| | |fS )Nc                 S  r  rw   r_   r   r   r  r  rw   rw   rx   r         z.View.resolve_negative_size.<locals>.<listcomp>c                 S  r  rw   r  r  rw   rw   rx   r     r  r"  )r   r   r   r   rE  Oner0   rZ   r_   r   r   guard_equals)r  r  r   rw   rw   rx   r    s   zView.resolve_negative_sizeNr  r'  r  	dense_dimr  rq   r   c              	   C  sZ   z
|  |||}W |S  ttfy,   t|g}|  ||}|  ||}t||}Y |S w rs   )_dynamic_reshape_indexerr  
IndexErrorrZ   r   )rA  r  r  r  r   Zflatr   r   rw   rw   rx   r     s   
zView.dynamic_reshape_indexerc                   s  t jjj}dd tt|D  tt |}t| }|duo,|t|d ko,t|dk}|r?|dus5J ||}|	| g |r|r| }| \}	}
|dkrd	t
jj |	|	|
f n|
dkrn|	| n||
||kr	|	 t jj|
| n||
||k r||
||k r| \}}||
 |	 }	|
| }
||
||k s	|	 t jj|
| nL||
||krt
jj}|}	t|	|| || }||
||kr| }	t|	|| || }|| }||
||kst jj|
| nt|r|sG|r$| }t jj|d 	t
jj |s|r8| \}	}
t jj|
d |s'|durSt|dkrS   }|| n  tt| ksbJ  fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S  r  rw   )rY   r3   ZVIEWr   rw   rw   rx   r     r  z1View._dynamic_reshape_indexer.<locals>.<listcomp>Nr4   c                   sH   t | t ksJ t | t ftt|  t fddD S )Nc                 3  r(  rs   )r[   r  Zreplacementsrw   rx   r!  B  r,  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)r   r   r   r   r   rg  Z	view_exprr  rx   r   ?  s   $z.View._dynamic_reshape_indexer.<locals>.reindex)r_   r   r   r   r   r   r   r   r?  r  r   rE  rF  r  r  r2   r  reverseinsert)r  r  r  r   Z	stack_newZ	stack_oldZreordering_dense_dimZold_dimZsize_oldvarZsize_newZvar2Z	size_new2ZdivisormodulusZ
dense_exprr   rw   r  rx   r    s   







 zView._dynamic_reshape_indexerrs   )r  r'  r  r'  r  r  rq   r   )r  r  )
r   r   r   r  r  rY  r@  r  r   r  rw   rw   rw   rx   r    s    

,
r  c                      s   e Zd ZU dZded< d2 fddZd3d	d
ZeZd3ddZd4ddZ	d5ddZ
edd Zd6ddZdd Zd7ddZd8ddZd9d d!Zd"d# Z	$d:d;d(d)Zd<d=d-d.Zd>d0d1Z  ZS )?r?  z*Pretend our storage has a different layoutr  r;  rq   r   c                   s2   t    t| jtrt| d| j  d S d S )Nr:  )r6  re  rt   r:  r  rp   r`  r  rd  r7  rw   rx   re  M  s   
zReinterpretView.__post_init__r   c                 C     |  | j| jgS rs   )r  r:  r;  rd  rw   rw   rx   r<  R  s
   zReinterpretView.__str__c                 C  r  rs   r  rd  rw   rw   rx   r  \  r  zReinterpretView.get_namer  c                 C     | j jS rs   )r;  r   rd  rw   rw   rx   r  _  r  zReinterpretView.get_devicerY  c                 C  r   rs   rw   rd  rw   rw   rx   rt  b  r   zReinterpretView.get_origin_nodec                 C  r  rs   )r;  r   rd  rw   rw   rx   r   e  r  zReinterpretView.dtyper  c                 C     t | jjS rs   )r   r;  r   rd  rw   rw   rx   r   i  r   zReinterpretView.get_sizec                 C  r  rs   )r   r;  r   rd  rw   rw   rx   r  l  r   zReinterpretView.get_strider  c                      d fdd}|S )Nr   r  rq   r^   c                   sF    j  }t  || } j j jjkr!t| j jjS |S rs   )r;  r  r]   loadr  r   r:  to_dtype_bitcast)r   re  Z
tmp_loaderrd  rw   rx   rl  p  s
   
z+ReinterpretView.make_loader.<locals>.loaderr   r  rq   r^   rw   r  rw   rd  rx   r  o  s   zReinterpretView.make_loaderr  c                 C  r  rs   )r;  r  rd  rw   rw   rx   r  z  r  zReinterpretView.make_indexerc                 C  rp  rs   r;  rd  rw   rw   rx   r  }  rr  zReinterpretView.get_layoutc                 C  r   rs   rw   rd  rw   rw   rx   r    r   zReinterpretView.freeze_layoutFr   rr   r}   c                 C  s*   t | jj|t | jj|B t | jj|B S rs   )r   r;  r   r   r>  r  rw   rw   rx   r    s   z$ReinterpretView.get_free_symbol_usesNr  r  c                 C  s@   t jjj| j| jj| jj| jj|d ur|j	nt jjj	| jj
dS rc  )r_   r   wrapper_codeZcodegen_reinterpret_viewr:  r;  r   r   r>  	writeliner   r  rw   rw   rx   r    s   z!ReinterpretView.codegen_referenceru   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zReinterpretView.num_readsr  r  r  r  r  r  r  r  r  r
  rs   r  r  )r   r   r   __doc__r   re  r<  rX  r  r  rt  r  r   r   r  r  r  r  r  r  r  r  r[  rw   rw   r7  rx   r?  G  s*   
 








	r?  c                   @  sT   e Zd ZU dZded< edd Zddd	ZeZe	d
d Z
dddZdddZdS )	DtypeViewz(Pretend our storage has a different typer  target_dtypec                 C  sD   t |rt|\}}t|j||j|j|j}t||dS t||dS )Nr9  )r:  r  )	r   r<  r=  r   r   r   r>  r?  r  )rA  ro   Z	new_dtyper@  rA  rC  rw   rw   rx   r@    s   zDtypeView.createrq   r   c                 C  r  rs   )r  r:  r  rd  rw   rw   rx   r<    r  zDtypeView.__str__c                 C  rp  rs   )r  rd  rw   rw   rx   r     s   zDtypeView.dtyper  c                 C  r  rs   r:  r   rd  rw   rw   rx   r     r  zDtypeView.get_sizer  c                   s   j    fdd}|S )Nc                   s   t  | jjjS rs   )r]   r  r  r:  r   rF  r  ra  rw   rx   rl    s   z%DtypeView.make_loader.<locals>.loaderr:  r  r  rw   r   rx   r    s   
zDtypeView.make_loaderNr  r  r  )r   r   r   r  r   rY  r@  r<  rX  r  r   r   r  rw   rw   rw   rx   r    s   
 



r  c                   @  s&   e Zd Zedd ZedddZdS )		SliceViewc                   s   t jj| | tdd ||fD rtjtjnjj	fdd  fdd}||dd}|||}||fS )zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s      | ]}t |V  qd S rs   )r'   r  rw   rw   rx   r!    r  z0SliceView.normalize_start_end.<locals>.<genexpr>c                   s<    | |r| n | |}||r|}|S ||}|S rs   )statically_known_geqr*  )ro   lowerupperZclamped_lowerZclamped_full)max_funcmin_funcr   rw   rx   clamp  s   
z,SliceView.normalize_start_end.<locals>.clampc                   s$   | d u r|S  | }  | ||S rs   )r  )r  r  r  r  )r	  rA  dim_sizerw   rx   
clamp_wrap  s   z1SliceView.normalize_start_end.<locals>.clamp_wrapr   )
r_   r   r   r   r~  r   ZMinZMaxZevaluate_minZevaluate_max)rA  ro   r-  startendr  rw   )r	  rA  r
  r  r  r   rx   normalize_start_end  s   zSliceView.normalize_start_endr4   Tc                   s  t tt jsdksJ zdkr!|dkr!dkr!|W S W n	 ty+   Y nw t| |r>| | |\}t| d   < t	|rzt
|\}}t|j}	|	   |	 < t|j|j|	|j|j    }
t||
dS  fdd}t||dS )Nr   l    r4   r9  c                   sD   t | t ksJ d|  d t| } |     |  < | S )Nzwrong ndim r  )r   r   r   r-  r  r  steprw   rx   r     s   $z!SliceView.create.<locals>.reindexr  )r   r  rt   r   	TypeErrorr   r   r  r1   r   r<  r   r=  r   r   r>  r?  r  )rA  ro   r-  r  r  r  r	  r@  rA  rB  rC  r   rw   r  rx   r@    s6   

zSliceView.createN)r4   T)r   r   r   rY  r  r@  rw   rw   rw   rx   r    s
    
$r  c                   @  sF   e Zd ZU ded< ded< dddZdd
dZdddZdddZdS )BaseConstantr  r   r  r   rq   r  c                 C  r  Nrw   rw   rd  rw   rw   rx   r     r   zBaseConstant.get_sizer  c                 C  rp  rs   r=  rd  rw   rw   rx   r     rr  zBaseConstant.get_devicerY  c                 C  r   rs   rw   rd  rw   rw   rx   rt  #  r   zBaseConstant.get_origin_noder  c                 C  r  rs   r.   rd  rw   rw   rx   rm  &  rr  zBaseConstant.get_readsNr  r  r  r  )r   r   r   r   r   r  rt  rm  rw   rw   rw   rx   r    s   
 


r  c                   @  sD   e Zd ZU ded< ded< ded< dd	d
ZdddZdddZdS )Constantr
   r	  r  r   r  r   rq   r  c                   r  )Nr   r  rq   r^   c                      t  j jS rs   )r]   r_  r	  r   r   rd  rw   rx   rl  1  r  z$Constant.make_loader.<locals>.loaderr  rw   r  rw   rd  rx   r  0     zConstant.make_loaderr  c                 C  r   rs   rw   rd  rw   rw   rx   r  6  r   zConstant.realizerm   c                 C     t | j| j|dS )N)r	  r   r   )r  r	  r   r  rw   rw   rx   r  9  r  zConstant.constant_to_deviceNr  r  r  )r   r   r   r   r  r  r  rw   rw   rw   rx   r  *  s   
 

r  c                   @  s:   e Zd ZU ded< ded< ded< dd	d
ZdddZdS )IndexingConstantr
   r   r  r   r  r   rq   r  c                   r  )Nr   r  rq   r^   c                   r  rs   )r]   r  r   r   r   rd  rw   rx   rl  D  r  z,IndexingConstant.make_loader.<locals>.loaderr  rw   r  rw   rd  rx   r  C  r  zIndexingConstant.make_loaderrm   c                 C  r  )N)r   r   r   )r  r   r   r  rw   rw   rx   r  I  r  z#IndexingConstant.constant_to_deviceNr  r  )r   r   r   r   r  r  rw   rw   rw   rx   r  =  s   
 
r  r   c                 C  s    t dd t| t||D S )Nc                 s  s&    | ]\}}}|d kp||kV  qdS r  rw   )r   leftrightr   rw   rw   rx   r!  P  s
    
z2is_contiguous_strides_for_shape.<locals>.<genexpr>)r$  r   r  r  )r   r)  rw   rw   rx   is_contiguous_strides_for_shapeM  s
   r  c                 C  s   t j| j S rs   )r5   Zpadding_alignment_bytesitemsizer  rw   rw   rx   get_align_for_dtypeX  r   r  c                   @  s$   e Zd ZdZd
ddZdddZd	S )r  zxAbstract base for Layout, MultiOutputLayout, NoneLayout.
    Represents the memory layout of the output of an Operation.rq   r  c                 C  r  rs   r  rd  rw   rw   rx   r  `  rz   zOutputSpec.get_deviceru   c                 C  r  rs   r  rd  rw   rw   rx   storage_sizec  rz   zOutputSpec.storage_sizeNr  r  )r   r   r   r  r  r  rw   rw   rw   rx   r  \  s    
r  c                   @  s   e Zd Zdedfd7ddZd8ddZeZd9ddZd:ddZd;ddZ	e
d<dd Zd;d!d"Zd;d#d$Zd%d& Ze
d'd( Zd)d* Zd+d, Zd-d. Zd=d0d1Zd;d2d3Zd>d5d6ZdS )?r  Nr   r   r  r   r  r   rO  r   Optional[list[Expr]]r>  r   rq   r   c                 C  sn   |d u r	t |}|| _|| _t|t|ks!J d| d| tdd |D s,J || _|| _|| _d S )Nr  	, stride=c                 s  s    | ]
}t |ttfV  qd S rs   )rt   r   ru   r   rw   rw   rx   r!  v  s    z"Layout.__init__.<locals>.<genexpr>)	r  r  r   r   r   r$  r   r   r>  )ra  r   r   r   r   r>  rw   rw   rx   r=  i  s   
$
zLayout.__init__r   c                 C  sr   d}| j dkrd| j  }| jjd u rdnd| jj }t| j d| jj | d| j d| j d| j | d	S )
Nry  r   z	, offset=:z('z', z, size=r   r  )r>  r   r   r   r   r   r   r   )ra  r>  Zdevice_index_strrw   rw   rx   r<  {  s   
"zLayout.__str__c                 C  rp  rs   r=  rd  rw   rw   rx   r    rr  zLayout.get_devicer   c                 C  sL   t j tjt| jt| j| j| jdW  d    S 1 sw   Y  d S )Nr   r   )	r_   	fake_moder  r  rM   r   r   r   r   rd  rw   rw   rx   get_example  s   $zLayout.get_examplerr   c                 C  s   t | j| jS rs   )r  r   r   rd  rw   rw   rx   r    rz   zLayout.is_contiguousr)  r'  r3  c                 C  sV   t | }|dvs| d dkrdS t|t| | D ]\}}}|dkr(||kr( dS qdS )N)r      r4   FT)r   r   r!   )r)  r3  ndimr  r  r   rw   rw   rx   is_channels_last_contiguous  s   
z"Layout.is_channels_last_contiguousc                 C  sJ   t | jtttt| j| jD ]\}}}|dkr"||kr" dS qdS )Nr4   FT)r   r   reversedr  r  r   r   )ra  r  r  r   rw   rw   rx   is_transposed  s   zLayout.is_transposedc                   s   t jt  ksJ dd tjD }fdd|D } fdd|D  dd }|  dgt   }tt  D ]
}|| | | < q<tt  d D ]'}|| ||d  k}t|tsqtjj	j
|| ||d  kd	d
}|rv dS qOd	S )Nc                 S  s*   g | ]\}}t jjj|d ddkr|qS )r   r  r4   )r_   r   r   r   )r   r   r-  rw   rw   rx   r     s    z,Layout.is_stride_ordered.<locals>.<listcomp>c                   r  rw   r  r   rd  rw   rx   r     r   c                   r   rw   rw   r   r   rw   rx   r     r   c                   s   t |   fdd| D S )Nc                      g | ]}  |qS rw   r   )r   elementZ
sorted_arrrw   rx   r     r   zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)r  )Zarrrw   r,  rx   sorted_indices  s   z0Layout.is_stride_ordered.<locals>.sorted_indicesr"  r4   Tr  F)r   r   r   r   r   rt   rr   r_   r   Z
_shape_envr  )ra  r   Znon_1_indicesr   r-  stride_orderedr   exprrw   )r   ra  rx   r    s*   
zLayout.is_stride_orderedc                 C  s:   dgt ttdt| jd  }t|g| }| |S Nr   r4   )r   r(  r   r   r   r  r  rw   rw   rx   is_channels_last_stride_ordered  s   "
z&Layout.is_channels_last_stride_orderedc                 C  s*  t |}t| dkr| S tjst|| r| S t }t|dr)|j	
ddr)| S tdd t| |D s8| S t| }t|}dd tt| D }d	||d < d}t|d	d
 d	dD ]*\}	}
||	d	  }|| ||  }|tjkr|| dkrt||| }d}|||
< q]|s| S t jd	7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   rJ  Zdislike_paddingFc                 s  s     | ]}t |ttjfV  qd S rs   )rt   ru   r   r   r   rw   rw   rx   r!    rR  z&Layout._pad_strides.<locals>.<genexpr>c                 S  r   r   rw   r   rw   rw   rx   r     r   z'Layout._pad_strides.<locals>.<listcomp>r4   N)r  T)r  r   r5   Zpad_channels_lastr  r'  r_   Zget_current_noder  rJ  getr$  r  chainr   r   r   r   Zpadding_stride_thresholdrK   r   Znum_comprehensive_padding)Z
in_stridesr   r   alignZcurrent_fx_noder  r   Znew_stridespaddedrankr   Zprev_idxr   rw   rw   rx   _pad_strides  s@   


zLayout._pad_stridesc                 C  s6   t | tsJ | jd usJ | | j| j| j| _d S rs   )rt   r  r   r7  r   r   rd  rw   rw   rx   r    s   zLayout.pad_stridesc                 C  s   t jot| tS rs   )r5   comprehensive_paddingrt   r  rd  rw   rw   rx   r    r  zLayout.should_pad_stridesc                 C  s8   t | tr| S |  r|   t| j| j| j| j| jS rs   )	rt   r=  r  r  r   r   r   r   r>  rd  rw   rw   rx   as_fixed  s   
zLayout.as_fixedr  c                 C  s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r  rK  r   r   r9  r  rd  rw   rw   rx   r  )  s   zLayout.make_indexerc                 C  s<   | j |j ko| j|jko| j|jko| j|jko| j|jkS rs   r   r   r   r   r>  )ra  otherrw   rw   rx   __eq__/  s   



zLayout.__eq__
sympy.Exprc                 C  s   t | j| j| jS rs   )r   r   r   r>  rd  rw   rw   rx   r  8  r  zLayout.storage_size)r   r  r   r  r   rO  r   r  r>  r   rq   r   r  r  )rq   r   r  )r)  r'  r3  r'  rq   rr   r  rq   r=  )r   r   r   r   r=  r<  rX  r  r$  r  r  r'  r)  r  r1  r7  r  r  r9  r  r<  r  rw   rw   rw   rx   r  g  s,    



	


#
:

	r  c                   @  s   e Zd ZdZdddZdS )r=  z A Tensor layout we cannot changerq   r  c                   s    fdd}|S )z1A closure containing math to read a given elementc                   sf   t | t  jksJ t | t  jksJ  j}t|  j jD ]\}}}|dkr0|||  }q!|S r  )r   r   r   r>  r   )r   r`  r   r   szrd  rw   rx   re  B  s   z)FixedLayout.make_indexer.<locals>.indexerrw   r  rw   rd  rx   r  ?  s   	zFixedLayout.make_indexerNr  )r   r   r   r  r  rw   rw   rw   rx   r=  <  s    r=  c                      s   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	edd Z
dddZdddZdd Zdd Zdd fddZ  ZS )r  z(A Tensor layout we are allowed to changeFc                 C  sN   t | dkrg S tjjg}t| dd  D ]}|||d   qtt|S )Nr   r4   r"  )r   r   rE  r  r(  r  r   )sizesZreversed_stridesr   rw   rw   rx   r  T  s   
z!FlexibleLayout.contiguous_stridesc                 C  s\   t tt| t |ksJ | |ftjj}dgt| }|D ]}|||< || |  }q|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        N)r/   r   r   r   rE  r  )r@  r   Znext_strider3  r   rw   rw   rx   fill_ordered]  s   $zFlexibleLayout.fill_orderedc                 C  s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r/   r   r   r   r  rA  )r@  r   r   rw   rw   rx   r.  n  s   zFlexibleLayout.stride_orderedc                 C  sP   |t jkrt| tS |t jkrt| tS |t jkr t| S t	
d| t)aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %s)r  channels_lastr  r.  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERZcontiguous_formatr  r  r  r  )r@  memory_formatrw   rw   rx    stride_ordered_for_memory_formatz  s   



z/FlexibleLayout.stride_ordered_for_memory_formatc                 C  sD   t | t |ks
J dd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S  r  rw   )r_   r   r   r  r  rw   rw   rx   r     r  z/FlexibleLayout.same_ordered.<locals>.<listcomp>r  )r   r  r   __getitem__r  rA  )r@  r   r   rw   rw   rx   same_ordered  s   zFlexibleLayout.same_orderedc                 C  sD   |  | j|}|  r|r| || j| j}t| j| j| j|| jS rs   )r.  r   r  r7  r   r=  r   r>  )ra  r   r  rB  rw   rw   rx   as_stride_order  s   zFlexibleLayout.as_stride_orderc                 C  s:   |}|   r|r| || j| j}t| j| j| j|| jS rs   )r  r7  r   r   r=  r   r>  )ra  r  r  rB  rw   rw   rx   as_exact_strides  s   zFlexibleLayout.as_exact_stridesc                 C  @   |  | j|}|  r| || j| j}t| j| j| j|| jS rs   )rA  r   r  r7  r   r=  r   r>  )ra  r   rB  rw   rw   rx   as_fill_order     zFlexibleLayout.as_fill_orderc                 C  rL  rs   )rI  r   r  r7  r   r=  r   r>  )ra  r   rB  rw   rw   rx   as_same_order  rN  zFlexibleLayout.as_same_orderNrq   r   c                   s2   |r	t ||}nt |}t |||| d S rs   )r  rA  r  r6  r=  )ra  r   r   r   r  r3  r7  rw   rx   r=    s   
zFlexibleLayout.__init__r  rs   r  )r   r   r   r  rK  r  r  rA  r.  rG  rI  rJ  rK  rM  rO  r=  r[  rw   rw   r7  rx   r  N  s$    






r  c                      s4   e Zd ZdZd fddZdd	d
Zdd Z  ZS )NonOwningLayoutz,Is a view into the storage of another tensorviewUnion[BaseView, TensorBox]rq   r   c                   s,   |  }t |j|j|j|j || _d S rs   )r  r6  r=  r   r   r   r   rQ  )ra  rQ  r;  r7  rw   rx   r=    s   
zNonOwningLayout.__init__r  c                 C     |    S rs   )r9  r  rd  rw   rw   rx   r    r   zNonOwningLayout.make_indexerc                 C  s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr4   )	ALIGNMENT)	rQ  r  r>  utilsrT  r_   r   r   Zstatically_known_multiple_of)ra  r>  rT  rw   rw   rx   maybe_guard_aligned  s
   z#NonOwningLayout.maybe_guard_aligned)rQ  rR  rq   r   r  )r   r   r   r  r=  r  rV  r[  rw   rw   r7  rx   rP    s
    

rP  c                   @     e Zd ZdZdS )CommBufferTypeZsymm_memN)r   r   r   ZSYMM_MEMrw   rw   rw   rx   rX        rX  c                      s4   e Zd ZU dZded< ded< d
 fdd	Z  ZS )CommBufferLayoutax  
    A layout that signifies the buffer is a comm buffer.
    In terms of striding, the layout is identical to `FixedLayout`.

    Buffers with this layout do not participate in in-place reuse - it can be
    neither the source nor the target for in-place reuse.

    For detailed motivation and usage of this layout, see
    NOTE [lowering-time collective optimization].
    rX  comm_buffer_typer   
group_namer;  r  c                   sR   t |tstd| d| }t j|j|j|j|j	|j
d || _|| _d S )NzJA `CommBufferLayout` can only be initialized with a `FlexibleLayout` (got z).r:  )rt   r  r  r9  r6  r=  r   r   r   r   r>  r[  r\  )ra  r;  r[  r\  fixedr7  rw   rx   r=    s    

zCommBufferLayout.__init__)r;  r  r[  rX  r\  r   )r   r   r   r  r   r=  r[  rw   rw   r7  rx   rZ    s
   
 rZ  c                   @  sb   e Zd ZU ded< ejdd dZded< ejdd dZded	< dddZdd Z	dddZ
dS )
NoneLayoutr  r   c                   C     dgS r  rw   rw   rw   rw   rx   r  )  r  zNoneLayout.<lambda>default_factoryr  r   c                   C  r_  r  rw   rw   rw   rw   rx   r  *  r  r   rq   ru   c                 C  r  r  rw   rd  rw   rw   rx   r  ,  r   zNoneLayout.storage_sizec                 C     | S rs   rw   rd  rw   rw   rx   r9  /  r   zNoneLayout.as_fixedc                 C  rp  rs   r=  rd  rw   rw   rx   r  2  rr  zNoneLayout.get_deviceNr  r  )r   r   r   r   r  r  r   r   r  r9  r  rw   rw   rw   rx   r^    s   
 

r^  c                      sx   e Zd Zd fddZeddd	Zejddd	Zd ddZd!ddZdd Z	e
d"ddZdd Zd#ddZ  ZS )$MutationLayoutSHOULDREMOVEr  rm   rq   r   c                   s@   t  | | | d  || _|   }tj	
| d S rs   )r6  r=  r  r  r   r  
get_bufferr  r_   r   mark_buffer_mutated)ra  r  r   r7  rw   rx   r=  7  s   z#MutationLayoutSHOULDREMOVE.__init__rO  c                 C  r  rs   )real_layoutr   rd  rw   rw   rx   r   B  r  z!MutationLayoutSHOULDREMOVE.strider	  r   c                 C  r   rs   rw   )ra  r	  rw   rw   rx   r   F  ra  r=  c                 C  rS  rs   )rf  r  rd  rw   rw   rx   r  J  r   z'MutationLayoutSHOULDREMOVE.storage_sizer  c                   s,    fdd  | j }t|tsJ d|S )Nc                   sB   t | tr
 | jS t | tr |  S t | tr | jS | S rs   )rt   rc  r  r  r  
MutableBoxr:  )r  unwrap_viewsrw   rx   ri  N  s   




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_viewsz1MutationLayoutSHOULDREMOVE must refer to a buffer)r  rt   r  )ra  r`  rw   rh  rx   rd  M  s   
	z%MutationLayoutSHOULDREMOVE.get_bufferc                 C  r  rs   )rd  r;  rd  rw   rw   rx   rf  ]  r  z&MutationLayoutSHOULDREMOVE.real_layoutFc              	   C  s   |   tj|  t|tr|j}|  |s6t	j
| | | dd t| | D dj}|   t|jjtsCJ t||j_|jS )Nc                 S      g | ]\}}t jj||qS rw   r_   r   r   r  r   r  r  rw   rw   rx   r   x      z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>rq  )r  r_   r   re  r  rt   rl   r:  r  rb  r@  r  r  r  r   r   r;  r  rc  )rA  srcdstZunsafe_aliasrw   rw   rx   realize_into`  s(   

z'MutationLayoutSHOULDREMOVE.realize_intoc                 C  rb  rs   rw   rd  rw   rw   rx   r9    r   z#MutationLayoutSHOULDREMOVE.as_fixedr  c                 C  r  rs   )r  r  rd  rw   rw   rx   r    r  z'MutationLayoutSHOULDREMOVE.make_indexer)r  rm   rq   r   rq   rO  )r	  r   rq   r   r>  )rq   r  r  r  )r   r   r   r=  r  r   setterr  rd  rf  rY  rp  r9  r  r[  rw   rw   r7  rx   rc  6  s    

"rc  c                      sP  e Zd ZU ded< ded< dU fddZdVd
dZdWddZdXddZdYddZdZddZ	e
d[ddZd\ddZd]dd Zd^d"d#Zd_d%d&Zd`d'd(Zd)d* Zd+d, ZdadUd.d/ZdUd0d1ZdUd2d3Z	-dadUd4d5Zd6d7 Zdbd9d:Zdcddd>d?Zd@dA ZdedCdDZdedEdFZdfdHdIZ	-dadgdMdNZdhdOdPZdidQdRZ djdSdTZ!  Z"S )kr  r  r   r  r;  rq   r   c                   s   t    | dd  d S r  )r6  re  rb  rd  r7  rw   rx   re    s   
zBuffer.__post_init__r  c                 C  rS  rs   )r  r  rd  rw   rw   rx   r    r   zBuffer.make_indexerr   c                 C  s   | j sJ | | j S rs   r   rd  rw   rw   rx   r    r  zBuffer.get_name!Union[torch.Tensor, sympy.Symbol]c                 C  s&   t | jtr| j S tt| jjrs   )rt   r;  r  r$  r  r   r   rd  rw   rw   rx   r$    s   
zBuffer.get_exampler  c                 C  rS  rs   )r  r  rd  rw   rw   rx   r    r   zBuffer.get_deviceru  c                 C  r   rs   rw   rd  rw   rw   rx   rv    r   zBuffer.get_defining_opr  c                 C  r  rs   )r  r   rd  rw   rw   rx   r     r  zBuffer.dtyper  c                 C     g |   jS rs   )r  r   rd  rw   rw   rx   r     rz   zBuffer.get_sizerO  c                 C  rt  rs   )r  r   rd  rw   rw   rx   r    rz   zBuffer.get_strider   c                 C  r  rs   )r  r>  rd  rw   rw   rx   
get_offset  r  zBuffer.get_offsetr  c                 C  s"   t | jtr	| jS tt| jjrs   )rt   r;  r  r  r   r   rd  rw   rw   rx   r    s   zBuffer.get_layoutc                 C  rp  rs   r  rd  rw   rw   rx   r    rr  zBuffer.get_output_specc                 C  r  rs   )r  rd  rw   rw   rx   r    r  zBuffer.get_storage_numelc                 C  s0   t | jtrt | jts| j | _d S d S d S rs   )rt   r;  r  rP  r9  rd  rw   rw   rx   r    s
   zBuffer.freeze_layoutFc                 C  &   t | jtsJ | jj||d| _d S Nr  )rt   r;  r  rJ  r  rw   rw   rx   r    s   z&Buffer.freeze_layout_with_stride_orderc                 C  "   t | jtsJ | j|| _d S rs   )rt   r;  r  rM  r  rw   rw   rx   r       z$Buffer.freeze_layout_with_fill_orderc                 C  rx  rs   )rt   r;  r  rO  r  rw   rw   rx   r    ry  z$Buffer.freeze_layout_with_same_orderc                 C  rv  rw  )rt   r;  r  rK  r  rw   rw   rx   r    s   z'Buffer.freeze_layout_with_exact_stridesc                 C  r  r  r  rd  rw   rw   rx   r    r  zBuffer.is_zero_elementsr  c                   s(      rtt  dS  fdd}|S )Nr  c                   s      }t jp
d|| S rh  )r  r]   r  r   r   re  rd  rw   rx   rl    s   z"Buffer.make_loader.<locals>.loader)r  r	   ra  r  r  rw   rd  rx   r    s   zBuffer.make_loaderNr  r  c                 C  r  rs   r  r  rw   rw   rx   r    r  zBuffer.codegen_referencec                 C  r   rs   rw   rd  rw   rw   rx   r    r   zBuffer.decide_layoutrx  c                 C     t | jtr| jj gS dS r  )rt   r;  rP  rQ  r  rd  rw   rw   rx   r       z#Buffer.get_inputs_that_alias_outputc                 C  r|  r  )rt   r;  rc  r  r  rd  rw   rw   rx   r    r}  zBuffer.get_mutation_namesrf  c                 C  s   t |  gS rs   )r/   r  rd  rw   rw   rx   rn    rz   zBuffer.get_read_namesr   rr   r}   c                 C  r  rs   r.   r  rw   rw   rx   r       zBuffer.get_free_symbol_usesc                 C  r  rs   r.   rd  rw   rw   rx   r     rr  zBuffer.get_unbacked_symbol_defsc                 C  r   rs   rw   rd  rw   rw   rx   r    r   zBuffer.realizec                 C  r  r  rw   rd  rw   rw   rx   should_allocate  ra  zBuffer.should_allocater  r  r  )rq   rs  r  r  r  r  rq  r  r  r  r  r  rs   r  r  r  r
  r#  r  r  )#r   r   r   r   re  r  r  r$  r  rv  r  r   r   r  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rn  r  r   r  r  r[  rw   rw   r7  rx   r    sF   
 

















r  c                   @  s0   e Zd ZdddZdddZejZdd	d
ZdS )OperationBufferrq   rN  c                 C  s   | gS rs   rw   rd  rw   rw   rx   r    rr  zOperationBuffer.get_outputsr  c                 C  rb  rs   rw   rd  rw   rw   rx   rv    r   zOperationBuffer.get_defining_opr   c                 C  s   t |  t|  d S rs   )r  re  r  rd  rw   rw   rx   re    s   
zOperationBuffer.__post_init__Nr"  rq   r  r  )r   r   r   r  rv  r  r  re  rw   rw   rw   rx   r    s
    

r  c                   @     e Zd ZdddZdS )InputBufferrq   ru   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zInputBuffer.num_readsNr  )r   r   r   r  rw   rw   rw   rx   r        r  c                   @  rW  )DonatedBufferaY  
    Represents a donated buffer which is a saved tensor that is not alias to any
    fwd inputs, fwd user outputs, and bwd outputs. We generally cannot inplace
    reuse the input tensor memory during backward since it might be used in another
    function. However, donated buffer can be inplace reused during backward
    to save memory.
    N)r   r   r   r  rw   rw   rw   rx   r  #  rY  r  c                   @  s.   e Zd ZU dZded< dddZdddZdS )rr  Nr  rp  rq   r  c                   r  )Nr   r  rq   r^   c                   s,       }ttj   j|| S rs   )	r  r  r]   r  r_   r   constant_namer  rp  rz  rd  rw   rx   rl  1  s
   z*ConstantBuffer.make_loader.<locals>.loaderr  rw   r  rw   rd  rx   r  0  s   zConstantBuffer.make_loaderr   r  rm   c                 C  s   t tj|  || jdS N)r   r;  )rr  r_   r   r  r  r;  r  rw   rw   rx   r  :  s   z!ConstantBuffer.constant_to_devicer  r  )r   r   r   rp  r   r  r  rw   rw   rw   rx   rr  -  s   
 

rr  c                   @  sD   e Zd ZdddZ	ddd	d
ZddddZdddZdddZdS )NoneAsConstantBufferrq   r  c                 C  r  rs   r.   rd  rw   rw   rx   rm  B  rr  zNoneAsConstantBuffer.get_readsFr   rr   r}   c                 C  r  rs   r.   r  rw   rw   rx   r  E  r~  z)NoneAsConstantBuffer.get_free_symbol_usesNr  r  r   c                 C  s
   t jjjS rs   )r_   r   r  none_strr  rw   rw   rx   r  J  r  z&NoneAsConstantBuffer.codegen_referencer  c                 C  s
   t d dS Nr=  )r^  rd  rw   rw   rx   r  M  r  z$NoneAsConstantBuffer.get_output_specc                 C  r  r  rw   rd  rw   rw   rx   r  P  r   z&NoneAsConstantBuffer.has_tensor_outputr  r  r
  rs   r  r  r  )r   r   r   rm  r  r  r  r  rw   rw   rw   rx   r  @  s    

r  c                   @  s:   e Zd ZU ded< 	dddd	ZddddZdddZd
S )r   r   r/  Fr   rr   rq   r}   c                 C     t | j|S rs   )r   r/  r  rw   rw   rx   r  X  rP  z*ShapeAsConstantBuffer.get_free_symbol_usesNr  r  r   c                 C  s   t jj| jS rs   )r_   r   r  Zcodegen_sizevarr/  r  rw   rw   rx   r  ]  r  z'ShapeAsConstantBuffer.codegen_referencec                 C  r  r  rw   rd  rw   rw   rx   r  `  r   z'ShapeAsConstantBuffer.has_tensor_outputr  r
  rs   r  r  )r   r   r   r   r  r  r  rw   rw   rw   rx   r   T  s   
 r   c                      s   e Zd ZU ded< d@ddZdAdd	ZdBddZdCddZdDddZ	dEdFddZ	dG fddZ
dHddZdId d!ZdJd#d$ZedKd&d'Z	(	(dLdMd.d/Ze	(dNd0d1ZdOd3d4Zd@d5d6ZdPd7d8ZdPd9d:ZdQd>d?Z  ZS )Rr  r$  r:  rq   r  c                 C  s(   | j dur| j S t| jdr| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nr   )r   r  r:  rd  rw   rw   rx   get_computed_buffer_nameh  s
   
z'ComputedBuffer.get_computed_buffer_nameru   c                 C  r  rs   r:  r  rd  rw   rw   rx   r  s  r  zComputedBuffer.num_readsr  c                 C  r  rs   r:  rm  rd  rw   rw   rx   rm  v  r  zComputedBuffer.get_readsrf  c                 C  r  rs   r  rd  rw   rw   rx   rn  y  r  zComputedBuffer.get_read_namesr  c                 C  sz   t tdd, | j r"t|  | j | j W  d    S t|  | j	 W  d    S 1 s6w   Y  d S rJ  )
r   rp   r  r:  r  r>   get_store_functionr>  r  r   rd  rw   rw   rx   r  |  s   
$zComputedBuffer.get_read_writesFr   rr   r}   c                 C  s6   t |  |t |  |B t |  |B | j|B S rs   )r   r   r  ru  r:  r  r  rw   rw   rx   r    s   
z#ComputedBuffer.get_free_symbol_usesr  c                   s6   |   s| jtjjvr|  dkr| j S t  S r  )	r  r   r_   r   Zmutated_buffersr  r:  r  r6  rd  r7  rw   rx   r    s   


zComputedBuffer.make_loaderCallable[..., None]c                 C  sV   |     }t| jtttfrt| jj	| j
|S t| jts"J t| jj| j
|S rs   )r  r9  r  rt   r:  r  rS  r  r	   r  r   rb  rm  r  rw   rw   rx   r    s
   z!ComputedBuffer.get_store_functionOptional[list[int]]c                   s   t | jtrYt| j | j \\}}|  j	}t
dd |D s&J fdd|D }|rYt | jttfrA| j| n|  fdd|D }ddlm} |||  S dS )	al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 s  s"    | ]}t |tjtjfV  qd S rs   )rt   r6   StarDep	MemoryDepr  rw   rw   rx   r!    
    
z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                   s.   g | ]}t |tjrt|jd d  D qS )c                 S  s   i | ]}|d kr|t jjqS r   r  )r   vrw   rw   rx   r         z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)rt   r6   r  r[   r   r  )r  rw   rx   r     s    
z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                   s   g | ]
}t jj| qS rw   r_   r   r   r  r   r/  )r  rw   rx   r     s    r4   pick_loop_orderN)rt   r;  r  r6   r  r:  r>  r  r  r  r$  rS  r  r   	schedulerr  r   )ra  
index_varsr   r  Zstride_lengthsr  rw   )r  r  rx   r     s*   


zComputedBuffer.get_fill_orderr   c                 C  s6   t | jtr|  }|r| | d S |   d S d S rs   )rt   r;  r  r   r  r  r  rw   rw   rx   r    s   zComputedBuffer.decide_layoutetuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody, tuple[list[sympy.Expr], list[sympy.Expr]]]c           
      C  s   t j| j | j dd\}}ttd|   t	| 
 |  r$|n|d d |g|R  }W d    n1 s:w   Y  g }g }g }g }| D ]+\}}	||d v rd|rYJ || ||	 qK||d v slJ || ||	 qK||f|||ffS )Nqrj   rp  r4   r   )r6   r  r:  r>  r  r   rp   rr  r  r@   r  r  itemsr  )
ra  r   
var_rangesr  r  reduce_vars
index_sizereduce_sizer  r   rw   rw   rx   get_default_sizes_body  s2   



z%ComputedBuffer.get_default_sizes_bodyNextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[..., Any]]:tuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody]c                   s    \\}}}\}}|r|||f|||f\\}}}\}}g |j  |durmt|tr4t|dks6J |\}}	t|tsAJ t|	tsHJ tdd |	D sSJ |j	}
|
|ks`J |
|f fdd|	D }	 |	7  g |
 tjtjs|   fdd}|| }tt ptj }|||||\}}}|||||\}}}tj||d	d
\\}}}t|||||g|||}||f|fS )an  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Optional argument recompute_sizes_body_func can be used to recompute sizes and body
        on the default body. This can be useful to append additional loop transformations.
        Nr   c                 s  s    | ]}t |tV  qd S rs   )rt   r   )r   frw   rw   rx   r!  5  r,  z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                   s   g | ]}| vr|qS rw   rw   r*  )index_formulasrw   rx   r   =  
    z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>c           	        s\    | ||\}}}|| } |r'tjj| |t | |\}}}t||}n|}|||fS rs   )_apply_loop_reorderingr_   r   r   _simplify_loopsr:   r   )	Zx_varssupport_varsr@  Zsimplify_loopsZreindex0r   r   _pruner   r  memory_addrsra  rw   rx   simplify_and_reorderF  s   



zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderprj   )r  Zindexing_exprsr   rt   r   r   r   r   r$  r  Zget_write_exprsr_   r   r  r7   ZPREFER_STORE_LOOP_ORDERextendZget_read_exprsrV   r  r5   Zloop_ordering_after_fusionr6   Zindex_vars_no_squeezer@   )ra  r  r  r  r  r  r  r  Zextra_indexing_rangesZextra_indexing_exprZexpected_var_rangesr  r  Zshould_merge_loopsZiter_rangesZiter_reindexr   Zreduce_rangesZreduce_reindexZ	iter_varsr  rw   r  rx   r    sz   




z#ComputedBuffer.simplify_and_reorderc              
     s   ddl m} |du rg }z* fdd|D }t|t|kr)t|d t ks+J tt|||}W n  tyV   tjrLt	dt
t | ttt}Y nw fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r4   r  Nc                   s   g | ]}t jj| qS rw   r  r  )r  r  rw   rx   r         z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                   r   rw   rw   r   )r@  rw   rx   r     r   )r  r  r   r   r(  	Exceptionr5   r  r  warningr   r   r   r   r   )r  r  r@  r  Zpriority_idxr  r3  r   rw   )r  r@  r  rx   r  z  s,   
z%ComputedBuffer._apply_loop_reorderingr  c                 C  r  rs   r:  r  rd  rw   rw   rx   r    r  z!ComputedBuffer.get_reduction_sizec                 C  r  rs   r:  r  rd  rw   rw   rx   r    r  z!ComputedBuffer.get_reduction_typec                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zComputedBuffer.is_no_opc                 C  r  NTrw   rd  rw   rw   rx   r    r   zComputedBuffer.should_allocater   r  rm   c                 C  r  )ro  r:  r  r  rw   rw   rx   r       z!ComputedBuffer.constant_to_devicer  r  r  r  r  r  r
  r  )rq   r  )rq   r  r  )rq   r  NN)r  r  r  r  rq   r  rs   r  r  r  )r   r   r   r   r  r  rm  rn  r  r  r  r  r   r  rJ   r  r  r  r  r  r  r  r  r  r[  rw   rw   r7  rx   r  d  s4   
 








'"s
#


r  c                      sb   e Zd ZdZd! fd
dZd"ddZdd Zd#ddZd$ddZd%ddZ			d&d'dd Z
  ZS )(TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    r;  r  rM  Sequence[IRNode]make_kernel_renderr%  rq   r   c                   s@   t  jd |d t|| _|| _tj| | _	tj
|  d S r  )r6  r=  InputsKernelunwrap_storagerM  r  r_   r   register_bufferr   register_operation)ra  r;  rM  r  r7  rw   rx   r=    s
   zTemplateBuffer.__init__r  c                 C  s   | j ddS )NT	normalize)r>   rd  rw   rw   rx   r    r   zTemplateBuffer.get_read_writesc              	     s   |   |     fdd}tj||  d|d}| jD ]j   fdd}| jtj| dddjO  _q|S )Nc                   s"   t |dksJ t | dS )Nr   Zfake)r   r]   rj  r  )re  r   rw   rx   dummy  ry  z1TemplateBuffer.extract_read_writes.<locals>.dummyrw   r  c                   s(   t |dksJ t  |  d S r  )r   r]   r  r  r  )re  rQ  rw   rx   r    s   T)	r  r  r  r6   r>   r   rM  r;  r  )ra  r  r  depsrw   )re  rQ  r   rx   r>     s   


z"TemplateBuffer.extract_read_writesr  c                 C  s   t jjS rs   )r   rE  r  rd  rw   rw   rx   r    r  z!TemplateBuffer.get_reduction_sizer  c                 C  r   rs   rw   rd  rw   rw   rx   r    r   z!TemplateBuffer.get_reduction_typerr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zTemplateBuffer.should_allocateNr  r  r  r  c                 C  s   |   dfd fS r  r  )ra  r  r  rw   rw   rx   r    s
   z#TemplateBuffer.simplify_and_reorder)r;  r  rM  r  r  r%  rq   r   r  r  r  r  r  )r  r  r  r  )r   r   r   r  r=  r  r>   r  r  r  r  r[  rw   rw   r7  rx   r    s    



r  c                      sT   e Zd Z		dd fdd	Z	
dd fddZdddZdddZdddZ  ZS ) TritonTemplateBufferNmutated_inputsOptional[Iterable[IRNode]]allowed_prologue_inpsOptional[OrderedSet[str]]rq   r   c                   s   t  ||| |_g_|durFtjjjtjjjf}t	j
jj}||v s0J d| d| jd    j fdd|D 7  _|rJ|nt _d_d_dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        Nz$Mutated inputs are only allowed for z	 but got r   c                      g | ]}t t d |qS r=  MutationOutputr^  r   r  r   ra  rw   rx   r     r  z1TritonTemplateBuffer.__init__.<locals>.<listcomp>)r6  r=  r  outputsr  r]   Zhigher_orderZflex_attentionZflex_attention_backwardr_   r   current_noder  rM  r  r/   r  subgraph_inpssubgraph_outs)ra  r;  rM  r  r  r  Zallowed_setr  r7  r  rx   r=    s&   



zTritonTemplateBuffer.__init__Fr   rr   r}   c                   s   t  |}| jr| jng }| jr| jng }|D ]%}t|tjr)|t|| qt|t	r7||| q|d u s=J q|D ]}t|t	rP||| q@|d u sVJ q@|S rs   )
r6  r  r  r  rt   r   r   updater   rm   )ra  r   resr  r  rQ  r   r7  rw   rx   r     s   

z)TritonTemplateBuffer.get_free_symbol_usesrN  c                 C  rp  rs   )r  rd  rw   rw   rx   r  7  rr  z TritonTemplateBuffer.get_outputsrf  c                 C  rp  rs   )r  rd  rw   rw   rx   get_allowed_prologue_inps:  rr  z.TritonTemplateBuffer.get_allowed_prologue_inpsr   c                 C  s   d| j  d}|S )NzTritonTemplateBuffer(layout=r  r  )ra  r   rw   rw   rx   r<  =  s   zTritonTemplateBuffer.__str__r  )r  r  r  r  rq   r   r  r
  r"  r  r  )	r   r   r   r=  r  r  r  r<  r[  rw   rw   r7  rx   r    s    ,

r  c                      sp   e Zd ZdZd  fddZd!ddZd"ddZdd Zd"ddZd"ddZ	d#ddZ
d$ddZd"ddZ  ZS )%ChoiceCallera.  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUDATemplateCaller.
    r   r   r~   rN  r;  r  descriptionrq   r   c                   s&   t    || _|| _|| _|| _d S rs   )r6  r=  r   r;  r~   r  )ra  r   r~   r;  r  r7  rw   rx   r=  N  s
   

zChoiceCaller.__init__r`  c                  s2   |    tjrt fddS t d|iS )Nc                     s     S rs   rw   rw   algor   rw   rx   r  `  s    z(ChoiceCaller.benchmark.<locals>.<lambda>r   )to_callabler5   Z/profile_bandwidth_with_do_bench_using_profilingrO   rE   	benchmark)ra  r   r   rw   r  rx   r  ]  s   zChoiceCaller.benchmarkc                 C  r  rs   r  rd  rw   rw   rx   	call_namec  r   zChoiceCaller.call_namec                 C  r  rs   r  rd  rw   rw   rx   r  f  r   zChoiceCaller.to_callablec                 C  r  )z
        Hash key for the underlying kernel. By default, we assume there are no
        runtime params, so kernel hash key defaults to choice caller's hash key.
        )hash_keyrd  rw   rw   rx   kernel_hash_keyi  s   zChoiceCaller.kernel_hash_keyc                 C  r  rs   r  rd  rw   rw   rx   r  p  r   zChoiceCaller.hash_keyrl   c                 C  r  rs   r  rd  rw   rw   rx   rK  s  r   zChoiceCaller.output_node<dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]c                 C  s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.rw   rd  rw   rw   rx   	info_dictv  ra  zChoiceCaller.info_dictc                 C  r  )NZunsupported_choicerw   rd  rw   rw   rx   autoheuristic_idz  r   zChoiceCaller.autoheuristic_id)
r   r   r~   rN  r;  r  r  r   rq   r   )rq   r`  r  )rq   rl   )rq   r  )r   r   r   r  r=  r  r  r  r  r  rK  r  r  r[  rw   rw   r7  rx   r  E  s    





r  c                   @  r  )TritonTemplateCallerBaserq   r
   c                 C  r  rs   r  rd  rw   rw   rx   get_make_kernel_render  r   z/TritonTemplateCallerBase.get_make_kernel_renderN)rq   r
   )r   r   r   r  rw   rw   rw   rx   r  ~  r  r  c                      sb   e Zd ZdZd fddZed ddZed!ddZej	d"ddZ
d#ddZd$ddZ  ZS )%MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    r;  r  rM  r   choice_timings_fn'Callable[[], dict[ChoiceCaller, float]]unfiltered_choiceslist[ChoiceCaller]r  rf  rq   r   c                   s>   t  j||d |d || _d | _|| _tdd |D | _d S )N)r;  rM  r  r  c                 s  s0    | ]}t |tpt |tjjjo|jV  qd S rs   )rt   r  r  	_inductorselect_algorithmZExternKernelCallerZhas_out_variant)r   choicerw   rw   rx   r!    s    

z/MultiTemplateBuffer.__init__.<locals>.<genexpr>)r6  r=  _choice_timings_fn_choice_timingsZoriginal_inputsr$  _output_plannable)ra  r;  rM  r  r  r  r7  rw   rx   r=    s   zMultiTemplateBuffer.__init__rr   c                 C  rp  )z^
        Are all possible choices TritonTemplates or Extern Kernels with out variants
        )r  rd  rw   rw   rx   output_plannable  s   z$MultiTemplateBuffer.output_plannabledict[ChoiceCaller, float]c                 C  s   | j d u r
|  | _ | j S rs   )r  r  rd  rw   rw   rx   choice_timings  s   

z"MultiTemplateBuffer.choice_timingscallerr  c                 c  sR    t |tjjjsJ | j|jksJ | j}| | _z	d V  W || _d S || _w rs   )rt   r  r  r  TritonTemplateCallerr;  r  r  )ra  r  renderrw   rw   rx   swap_as_triton_caller  s   
z)MultiTemplateBuffer.swap_as_triton_callerc                 C  sJ   t |tjjjs
J |  |jjksJ |  |jj	ksJ |
 | _d S rs   )rt   r  r  r  r  r   r;  r   r  r   r  r  )ra  r  rw   rw   rx   finalize_as_triton_caller  s   z-MultiTemplateBuffer.finalize_as_triton_callertuple[ChoiceCaller, float]c                 C  s    t | j| jjd}|| j| fS )Nr  )r  r  r2  )ra  Z
min_choicerw   rw   rx   get_min_choice  s   z"MultiTemplateBuffer.get_min_choice)r;  r  rM  r   r  r  r  r  r  rf  rq   r   r  )rq   r  )r  r  )r  r  rq   r   )rq   r  )r   r   r   r  r=  r  r  r  r  r  r  r  r  r[  rw   rw   r7  rx   r    s    
r  c                      s0   e Zd Zd fd	d
Zdd ZdddZ  ZS )CUDATemplateBufferworkspace_sizeru   templaterb   supports_epilogue_fusionrr   rq   r   c                   s&   t  ||| || _|| _|| _d S rs   )r6  r=  r   r  r  )ra  r;  rM  r  r   r  r  r7  rw   rx   r=    s   	
zCUDATemplateBuffer.__init__c                 C  s   | j d ur| j S dS r  )r   rd  rw   rw   rx   r!    r   z%CUDATemplateBuffer.get_workspace_sizec                 C  s$   |   D ]}t| d d  qd S rs   )r  r]   rj  r  )ra  rG  rw   rw   rx   emulate_store_fn  s   z#CUDATemplateBuffer.emulate_store_fn)r   ru   r  rb   r  rr   rq   r   r  )r   r   r   r=  r!  r  r[  rw   rw   r7  rx   r    s    r  c                      s,   e Zd Zd fddZd	 fddZ  ZS )
CppTemplateBufferrq   r   c                   s&   t  ||| || _|| _d | _d S rs   )r6  r=  r  r  r  )ra  r;  rM  r  r  r  r7  rw   rx   r=    s   
zCppTemplateBuffer.__init__r  c                   sV   t | jtr&t | jtsJ | jd }t |tsJ |j}t |ts$J |S t  S r  )	rt   r;  MultiOutputLayoutr  r   r  r  r6  r  )ra  Zfirst_outputr;  r7  rw   rx   r    s   

zCppTemplateBuffer.get_layoutr  r  )r   r   r   r=  r  r[  rw   rw   r7  rx   r    s    r  c                   @  sX   e Zd ZU ded< dddZddd	ZedddZedd Z	dddZ
dddZdS )r  rN  rM  rq   r  c                   s   t tj  }tj | jD ]#}t|tr | fdd|D  qt|tr&q|	 |
  qt tj  fdd|  D }tj||t  dS )Nc                 3      | ]	} |  V  qd S rs   r{  r  r  rw   rx   r!    r  z/InputsKernel.get_read_writes.<locals>.<genexpr>c                 3  r  rs   r{  r  r  rw   rx   r!        
)r  writesZindex_exprs)r/   r6   r;   r  rM  rt   r   r  r   r}  r  r  Z
ReadWrites)ra  r  inputr	  rw   r  rx   r    s    


zInputsKernel.get_read_writesr  c                 C  r  rs   r  rd  rw   rw   rx   rm    r  zInputsKernel.get_readsro   rm   c                 C  sz   t |tr|j}t |tr|j}t |trt |tst|}t |tr)| |S t |t	r0|S t |t
tfs;J ||S rs   )rt   rl   r:  r  r  r?  r  realize_inputunwrap_storage_for_inputTorchBindObjectr  rA  ro   rw   rw   rx   r    s   





z%InputsKernel.unwrap_storage_for_inputc                 C  s@   g }| D ]}t |trdd |D }nt|}|| q|S )Nc                 S  r  rw   )r  r  r   rw   rw   rx   r   *  r   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)rt   r   r  r  r  )rM  Z
inputs_newro   rw   rw   rx   r  %  s   

zInputsKernel.unwrap_storagerr   c                 C  r  r  rw   rd  rw   rw   rx   r  0  r   zInputsKernel.is_externru   c                 C  r  r  rw   rd  rw   rw   rx   r  3  r   zInputsKernel.num_readsNr  r  )ro   rm   rq   rm   r  r  )r   r   r   r   r  rm  rY  r  r  r  r  r  rw   rw   rw   rx   r    s   
 




r  c                   @  s    e Zd Zd	ddZd
ddZdS )	NopKernelrq   rr   c                 C  r  r  rw   rd  rw   rw   rx   r  8  r   zNopKernel.is_no_opr  c                 C  r  rs   r.   rd  rw   rw   rx   rm  ;  rr  zNopKernel.get_readsNr  r  )r   r   r   r  rm  rw   rw   rw   rx   r  7  s    
r  c                   @  s@   e Zd ZdZedd ZedddZedd ZdddZdS )ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C  s  |d   }|d  }t|d  }dg}|| g}d|  kr)t|k s,J  J tdt|D ]Z}||  }	|||  t|	t|ksLJ ||  |ksVJ ||   |ks`J tt|D ]}
|
|krw||
 |	|
  ||
< qftjj	
||
 |	|
 ||
< qf|||  q3t|}tjrt|||d j}tt|D ]!}|| }t|r| }t|trt|j|jrt|} qqtdd |D }tjjjd }t|tsJ |du rtdd |D rt|}td t||||dg d}t|}g }tt|D ]N}| || t j!|||| || dd	}|j"| t|| j#t$r6|| j#% }n|| j#}|& rVt'||   j(rVt)|sV||*  q	t|dkrntj+|t,j-rntj.| tj/||_0| 1|j"|_"tj2| |S )
Nr   r4   c                 s  r  rs   )r   r  rw   rw   rx   r!  n  r  z&ConcatKernel.create.<locals>.<genexpr>Fc                 s  sB    | ]}d |j v o|j d  jtjdp|j d  jtjdV  qdS )r  rF  N)rJ  r  r  rB  rD  r   argrw   rw   rx   r!  r  s    

)r   r   r   r   r   r;  rM  )r	  )3r  r  r   r   r   r   r  r_   r   r   r  r  r  r5   r8  r  r7  r   r   r  rt   r=  r'  r   r   r!   r~  r  r   r  r  rp  r  r@  rM  r:  r  r  r  rV   r   rU   r  r  r7   ZFOREACHZregister_operation_listr  r   r  r  )rA  rM  r-  r   r   r  Zoffsets_startZoffsets_endr   Z
input_sizer  Zoutput_stridero   r;  Zany_input_is_storage_and_layoutZfx_node_argsZconcat_kernelkernelZop_namesZinput_bufferZinput_unwrappedrw   rw   rx   r@  E  s   
 



 zConcatKernel.createNc                 C  s   t |tr| |j|S t |jtrCt |jjtr|jjsdS |d u r%dS t|	 t|	 ks3dS t
dd t|	 |	 D S t |jjtoPt |jt S )NFTc                 s  r5  rs   r6  r7  rw   rw   rx   r!    r8  z=ConcatKernel.can_realize_into_without_copy.<locals>.<genexpr>)rt   rl   can_realize_into_without_copyr:  r  r;  r=  r  r   r  r$  r   r  ExternKernelAlloc)rA  rn  ro  rw   rw   rx   r    s$   
z*ConcatKernel.can_realize_into_without_copyc              	   C  s   t |tst|rt|\}}t||d}t |tsJ |t |tr*| |j|S t |trJ|  t	|jds;J | 
||rJt||j_|jS tj| | | dd t| | D d}| ||S )Nr9  r;  c                 S  rj  rw   rk  rl  rw   rw   rx   r     rm  z-ConcatKernel.realize_into.<locals>.<listcomp>rq  )rt   r?  r   r<  rl   rp  r:  r  r  r  r  rP  r;  rb  r@  r  r  r  r   r   )rA  rn  ro  r@  r;  pwrw   rw   rx   rp    s,   


	zConcatKernel.realize_intorq   rr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zConcatKernel.should_allocaters   r  )	r   r   r   r  rY  r@  r  rp  r  rw   rw   rw   rx   r  ?  s    
`
 r  c                      sJ  e Zd ZU dZded< ejedZded< dZ	ded	< dZ
d
ed< dZd
ed< ejedZded< dZded< dZded< dZded< ejedZded< ejedZded< 							dndo fddZdpddZdqd!d"Zd#d$ Zd%d& Zdod'd(Zd)d* Zdrdsd+d,Zdtd-d.Zd/d0 Zed1d2 Zedud4d5Z ed6d7 Z!ed8d9 Z"ed:d; Z#e			<dvdwdAdBZ$edxdCdDZ%edxdEdFZ&edGdH Z'edIdJ Z(edKdL Z)edMdN Z*dodOdPZ+dQdR Z,drdydUdVZ-dWdX Z.dYdZ Z/dxd[d\Z0dzd^d_Z1dod`daZ2dodbdcZ3ddde Z4dfdg Z5	<dxd{djdkZ6dzdldmZ7e7Z8  Z9S )|r  rw   ztuple[Any, ...]constant_argsr`  zdict[str, Any]r   NzOptional[ReinterpretView]output_viewr  python_kernel_namecpp_kernel_namezIterable[str]ordered_kwargs_for_cpp_kernelzFOptional[Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]]op_overloadzOptional[list[dict[str, Any]]]arg_propertiesz#Optional[dict[str, dict[str, Any]]]kwarg_propertiesz"dict[sympy.Symbol, pytree.KeyPath]unbacked_bindingszlist[MutationOutput]mutation_outputsrq   r   c                   sn   t  j|||d || _|r|ni | _|| _|
| _| | | | |	| _| 	  i | _
g | _tjj| _d S Nr  )r6  r=  r  r   r  r  set_cpp_kernel_nameset_python_kernel_namer  collect_arg_kwarg_propertiesr!  r"  r_   r   r  fx_node)ra  r   r;  rM  r  r   r  r  r  r  r  r7  rw   rx   r=     s    

zExternKernel.__init__rN  c                 C  s   | g| j S rs   )r"  rd  rw   rw   rx   r    r   zExternKernel.get_outputsr}   c                 C  r  rs   r.   rd  rw   rw   rx   r   !  rr  z%ExternKernel.get_unbacked_symbol_defsc                 C  s   t | jtjjrdd | jjjD ndd tt| j	D | _
t | jtjjr1dd | jjjD ni | _t | jtjjrW| jsJdd | jjjD | _dd | jjjD | _d S g | _d S )Nc                 S  s$   g | ]}|j s|j|j|jd qS ))r   r   r  )
kwarg_onlyr   	real_typer  r  rw   rw   rx   r   (  s    z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S  s   g | ]}i qS rw   rw   r   rw   rw   rx   r   2  r   c                 S  s   i | ]}|j |j|jd qS ))r   r  )r   r)  r  r  rw   rw   rx   r   5  r  z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S     g | ]}|j r|jqS rw   r(  r   r  rw   rw   rx   r   @  
    c                 S  s   g | ]}|j r|qS rw   )r(  r  rw   rw   rx   r   C  s
    )rt   r  r  _ops
OpOverload_schema	argumentsr   r   rM  r  allarg_propertiesr  schema_kwargsrd  rw   rw   rx   r&  $  s*   


z)ExternKernel.collect_arg_kwarg_propertiesc                 C  s$   t | jtr|   |   d S d S rs   )rt   r;  r  apply_constraintr  rd  rw   rw   rx   r  I  s   zExternKernel.decide_layoutc                 C  s$   t | |\}}|r|| d S d S rs   )rR   Zmake_comment)ra  wrapperZ
origin_strZ_detailed_origin_strrw   rw   rx   codegen_commentN  s   zExternKernel.codegen_commentc                 C  r  rs   r  ra  r4  rw   rw   rx   codegenS  r   zExternKernel.codegenc                 C  s   || _ tjjrt| jtjjsd S | j}| j d u rB|j	dkr;|j
dkr+|jdd n|jdd}d| d| _ d S |jj| _ d S d S )Natenr  .r   r   z
at::_ops::z::call)r  r_   r   cpp_wrapperrt   r  r  r-  r.  	namespaceZ_overloadnamer   r  replacer/  r   )ra  r  r  opnamerw   rw   rx   r$  V  s   



z ExternKernel.set_cpp_kernel_namec                 C  sd   || _ |d ur	d S | j}|d u rd S t|tjjr"d|j | _ d S |jdd d|j | _ d S )Nztorch.ops.higher_order.._ops..ops.r9  )	r  r  rt   r  r-  HigherOrderOperatorr   r   r<  )ra  r  r  rw   rw   rx   r%  n  s   z#ExternKernel.set_python_kernel_namec                 C  s:   |    }r	|jntjj}tjjrtjj| j|S | j	S rs   )
r  r   r_   r   Zdevice_typer:  r  Zget_c_shim_func_namer  r  )ra  dr   rw   rw   rx   get_kernel_name}  s   zExternKernel.get_kernel_namec                 C  s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r&  r'  rZ  rX  )	rb  r@  r  r  r  r   rt  rq  r  )ro   r  rw   rw   rx   
copy_input  s   zExternKernel.copy_inputituple[Any, list[Any], list[Any], Callable[[Any, Any], Any], Optional[dict[sympy.Symbol, pytree.KeyPath]]]c                   s  ||d}t |\} g g }g }|D ]/}t|to"t|t  d r.|| qt|tjr>tj	j
jj|d d}|| q fdd}	fdd|D }|D ]}
t|
rbt|
dd	 qVg }|D ]n}
t|
ts|
 tj	jv r|tj	j|
   qgt|
ts|
 tj	jv r|tj	j|
   qgt|
tr||
  qgt|
tjjjr|
jj}|
jjd
kr|d usJ |tjj|   qg|t|
dd qg|	||\}}||i |}d }tjj }r*tj j!"d}t# }tj j$tj%j&j'kr
|d }t(tj }| t)|tj | W d    n	1 sw   Y  t*|||}t|t+t,fs5|gn|}|D ]'}t|tj-r_|j.r_d}tj	j j!"dd  }r[| d| }|tj	_/q9||||	|fS )Nr   r"  )r	  c                   sd   g }t | }t |}D ]}|r|t| q|t| qt| }|dg |di fS )Nr   r   )iterr  nextpytreeZtree_unflattenr2  )Znew_tensor_argsZnew_non_tensor_argsr`  Z
it_tensorsZit_non_tensorsZ	is_tensorrC  )	args_specis_arg_tensorrw   rx   unflatten_args  s   z3ExternKernel.process_kernel.<locals>.unflatten_argsc                   r*  rw   r  r  rA  rw   rx   r     r   z/ExternKernel.process_kernel.<locals>.<listcomp>Tr  r  )r   r  r4   zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 )0rG  tree_flattenr  rt   rm   GeneratorStater   r   r_   r   r   r   Zcreate_symintnoder   r<  r  r  	constantsZtorchbind_constantsr  	get_valuer  r  Zirr   r   r   r  Zdefault_generatorsZclone_stater   r#  r  rJ  r2  r   r  _higher_order_opsZeffectsZwith_effectsr$   r)   r%   r   r   TensorZ	is_sparseZdisable_cudagraphs_reason)rA  r  r   r   Zbinded_argsZ	args_flattensor_argsnon_tensor_argsr  rJ  ro   Zexample_argsZdevice_indexnew_argsZ
new_kwargsexample_outputr!  r   Znode_meta_valctxZexample_out_lir  msgrM  rw   )rH  rA  rI  rx   process_kernel  s   

		


zExternKernel.process_kernelc              	   C  sV  t |tsJ t |tr|S | }tj| }|dus J | }|durQd|j	v rQt |j
trQ|j	d jtjdsG|j	d jtjdrQ|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}	tjj||}
t||	|
 }||krtd|	|
| tt|jt |! |" | |	|
dd	S )
z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr  r  rC  rj   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr:  r9  )#rt   r  r?  r  r_   r   rd  r  rt  rJ  r;  r  r  r  rB  rD  r  r!   r   r  r6   r  r  r   r  Zstride_varsZ
offset_varrW   r  r  r  r:  r=  r  r  )rA  ro   Zx_unwrap_viewr  Zx_unwrap_view_fx_nodeZ
index_argsr  r  r   r3  r>  expectedrw   rw   rx   convert_to_reinterpret_view
  sj   





z(ExternKernel.convert_to_reinterpret_viewc                 C  s  |d u rt  S t|tjtjjjtfrt|dS t|t	r.t
jtj|j| | dS t|tr5|S t|tr@| |jS t|trQt| |j| dS t|trp|  t| rpz| |W S  tyo   Y nw t|tr{|  |S t|ttfr|S |  |S )N)r/  r"  r9  )!r  rt   r   r   r   r   r   ru   r   r  r_   r   Zadd_tensor_constantr  r1  r	  r  r  rr  rl   r  r:  r?  r  r  r  r   r  r\  r  r  NonTensorObjrC  r  rw   rw   rx   r  O  s<   







zExternKernel.realize_inputc                 C  sD   t |rt| dkr|S | D ]
}|dkr|  S q| |S r0  )r   r   r  rC  )rA  ro   r   rw   rw   rx   require_stride1p  s   
zExternKernel.require_stride1Fr   Optional[Sequence[int]]r  r  c              	     s  |d us
 d us
J   dv r sS trt trI|r=tddt|r6ttj	j
 jn||d S tddd | d S t ttfrs|r[ |sh rst  j rs d urqt S S t trt  trtdt  tr|r  |s rt   j rS ttr|r |sǈ rt  j rɈS ttrtjtrtjtst rt jtsz!| j_|r| j||dW S  r| j |dW S W n
 t y   Y nw d } } d urItj	j
 fdd	t!t" D }|D ]}t#j$j%&|d
dq;| 'tdd|| d |ret|scJ S |r|d urr d ustJ t#j$j%(|t S S )N)r   r4   TF)r  r  r  r  r  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutr  c                   s4   g | ]}  | d r | dr|qS )r   r   )r+  r  r   r   r  r   ro   rw   rx   r     s    z0ExternKernel.require_strides.<locals>.<listcomp>r   r4   ))r  r   rt   r  r  r<  r  r   r_   r   r   Z
size_hintsr   r=  rP  r  r0  r   rD  rc  rf  r  r  rl   r:  r  r?  r  r  r\  require_stride_orderrequire_exact_stridesr  r   r   r  r  loweringZslice_rC  r  )rA  ro   r   r  r  Zexpanded_dimsZ	orig_sizer-  rw   r`  rx   require_stridesz  s   

	


	


zExternKernel.require_stridesc                 C     | j |||dS )N)r  r  rd  )rA  ro   r  r  rw   rw   rx   rb    s   z"ExternKernel.require_exact_stridesc                 C  re  )N)r   r  rf  )rA  ro   r   r  rw   rw   rx   ra       z!ExternKernel.require_stride_orderc                 C     |  |tS rs   )ra  rC  r  rw   rw   rx   require_channels_last!  r  z"ExternKernel.require_channels_lastc                 C  rh  rs   )ra  rE  r  rw   rw   rx   require_channels_last_3d%  r  z%ExternKernel.require_channels_last_3dc                 C  s*   dd }||r
|S |  |t| S )Nc                 S  s*   dd }|| t jjv ot jj||  jS )Nc              	   S  s$   z|   W S  ttfy   Y d S w rs   )r  AttributeErrorr  rv   rw   rw   rx   safe_get_name,  s
   
zPExternKernel.require_contiguous.<locals>.is_mkldnn_tensor.<locals>.safe_get_name)r_   r   rP  Z	is_mkldnn)ro   rl  rw   rw   rx   is_mkldnn_tensor+  s   z9ExternKernel.require_contiguous.<locals>.is_mkldnn_tensorrb  r  r  r   )rA  ro   rm  rw   rw   rx   r  )  s   zExternKernel.require_contiguousc                 C  s   |  |t| S rs   rn  r  rw   rw   rx   require_contiguous_strides?  s   z'ExternKernel.require_contiguous_stridesc                 C  r   rs   rw   rd  rw   rw   rx   r3  G  r   zExternKernel.apply_constraintc                 C  s   t |ttfs	J t |trt|}| jsJ dt|}t| j}||k rQtd| j||  t||D ]}| j| d }|	||v rH|| n| j| d  q5|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.r   r  )
rt   r   r   r  r   r  r  r  r   r  )ra  r   r   Zn_argsZ
n_pos_argsr   arg_namerw   rw   rx   fill_non_provided_argsJ  s(   	


z#ExternKernel.fill_non_provided_argsr/  rW  c           	      C  s   t jjrig }d }|r"| jr"t| jt|ksJ ddd | jD }t| jD ]?\}}|d ur@||| }|r=|dnd }nt| j| }| jrY|t| jk rY| j| dnd }|	t jj
|| q'|S tt jj
j| jS )NzDnames passed to codegen_const_args does not match self.constant_argsc                 S  s   i | ]}| d |qS r   )r2  r  rw   rw   rx   r   x  r  z3ExternKernel.codegen_const_args.<locals>.<dictcomp>r   )r_   r   r:  r  r   r  r   r2  rM  r  r  val_to_arg_strr  )	ra  r/  r`  Zname_to_arg_propertiesr   ro   proptype_r   rw   rw   rx   codegen_const_argsl  s0   
zExternKernel.codegen_const_argsc                 C  s   t jjr| jd ur| g | j| j| j}d}n| j}d}g }t|D ]4\}}t jjrN| j	r6|t
| j	k s:J d| j	| d}|t jj|| q$|t jj| q$|rb||   |S )NFTz-Invalid access to ExternKernel.arg_propertiesr   )r_   r   r:  r  rq  rM  r  r   r   r  r   r2  r  r  rr  r  ru  )ra  rM  Zneed_codegen_constant_argsr   r   ro   rt  rw   rw   rx   codegen_args  s&   zExternKernel.codegen_argsc                 K  sX   ||v r	| |S || jv r| j |S | jr%|| jv r%| j | dS t| d)zGiven an argument name, queries for values in (in order):
        1. any provided kwargs for this function.
        2. the class self.kwargs member.
        3. any available default arguments in self.allarg_properties.r  z not in self.allarg_properties)r2  r   r1  r  )ra  rp  r   rw   rw   rx   get_kwargs_value  s   

zExternKernel.get_kwargs_valuec                 C  s   t jjrR| jd urt| jdkrg S g }| jD ]8}|r |dkr q| |}t|t	j
r1|| q| jrB|| jv rB| j|dnd }|t jj|| q|S dd | j D }|S )Nr   r   r   c                 S  s(   g | ]\}}| d t jj| qS r1  r_   r   r  rr  )r   kr  rw   rw   rx   r     s    z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)r_   r   r:  r  r   r2  r  rw  rt   r   r   r  r1  r2  r  rr  r   r  )ra  Zskip_outr   rp  r  rt  rw   rw   rx   codegen_kwargs  s,   

zExternKernel.codegen_kwargsr   c                 C  sT   | j d ur&| j j}t|dd}|dd}|ddd }| d| }|S d}|S )	Nr   Zunknown_namespacer>  r?  r9  r4   r   Z
unknown_op)r'  r  r   r<  rsplit)ra  r  Zop_namespaceop_namerw   rw   rx   get_op_name  s   
zExternKernel.get_op_namec                 C  s   t jr=tjjs?t|  dkrd S tjj|  }tjj| 	 }| 
 }|d|   d| d| d|d	 d S d S d S )Nr   zassert_size_stride(r  r  )r5   Zsize_assertsr_   r   r:  rZ   r   r  Zcodegen_shape_tupler  r}  r  r  )ra  r4  r   r   r|  rw   rw   rx   codegen_size_asserts  s    z!ExternKernel.codegen_size_assertsc              	   C  st   t jr6tjjs8|  }|tjjv}|  }|r(|d| dt	 d|d d S |d| d| d d S d S d S )Nzassert_alignment(r  r  z	# buffer z (op: z) is assumed to be not aligned)
r5   Zalignment_assertsr_   r   r:  r  r  r}  r  rS   )ra  r4  r   Zalignedr|  rw   rw   rx   codegen_alignment_asserts  s   z&ExternKernel.codegen_alignment_assertsc                 C  s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r  )ra  _sizeZ_striderw   rw   rx   get_group_stride  s   zExternKernel.get_group_stridec                   s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                   r*  rw   )r   r  )r   rw   rx   r     r   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S  s   g | ]	}t d | qS )rA  )rX   r   rw   rw   rx   r     r  T)r  r  c                 S  r   rw   rw   r   rw   rw   rx   r     r   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                   r   rw   rw   r   r   rw   rx   r     r   c                   r   rw   rw   r   )r  rw   rx   r     r   cc                   r   rw   rw   r  )add_varrw   rx   r     r   )r_   r   r   r   r  r   r   r  rH  r   r  r  r?   r   r   r[   r   r  r   )ra  r@  r3  Zindex_orderr   re  r   Z	new_sizesr   r  r   replacementrw   )r  r  r   r   rx   canonicalize  s$   
 zExternKernel.canonicalizer   rr   c                 C  sP   |rt nt}ttj  }| jD ]}|||O }q| j D ]}|||O }q|S rs   )maybe_free_unbacked_symbolsmaybe_free_symbolsr/   r   r   r  r   r   )ra  r   Zmaybe_get_symbolsrC  r  rw   rw   rx   r    s   

z!ExternKernel.get_free_symbol_usesc                   sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nr  zpython_kernel_name=c                   s$   g | ]}|j  d t |j  qS r1  )r   r   )r   r  rd  rw   rx   r   1      z(ExternKernel.__str__.<locals>.<listcomp>r3  )r   r  fieldsr  rZ  r  )ra  Zkernel_namer}  rw   rd  rx   r<  ,  s   
zExternKernel.__str__rw   NNNNrw   Nr  r"  r#  rs   r  r  rq   r   )r  r  rq   r   )rq   rD  )NNF)r   r_  r  r  r  )r/  rW  r  r
  ):r   r   r   r  r   r  r  r   r   r  r  r  r   r  r  r  r   r!  r"  r=  r  r   r&  r  r5  r7  r$  r%  rB  r  rC  rY  rZ  r\  r  r^  rd  rb  ra  ri  rj  r  ro  r3  rq  ru  rv  rw  rz  r}  r~  r  r  r  r  r<  rX  r[  rw   rw   r7  rx   r    s   
 


%


w
D
 
	 




" 



	 
r  c                      sB   e Zd ZdddZ							dd fddZdd
dZ  ZS )ExternKernelOutrq   r   c                 C     | |  d S rs   )Zgenerate_extern_kernel_outr6  rw   rw   rx   r7  =  rz   zExternKernelOut.codegenrw   Nc
           
        sF   t  d || |||pi d ||||	
 tj| | _tj|  d S rs   )r6  r=  r  r_   r   r  r   r  )
ra  r;  rM  r  r   r  r  r  r  r  r7  rw   rx   r=  @  s   zExternKernelOut.__init__rr   c                 C  r  r  rw   rd  rw   rw   rx   r  [  r   zExternKernelOut.should_allocater  r  r  )r   r   r   r7  r=  r  r[  rw   rw   r7  rx   r  ;  s    
r  c                      s   e Zd Zd	 fddZ  ZS )
RandomSeedscountru   r   r  rq   r   c                   sF   t t j}t jt|t j|gdg |j|j|ggddtj	j
d d S )Nr  zaten.randint.low_outzat::_ops::randint_low_out::call)r;  rM  r  r  r  r  )r  r  r  r6  r=  r=  r  rP  r8  randintZlow_out)ra  r  r   Zlimitsr7  rw   rx   r=  `  s   
zRandomSeeds.__init__)r  ru   r   r  rq   r   r   r   r   r=  r[  rw   rw   r7  rx   r  _      r  c                      sH   e Zd ZdddZ						dd fddZdd
dZdd Z  ZS )r  rq   r   c                 C  r  rs   )Zgenerate_extern_kernel_allocr6  rw   rw   rx   r7  t  rz   zExternKernelAlloc.codegenrw   Nc	           	        sL   t  d || |||pi d ||||
 g | _tj| | _tj|  d S rs   )	r6  r=  r  r  r_   r   r  r   r  )	ra  r;  rM  r  r   r  r  r  r  r7  rw   rx   r=  w  s   zExternKernelAlloc.__init__rr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   z!ExternKernelAlloc.should_allocatec                 C  r  rs   r  rd  rw   rw   rx   r3    r   z"ExternKernelAlloc.apply_constraintr  )rw   NNNrw   Nr  )r   r   r   r7  r=  r  r3  r[  rw   rw   r7  rx   r  s  s    

r  c                      s@   e Zd ZdZd fddZddd	ZdddZdddZ  ZS )r  zP
    An output buffer that represents the mutation of a pre-existing buffer
    mutating_noder  rq   r   c                   sD   t  jd |d | }tj| |g| _|| _tj| | _	d S r  )
r6  r=  r  r_   r   re  mutation_namesr  r  r   )ra  r;  Zmutated_noder  Zmutated_node_namer7  rw   rx   r=    s   zMutationOutput.__init__c                 C  rp  rs   )r  rd  rw   rw   rx   rv    rr  zMutationOutput.get_defining_oprx  c                 C  rp  rs   )r  rd  rw   rw   rx   r    rr  z!MutationOutput.get_mutation_namesrr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zMutationOutput.should_allocate)r  r  rq   r   r  r  r  )	r   r   r   r  r=  rv  r  r  r[  rw   rw   r7  rx   r    s    

r  c                      s`   e Zd ZU dZi Zded< edd	d
ZedddZd fddZ	dddZ
dddZ  ZS )TMADescriptorad  
    An IR node representing a generic host-side TMA descriptor in the Triton API
    Mostly useful for user-defined Triton kernels relying on host-side TMA;
    but can, in principle, be used for Inductor's Triton templates, too.

    See TMADescriptorExperimental and TMADescriptorStable for the two implementations
    (the old API and the new API)
    zdict[Any, TMADescriptor]_CACHEr1  rm   tma_metatuple[str, tuple[Any, ...]]rq   c                 C  sT   t |dksJ |d dkrt|g|d R  S |d dks J t|g|d R  S )Nr   r   Zexperimentalr4   r  )r   TMADescriptorExperimentalTMADescriptorStable)rA  r1  r  rw   rw   rx   _create_impl  s
   zTMADescriptor._create_implc                 C  s2   t ||f}|| jvr| ||| j|< | j| S rs   )idr  r  )rA  r1  r  r  rw   rw   rx   r@    s   

zTMADescriptor.createc                   sL   t  d tt|| d|t|d  || _tj	| | _
tj|  d S )Nr9  )r6  r=  rP  r?  r  r   r1  r_   r   r  r   r  )ra  r1  rM  r  r7  rw   rx   r=    s   zTMADescriptor.__init__r   c                 C  r  rs   )Zgenerate_tma_descriptorr6  rw   rw   rx   r7    rz   zTMADescriptor.codegenc                 C  rp  rs   )r1  rd  rw   rw   rx   
get_tensor  rr  zTMADescriptor.get_tensor)r1  rm   r  r  rq   r  )r1  rm   r  r   )r   r   r   r  r  r   rY  r  r@  r=  r7  r  r[  rw   rw   r7  rx   r    s   
 

r  c                      s&   e Zd ZdZ	dd fddZ  ZS )r  z
    the new host-side TMA Descriptor API:
    (the ones obtained via create_{1d,2d}_tma_descriptor calls).

    See also TMADescriptorStable for the new API.
    Nr1  rm   r  list[Union[int, torch.SymInt]]
block_dimselement_sizer  rq   r   c                   s   t |dv sJ t |t |ksJ |d u r| j}|| _|| _|| _t | j| _|g}g | j| j| j}t j|||d d S )N)r4   r   r1  rM  r  )	r   r  r  r  r  r  r6  r6  r=  )ra  r1  r  r  r  rM  r  r7  rw   rx   r=    s*   

z"TMADescriptorExperimental.__init__rs   )
r1  rm   r  r  r  r  r  r  rq   r   r   r   r   r  r=  r[  rw   rw   r7  rx   r    s    r  c                      s"   e Zd ZdZd fddZ  ZS )	r  z
    the new host-side TMA descriptor API
    (the ones obtained via TensorDescriptor.from_tensor).

    See also TMADescriptorExperimental for the old API.
    r1  rm   block_shaper  c                   s   || _ t j||g|d d S )Nr  )r  r6  r=  )ra  r1  r  r7  rw   rx   r=  !  s   
zTMADescriptorStable.__init__)r1  rm   r  r  r  rw   rw   r7  rx   r    s    r  c                      s(   e Zd Zd fddZdddZ  ZS )SubgraphBufferr;  r  r~   rN  rE  rF  example_inputs	list[Any]subgraph_namer   c           	   	     s  t  d || || _|| _tj| | _tj|  tj	| j||| _
t| j}|D ]}|| j
j|j< | j
j|j q,dd |D | _dd lm  m} t| j
0 |jdddd | j
j| j  W d    n1 spw   Y  W d    d S W d    d S 1 sw   Y  d S )Nc                 S  s   g | ]}|j qS rw   r   )r   Zsym_varrw   rw   rx   r   B  s    z+SubgraphBuffer.__init__.<locals>.<listcomp>r   FZATEN)Zmax_autotuneZmax_autotune_gemmZmax_autotune_gemm_backends)r6  r=  rE  r  r_   r   r  r   r  make_subgraphsubgraphrR  rM  r  Zgraph_input_namesr  
sym_inputsZtorch._inductor.configr  r5   set_graph_handlerr   run)	ra  r;  r~   rE  r  r  r  Zsym_inpZinductor_configr7  rw   rx   r=  ,  s.   
"zSubgraphBuffer.__init__rq   r   c                 C  sD   G dd d}dd | j D }||| jg | j|| jg d S )Nc                   @  r  )z,SubgraphBuffer.codegen.<locals>.CodegenGraphr   rc   c                 S  s   || _ |j| _d S rs   )r   r   )ra  r   rw   rw   rx   r=  Q  s   z5SubgraphBuffer.codegen.<locals>.CodegenGraph.__init__N)r   rc   )r   r   r   r=  rw   rw   rw   rx   CodegenGraphP  r  r  c                 S     g | ]}|  qS rw   r  r   r  rw   rw   rx   r   U  r   z*SubgraphBuffer.codegen.<locals>.<listcomp>)rM  Z'codegen_subgraph_with_flattened_outputsr  r  r   )ra  r4  r  Zouter_inputsrw   rw   rx   r7  O  s   zSubgraphBuffer.codegen)
r;  r  r~   rN  rE  rF  r  r  r  r   r  )r   r   r   r=  r7  r[  rw   rw   r7  rx   r  +  s    #r  c                      s`   e Zd Zdd ZdddZ	dd fddZdddZd fddZdddZdddZ	  Z
S )UserDefinedTritonKernelc                   s   ddl m} ddlm} || j g }g }g }t |ret dr0| fdd j	D  nt ds7J | j
 t drR jD ]}| jj|  qEnt d	sYJ | j  j} j  |||fS )
Nr   )	Autotuner)kernel_side_tablerestore_idxc                 3  s    | ]	} j j| V  qd S rs   )r   	arg_namesr   r  rw   rx   r!  k  r  zBUserDefinedTritonKernel.get_kernel_and_metadata.<locals>.<genexpr>restore_value	reset_idxreset_to_zero)Ztriton.runtime.autotunerr  *torch._higher_order_ops.triton_kernel_wrapr  Z
get_kernel
kernel_idxrt   r  r  r  r  r  r  r   r  r  configs)ra  r  r  r  restore_value_argsreset_to_zero_argsr   rw   r  rx   get_kernel_and_metadata^  s,   




z/UserDefinedTritonKernel.get_kernel_and_metadatarq   r   c                   s  ddl m}  \ }}}| |j||j\}}}fddjD }	t fdd jD }
g }g }g }g }t	
|	 tt	d|D ]o\}}|| || t|trk||  ||  qIt|ttttjfr|| |t| qI||
v r|d |t qI|d u r	 | r|d |t qI|  |  qItd	t| d
| | |j||||||d jjd	 d S )Nr   )triton_version_uses_attrs_dictc                   s   i | ]}|  |qS rw   rw  r   ry  rd  rw   rx   r     r  z3UserDefinedTritonKernel.codegen.<locals>.<dictcomp>c                   r  rw   )r  r   r  rw   rx   r     r   z3UserDefinedTritonKernel.codegen.<locals>.<listcomp>ry  r"  zUnsupported arg type: r  T)	arg_typesZraw_argsZraw_keystriton_metar  r   Zoriginal_fxnode_name) Ztorch._inductor.utilsr  r  Z!define_user_defined_triton_kernelr   gridr  r/   Z
constexprsr  r3  r  r   repeatr  rt   rm   r  r  ru   r`  rr   r   r   r   r?  r  r5  Zgenerate_kernel_callr  r'  r   )ra  r4  r  r  r  r  new_namer  Zextra_launch_argsZ
named_argsZconstexpr_namesr   r  Zraw_keys_filteredZraw_args_filteredr   r  rw   )r  ra  rx   r7  }  sz   
	




	



zUserDefinedTritonKernel.codegenFr   rr   r}   c                   s   t  |t| j|B S rs   )r6  r  r   r  r  r7  rw   rx   r    s   z,UserDefinedTritonKernel.get_free_symbol_usesc                 C  r  rs   r.   rd  rw   rw   rx   r     rr  z0UserDefinedTritonKernel.get_unbacked_symbol_defsc                  sV  g }i }g }   D ]0\}}	t|	tr1t|	}
||v r't|
|| }
||
 |
||< q
||	 |	||< q
t	|dksCJ |d 
 _t d tjd|t|| |_|_ \}}}} fdd|jD _ddlm} t	|dkr|d jni } fdd||i  ||D _fddjD _tj d S )Nr   r=  c                   s   g | ]}| v r|qS rw   rw   r  kernel_argsrw   rx   r     r  z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_mutated_tensorsc                   r   rw   rw   r   r  r  rw   rx   r     s    c                   s    g | ]}t t jd | qS r  )r  r^  r   r  rd  rw   rx   r     s    )r  rt   rl   r  r  r  r  r@  r  r   r  r   r6  r=  r^  r   r  r  r  r  r  r  r  r   Zmutable_argsr"  r_   r   r  )ra  r  r  Ztma_descriptor_metadatar  rM  r   r  ry  r  r  r  r  r   r  Zautotuned_kwargsr7  )r  ra  rx   r=    sL   








z UserDefinedTritonKernel.__init__rN  c                 C  s
   t | jS rs   )r   r"  rd  rw   rw   rx   r    r  z#UserDefinedTritonKernel.get_outputsr  c                 C  rp  rs   r=  rd  rw   rw   rx   r    rr  z"UserDefinedTritonKernel.get_devicer  r  r
  r#  r"  r  )r   r   r   r  r7  r  r   r=  r  r  r[  rw   rw   r7  rx   r  ]  s    
Q
	
5r  c                      J   e Zd ZdZdddZdddZdd
dZdddZd fddZ  Z	S )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    rq   r   c                 C  s   dd | j D \}tjjr)||   d| ddtt| j	 d|j
  d S ||   d| ddtt| j	 d|j
  d S )Nc                 s      | ]}|  V  qd S rs   r  r  rw   rw   rx   r!    r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>r  r  z, NULL)r  )rM  r_   r   r:  r  rB  r  r  reprr  ending)ra  r4  ro   rw   rw   rx   r7    s   ,,z InplaceBernoulliFallback.codegenrr   c                 C  r  r  rw   rd  rw   rw   rx   r  (  r   z(InplaceBernoulliFallback.should_allocaterx  c                 C     | j d  gS r  rM  r  rd  rw   rw   rx   r  +  r  z+InplaceBernoulliFallback.get_mutation_namesr}   c                 C  r  rs   r.   rd  rw   rw   rx   r   .  rr  z1InplaceBernoulliFallback.get_unbacked_symbol_defsc                   sV   t  jd t| d| |g||d tj|  tj	| | _
tj|  d S )Nr=  r  )r6  r=  r^  r  r  r_   r   re  r  r  r   r  )ra  r  ro   r  r7  rw   rx   r=  1  s   
z!InplaceBernoulliFallback.__init__r  r  r  r#  
r   r   r   r  r7  r  r  r   r=  r[  rw   rw   r7  rx   r        



r  c                      sZ   e Zd ZdZdddZdddZdd
dZdddZd fddZe	ddddZ
  ZS )InplaceCopyFallbackr  rq   r   c                 C  s    |   \}}}|||| d S rs   )rv  codegen_device_copy)ra  r4  ro  rn  non_blockingrw   rw   rx   r7  D  s   zInplaceCopyFallback.codegenrr   c                 C  r  r  rw   rd  rw   rw   rx   r  H  r   z#InplaceCopyFallback.should_allocaterx  c                 C  r  r  r  rd  rw   rw   rx   r  K  r  z&InplaceCopyFallback.get_mutation_namesr}   c                 C  r  rs   r.   rd  rw   rw   rx   r   N  rr  z,InplaceCopyFallback.get_unbacked_symbol_defsc                   sJ   t  jd |||ddd tj|d   tj| | _tj|  d S )Nz
aten.copy_Zaoti_torch_copy_)r  r  r   )	r6  r=  r_   r   re  r  r  r   r  )ra  r;  rM  r  r7  rw   rx   r=  Q  s   zInplaceCopyFallback.__init__Fr  c                   s6    fdd||fD }|f}t t| d||}|S )Nc                   r*  rw   rK  r  rL  rw   rx   r   e  r   z.InplaceCopyFallback.create.<locals>.<listcomp>r=  )r  r^  r  )rA  ro  rn  r  rM  r  r`  rw   rL  rx   r@  c  s   zInplaceCopyFallback.creater  r  r  r#  r  )r  rr   )r   r   r   r  r7  r  r  r   r=  rY  r@  r[  rw   rw   r7  rx   r  ?  s    



r  c                   @  sB   e Zd ZdZdddZdddZdd
dZdddZdddZdS )MutatingFirstArgExternKernelr  rq   r   c                 C  sJ   g dd | j D tt| j}||   dd| d|j  d S )Nc                 s  r  rs   r  r  rw   rw   rx   r!  v  r  z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>r  r  r  )rM  r  r  r  r  rB  r  r  )ra  r4  Zargrefsrw   rw   rx   r7  t  s   
z$MutatingFirstArgExternKernel.codegenrr   c                 C  r  r  rw   rd  rw   rw   rx   r  }  r   z,MutatingFirstArgExternKernel.should_allocaterx  c                 C  r  r  r  rd  rw   rw   rx   r    r  z/MutatingFirstArgExternKernel.get_mutation_namesr}   c                 C  r  rs   r.   rd  rw   rw   rx   r     rr  z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C  r  r  rw   rd  rw   rw   rx   has_side_effects  r   z-MutatingFirstArgExternKernel.has_side_effectsNr  r  r  r#  )	r   r   r   r  r7  r  r  r   r  rw   rw   rw   rx   r  o  s    

	

r  c                      s   e Zd Zd fddZ  ZS )ResizeStorageBytesrq   r   c                   s   t |ts	J dt jd t| d| |g|fd tj	|
  tj| | _tj|  d| _d| _tjj|j
  d S )NzTODO: dynamic shapesr=  )r  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)rt   ru   r6  r=  r^  r  r  r_   r   re  r  r  r   r  r  r  never_reuse_buffersr}  r:  )ra  variabler  r7  rw   rx   r=    s   
zResizeStorageBytes.__init__r  r  rw   rw   r7  rx   r    r  r  c                      s(   e Zd Zd fddZd	ddZ  ZS )
SetSourceTensorKernelrq   r   c                   s   |   t j| ||gdtjjjjd t	j
j|j  t	j
j|  t	j
j|   | }tt|d|| tt|d|| g| _d S )Nz!torch.ops.aten.set_.source_Tensor)r  r  r=  )r  r6  r=  r  r  r]   r8  Zset_Zsource_Tensorr_   r   r  r}  r:  r  r  r  r^  r"  )ra  Zself_tensorZstorage_tensorr   r7  rw   rx   r=    s   

zSetSourceTensorKernel.__init__rx  c                 C  s   | j d  | j d  gS r0  r  rd  rw   rw   rx   r    s   z2SetSourceTensorKernel.get_inputs_that_alias_outputr  r  )r   r   r   r=  r  r[  rw   rw   r7  rx   r    s    r  c                      sR   e Zd ZdZdddZdddZdd
dZdddZdddd fddZ  Z	S )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    rq   r   c              
   C  s   | j d }tjjrddd}||v r|| }| jr%dd | jD \}}}ndd | jD \}}| jd }|||| jd	 ||g| j| j	| j|| 
  d S )
Nr  r  r  )r}  multiplyc                 s  r  rs   r  r  rw   rw   rx   r!    r  z*ScatterFallback.codegen.<locals>.<genexpr>c                 s  r  rs   r  r  rw   rw   rx   r!    r  r4   r   )r   r_   r   r:  src_is_tensorrM  r  Zgenerate_scatter_fallbackr  r  rz  )ra  r4  r  Zget_operator_enumro   r   rn  rw   rw   rx   r7    s$   


zScatterFallback.codegenrr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zScatterFallback.should_allocaterx  c                 C  r  r  r  rd  rw   rw   rx   r    r  z"ScatterFallback.get_mutation_namesr}   c                 C  r  rs   r.   rd  rw   rw   rx   r     rr  z(ScatterFallback.get_unbacked_symbol_defsNTr  include_selfr-  ru   r  r  r  c          
   
     s   t |t _ jr fdd|||fD }|f}	n fdd||fD }||f}	t jd t| d ||	||dt|ddg|d t	j
|  t	j
  _t	j
  d S )	Nc                   r*  rw   rK  r  rd  rw   rx   r     r   z,ScatterFallback.__init__.<locals>.<listcomp>c                   r*  rw   rK  r  rd  rw   rx   r     r   r=  r  r  r  )r  r  r  )rt   rl   r  r6  r=  r^  r  r  r   r_   r   re  r  r  r   r  )
ra  r  ro   r-  r   rn  r  r  tensorsr  r7  rd  rx   r=    s&   
zScatterFallback.__init__r  r  r  r#  )r-  ru   r  r  r  rr   rq   r   r  rw   rw   r7  rx   r    s    



r  c                      r  )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    rq   r   c           	      C  s   dd | j D ^}}}g }t|}t| jD ]\}}| j| d ur)|t| q|tjjj	 q|j
|  |||g|  R   d S )Nc                 s  r  rs   r  r  rw   rw   rx   r!    r  z+IndexPutFallback.codegen.<locals>.<genexpr>)rM  rE  r   r  r  rF  r_   r   r  r  Zgenerate_index_put_fallbackrB  ru  )	ra  r4  ro   r   valid_indicesr  Ziter_valid_indicesr   r   rw   rw   rx   r7    s   zIndexPutFallback.codegenrr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   z IndexPutFallback.should_allocaterx  c                 C  r  r  r  rd  rw   rw   rx   r    r  z#IndexPutFallback.get_mutation_namesr}   c                 C  r  rs   r.   rd  rw   rw   rx   r     rr  z)IndexPutFallback.get_unbacked_symbol_defsc           	   	     s   | _ dd |D } fdd||g|D }d}t jd t| d ||fd||d tj j	d 
  tj  _tj  d S )	Nc                 S  s   g | ]}|d ur|qS rs   rw   r   rw   rw   rx   r     r   z-IndexPutFallback.__init__.<locals>.<listcomp>c                   r*  rw   rK  r  rd  rw   rx   r     r   Zaoti_torch_index_put_outr=  zaten.index_put_)r  r  r  r   )r  r6  r=  r^  r  r  r_   r   re  rM  r  r  r   r  )	ra  r  ro   r  r   
accumulater  r  r  r7  rd  rx   r=    s    	zIndexPutFallback.__init__r  r  r  r#  r  rw   rw   r7  rx   r    r  r  c                   @  s"   e Zd Zedd ZdddZdS )	
DeviceCopyc                 C  s   |  stdd | D rtjjs||S tj	| tj	|
  td |f}tt|| | d| |g|S )Nc                 s  s    | ]	}|t jjv V  qd S rs   )r_   r   rP  r  rw   rw   rx   r!  0  r  z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr  )r  r$  rn  r5   aot_inductorZuse_runtime_constant_foldingr  r_   r   Zadd_device_infor  rN   r  r  r  r   r  )rA  ro   r   r  r  rw   rw   rx   r@  ,  s(   

zDeviceCopy.createrq   r   c                 C  s\   |   }t|dksJ | jr||d | j |d  d S ||d |  |d  d S )Nr   r   r4   )rv  r   r  r  r  )ra  r4  r   rw   rw   rx   r7  D  s   zDeviceCopy.codegenNr  )r   r   r   rY  r@  r7  rw   rw   rw   rx   r  +  s    
r  c                      sJ   e Zd ZdZdddZdddZd fd
dZdddZdddZ  Z	S )r   z;
    The result of a call to aten._local_scalar_dense.
    rq   r  c                 C  r  rs   r.   rd  rw   rw   rx   rm  T  rr  zDynamicScalar.get_readsrr   c                 C  r  r  rw   rd  rw   rw   rx   r  W  r   zDynamicScalar.should_allocater   c                   s<   |   t d ttdd| |g || _|| _d S Nr  r=  )	r  r6  r=  r^  r  r   r  symkeypath)ra  r  r  r:  r7  rw   rx   r=  Z  s   
zDynamicScalar.__init__r}   c                 C  s   t | jgS rs   )r/   r  rd  rw   rw   rx   r   b  r   z&DynamicScalar.get_unbacked_symbol_defsc                 C  r  rs   )Zcodegen_dynamic_scalarr6  rw   rw   rx   r7  e  rz   zDynamicScalar.codegenr  r  r  r#  )
r   r   r   r  rm  r  r=  r   r7  r[  rw   rw   r7  rx   r   O  s    


r   c                      sV   e Zd ZdZdddZdddZd fd
dZdddZddddZdddZ	  Z
S )r   z5
    The result of a call to aten._assert_scalar
    rq   r  c                 C  r  rs   r.   rd  rw   rw   rx   rm  n  rr  zAssertScalar.get_readsrr   c                 C  r  r  rw   rd  rw   rw   rx   r  q  r   zAssertScalar.should_allocater   c                   s,   t  d ttddg  || _|| _d S r  )r6  r=  r^  r  r   scalarrY  )ra  r  rY  r7  rw   rx   r=  t  s   
zAssertScalar.__init__c                 C  r  r  rw   rd  rw   rw   rx   r    r   zAssertScalar.has_side_effectsFr   c                 C  r  rs   )r   r  r  rw   rw   rx   r    r   z!AssertScalar.get_free_symbol_usesc              	   C  s   t jsd S tt| jdd}tjjr5d| d}tjjj	| j
dd}|d| d| j d| d	 d S tjjj| j
dd}|d
| d |dt| j d ||   d d S )NFrP  zstd::to_string(r  )r  zif (!(z()) { throw std::runtime_error("Expected z but received " + z); }zif not (z):z    raise RuntimeError(z = None)r5   Zscalar_assertsrF  rE  r  r_   r   r:  r  Zcodegen_cpp_sizevarr  r  rY  Zcodegen_python_sizevarr  r  )ra  r4  symbolZ
symbol_strZsizevarrw   rw   rx   r7    s"   zAssertScalar.codegenr  r  r  r  )r   rr   )r   r   r   r  rm  r  r=  r  r  r7  r[  rw   rw   r7  rx   r   i  s    


r   c                   @  s   e Zd ZU ded< ded< dS )ExternKernelNoder   r   zexport_schema.Noder   Nr   rw   rw   rw   rx   r    s   
 r  c                      s   e Zd ZdZ	d'ddd( fddZd) fd	d
Zd(ddZd*ddZdd Ze	dd Z
dd Zdd Zd+ddZdd Zd(ddZe	d,d!d"Zed#d$ Z fd%d&Z  ZS )-FallbackKernelz
    A class that represents a fallback kernel for handling operators that are not
    directly support by inductor. It currently supports functional ops, view ops,
    inplace aten ops, and mutating ops that are auto-functionalizable.
    Nr!  rq   r   c                  sN  t  j|t|t||d d _| _t|tjjtjj	fs,J d| dt
| d| _| _|d u r8i n| _tj j g  _g  _t jtjj	rRd S d j v r[d S  jj}tjj jrs j|d   d S |jrt|std|   j j\}	}d fdd}
tjj ||	|D ]	\}}|
|| qd S )Nr  Fz#Fails to create FallbackKernel for r  z not supportedZ_c10d_functionalr   z'NYI: Can't generate FallbackKernel for rq   r   c                   s   t  jtjrt |ttfsJ t jrt |ttfrJ |d u r%d S  jd u r,d S d fdd}t	 jrK|d urG|D ]}|| q@d S d S t jsSJ || d S )Nrq   r   c                   s>   j |    jjrjtt|  d|  d S d S r  )	alias_namesr  r  
alias_infoZis_writer"  r  r^  r  r  )infora  rw   rx   	add_alias	  s   zPFallbackKernel.__init__.<locals>.handle_aliasing_and_mutation.<locals>.add_aliasr  )
rt   r   r  ListTyper   r   library_utilsZis_tensor_like_typer  Zis_tensorlist_like_type)r  r  r  Zoptional_tensor_argrd  )r  rx   handle_aliasing_and_mutation  s"   

z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutationr  )!r6  r=  r   use_runtime_dispatchr!  rt   r  r-  r.  r@  r   r  rJ  r   r_   r   Zwarn_fallbackr  r  r  r   r/  _libraryrU  Zmutates_and_returns_first_argr  r  
is_mutabler   r  rM  r  Z
zip_schema)ra  r;  r  rT  nontensor_argsrJ  r   r!  schemar   r  r  r  r7  rd  rx   r=    sN   zFallbackKernel.__init__r  c                   sH   t   }| jtjjju r"| jD ]}t|t	r!|
t| }q|S rs   )r6  r  r  r  Z_primsZ	rng_primsZgraphsafe_run_with_rng_stater  rt   rO  Z	with_readr6   r  r  )ra  r  r  r7  rw   rx   r    s   


zFallbackKernel.get_read_writesc                 C  s   | |  | jt| dd S Nr!  )(codegen_unbacked_symbol_defs_for_outputsr  r  r   r6  rw   rw   rx   codegen_unbacked_symbol_defs'  s   z+FallbackKernel.codegen_unbacked_symbol_defsr}   c                 C  :   t | dd  }rttjjj|}|d usJ | S t S r   r   r*   r_   r   r   r   r  r/   ra  r!  resolvedrw   rw   rx   r   ,     
z'FallbackKernel.get_unbacked_symbol_defsc                   s   t jG dd d  fdd| jD }| || j\}}tjjr=t| j	t
jjr=| ||}dd t| j	jj|D }ndd |D }| j| |S )Nc                   @      e Zd ZU ded< dddZdS )	z)FallbackKernel.codegen_args.<locals>.Shimr
   refrq   r   c                 S  rp  rs   )r	  rd  rw   rw   rx   rX  ;  rr  z2FallbackKernel.codegen_args.<locals>.Shim.__repr__Nr  )r   r   r   r   rX  rw   rw   rw   rx   Shim7     
 r
  c                   s   g | ]} |  qS rw   r  r  r
  rw   rx   r   >  r   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S  s"   g | ]\}}t jj||jqS rw   )r_   r   r  rr  r)  )r   paramro   rw   rw   rx   r   B  s    c                 S  r  rw   rx  r  rw   rw   rx   r   G  r  )r  	dataclassrM  rJ  r  r_   r   r:  rt   r  r  r-  r.  rq  r   r/  r0  r   r  )ra  rT  r   r   rw   r  rx   rv  6  s   zFallbackKernel.codegen_argsc                 C  s   | r	dd | D nd }|rdd | D }|d S t |tjr!|jS t |ttfrTtdd |D }dd |D }t|dkrB|d S |D ]}t|j	rO|  S qD|d S d S )	Nc                 S  s   g | ]	}t |ts|qS rw   )rt   r  r  rw   rw   rx   r   P  r  z.FallbackKernel.find_device.<locals>.<listcomp>c                 S  s   g | ]
}|  r|  qS rw   r  r  rw   rw   rx   r   U  r  r   c                 s  s    | ]	}t d |V  qd S rs   )r  find_devicer  rw   rw   rx   r!  Z  r  z-FallbackKernel.find_device.<locals>.<genexpr>c                 S  s   g | ]}|r|qS rw   rw   )r   r   rw   rw   rx   r   ^  r   r4   )
rt   r  rS  r   r   r   r/   r   rV   r   )rT  rW  Znon_torch_bind_tensor_argsZdevicesZ
device_setr   rw   rw   rx   r  M  s,   
zFallbackKernel.find_devicec                 C  s"   t | jtjjr
dS t| j S r  )rt   r  r  r-  r@  r#   r  rd  rw   rw   rx   r  g  s   zFallbackKernel.has_side_effectsc                 C  rp  rs   )r  rd  rw   rw   rx   r  l  rr  z+FallbackKernel.get_inputs_that_alias_outputrx  c                 C  s   t | jdks	J | jS r  )r   r  rd  rw   rw   rx   r  o  s   z!FallbackKernel.get_mutation_namesc                   sP  t d j ttsJ jj\}	|}fddj
D }j}tjjs9g ||S tdd}|||}dd  t|tjjjr]||d |d j}n|jj}t|dkr|jrmjnj}|d j} ||g}	n fd	dt|jD }	t tjj ||	i d
d}
tjj !|
 g ||S )a  
        ProxyExecutor Design Note
        We export the ExternFallbackNodes (for custom ops) into a serialized file
        and run it with a host side proxy executor to address the ABI problem
        This is currently only implemented for fbcode. Eventually, we will also make this work for OSS.
        Detailed design doc can be found at
        https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
        z4Extern kernel node added for node %s with target %s.c                   s   g | ]}j |fi  qS rw   r  r  r   ra  rw   rx   r     r  z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>Nc                 S  s<  t | tjtjfr>|}t |ttfrt|dksJ |d }t | tjr1tjj	tj
| ddS |d u s7J tjj	ddS t | tjrXt |  tjrXtjj	dd |D d	S t | tjrt |  tjr|d u rvtjj	tjj	ddd
S tjj	tjj	tj
| ddd
S t | tjrtjj	|dS tdt|  )Nr4   r   r   )Z	as_tensorT)Zas_nonec                 S  s   g | ]
}t j| d qS )r   )export_schemaTensorArgumentr  )r   r   rw   rw   rx   r     s    zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)Z
as_tensors)Zas_optional_tensor)Zas_intzUnsupported return type )rt   r  Z
TensorTypeNoneTyper   r   r   r  ZArgumentr@  r  r  r  getElementTypeOptionalTypeZOptionalTensorArgumentZIntTypeRuntimeErrorr   )return_typerG  r   rw   rw   rx   handle_single_output  sL   

zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr   r4   c                   s   g | ]
\}} |j |qS rw   )r)  )r   Zreturn_schemarG  )r  rw   rx   r     s    
)r  rM  r  metadata)r   r   )"r  r  r  r  rt   r  rJ  rM  r  rq  r  r_   r   Zaot_moder   Zserialize_inputsr  rR  	torchbindCallTorchBindr  returnsr/  r   r  r"  r)  r   r  r  ra   r   Zextern_kernel_nodesr  )ra  r   Zordered_kwargsr  
serializerZnamed_argumentsr  r  r  Zoutput_argumentsr   rw   )r  r   ra  rx   export_extern_kernel_nodes  sL   	
/



z(FallbackKernel.export_extern_kernel_nodec                   s  j }|jdkr+t|tjjsJ tjjr*ddl	m
} t||vr*td| d_n|jdkr:t|tjjs9J ntjjrE|tjjv_tjjrt|tjjrjsd fdd jj\}t|fddjD }t fddt||jjD _| jr }| j fddj |j!rj!nj" n|# tj$t%r&| '| (| d S )Nr8  r   )inductor_fallback_opszG%s is missing a c-shim implementation, using proxy executor as fallbackTZ
_quantizedr  torch.JitTyperq   rr   c                   s$   t | tjr |  S t | tjS rs   )rt   r  r  r  Z
NumberTyper  	is_numberrw   rx   r"    s   z)FallbackKernel.codegen.<locals>.is_numberc                 3  s"    | ]}j |fi  V  qd S rs   r  r  r  rw   rx   r!    r  z)FallbackKernel.codegen.<locals>.<genexpr>c                 3  s(    | ]\}}t |to |jV  qd S rs   )rt   complexr)  )r   r  r  r!  rw   rx   r!    s
    
c                     s   g      S rs   )rv  rz  rw   rd  rw   rx   r  "  r   z(FallbackKernel.codegen.<locals>.<lambda>)r  r   rq   rr   ))r  r;  rt   r  r-  r.  r_   r   r:  Ztorchgen.aoti.fallback_opsr  r   r  r  r  r5   r  Zcustom_ops_to_c_shimsrJ  rM  r  r  r3  r  r~  r   r/  r0  r5  r  Z,generate_fallback_kernel_with_runtime_lookupr  r  r  r"  Zgenerate_fallback_kernelr;  r  r~  r  r  )ra  r4  r  r  r   Z	args_iterZexported_argsrw   )r"  r   ra  rx   r7    sd   









zFallbackKernel.codegenrG  r   c                 C  s"   t | j| jt|  t|  S rs   )r=  r   r   rL   r   r   )rG  rw   rw   rx   tensor_to_layout0  s   

zFallbackKernel.tensor_to_layoutc                   s,  t jf}||vrtjjnt }|  j|g|R i |\}}}}	}
W d    n1 s-w   Y  tdd |D  ||}|sPt	|t
jjjrPt
d}|d u rb t|d||||	|
dn|shJ d t|d||||	|
d fdd|g }t	|tttfr|_|S |g_|S )	Nc                 s  r  rs   )r  r  rw   rw   rx   r!  J  r  z(FallbackKernel.create.<locals>.<genexpr>r  r=  r  z"Not sure where to find device infoc                   s   t ttfrt fddttD S t tr, fdd D S t tj	rNt
 }tjsDsDtsLtjj|j |S t trUS t tjr_jjS d u smJ dt dd S )Nc                 3  s,    | ]} | t |fg V  qd S rs   r   r   generate_outputr  rG  rw   rx   r!  k  s
    
zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                   s*   i | ]\}}| |t |fg qS rw   r%  )r   r  r  r&  rw   rx   r   p  rH  zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>zFallbackKernel output type z is not supported)rt   r   r   r   r   r   r   r  r  rS  MultiOutputr$  r5    assume_unaligned_fallback_outputr\   r_   r   r  r}  r   ru   ZSymIntr   r/  )rG  r  r  rA  r'  Zhas_unaligned_inputpacked)r  rG  rx   r'  i  s<   



z.FallbackKernel.create.<locals>.generate_output)r8  Z*_fused_moving_avg_obs_fq_helper_functionalr_   r   r#  r   rZ  r~  r  rt   r  rR  r  r  r   r^  r  r   r   r   r  )rA  r  r   r   Zfake_incorrect_kernelscontextrW  rT  rU  rJ  r!  r   r  rw   r*  rx   r@  9  sV   


	
"zFallbackKernel.createc                   s
   t   S rs   )r6  r3  rd  r7  rw   rx   r3    r  zFallbackKernel.apply_constraintrs   r  r  r#  r  )rG  r   )r   r   r   r  r=  r  r  r   rv  r  r  r  r  r  r  r7  r$  rY  r@  r3  r[  rw   rw   r7  rx   r    s,    	l





oN
Xr  c                      s<   e Zd ZdZdddZdddZd	d
d fddZ  ZS )ComplexViewz9View a complex number as two dtyped numbers or vice versarq   rr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   zComplexView.should_allocaterx  c                 C  r  r  r  rd  rw   rw   rx   r    rg  z(ComplexView.get_inputs_that_alias_outputNr  r   c                  s   t  j||||||d d S )Nr  )r6  r=  )ra  r;  r  rT  r  rJ  r!  r7  rw   rx   r=    s   

zComplexView.__init__r  r  r  )r   r   r   r  r  r  r=  r[  rw   rw   r7  rx   r-    s    

r-  c                   @  r  )	r  r  r   rq   r  c                 C  rp  rs   r=  rd  rw   rw   rx   r    rr  zMultiOutputLayout.get_deviceNr  )r   r   r   r   r  rw   rw   rw   rx   r    r  r  c                      sN   e Zd ZdddZ	dd fd
dZ	ddddZdddZdddZ  ZS )r(  rq   r   c                 C  s,   | |  | js| | | | d S d S rs   )Zcodegen_multi_output!skip_size_stride_alignment_checksr~  r  r6  rw   rw   rx   r7    s
   

zMultiOutput.codegenFr;  r  r  list[tuple[Any, ...]]c                   s>   t  d ||gd tj| | _tj|  || _|| _d S r  )	r6  r=  r_   r   r  r   r  r  r.  )ra  r;  r
  r  r.  r7  rw   rx   r=    s
   
zMultiOutput.__init__r   rr   r}   c                 C  s   | j d |S r  )rM  r  r  rw   rw   rx   r    r.  z MultiOutput.get_free_symbol_usesc                 C  s&   t | jdkrt| jd trdS dS )Nr4   r   TF)r   rM  rt   r  rd  rw   rw   rx   r    s
   zMultiOutput.should_allocaterx  c                 C  s   dd | j D S )Nc                 S  s.   g | ]}t |trt| d kr| qS r   )rt   r  r   r  r  r   rQ  rw   rw   rx   r     s    z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>)rM  rd  rw   rw   rx   r    s   z(MultiOutput.get_inputs_that_alias_outputr  r  )r;  r  r  r/  rq   r   r
  r  r  )	r   r   r   r7  r=  r  r  r  r[  rw   rw   r7  rx   r(    s    

r(  c                   @  s  e Zd ZU dZded< dwddZdxd	d
ZdyddZdzddZd{ddZ	d|ddZ
d}d~ddZddd Zdd!d"Zdd#d$Zdwd%d&Zdd'd(Z	)ddd-d.Zdd/d0Zdd3d4Z	)ddd6d7Zdd9d:Zdd<d=Zdd>d?ZddAdBZddDdEZddGdHZdwdIdJZdwdKdLZddOdPZddRdSZd|dTdUZddVdWZ ddXdYZ!	)ddd\d]Z"dd_d`Z#ddbdcZ$d}ddfdgZ%e&ddidjZ'ddldmZ(ddndoZ)ddqdrZ*e&dsdt Z+d|dudvZ,e,Z-dS )rg  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    rm   r:  rq   rr   c                 C  r  rs   r  rd  rw   rw   rx   r    r  z!MutableBox.has_exceeded_max_readsr  c                 C  r  rs   r  rd  rw   rw   rx   r    r  zMutableBox.get_devicer  c                 C  r  rs   r  rd  rw   rw   rx   r    r  zMutableBox.make_loaderr  c                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zMutableBox.make_indexerr'  c                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zMutableBox.get_strider   c                 C  r  rs   r  rd  rw   rw   rx   r    r  zMutableBox.get_nameNr  r  c                 C  r  rs   )r:  r  r  rw   rw   rx   r    r   zMutableBox.has_large_inner_fnr  ru   r   c                 C  r  rs   r  r  rw   rw   rx   r    r   zMutableBox.mark_reusec                 C  r  rs   r  rd  rw   rw   rx   r    r  zMutableBox.realize_hintc                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zMutableBox.unwrap_viewc                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zMutableBox.is_input_bufferc                 C  r  rs   )r:  r  rd  rw   rw   rx   r    r  zMutableBox.freeze_layoutFr   r  r  c                 C     | j ||S rs   )r:  r  r  rw   rw   rx   r    r  z*MutableBox.freeze_layout_with_stride_orderc                 C  r  rs   )r:  r  r  rw   rw   rx   r    r   z(MutableBox.freeze_layout_with_fill_orderr   r  c                 C  r  rs   )r:  r  r  rw   rw   rx   r    r   z(MutableBox.freeze_layout_with_same_orderr  c                 C  r1  rs   )r:  r  r  rw   rw   rx   r    r  z+MutableBox.freeze_layout_with_exact_stridesr  c                 C  r  rs   )r:  r  rd  rw   rw   rx   r  $  r  zMutableBox.get_read_writesr  c                 C  r  rs   r  rd  rw   rw   rx   rm  '  r  zMutableBox.get_readsc                 C  r  rs   r  rd  rw   rw   rx   r  *  r  zMutableBox.num_readsrh   c                 C  r  rs   r  rd  rw   rw   rx   r  -  r  zMutableBox.get_storage_numelr  c                 C  r  rs   r  rd  rw   rw   rx   r  0  r  zMutableBox.get_reduction_typer  c                 C  r  rs   r  rd  rw   rw   rx   r  3  r  zMutableBox.get_reduction_sizec                 C  r  rs   r  rd  rw   rw   rx   r  6  r  zMutableBox.is_externc                 C  r  rs   )r:  r  rd  rw   rw   rx   r  9  r  zMutableBox.is_no_opr   r  c                 C  r  rs   r  r  rw   rw   rx   r  <  r   zMutableBox.constant_to_devicerx  c                 C  r  rs   )r:  r  rd  rw   rw   rx   r  ?  r  zMutableBox.get_mutation_namesc                 C  r  rs   )r:  r  rd  rw   rw   rx   r  B  r  zMutableBox.get_operation_namec                 C  r  rs   )r:  r  rd  rw   rw   rx   r  E  r  z'MutableBox.get_inputs_that_alias_outputc                 C  r  rs   r  rd  rw   rw   rx   r  H  r  zMutableBox.realizer   r}   c                 C  r  rs   r  r  rw   rw   rx   r  K  rP  zMutableBox.get_free_symbol_usesrf  c                 C  r  rs   r  rd  rw   rw   rx   rn  P  r  zMutableBox.get_read_namesru  c                 C  r  rs   )r:  rv  rd  rw   rw   rx   rv  S  r  zMutableBox.get_defining_opr  r  c                 C  r  rs   )r:  r  r  rw   rw   rx   r  V  r   zMutableBox.codegen_referencer  c                 C  r  rs   r:  r  rd  rw   rw   rx   r;  Y  s   
zMutableBox.layoutr  c                 C  r  rs   r  rd  rw   rw   rx   r  ^  r  zMutableBox.get_layoutc                 C  r  rs   r2  rd  rw   rw   rx   r  a  r  zMutableBox.get_output_specr  c                 C  r  rs   r  rd  rw   rw   rx   r   d  r  zMutableBox.get_sizec                 C  r  rs   )r:  r   rd  rw   rw   rx   r   g  r  zMutableBox.dtypec                 C  sn   t | jtrt| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r  
)rt   r:  rg  r   r   r  r   r  )ra  Zline0Zendlr  r}  rw   rw   rx   r<  k  s   


zMutableBox.__str__r  r  r  r  r  r  rs   r  r  r  r   r  r  r  r  r  r  r  r  r	  r  r  r  r  r
  r  r  r  r  r  r  ).r   r   r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rm  r  r  r  r  r  r  r  r  r  r  r  r  rn  rv  r  r  r;  r  r  r   r   r<  rX  rw   rw   rw   rx   rg    s`   
 
































rg  c                   @  s   e Zd Zedd ZdS )rl   c                 C  s   t | tr| S tt| S rs   )rt   r   rl   r  )r:  rw   rw   rx   r@    s   
zTensorBox.createN)r   r   r   r  r@  rw   rw   rw   rx   rl     s    c                   @  sV   e Zd ZdddZdd Zddd	ZdddZdddZdd ZdddZ	dd Z
dS )r  rq   rr   c                 C  s&   t | jttfr| j tjjv S dS r  )rt   r:  r  r?  r  r_   r   r  rd  rw   rw   rx   r    s   zStorageBox.is_input_bufferc                 C  s   t | jto| j tjjv S rs   )rt   r:  rr  r  r_   r   rP  rd  rw   rw   rx   r    s   zStorageBox.is_module_bufferr  c                 C  s   t | jtttttfr| j S t | jtt	t
tfs!J t| j| j }| j }td t| j | j | j d| jd| _tj| j| j_tj| j | j| j_|| j_|| j_| jjS )Nr  r  )rt   r:  r  r  r  r?  r  r  rb  r  rS  r  r   rt  rq  r  r  r  r   r_   r   r  r   r  rV  rZ  rX  )ra  rZ  rX  rw   rw   rx   r    s<   



	
zStorageBox.realizer   c                 C  s4   t | jttfr| j jdkr|   dS dS dS )zL
        Called on buffers we expect to be forced to realize later.
        r4   N)rt   r:  rb  r  rN  Znontrivial_read_countr  rd  rw   rw   rx   r    s   zStorageBox.realize_hintc                 C  s"   t | jto|  tjkp|  S rs   )rt   r:  rb  r  r5   Zrealize_acc_reads_thresholdr  rd  rw   rw   rx   r    s   z!StorageBox.has_exceeded_max_readsc                   sh   |dkr2t | jttfr2t| jr'| j  ddg}t fdd|D r'dS |  tj	kp1| 
 S dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        r4   expZsigmoidc                 3  s    | ]}| j v V  qd S rs   )Zused_opsr  Zopcountrw   rx   r!    r,  z5StorageBox.should_realize_on_reuse.<locals>.<genexpr>TF)rt   r:  rb  r  r  rN  r~  r  r5   Zrealize_reads_thresholdr  )ra  r  Z	heavy_opsrw   r5  rx   should_realize_on_reuse  s   

z"StorageBox.should_realize_on_reuser  ru   c                 C  s   |  |r|   d S d S rs   )r6  r  r  rw   rw   rx   r    s   
zStorageBox.mark_reusec                 C  r  rs   r  rd  rw   rw   rx   r    r  zStorageBox.num_readsNr  r  r  r  )r   r   r   r  r  r  r  r  r6  r  r  rw   rw   rw   rx   r    s    


!


r  c                   @  s*   e Zd ZU ded< ded< dZded< dS )Subgraphr   r   rF  graph_moduleNzOptional[GraphLowering]r   )r   r   r   r   r   rw   rw   rw   rx   r7    s   
 r7  buffersr  c                 C  s,   dd | D } t tdd | D t | k S )Nc                 S  s"   g | ]}t |tr| n|qS rw   )rt   r?  r  r   r  rw   rw   rx   r     s    z(_has_aliased_buffers.<locals>.<listcomp>c                 s  r  rs   )r  r:  rw   rw   rx   r!    r  z'_has_aliased_buffers.<locals>.<genexpr>)r   r/   )r9  rw   rw   rx   _has_aliased_buffers  s   r;  c                      s`   e Zd ZU dZdZded< dZded< dZded< d fddZe	dddZ
dddZ  ZS )InvokeSubgraphz.
    Ir node for the invoke_subgraph HOP.
    NOptional[Subgraph]r  zOptional[list[TensorBox]]operandsOptional[list[MultiOutput]]r  r7  list[TensorBox]r;  r  rq   r   c                   s6   t  jd ||d || _tj| | _tj|  d S r#  )r6  r=  r  r_   r   r  r   r  )ra  r  r>  r;  r7  rw   rx   r=    s   zInvokeSubgraph.__init__c                   s  ddl m} tjj}d }|jd }r|d dd  }n|jdd  }dd |D } fdd|D }g }t|D ]\}	}
t	|
t
rI||
 q:|||
||	  q:|}|jd u rtjj|j||jd	|_t|j |jj|  W d    n1 s~w   Y  |jj}d }|D ]}
t	|
t
s|
 } qq|d usJ t||t|d
ddfddfddt|D }|_|S )Nr4   )constrain_to_fake_tensoreager_input_valsr   r   c                 S     g | ]}|j d  qS r  rJ  r  rw   rw   rx   r     r   z)InvokeSubgraph.create.<locals>.<listcomp>c                   r*  rw   rK  r  rL  rw   rx   r     r   rE  r  r  r=  )r  r>  r;  rG  rm   indru   c                   sL   t | ttfr	| S tt|  |  |  |  | 	 j
d t|fgddS )Nr:  T)r.  )rt   r   r  r(  r=  r  r  r   r  r  r>  r   )rG  rF  )invoke_subgraphrw   rx   create_output@  s   z,InvokeSubgraph.create.<locals>.create_outputc                   s   g | ]	\}} ||qS rw   rw   )r   r   rG  )rH  rw   rx   r   Q  r  )rG  rm   rF  ru   )rc  rA  r_   r   r  rJ  r2  r   r   rt   r   r  r  r8  r   r  r  graph_outputsr  r<  r  r  )rA  r  r>  rA  r  fake_operandsrB  fx_operandsZnew_operandsr   operandr  r   rw   )rA  rH  rG  rx   r@    sT   


zInvokeSubgraph.createc                 C  r  rs   )Zcodegen_invoke_subgraphr6  rw   rw   rx   r7  U  rz   zInvokeSubgraph.codegen)r  r7  r>  r@  r;  r  rq   r   )r  r7  r  )r   r   r   r  r  r   r>  r  r=  rY  r@  r7  r[  rw   rw   r7  rx   r<    s   
 Pr<  c                      s~   e Zd ZU dZded< dZded< dZded< dZded< dZd	ed
< d  fddZ	e
d!ddZd"ddZd#ddZ  ZS )$ConditionalNr   	predicate7Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]]r>  r=  true_subgraphfalse_subgraphr?  r  rm   -list[Union[TensorBox, ShapeAsConstantBuffer]]r7  r;  r  r!  ,Optional[dict[sympy.Symbol, pytree.KeyPath]]rq   r   c           	        sj   || _ || _|| _|| _t|g| \}}t jd |||d |d ur&|| _tj	
| | _tj	|  d S N)r   r;  rM  r  )rN  r>  rP  rQ  _split_by_sym_typer6  r=  r!  r_   r   r  r   r  )	ra  rN  r>  rP  rQ  r;  r!  sym_argsrT  r7  rw   rx   r=  a  s   	zConditional.__init__rl   true_fnfalse_fnc              	     s   |}fdd|D }tjjjd }dd |D }||fD ]/}|jd u rOtjj|j||jd|_t|j |jj	|  W d    n1 sJw   Y  q |jj
}|jj
}	d|fd|	ffD ]\}
}t|rrtd|
 d	| q`t|t|	ksJ ||	ftt||	D ]5\}\}}| | ksJ |||f| | ksJ |||f| j| jksJ |||fqtd
d |g| D }ttjjjtjjjdd }|d usJ dt||||t|d|dddd  fddtt|tjjjd D }|_|S )Nc                   r*  rw   rK  r  rL  rw   rx   r     r   z&Conditional.create.<locals>.<listcomp>r"  c                 S  rC  r  rD  r  rw   rw   rx   r     r   rE  rW  rX  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: c                 s  s"    | ]}t |ts| V  qd S rs   )rt   r   r  )r   orw   rw   rx   r!    s    
z%Conditional.create.<locals>.<genexpr>r!  zcannot determine devicer=  )rN  r>  rP  rQ  r;  r!  r   Union[int, torch.SymInt]rq   Union[int, sympy.expr]c                 S  s   t | tr| S | jjS rs   )rt   ru   r   r/  )r   rw   rw   rx   _maybe_expr  s   
z'Conditional.create.<locals>._maybe_exprc              
     sf   g | ]/\}\}}t t| |  fd d| D  fdd| D | jdt|fgqS )c                   r   rw   rw   r   r?  r\  rw   rx   r     r   z1Conditional.create.<locals>.<listcomp>.<listcomp>c                   r   rw   rw   r]  r^  rw   rx   r     r   r:  )	r(  r=  r  r  r   r   r  r>  r   )r   r   rG  Zmerged_output)r\  conditionalrw   rx   r     s    
r  )r   rZ  rq   r[  )r  r_   r   r  r   r  r8  r   r  r  rI  r;  r  r   r   r   r  r  r  r>  rF  r*   r   r   rJ  r2  rM  r  r  )rA  rN  rW  rX  r>  rK  rJ  r  Ztrue_outputsZfalse_outputsr   r  r   Zt_oZf_or   r!  rw   )r\  rA  r_  rx   r@  }  sp   

$
	zConditional.createc                 C  s*   | |  ||  | jt| di  d S r   )Zcodegen_conditionalr  r  r  r   r6  rw   rw   rx   r7    s   
zConditional.codegenr}   c                 C  r  r   r  r  rw   rw   rx   r     r  z$Conditional.get_unbacked_symbol_defs)rN  rm   r>  rR  rP  r7  rQ  r7  r;  r  r!  rS  rq   r   )rN  rl   rW  r7  rX  r7  r>  rR  r  r#  )r   r   r   rN  r   r>  rP  rQ  r  r=  rY  r@  r7  r   r[  rw   rw   r7  rx   rM  Y  s   
 
VrM  r   r  -tuple[list[ShapeAsConstantBuffer], list[Any]]c                 C  s<   g }g }| D ]}t |tr||j q|| q||fS rs   )rt   r   r  r/  )r   Znon_sym_argsrV  r  rw   rw   rx   rU    s   
rU  c                      st   e Zd ZU dZded< dZded< dZded< dZded< dZded	< d fddZ	e
dddZdddZ  ZS )	WhileLoopNrO  carried_inputsadditional_inputsr=  cond_subgraphbody_subgraphr?  r  rR  r7  r;  r  rq   r   c                   sZ   || _ || _|| _|| _t|| \}}t jd |||d tj	| | _
tj|  d S rT  )rb  rc  rd  re  rU  r6  r=  r_   r   r  r   r  )ra  rb  rc  rd  re  r;  rV  rT  r7  rw   rx   r=    s   zWhileLoop.__init__cond_fnbody_fnc              	     s  ddl m} d%dd	}tjjjd
 }tjjjd }|| }	dd |	D }
dd |D }dd |D }fdd|D }|||}fdd|D }|||}||  ||fD ]H}|jd u rtjj|j|	|jd|_t	|j' |jj
|
  ||u rt|jjt|ksJ ||jj||j_W d    n1 sw   Y  qV|jj}|jj}t|rtd| t|dksJ ||d }t|ts| tjksJ |t| dksJ |t dksJ d d  }|d usJ t|t|ksJ ||ftt||D ]Q\}\}}d&dd}|| |  || |  | | ks6J ||||f| | ksFJ |||f| j| jksXJ |||fqt||||t|dd|jd urvt|jjtjjsxJ ||jj|
d }t | fddD }fd d!t|D }fd"d|! D }|_"fd#d|D _#t$|t$|fd$dt%t|D }t||D ]\}}|& tjj'v rtjj()|&  q|S )'Nr   )check_input_alias_and_mutationtensor_boxes'list[TensorBox | ShapeAsConstantBuffer]fake_tensors,list[Union[int, torch.SymInt, torch.Tensor]]rq   c                 S  sb   t | t |ks
J g }t| |D ]\}}t|tjr)|tj|| dd q|| q|S )NFr  )	r   r   rt   r  rS  r  r  rb  r   )ri  rk  retrB  Zfkrw   rw   rx   _require_exact_strides  s   
z0WhileLoop.create.<locals>._require_exact_stridesr"  c                 S  rC  r  rD  r  rw   rw   rx   r   1  r   z$WhileLoop.create.<locals>.<listcomp>c                 S  rC  r  rD  r  rw   rw   rx   r   2  r   c                 S  rC  r  rD  r  rw   rw   rx   r   3  r   c                   r*  rw   rK  r  rL  rw   rx   r   5  r   c                   r*  rw   rK  r  rL  rw   rx   r   7  r   rE  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: r4   z9torch.while_loop is assumed to have at least one operand.	lhs_exprsSequence[Union[int, Any]]	rhs_exprsr   c                 S  s(   t | |D ]\}}tjj|| qd S rs   )r   r_   r   r   r  )rp  rr  lhsrhsrw   rw   rx   _guard_list_equalsq  s   z,WhileLoop.create.<locals>._guard_list_equalsr=  )rb  rc  rd  re  r;  r   c                   r   rw   rw   r   r   )
all_inputsrw   rx   r     r   c                   s   i | ]\}}| vr||qS rw   rw   )r   r   r   )mutated_idx_setrw   rx   r     s    z$WhileLoop.create.<locals>.<dictcomp>c              
     sF   g | ]\}}t t| | | | | jd  t|fgqS )r:  )	r(  r=  r  r  r   r  r  r>  r   )r   r   rG  
while_looprw   rx   r     s    c                   s   g | ]	}t |j| qS rw   )r  r;  r0  ry  rw   rx   r     s    c                   s$   g | ]}| v rt nt qS rw   )rF  rv  )rx  mutated_inputs_iteroutputs_iterrw   rx   r     r  )ri  rj  rk  rl  rq   rj  )rp  rq  rr  rq  rq   r   )*Ztorch._higher_order_ops.utilsrh  r_   r   r  r   r  r8  r   r  r  r   rI  r;  r  rt   r   r  r  rr   r   r  r   r   r  r  r>  ra  r  moduleZfxZGraphModuler/   r  r  r"  rE  r   r  r  r  r}  )rA  rf  rg  rb  rc  rh  rn  Zfx_carried_inputsZfx_additional_inputsZfx_all_inputsZfake_all_inputsZfake_carried_inputsZfake_additional_inputsr  Zcond_outputsZbody_outputsr  r   r   rH  Zboru  Zmutated_idxsr  Zreal_outputsZall_outputsrQ  r   rw   )rw  rA  rx  r{  r|  rz  rx   r@    s   





" (	




zWhileLoop.createc                 C  r  rs   )Zcodegen_while_loopr6  rw   rw   rx   r7    rz   zWhileLoop.codegen)rb  rR  rc  rR  rd  r7  re  r7  r;  r  rq   r   )rf  r7  rg  r7  rb  rR  rc  rR  r  )r   r   r   rb  r   rc  rd  re  r  r=  rY  r@  r7  r[  rw   rw   r7  rx   ra    s   
  )ra  c                      s@   e Zd Z	dddd fddZd fdd	ZdddZ  ZS )r   Nr  rq   r   c             	     s~   t  j|||||d |d ddlm} dd |D }	||g ||	R |}
|
d us+J |
| _tjj|
d | _	| tjj|
< d S )N)r   r!  r   )get_effect_keyc                 S  s    g | ]}t |tr|jn|qS rw   )rt   r  r	  )r   r  rw   rw   rx   r     s    z,EffectfulKernel.__init__.<locals>.<listcomp>)
r6  r=  Ztorch._higher_order_ops.effectsr~  effect_typer_   r   Zeffectful_opsr2  prev_effect_buffer)ra  r;  r  rT  r  rJ  r   r!  r~  Zuncovered_argsr  r7  rw   rx   r=    s$   
zEffectfulKernel.__init__r  c                   s0   t   }| jd ur|jt| j  |S rs   )r6  r  r  r  r}  r6   r  r  )ra  r  r7  rw   rx   r    s   

zEffectfulKernel.get_read_writesrr   c                 C  r  r  rw   rd  rw   rw   rx   r    r   z EffectfulKernel.has_side_effectsrs   r  r  r  )r   r   r   r=  r  r  r[  rw   rw   r7  rx   r     s    	 
r   c                   @  r  )r]  Nr  rw   rw   rw   rx   r]    s    r]  c                   @  sR   e Zd ZU ded< ded< dddZddddZdddZdddZdddZdS )r  r   r   +Union[FakeScriptObject, torch.ScriptObject]r	  rq   c                 C  rp  rs   r   rd  rw   rw   rx   r    rr  zTorchBindObject.get_nameNr  r  c                 C  rp  rs   r   r  rw   rw   rx   r    rr  z!TorchBindObject.codegen_referencec                 C  rp  rs   r  rd  rw   rw   rx   rQ    rr  zTorchBindObject.get_valuetorch.ScriptObjectc                 C  s   t | jtjr
| jS | jjS rs   )rt   r	  r  ZScriptObjectZreal_objrd  rw   rw   rx   get_real_obj   s   zTorchBindObject.get_real_objru   c                 C  s@   |   }t| }t|d }dd |D }ttj|dS )Nr   c                 S  s(   g | ]}t |tjr| |  qS rw   )rt   r  rS  r  Znumelr  rw   rw   rx   r      s    
z1TorchBindObject.get_buf_bytes.<locals>.<listcomp>)	r  r   Z__obj_flatten__rG  rN  r  r  operatorr}  )ra  Zreal_script_objZ	flat_dictZ
flat_elemsZ
flat_sizesrw   rw   rx   get_buf_bytes   s   zTorchBindObject.get_buf_bytesr  rs   r  )rq   r  )rq   r  r  )	r   r   r   r   r  r  rQ  r  r  rw   rw   rw   rx   r    s   
 


r  c                   @  s4   e Zd ZU ded< ded< dddZddddZdS )rO  r   r   r  r   rq   c                 C  rp  rs   r   rd  rw   rw   rx   r     rr  zGeneratorState.get_nameNr  r  c                 C  rp  rs   r   r  rw   rw   rx   r     rr  z GeneratorState.codegen_referencer  rs   r  )r   r   r   r   r  r  rw   rw   rw   rx   rO     s
   
 
rO  c                   @  sH   e Zd ZdddZdddZddddZedddZedddZdS )_CollectiveKernelrq   rr   c                 C  r  r  rw   rd  rw   rw   rx   r  !   r   z!_CollectiveKernel.should_allocatec                 C  r  r  rw   rd  rw   rw   rx   r  $   r   z"_CollectiveKernel.has_side_effectsNr  r  r   c                 C  sB   t | jtjju sJ d| j}|jj| _dd |jjD | _	d S )Nz,Setting cpp kernel needs a valid op_overloadc                 S  r*  rw   r+  r  rw   rw   rx   r   0   r,  z9_CollectiveKernel.set_cpp_kernel_name.<locals>.<listcomp>)
r   r  r  r-  r.  r/  r   r  r0  r  )ra  r  r  rw   rw   rx   r$  )   s   
z%_CollectiveKernel.set_cpp_kernel_namerM  !Union[TensorBox, list[TensorBox]]c                   s  t jj | j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
|
  q3|d   | t d||||t|}j	
 fdd|D  j
dd |D  d|v rj	tt d|d  j|d   d S d S )Nr  r   r=  c                   r  r  r  r  r   r+  rw   rx   r   U   r  z4_CollectiveKernel.create_inplace.<locals>.<listcomp>c                 S  r  rw   r{  r0  rw   rw   rx   r   Y   r   r   )r_   r   r#  rZ  r  r  r^  rG  Ztree_leavesr"  r  r  r  r  r  )rA  r  rM  r   r   _example_outputrT  rU  rJ  r!  
tensor_argZinpsrw   r  rx   create_inplace:   s>   


z _CollectiveKernel.create_inplacec                   s4  t jj  j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
|
  q3t|trz ||} t|d|||| fddt	|D _
tj
|D ]\}}tjsnt|svt jj|j qcj
S   |||||tjst|st jjj g_
S )Nr  r=  c                   s(   g | ]\}}t  |t|fgqS rw   )r(  r$  r   )r   r   r1  rA  r+  rw   rx   r      s    z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)r_   r   r#  rZ  r  rt   r   r  r  r   r  r   r5   r)  r\   r  r}  r   r$  )rA  r  rM  r   r   rW  rT  rU  rJ  r!  r  r   r  r1  rw   r  rx   create_out_of_placew   sX   


z%_CollectiveKernel.create_out_of_placer  rs   r  )rM  r  rq   r   )rM  r  )	r   r   r   r  r  r$  rY  r  r  rw   rw   rw   rx   r      s    

<r  c                      s4   e Zd Zdd ZedddZd fd
dZ  ZS )_WaitKernelc                 C  s`   | j d }t|tr|j d gS t|tr.|j d }t|tr,|jd \}}|j | gS g S g S r  )rM  rt   r  r(  r  )ra  rQ  Zcollr   r   rw   rw   rx   get_volatile_reads   s   




z_WaitKernel.get_volatile_readsrQ  rl   rq   r   c           	      C  s   t jj | ||\}}}}}W d    n1 sw   Y  |r*J | d| | t| d||||}|jtt| d|| d S )Nr  r=  )	r_   r   r#  rZ  r^  r  r"  r  r  )	rA  r  rQ  r  rT  rU  rJ  r!  r+  rw   rw   rx   create_wait   s(   

z_WaitKernel.create_waitr  c                   s6   t   }|  }|D ]}|jt|  q|S rs   )r6  r  r  r  r}  r6   r  r  )ra  r  Zvolatile_readsZvrr7  rw   rx   r     s
   
z_WaitKernel.get_read_writes)rQ  rl   rq   r   r  )r   r   r   r  rY  r  r  r[  rw   rw   r7  rx   r     s
    r  r   rQ  c                 C  d   t | ttfrt| S t | ttfr%ttj  }| D ]}|t	|O }q|S t | t
jr/t| S t S rs   )rt   r-   r   r'   r   r   r/   r   r   r  r  rS  r   rC  r  rw   rw   rx   r        r  c                 C  r  rs   )rt   r-   r   r&   r   r   r/   r   r   r  r  rS  r  rw   rw   rx   r     r  r  )ro   rp   rq   rr   )r   r   rq   r   )r   r   rq   r   )r   r   rq   r   )r   r   r   r   rq   r   )ro   r(   r   rr   rq   r}   rs   )r   r   r   r   rq   r   )r   r   rq   r   r  )ro   r   r   rr   rq   r   )ro   rm   r   rr   rq   r   )ro   r   r   rr   rq   r   )r	  r
  rq   r  )ro   r  rq   r  )ro   r  rq   rr   )ro   r  r  ru   rq   rr   )r&  r'  r(  r'  r)  r'  rq   rr   )r1  r2  r3  r4  rq   r2  )rE  rF  rq   r   )rM  rN  rq   rO  )r   r\  r   r  rq   r^   )r  r   r   r  r  rr   rq   r  )ro   rm   rq   rr   )TFNFN)ro   rm   r  rr   r  rr   r  r  r  rr   r  r  rq   r  )ro   rm   r  r   rq   rr   )r   rm   rq   rr   )r   r'  r)  r'  rq   rr   )r   r  rq   ru   )r9  r  rq   rr   )r   r  rq   r`  )r   rp   rq   rQ  (-  
__future__r   r  r  r  r  loggingr  textwraprX  r  collections.abcr   r   r   r   r   enumr   r	   r
   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   Zunittest.mockr   r   r   r   r   Ztorch._export.serde.schemaZ_exportZserder  r  Ztorch._library.utilsr  rU  r  Ztorch._loggingr  Ztorch.fxZtorch.utils._pytreeZ_pytreerG  Ztorch._dynamo.utilsr   Ztorch._export.serde.serializer   Z*torch._higher_order_ops.auto_functionalizer   Ztorch._inductorr   Ztorch._prims_commonr   r   r    r!   r"   Ztorch._subclasses.fake_tensorr#   Z%torch.fx.experimental.symbolic_shapesr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   Ztorch.utils._ordered_setr/   Ztorch.utils._sympy.functionsr0   r1   r2   Ztorch.utils._sympy.symbolr3   ry  r5   r6   Zcodegen.commonr7   r8   r9   r:   r;   r<   r=   r>   r?   Z	loop_bodyr@   Zops_handlerrA   rB   rC   rD   Zruntime.benchmarkingrE   Zruntime.hintsrF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   Zvirtualizedr]   r^   r_   Z"torch._library.fake_class_registryr`   Ztorch.fx.nodera   Zcodegen.cuda.cuda_templaterb   r   rc   rd   rp   r   r  __version__r}  r|  ImportErrorre   rf   rg   ru   rh   r`  ri   	getLoggerr   r  r  r8  r   r   rn   ry   r  r|   r   r   r   r   r   r   rC  rE  r   r   r   r   r  r  r  r  r%  r0  rD  rL  rR  rm   r  r$  ra  rb  ru  r  r  r  ZINNER_FN_TYr3  r>  rD  rS  r~  r  r   r  r<  r  r  r  r   r  r  r  r  r?  r  r  r  r  r  r  r  r  r  r=  r  rP  rX  rZ  r^  rc  r  r  r  r  rr  r  r   r  r  r  rr   r   ZPrimitiveInfoTyper  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r  r  r-  r  r(  rg  rl   r  r7  r;  r<  rM  rU  ra  r   r]  r  rO  r  r  r  r  rw   rw   rw   rx   <module>   sP   ,0\
*

"	


&
	 |K  #?     q:& _ I <
_N+: .U(R U 'T 
  LE"N9GA ,      V#)>(2 9*0K/$9   q+ W	j  M/  8