o
    h                 	   @   s  U d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
Z	d dlmZ d dlmZmZmZ e jdddkZdZdee fd	d
Zdee fddZdee fddZdee fddZdee fddZdefddZdefddZe jdddkZdZdZdZ dZ!e"e#d< eddddZ$ee#d< e Z%ee e#d< e Z&ee e#d < ed!d"ddZ'ee#d#< dZ(ee#d$< e Z)ee e#d%< e Z*ee e#d&< ed'd(ddZ+ee#d)< dZ,ee#d*< i Z-e.e/e/f e#d+< dZ0ee" e#d,< d-Z1ed. e#d/< e jd0d1dkZ2ee#d2< e jd3d1dkZ3ee#d4< e  Z4ee#d5< e jd6ddkZ5dZ6dZ7e jd7ddkZ8e jd8dkZ9e jd9ddkZ:e jd:e rJd1nddkZ;dZ<dZ=dZ>e jd;d1dkZ?e jd<dkZ@dZAe jd=d>ZBed? e#d@< dZCdZDe ZEdZFdZGdZHdZIe	jJjKjLe#dA< dZMe	jJjKjLe#dB< dZNeee	jOjPgdf  e#dC< dZQeee	jOjPgdf  e#dD< dZReee	jOjSjPgdf  e#dE< dZTeeeUdF geUdF f  e#dG< dZVeeeUdF geUdF f  e#dH< dZWdZXdZYdZZdZ[i Z\e.e/e.e/ef f e#dI< i Z]e.e/e.e/ef f e#dJ< dZ^e jdKddkZ_dZ`dZaddLdMddNZbe.e/ef e#dO< dPZcedQ e#dR< dZdg dSZeeUee/eeUdF geUdF f f  e#dT< dZfee" e#dU< dZgdVZhdWZidXZjeddYdZd[Zkee#d\< e jd]dkZle jd^dkZme jd_dkZne jd`dkZodaZpee" e#db< dZqe rdne jdcdkZre jdddes Zte jdfdgs Zue jdhdis Zvedj e#dk< e jdldis Zwedj e#dm< dZxdnZye jdodkZze jdpdkZ{e jdqdkZ|drZ}dsZ~dsZe jdtdkZe jdudkZe jdvdkZe"e jdwdZe jdxdyZe jdzd{Zd|e/defd}d~Zd|e/defddZd|e/defddZe jddiZe	jjsFdnd1Ze jdedkZe jdd1dkZe jdddkZe jddkZdZdZdZdZdZe jddkZdZe jddkZee#d< e jddkZee#d< e jddyZe jdd1dkZee#d< daZe jdddkZdMZdZdZdZdZdZdZdZdZe"e jdd Ze jdd1dkZdZdZdZdZdZdZdMZdMZdZdZdZe jdd1dkZde	jv pde	jv Ze p	eZe jdddkZdZee/ e#d< dZee/ e#d< de/fddZe Ze/e#d< eddddZee#d< dZdXZddgZeUeed e/f  e#d< dZee#d< G dd dZdefddZde"fddZe rqdneă Zee" e#d< e Zee#d< eddddZee#d< e jdd1dkZee#d< ee/ e#d< e rz!d dlmZ eːreʠe j͠eˠde jСdZneʠdZW q eefy   dZY qw dZdaZe jdddkZe jdddkZdZdZdZdZdZdZdZee#d< e jdd1dkZdZdZdZdZe jddyZedykZedkrdyneZe jddZee/ e#d< e jd¡dkZdZe jdd1dkZee#d< dZee#d< dZee#d< dZee#d< dZee#d< dZee#d< dZee#d< e jdd1dkZe jdd1dkZee#d< dZee#d< G ddЄ dЃZG dd҄ d҃ZG ddԄ dԃZG ddք dփZG dd؄ d؃ZdZed e#d< dZed e#d< G ddބ dރZG dd dZg dZeUe/ e#d< g dZeUe/ e#d< g ZeUee	je	je	jgdf  e#d< G dd dZ erd dlT eeje  dS )    N)AnyCallableLiteralOptionalTYPE_CHECKINGUnion)	is_fbcode)Configget_tristate_envinstall_config_moduleZTORCHINDUCTOR_INPLACE_PADDING1Freturnc                   C      t dS )NZ#TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHEr
    r   r   \/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torch/_inductor/config.pyfx_graph_remote_cache_default      r   c                   C   s,   t jddkr
dS t jddkrdS d S )NTORCHINDUCTOR_VEC_ISA_OKr   T0F)osenvirongetr   r   r   r   vec_isa_ok_default   s
   r   c                   C   r   )NZ#TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHEr   r   r   r   r   autotune_remote_cache_default   r   r   c                   C   r   )NZ+TORCHINDUCTOR_BUNDLED_AUTOTUNE_REMOTE_CACHEr   r   r   r   r   %bundled_autotune_remote_cache_default   r   r   c                   C   s   t dt sdS d S )NZ/TORCHINDUCTOR_BUNDLE_TRITON_INTO_FX_GRAPH_CACHET)r
   r   r   r   r   r   )bundle_triton_into_fx_graph_cache_default#   s   r   c                  C   s<   d} dt jv rt jddkS t rtjd}|| kS dS )N   Z&TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHERr   z-pytorch/inductor:static_cuda_launcher_versionTr   r   r   r   torch_utils_internaljustknobs_getval_int)ZSTATIC_CUDA_LAUNCHER_VERSIONversionr   r   r   static_cuda_launcher_default*   s   
r#   c                  C   s@   d} dt jv rt jddkS t rd}tj|}|| kS dS )Nr   ZTORCHINDUCTOR_PROLOGUE_FUSIONr   z(pytorch/inductor:prologue_fusion_versionTr   )ZENABLE_PROLOGUE_FUSION_VERSIONjk_namer"   r   r   r   prologue_fusion_enabled9   s   
r%   Z"TORCHDYNAMO_AUTO_FUNCTIONALIZED_V2Ti  precompilation_timeout_secondsz0pytorch/remote_cache:enable_local_fx_graph_cacheZTORCHINDUCTOR_FX_GRAPH_CACHE)justknobenv_name_forcedefaultfx_graph_cachefx_graph_remote_cache!bundle_triton_into_fx_graph_cachez>pytorch/remote_cache:enable_non_blocking_remote_cache_write_v2Z-TORCHINDUCTOR_NON_BLOCKING_REMOTE_CACHE_WRITEnon_blocking_remote_cache_writeautotune_local_cacheautotune_remote_cachebundled_autotune_remote_cachez)pytorch/remote_cache:force_disable_cachesZ"TORCHINDUCTOR_FORCE_DISABLE_CACHESforce_disable_caches&unsafe_skip_cache_dynamic_shape_guards!unsafe_marked_cacheable_functionssleep_sec_TESTING_ONLYneeds_fixed_stride_order)r5   Zflexible_layout'triton_kernel_default_layout_constraintZTORCHINDUCTOR_CPP_WRAPPERr   cpp_wrapperZ(TORCHINDUCTOR_CPP_WRAPPER_BUILD_SEPARATEcpp_wrapper_build_separatecpp_cache_precompile_headersZTORCHINDUCTOR_ONLINE_SOFTMAXZTORCHINDUCTOR_SIZE_ASSERTSZTORCHINDUCTOR_NAN_ASSERTSZTORCHINDUCTOR_SCALAR_ASSERTSZTORCHINDUCTOR_ALIGNMENT_ASSERTSZTORCHINDUCTOR_MEMORY_PLANNINGZTORCHINDUCTOR_USE_FAST_MATHZTORCHINDUCTOR_MEMORY_POOLintermediates)noner:   Zoutputscombinedmemory_poolpost_grad_custom_pre_passpost_grad_custom_post_passjoint_custom_pre_passjoint_custom_post_passpre_grad_custom_passz+torch._inductor.scheduler.BaseSchedulerNode_pre_fusion_custom_pass_post_fusion_custom_passpre_grad_fusion_optionspost_grad_fusion_optionsZ"TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCKg-C6?r   )Zpre_grad	precisionZnum_iterationsZrequires_optimizerfx_passes_numeric_check	heuristic)r)   tritonZatenrI   mixed_mm_choice)Zreorder_compute_for_overlapZ
sink_waitsZraise_comms'reorder_for_compute_comm_overlap_passesreorder_prefetch_limitr)   i,     Z*TORCHINDUCTOR_USE_EXPERIMENTAL_BENCHMARKERz-pytorch/inductor:use_experimental_benchmarker)r)   r(   r'   use_experimental_benchmarkerZTORCHINDUCTOR_MAX_AUTOTUNEZ$TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISEZTORCHINDUCTOR_MAX_AUTOTUNE_GEMMZ!TORCHINDUCTOR_DISABLE_DECOMPOSE_K
   autotune_num_choices_displayedZ"TORCHINDUCTOR_FORCE_SAME_PRECISIONZ(TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDSzATEN,TRITON,CPPZ(TORCHINDUCTOR_MAX_AUTOTUNE_CONV_BACKENDSzATEN,TRITONZ,TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACEDEFAULT)rR   Z
EXHAUSTIVEmax_autotune_gemm_search_spaceZ,TORCHINDUCTOR_MAX_AUTOTUNE_FLEX_SEARCH_SPACEmax_autotune_flex_search_spacei    Z#TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHEZTORCHINDUCTOR_SAVE_ARGSZ!TORCHINDUCTOR_AUTOTUNE_IN_SUBPROCg      N@g        Z#TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICEZ'TORCHINDUCTOR_COORDINATE_DESCENT_TUNINGZ5TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONSZ'TORCHINDUCTOR_COORDINATE_DESCENT_RADIUSZ#TORCHINDUCTOR_AUTOHEURISTIC_COLLECT ZTORCHINDUCTOR_AUTOHEURISTIC_USEZmixed_mmnamec                 C   s   t | pt| S )N)collect_autoheuristicuse_autoheuristicrV   r   r   r   run_autoheuristic  s   rZ   c                 C      | t jjjdv S N,)r   	_inductorconfigautoheuristic_collectsplitrY   r   r   r   rW        rW   c                 C   r[   r\   )r   r^   r_   autoheuristic_usera   rY   r   r   r   rX   
  rb   rX   Z$TORCHINDUCTOR_AUTOHEURISTIC_LOG_PATHZ!TORCHINDUCTOR_LAYOUT_OPTIMIZATIONZTORCHINDUCTOR_FORCE_LAYOUT_OPTZ TORCHINDUCTOR_KEEP_OUTPUT_STRIDEZTORCHINDUCTOR_WARN_MIX_LAYOUT         Z.TORCHINDUCTOR_ASSUME_UNALIGNED_FALLBACK_OUTPUTZTORCHINDUCTOR_DEBUG_FUSIONdebug_fusionZTORCHINDUCTOR_BENCHMARK_FUSIONbenchmark_fusionZ#TORCHINDUCTOR_ENABLED_METRIC_TABLESZ(TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSIONloop_ordering_after_fusionZ'TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION@   ZTORCHINDUCTOR_MIN_NUM_SPLITZTORCHINDUCTOR_BENCHMARK_KERNELZ%TORCHINDUCTOR_EMULATE_PRECISION_CASTSdevgitZ0TORCHINDUCTOR_OPTIMIZE_SCATTER_UPON_CONST_TENSORadd_pre_grad_passesremove_pre_grad_passesc                  C   s4   dt jv rt jd } nd} | dv sJ d|  | S )NZTORCHINDUCTOR_WORKER_START
subprocess)ro   forkZspawnzInvalid start method: )r   r   )Zstart_methodr   r   r   decide_worker_start_method  s   

rq   worker_start_methodz(pytorch/compiler:worker_suppress_loggingZ%TORCHINDUCTOR_WORKER_SUPPRESS_LOGGINGworker_suppress_loggingZfuse_ddp_with_concat_opZschedule_comm_wait).N_fuse_ddp_communication_passes_micro_pipeline_tpc                   @   s&   e Zd ZU dZeed< dZeed< dS )_collectiveFauto_selecti   #one_shot_all_reduce_threshold_bytesN)__name__
__module____qualname__rw   bool__annotations__rx   intr   r   r   r   rv     s   
 rv   c                  C   s   d} d}t j|}| |kS )a   
    TODO: Remove when parallel compiled is fully enabled internally. For rollout, use a
    knob to enable / disable. The justknob should not be performed at import, however.
    So for fbcode, we assign compile_threads to 'None' below and initialize lazily in
    async_compile.py.
    r   z0pytorch/inductor:enable_parallel_compile_version)r   r    r!   )ZENABLE_PARALLEL_COMPILE_VERSIONr$   r"   r   r   r   #parallel_compile_enabled_internally  s   r   c                  C   s   ddl } | t}dtjv rttjd }|d| |S tjdkr+d}|d |S t	 r:t
 s:d}|d |S ttd	rFttdnt }|sNJ td
|}|d| |S )a!  
    Here are the precedence to decide compile_threads
    1. User can override it by TORCHINDUCTOR_COMPILE_THREADS.  One may want to disable async compiling by
       setting this to 1 to make pdb happy.
    2. Set to 1 if it's win32 platform
    3. decide by the number of CPU cores
    r   NZTORCHINDUCTOR_COMPILE_THREADSz!compile_threads set to %d via envwin32r   z"compile_threads set to 1 for win32z"compile_threads set to 1 in fbcodesched_getaffinity    zcompile_threads set to %d)logging	getLoggerry   r   r   r~   infosysplatformr   r   hasattrlenr   	cpu_countmin)r   logcompile_threadsr   r   r   r   decide_compile_threads   s,   





r   r   use_static_cuda_launcherz:pytorch/inductor:static_launch_user_defined_triton_kernelsZ7TORCHINDUCTOR_STATIC_LAUNCH_USER_DEFINED_TRITON_KERNELS)static_launch_user_defined_triton_kernelsZ)TORCHINDUCTOR_STRICT_STATIC_CUDA_LAUNCHERstrict_static_cuda_launcherglobal_cache_dir)parutil.zfb/cacheZTORCHINDUCTOR_SHAPE_PADDINGZ#TORCHINDUCTOR_COMPREHENSIVE_PADDING   i   force_shape_padZTORCHINDUCTOR_PERMUTE_FUSIONZTORCHINDUCTOR_PROFILEZTORCHINDUCTOR_PROFILE_OUTPUTprofile_bandwidth_outputZ3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGZTORCHINDUCTOR_FREEZINGfreezingfreezing_discard_parametersdecompose_mem_bound_mmassume_aligned_inputs.unsafe_ignore_unsupported_triton_autotune_args"check_stack_no_cycles_TESTING_ONLY*always_complex_memory_overlap_TESTING_ONLYZ*TORCHINDUCTOR_ENABLE_LINEAR_BINARY_FOLDINGZTORCHINDUCTOR_ANNOTATE_TRAININGannotate_training)enable_caching_generated_triton_templatesc                   @   s  e Zd ZU dZejdddkZejdddkZdZ	e
e ed< eejdd	Zdejd
ejdkr6dndfZeed ef ed< ejdddkZejdddkZdZe
e ed< dZe
e ed< edZe
e ed< dZed ed< eejddZejdddkZejdddkZejddZejdddkZ dZ!eejd dZ"ejd!dZ#ejd"dZ$d#Z%dZ&ejd$ddkZ'dZ(dS )%cppZ$TORCHINDUCTOR_CPP_NO_REDUNDANT_LOOPSr   Z!TORCHINDUCTOR_CPP_DYNAMIC_THREADSr   NsimdlenZ TORCHINDUCTOR_CPP_MIN_CHUNK_SIZEZ4096CXXdarwinzclang++zg++cxxZ'TORCHINDUCTOR_CPP_ENABLE_KERNEL_PROFILEZ TORCHINDUCTOR_CPP_WEIGHT_PREPACKinject_relu_bug_TESTING_ONLYinject_log1p_bug_TESTING_ONLYr   
vec_isa_okoriginal_atenr   r   Zinductor_nodedescriptive_namesZ,TORCHINDUCTOR_CPP_MAX_HORIZONTAL_FUSION_SIZEZ16Z-TORCHINDUCTOR_CPP_FALLBACK_SCATTER_REDUCE_SUMZ-TORCHINDUCTOR_CPP_ENABLE_UNSAFE_MATH_OPT_FLAGZ5TORCHINDUCTOR_CPP_ENABLE_FLOATING_POINT_CONTRACT_FLAGoffZ)TORCHINDUCTOR_CPP_ENABLE_TILING_HEURISTICFZ#TORCHINDUCTOR_CPP_GEMM_MAX_K_SLICESZ%TORCHINDUCTOR_CPP_GEMM_CACHE_BLOCKINGZ%TORCHINDUCTOR_CPP_GEMM_THREAD_FACTORSTZ$TORCHINDUCTOR_CPP_USE_DECOMPOSE_TANH))ry   rz   r{   threadsr   r   r   Zno_redundant_loopsZdynamic_threadsr   r   r~   r}   Zmin_chunk_sizer   r   r   tupler   strZenable_kernel_profileZweight_prepackr   r   r
   r   r|   r   Zmax_horizontal_fusion_sizeZfallback_scatter_reduce_sumZenable_unsafe_math_opt_flagZ#enable_floating_point_contract_flagZenable_tiling_heuristicsZenable_grouped_gemm_templateZgemm_max_k_slicesZgemm_cache_blockingZgemm_thread_factorsZenable_loop_tail_vecZenable_concat_linearZuse_decompose_tanhZuse_small_dequant_bufferr   r   r   r   r     sN   
 r   c                   @   s  e Zd ZU ejddkZdZdZdZ	e
eeeeedf f   ed< dZdZe r,dndZdZd	Ze
e ed
< dZdZdZdZdZdZdZejde sOdnddkZeed< dZe
e ed< dZeed< dZ dZ!dZ"e
e ed< dZ#eed< dZ$eed< dZ%dZ&ejdddkZ'ejdddkZ(dZ)e*d ed< ejdddkZ+ejdddkZ,dZ-eejddZ.e*d ed< ejdddkZ/dZ0dZ1dZ2eed < dZ3dZ4e
e5 ed!< dZ6ejd"ddkZ7ejd#ddkZ8dZ9dS )$rJ   ZTORCHINDUCTOR_CUDAGRAPHSr   TFN.cudagraph_capture_sizesr   2   "cudagraph_dynamic_shape_warn_limitZ&TORCHINDUCTOR_COALESCE_TILING_ANALYSISr   coalesce_tiling_analysis	max_tilesprefer_nd_tilingautotune_at_compile_timeautotune_with_sample_inputstile_reductionsZ!TORCHINDUCTOR_UNIQUE_KERNEL_NAMESZ&TORCHINDUCTOR_UNIQUE_USER_KERNEL_NAMESr   r   r   Z#TORCHINDUCTOR_PERSISTENT_REDUCTIONSZ$TORCHINDUCTOR_COOPERATIVE_REDUCTIONSZTORCHINDUCTOR_MULTI_KERNEL)r   r         multi_kernelZTORCHINDUCTOR_DIVISIBLE_BY_16      spill_thresholdr   ZENABLE_PERSISTENT_TMA_MATMULZTORCHINDUCTOR_SKIP_L1):ry   rz   r{   r   r   r   Z
cudagraphsZcudagraph_treesZcudagraph_skip_dynamic_graphsr   r   r   r   r~   r}   Zslow_path_cudagraph_assertsZ!cudagraph_trees_history_recordingr   Z cudagraph_support_input_mutationZ#cudagraph_unexpected_rerecord_limitr   Zforce_cudagraph_syncZforce_cudagraphs_warmupZfast_path_cudagraph_assertsZskip_cudagraph_warmupZdebug_sync_graphZdebug_sync_kernelZdense_indexingr   r|   r   r   Zautotune_pointwiseZautotune_cublasLtr   r   r   Z tiling_prevents_pointwise_fusionZ tiling_prevents_reduction_fusionZunique_kernel_namesZunique_user_kernel_namesr   r   Zpersistent_reductionsZcooperative_reductionsZforce_cooperative_reductionsr   Zdivisible_by_16Zmin_split_scan_rblockZstore_cubinr   Zuse_block_ptrr   r   Zcodegen_upcast_to_fp32Zenable_persistent_tma_matmulZskip_l1_cacheZ.disallow_failing_autotune_kernels_TESTING_ONLYr   r   r   r   rJ   F  sr   
 $

	rJ   c                   @   s  e Zd ZU dZdZejdddkZejddZ	ejddZ
ed	 ed
< ejddZdZdZdZeed< dZeed< dZeed< dZeed< i Zeeef ed< ejdddkZeed< ejdddkZeed< eejddZeed< i Zeeef ed< dZeed< dZeed< dZ eed< dZ!eed< e"  Z#eed < dZ$eed!< dZ%eed"< dZ&e'e ed#< i Z(ee)j*j+e,e f ed$< dZ-e'e,e  ed%< dS )&aot_inductorz9
    Settings for Ahead-Of-Time Inductor Compilation
    rU   ZAOT_INDUCTOR_DEBUG_COMPILEr   r   Z&AOT_INDUCTOR_COMPILE_WRAPPER_OPT_LEVELZO1Z-AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER)r   r   23 debug_intermediate_value_printerZ&AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINTNFuse_runtime_constant_foldingforce_mmap_weightspackagepackage_cpp_onlymetadataZ/AOTINDUCTOR_RAISE_ERROR_ON_IGNORED_OPTIMIZATION#raise_error_on_ignored_optimizationZDUMP_AOTI_MINIFIERdump_aoti_minifierZAOTINDUCTOR_REPRO_LEVELr   repro_levelpresetsallow_stack_allocationuse_minimal_arrayref_interfaceTpackage_constants_in_sopackage_constants_on_diskprecompile_headersembed_kernel_binaryemit_multi_arch_kernelmodel_name_for_generated_filescustom_ops_to_c_shimscustom_op_libs).ry   rz   r{   __doc__Zoutput_pathr   r   r   Zdebug_compileZcompile_wrapper_opt_levelr   r   r}   Zfiltered_kernel_namesZserialized_in_specZserialized_out_specr   r|   r   r   r   r   dictr   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Z_opsZ
OpOverloadlistr   r   r   r   r   r     sF   
 
	
r   c                
   @   s  e Zd ZU dZdZee ed< dZee ed< dZ	e
d ed< dZdZdZdZejd	ejejejejd
ZdZee ed< g dZee ed< ejdddkZdZdZee ed< dZ eed< ejdddkZ!e"ed< ejdZ#ee ed< ejdZ$ee ed< ejddZ%eed< ejdZ&ee ed< ejdddkZ'e"ed< ejd ddkZ(e"ed!< ejd"d#Z)eed$< d%Z*e"ed&< dZ+e"ed'< dZ,e"ed(< dS ))cudaz9Settings for cuda backend, today this consists of cutlassNarchr"   -O1)-O0r   -O2-O3z-OScompile_opt_levelFZTORCHINDUCTOR_CUTLASS_DIRz../third_party/cutlass/cutlass_max_profiling_configs)r   r   rd   rf   %cutlass_max_profiling_swizzle_optionsZCUTLASS_EPILOGUE_FUSIONr   r   cuda_cxxr   cutlass_backend_min_gemm_sizeZ/INDUCTOR_CUDA_BACKEND_GENERATE_TEST_RUNNER_CODEgenerate_test_runnerZTORCHINDUCTOR_CUTLASS_ALLOWLISTcutlass_op_allowlist_regexZTORCHINDUCTOR_CUTLASS_DENYLISTcutlass_op_denylist_regexZ)TORCHINDUCTOR_CUTLASS_INSTANTIATION_LEVELcutlass_instantiation_levelZTORCHINDUCTOR_CUTLASS_PRESETScutlass_presetsZ+TORCHINDUCTOR_CUTLASS_HASH_WITH_COMPILE_CMDcutlass_hash_with_compile_cmdZ"TORCHINDUCTOR_CUTLASS_PRESCREENINGcutlass_prescreeningZ!TORCHINDUCTOR_CUTLASS_ENABLED_OPSallcutlass_enabled_opsTuse_binary_remote_cacheupload_to_binary_remote_cachebinary_remote_cache_force_write)-ry   rz   r{   r   r   r   r   r}   r"   r   r   Zenable_cuda_ltoZenable_ptxas_infoZenable_debug_infouse_fast_mathr   r   r   pathabspathjoindirnamer   __file__Zcutlass_dirr   r~   r   r   Zcutlass_epilogue_fusion_enabledZcutlass_tma_onlyr   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     sX   
 




r   c                   @   s   e Zd ZU g Zee ed< ddgZee ed< dZe	d ed< dZ
dZd	Zd	ZdZd
Zee ed< ejdZejdddkZeed< d
Zee ed< d
Zee ed< d
Zee ed< dZeed< d
Zeee  ed< dZeed< d
S )rocmr   Zgfx90aZgfx942ck_supported_archr   )	r   r   r   r   z-Osz-Ozz-Ominz-Ofastz-Omaxr   FTN	rocm_homeZTORCHINDUCTOR_CK_DIRZ-INDUCTOR_CK_BACKEND_GENERATE_TEST_RUNNER_CODEr   r   r   n_max_profiling_configsck_max_profiling_configsck_tile_max_profiling_configsuse_preselected_instanceskBatch_sweepr   split_k_threshold)ry   rz   r{   r   r   r   r}   r   r   r   Zis_debugZ
save_tempsr   Zflush_denormalsZprint_kernel_resource_usager   r   r   r   r   Zck_dirr   r|   r   r~   r   r   r   r   r   r   r   r   r   r     s,   
 
r   )r   rJ   halidecpu_backend)rJ   r   cuda_backendc                   @   sB   e Zd ZU dZdZdZed ed< dZed ed< dZ	dZ
dZd	S )
r   hostz	host-cudaAnderson2021)r  ZLi2018	Adams2019ZMullapudi2016scheduler_cudar  scheduler_cpuFN)ry   rz   r{   Z
cpu_targetZ
gpu_targetr  r   r}   r  ZassertsdebugZscan_kernelsr   r   r   r   r   N  s   
 r   c                   @   s   e Zd ZU ejdddkZejdddkZdZe	e
 ed< dZdZdZdZdZdZdZejd	ddkZejd
ddkZejddZejddZdZdZe	ee
gdf  ed< ejdddkZdZeed< dS )traceZTORCH_COMPILE_DEBUGr   r   ZTORCH_COMPILE_DEBUG_SAVE_REALN	debug_dirFTZINDUCTOR_POST_FUSION_SVGZINDUCTOR_ORIG_FX_SVGZINDUCTOR_DOT_GRAPH_SHAPE_SVGZ INDUCTOR_LOG_URL_FOR_GRAPH_XFORM
upload_tarZLOG_AUTOTUNE_RESULTS1log_inductor_triton_kernel_to_post_grad_node_info)ry   rz   r{   r   r   r   ZenabledZsave_real_tensorsr
  r   r   r}   Z	debug_logZinfo_logZfx_graphZfx_graph_transformedZir_pre_fusionZir_post_fusionZoutput_codeZgraph_diagramZdraw_orig_fx_graphZdot_graph_shapeZlog_url_for_graph_xformZcompile_profiler  r   Zlog_autotuning_resultsr  r|   r   r   r   r   r	  j  s&   
 
r	  )
ztrace.upload_tarr@   rA   rB   zaot_inductor.repro_levelzaot_inductor.dump_aoti_minifierr>   r?   rt   rC   _save_config_ignore)r	  zcuda.cutlass_dirrr   r   r?   r>   rt   rC   r   r*   r+   r.   r/   _cache_config_ignore_prefixexternal_matmulc                   @   sV   e Zd ZU dZeed< dZee ed< dZ	dZ
dZee ed< dZee ed< dZdS )test_configsF%force_extern_kernel_in_multi_templateNmax_mm_configsautotune_choice_name_regexautotune_choice_desc_regex)ry   rz   r{   r  r|   r}   r  r   r~   Zruntime_triton_dtype_assertZstatic_cpp_dtype_assertr  r   r  Z*graphsafe_rng_func_ignores_fallback_randomr   r   r   r   r    s   
 r  )*(  r   r   typingr   r   r   r   r   r   r   Z!torch._inductor.custom_graph_passZtorch._environmentr   Ztorch.utils._config_moduler	   r
   r   r   r   Zinplace_paddingZcan_inplace_pad_graph_inputr|   r   r   r   r   r   r#   r%   Zenable_auto_functionalized_v2r  Zdisable_progressZverbose_progressr&   r~   r}   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r   r   r4   r6   r7   r8   r9   Zonline_softmaxZdceZstatic_weight_shapesZsize_assertsZnan_assertsZscalar_assertsZalignment_assertsZpick_loop_ordersZinplace_buffersZallow_buffer_reuseZmemory_planningr   Zbfloat16_atomic_adds_enabledr=   Zbenchmark_harnessZepilogue_fusionZprologue_fusionZepilogue_fusion_firstZpattern_matcherZb2b_gemm_passr>   r^   Zcustom_graph_passZCustomGraphPassTyper?   r@   ZfxZGraphrA   rB   graphrC   r   rD   Zsplit_cat_fx_passesZ efficient_conv_bn_eval_fx_passesZis_predispatchZgroup_fusionZbatch_fusionrE   rF   Zreorder_for_localityZdynamic_scale_rblockZforce_fuse_int_mm_with_mulZuse_mixed_mmrH   rK   Z reorder_for_compute_comm_overlaprL   rM   Zreorder_for_peak_memoryZestimate_op_runtimeZintra_node_bwZinter_node_bwrO   Zmax_autotuneZmax_autotune_pointwiseZmax_autotune_gemmZdisable_decompose_krQ   Zgraph_partitionZforce_same_precisionupperZmax_autotune_gemm_backendsZmax_autotune_conv_backendsrS   rT   Zautotune_fallback_to_atenZunbacked_symint_fallbackZsearch_autotune_cacheZ	save_argsZautotune_in_subprocZ+max_autotune_subproc_result_timeout_secondsZ-max_autotune_subproc_graceful_timeout_secondsZ.max_autotune_subproc_terminate_timeout_secondsZautotune_multi_deviceZcoordinate_descent_tuningZ'coordinate_descent_check_all_directionsZ coordinate_descent_search_radiusr`   rc   rZ   rW   rX   Zautoheuristic_log_pathr"   ZhipZlayout_opt_defaultZlayout_optimizationZforce_layout_optimizationZkeep_output_strideZwarn_mix_layoutZrealize_reads_thresholdZrealize_opcount_thresholdZrealize_acc_reads_thresholdZfallback_randomZimplicit_fallbacksZ assume_unaligned_fallback_outputZaggressive_fusionrg   rh   Zenabled_metric_tablesri   Zscore_fusion_memory_thresholdZbenchmark_epilogue_fusionZ max_epilogue_benchmarked_choicesZmax_fusion_sizeZ)max_fusion_buffer_group_pairwise_attemptsZmax_pointwise_cat_inputsZforce_pointwise_catZunroll_reductions_thresholdZcomment_originZconv_1x1_as_mmZsplit_reductionsZmin_num_splitZbenchmark_kernelZconstant_and_index_propagationZalways_keep_tensor_constantsZassert_indirect_indexingZcompute_all_boundsZcombo_kernelsZbenchmark_combo_kernelZcombo_kernels_autotuneZcombo_kernel_allow_mixed_sizesZ#combo_kernel_foreach_dynamic_shapesZjoint_graph_constant_foldingZdebug_index_assertsZemulate_precision_casts__version__Zis_nightly_or_sourceZdeveloper_warningsZ"optimize_scatter_upon_const_tensorrm   rn   rq   rr   rs   Z_fuse_ddp_communicationZ_fuse_ddp_bucket_sizert   ru   rv   r   r   r   r   r   r   Zlibfb.pyr   __package__Zget_dir_pathr   r   replacesepr   
ValueErrorImportErrorZkernel_name_max_opsZshape_paddingZcomprehensive_paddingZpad_channels_lastZdisable_padding_cpuZpadding_alignment_bytesZpadding_stride_thresholdZpad_outputsZbw_outputs_user_visibler   Zpermute_fusionZprofiler_mark_wrapper_callZgenerate_intermediate_hooksZdebug_ir_tracebackZ_raise_error_for_testingZ_profile_varZprofile_bandwidthZprofile_bandwidth_regexr   Z/profile_bandwidth_with_do_bench_using_profilingZdisable_cpp_codegenr   r   r   r   r   r   r   Zenable_linear_binary_foldingr   r   r   rJ   r   r   r   r  r  r   r	  r  r  r  ZTensorr  Ztorch.utils._config_typingmodulesry   r   r   r   r   <module>   s  
  


 

	%


v Gu CC$
