o
    h0                     @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 de j
fddZdeddfdd	Zde j
fd
dZ				d#dededee dedee dee defddZG dd dZG dd dZ	d$deeef deee  deeeedf f fddZ	d%deded edefd!d"ZdS )&    N)AnyOptionalUnion)_get_device_indexreturnc                   C      t jdkr
tdS tdS )Nwin32z
nvcuda.dllzlibcuda.so.1sysplatformctypesCDLL r   r   W/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torch/cuda/_utils.py_get_cuda_library   s   


r   resultc                 C   sR   | dkrd S t  }t }|| t | |jd ur |j nd}td| )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr   ZcuGetErrorStringbyrefvaluedecodeRuntimeError)r   err_strlibcudaerror_messager   r   r   _check_cuda   s   r   c                   C   r   )Nr   znvrtc64_120_0.dllzlibnvrtc.sor	   r   r   r   r   _get_nvrtc_library    s   


r    kernel_sourcekernel_namecompute_capabilityheader_codecuda_include_dirsnvcc_optionsc              	      s  ddl }t d dtddf fdd}|  ds!d|  } |r*|d	 |  }n| }|d
}	|du rF|j|j }
|
j	 |
j
 }g }|d|   |rc|D ]}|d|   qV|rr|D ]
}||d
 qgddlm} dd |D }|dd |D  t|}tj| | }t }|t||	| d ddd |||}| krt }|t| t|j}|| td|j  t }||t| t|j}||| t| |jS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        header_code (str, optional): Additional header code to prepend to the kernel source
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC

    Returns:
        str: The compiled PTX code
    r   Nr   r   c                    sL   |  kr$t  }| t | |jd ur|j nd}td| d S )Nr   r   )r   r   ZnvrtcGetErrorStringr   r   r   r   )r   r   r   ZNVRTC_SUCCESSZlibnvrtcr   r   check_nvrtcJ   s   z#_nvrtc_compile.<locals>.check_nvrtcz
extern "C"zextern "C" 
utf-8z--gpu-architecture=sm_z-I)COMMON_NVCC_FLAGSc                 S   s   g | ]}|d kr|qS )z--expt-relaxed-constexprr   .0flagr   r   r   
<listcomp>y   s
    z"_nvrtc_compile.<locals>.<listcomp>c                 S   s   g | ]}| d qS )r(   )encoder*   r   r   r   r-   |   s    z.cuzKernel compilation failed:
) 
torch.cudar   intstrip
startswithr.   cudaZget_device_propertiesZcurrent_devicemajorminorappendZtorch.utils.cpp_extensionr)   extendlenr   r   c_void_pZnvrtcCreateProgramr   ZnvrtcCompileProgramZc_size_tZnvrtcGetProgramLogSizeZcreate_string_bufferr   ZnvrtcGetProgramLogr   r   ZnvrtcGetPTXSizeZnvrtcGetPTXZnvrtcDestroyProgram)r   r    r!   r"   r#   r$   torchr&   Zfull_sourcesource_bytespropsoptions	directoryoptionr)   Znvrtc_compatible_flagsZnum_optionsZoptions_arrayprogresZlog_sizelogZptx_sizeptxr   r%   r   _nvrtc_compile)   sh   

rD   c                   @   s2   e Zd ZdejddfddZdeddfdd	ZdS )
_CudaModulemoduler   Nc                 C   s   || _ i | _d S N)_module_kernels)selfrF   r   r   r   __init__      
z_CudaModule.__init__name_CudaKernelc              
   C   s   || j v r
| j | S ddlm} | }t }zt|t|| j|	d t
|| j}|| j |< |W S  tyJ } z	td| d|d }~ww )Nr   )r   r(   zNo kernel named 'z' in this module)rI   Ztorch.cuda._utilsr   r   r9   r   cuModuleGetFunctionr   rH   r.   rN   r   AttributeError)rJ   rM   r   r   funcZkernelerrr   r   r   __getattr__   s$   


z_CudaModule.__getattr__)__name__
__module____qualname__r   r9   rK   strrS   r   r   r   r   rE      s    rE   c                   @   st   e Zd ZdZdejdejddfddZ						dd
eeeef deeeef de	e
 dede	e ddfddZdS )rN   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    rQ   rF   r   Nc                 C   s   || _ || _d S rG   )rQ   rF   )rJ   rQ   rF   r   r   r   rK      rL   z_CudaKernel.__init__   rY   rY   r   gridblockargs
shared_memstreamc                 C   sl  ddl }|jj }|sg }g }g }	|D ]Y}
t|
|jr?|
js*|
jr&|
 s*t	dt
|
 }|| |	t
| qt|
trRt
|
}|	t
| qt|
tret
|
}|	t
| qtdt|
 t
jt|	  }t|	D ]\}}
t
|
t
j||< qz|du rddl}|j }t|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type: rY      )r:   r3   Z_utilsr   
isinstanceZTensorZis_cudaZis_cpu	is_pinned
ValueErrorr   r9   Zdata_ptrr6   r   r0   c_intfloatc_float	TypeErrortyper8   	enumeratecastr/   current_streamr   ZcuLaunchKernelrQ   Z_as_parameter_)rJ   rZ   r[   r\   r]   r^   r:   r   Zprocessed_argsZc_argsargZptrrc   re   Zc_args_arrayir   r   r   __call__   sV   





z_CudaKernel.__call__)rX   rX   Nr   N)rT   rU   rV   __doc__r   r9   rK   tupler0   r   listr   rm   r   r   r   r   rN      s*    rN   rC   kernel_namesc           	   	   C   s   ddl }t }t| tr| d} t }|j }| t	|
t||  W d   n1 s2w   Y  |s=t|S i }|D ]}t }t	|t|||d t||||< qA|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr(   )r/   r   r`   rW   r.   r   r9   r3   rj   r   ZcuModuleLoadDatar   rE   rO   rN   )	rC   rq   r:   r   rF   r^   ZkernelsrM   rQ   r   r   r   _cuda_load_module  s*   


rr   Fdeviceoptional	allow_cpuc                 C   s   t | tr| S t | trt| } t | tjr2|r&| jdvr%td|  n| jdkr2td|  tj sAt | tj	jrA| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )r3   cpuz(Expected a cuda or cpu device, but got: r3   z!Expected a cuda device, but got: )r`   r0   rW   r:   rs   rg   rb   ZjitZis_scriptingr3   idx_torch_get_device_index)rs   rt   ru   r   r   r   r   N  s   





r   )Nr   NNrG   )FF)r   r
   typingr   r   r   r:   Ztorch._utilsr   rx   r   r   r0   r   r   rW   rp   bytesrD   rE   rN   dictrr   boolr   r   r   r   <module>   s^    
|]


1