o
    ,h0                     @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 de j
fddZdeddfdd	Zde j
fd
dZ				d#dededee dedee dee defddZG dd dZG dd dZ	d$deeef deee  deeeedf f fddZ	d%deded edefd!d"ZdS )&    N)AnyOptionalUnion)_get_device_indexreturnc                   C      t jdkr
tdS tdS )Nwin32z
nvcuda.dllzlibcuda.so.1sysplatformctypesCDLL r   r   L/var/www/html/scripts/venv/lib/python3.10/site-packages/torch/cuda/_utils.py_get_cuda_library   s   


r   resultc                 C   sR   | dkrd S t  }t }|| t | |jd ur |j nd}td| )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr   cuGetErrorStringbyrefvaluedecodeRuntimeError)r   err_strlibcudaerror_messager   r   r   _check_cuda   s   r   c                   C   r   )Nr   znvrtc64_120_0.dllzlibnvrtc.sor	   r   r   r   r   _get_nvrtc_library    s   


r    kernel_sourcekernel_namecompute_capabilityheader_codecuda_include_dirsnvcc_optionsc              	      s  ddl }t d dtddf fdd}|  ds!d|  } |r*|d	 |  }n| }|d
}	|du rF|j|j }
|
j	 |
j
 }g }|d|   |rc|D ]}|d|   qV|rr|D ]
}||d
 qgddlm} dd |D }|dd |D  t|}tj| | }t }|t||	| d ddd |||}| krt }|t| t|j}|| td|j  t }||t| t|j}||| t| |jS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        header_code (str, optional): Additional header code to prepend to the kernel source
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC

    Returns:
        str: The compiled PTX code
    r   Nr   r   c                    sL   |  kr$t  }| t | |jd ur|j nd}td| d S )Nr   r   )r   r   nvrtcGetErrorStringr   r   r   r   )r   r   r   NVRTC_SUCCESSlibnvrtcr   r   check_nvrtcJ   s   

z#_nvrtc_compile.<locals>.check_nvrtcz
extern "C"zextern "C" 
utf-8z--gpu-architecture=sm_z-I)COMMON_NVCC_FLAGSc                 S   s   g | ]}|d kr|qS )z--expt-relaxed-constexprr   .0flagr   r   r   
<listcomp>y   s    z"_nvrtc_compile.<locals>.<listcomp>c                 S   s   g | ]}| d qS )r,   )encoder.   r   r   r   r1   |   s    z.cuzKernel compilation failed:
) 
torch.cudar   intstrip
startswithr2   cudaget_device_propertiescurrent_devicemajorminorappendtorch.utils.cpp_extensionr-   extendlenr   r   c_void_pnvrtcCreateProgramr   nvrtcCompileProgramc_size_tnvrtcGetProgramLogSizecreate_string_bufferr   nvrtcGetProgramLogr   r   nvrtcGetPTXSizenvrtcGetPTXnvrtcDestroyProgram)r    r!   r"   r#   r$   r%   torchr*   full_sourcesource_bytespropsoptions	directoryoptionr-   nvrtc_compatible_flagsnum_optionsoptions_arrayprogreslog_sizelogptx_sizeptxr   r'   r   _nvrtc_compile)   sh   

rZ   c                   @   s2   e Zd ZdejddfddZdeddfdd	ZdS )
_CudaModulemoduler   Nc                 C   s   || _ i | _d S N)_module_kernels)selfr\   r   r   r   __init__      
z_CudaModule.__init__name_CudaKernelc              
   C   s   || j v r
| j | S ddlm} | }t }zt|t|| j|	d t
|| j}|| j |< |W S  tyJ } z	td| d|d }~ww )Nr   )r   r,   zNo kernel named 'z' in this module)r_   torch.cuda._utilsr   r   r@   r   cuModuleGetFunctionr   r^   r2   rd   r   AttributeError)r`   rc   r   r   funckernelerrr   r   r   __getattr__   s$   


z_CudaModule.__getattr__)__name__
__module____qualname__r   r@   ra   strrk   r   r   r   r   r[      s    r[   c                   @   st   e Zd ZdZdejdejddfddZ						dd
eeeef deeeef de	e
 dede	e ddfddZdS )rd   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    rh   r\   r   Nc                 C   s   || _ || _d S r]   )rh   r\   )r`   rh   r\   r   r   r   ra      rb   z_CudaKernel.__init__   rq   rq   r   gridblockargs
shared_memstreamc                 C   sl  ddl }|jj }|sg }g }g }	|D ]Y}
t|
|jr?|
js*|
jr&|
 s*t	dt
|
 }|| |	t
| qt|
trRt
|
}|	t
| qt|
tret
|
}|	t
| qtdt|
 t
jt|	  }t|	D ]\}}
t
|
t
j||< qz|du rddl}|j }t|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type: rq      )rJ   r7   _utilsr   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr   r@   data_ptrr<   r   r4   c_intfloatc_float	TypeErrortyper?   	enumeratecastr3   current_streamr   cuLaunchKernelrh   _as_parameter_)r`   rr   rs   rt   ru   rv   rJ   r   processed_argsc_argsargptrr   r   c_args_arrayir   r   r   __call__   sV   





z_CudaKernel.__call__)rp   rp   Nr   N)rl   rm   rn   __doc__r   r@   ra   tupler4   r   listr   r   r   r   r   r   rd      s*    rd   rY   kernel_namesc           	   	   C   s   ddl }t }t| tr| d} t }|j }| t	|
t||  W d   n1 s2w   Y  |s=t|S i }|D ]}t }t	|t|||d t||||< qA|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr,   )r3   r   ry   ro   r2   r   r@   r7   r   r   cuModuleLoadDatar   r[   rf   rd   )	rY   r   rJ   r   r\   rv   kernelsrc   rh   r   r   r   _cuda_load_module  s*   


r   Fdeviceoptional	allow_cpuc                 C   s   t | tr| S t | trt| } t | tjr2|r&| jdvr%td|  n| jdkr2td|  tj sAt | tj	jrA| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )r7   cpuz(Expected a cuda or cpu device, but got: r7   z!Expected a cuda device, but got: )ry   r4   ro   rJ   r   r   r~   jitis_scriptingr7   idx_torch_get_device_index)r   r   r   r   r   r   r   N  s   





r   )Nr   NNr]   )FF)r   r
   typingr   r   r   rJ   torch._utilsr   r   r   r   r4   r   r   ro   r   bytesrZ   r[   rd   dictr   boolr   r   r   r   <module>   s^    
|]


1