o
    h
K                     @   sl  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efddZde
j fddZ!e" dd Z#e" de$fddZ%de$fddZ&e" de$fddZ'e"ddd Z(de$fddZ)eddG d d! d!Z*G d"d# d#eZ+dS )$    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dt tttf fdd}|S )Nreturnc                 S   s0   | j j}|j j}||ksJ d|dkrdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidth r   Z/var/www/html/scripts/venv/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r   r   r   r   min_dot_size   s   	r!   r   c                   C   s   t jjS N)r	   r   ptxasr   r   r   r   	get_ptxas!   s   r$   c                  C   s0   t jj} | d ur
| S tt jdgd}|S )Nz	--versionutf-8)r	   r   mock_ptx_version
subprocesscheck_outputr$   pathdecode)mock_verversionr   r   r   get_ptxas_version%   s
   r-   c                 C   sr   t | tsJ tt| d\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S td
|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P         F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr    splitRuntimeError)cuda_versionmajorminorr   r   r   ptx_get_version.   s   r?   archc                 C   s"   | j }|d u rt j}t|}|S r"   )ptx_versionr$   r,   r?   )optionsr@   rA   r<   r   r   r   get_ptx_version_from_optionsA   s
   rC   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)rC   min)rB   r@   rA   llvm_ptx_versionfeaturesr   r   r   get_featuresI   s   


rH   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r)   fr   r   r   	file_hashW   s   $rP   
capabilityc                 C   s   | dkrdnd}d|  | S )NZ   a sm_r   )rQ   suffixr   r   r   sm_arch_from_capability]   s   rW   T)frozenc                   @   s.  e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZe	e ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"d# Z d$d% Z!dS )&CUDAOptions   	num_warpsr2   num_ctas   
num_stagesNmaxnreg)r2   r2   r2   cluster_dimsrA   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)rj   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowr@   c                 C   s   t tjd }| jd u ri nt| j}|dd s%tjjp"t	|d |d< t
| dt|  | jdkr?| j| jd @ dksCJ dd S )Nlib	libdevicezlibdevice.10.bcrp   r   r2   znum_warps must be a power of 2)r   __file__parentrp   dictgetr	   r   libdevice_pathr8   object__setattr__tupleitemsr[   )selfdefault_libdirrp   r   r   r   __post_init__}   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S r"   )rP   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>rp   _c                 S   s   g | ]\}}| d | qS )-r   )r   namevalr   r   r   
<listcomp>   s    z$CUDAOptions.hash.<locals>.<listcomp>r%   )
ry   __dict__r~   sortedjoinr   rK   rL   encoderN   )r   	hash_dictkeyr   r   r   hash   s   
zCUDAOptions.hash)"__name__
__module____qualname__r[   r    __annotations__r\   r^   r_   r   r`   r~   rA   ra   r8   rb   rc   boolrd   re   rh   r   ri   rk   rn   ro   rp   ry   rq   rs   rt   r@   r   r   r   r   r   r   rY   c   s0   
 
rY   c                       s   e Zd ZedefddZdd ZdefddZdedd	f fd
dZ	de
fddZdd Zdd Zdeeef fddZdd Zedd Zedd Zdd Zdd Zdd Zd d! Zd"d# Ze d$d% Z  ZS )&CUDABackendr   c                 C   s
   | j dkS )Nrr   )backend)r   r   r   r   supports_target   s   
zCUDABackend.supports_targetc                 C   s0   d}t ||}|std| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r2   )re	fullmatch
ValueErrorr    group)r   r@   patternmatchr   r   r   _parse_arch   s
   zCUDABackend._parse_archr   c                 C   s   |  |j}d| S )Ncuda:)r   r@   )r   rB   rQ   r   r   r   get_target_name      
zCUDABackend.get_target_nameNc                    s   t  | d| _d S )Ncubin)super__init__
binary_ext)r   r   	__class__r   r   r      r   zCUDABackend.__init__c                    s   dt jjpd| jj i}| fddtj D  t	| 
|d }d|vr?ttj}|dkr7|d tt||d< d|vrK|d	krKd
|d< d|vrUt jj|d< |d	kr[dnd|d< tdi |S )Nr@   smc                    s*   i | ]}| v r | d ur| | qS r"   r   )r   r   optsr   r   
<dictcomp>   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>rh   Y   fp8e4nvri   rR   )rg   rc   i   @r   ro   r   )r	   runtimeoverride_archr   r@   updaterY   __dataclass_fields__keysr    r   setrh   addr~   r   languagedefault_fp_fusion)r   r   argsrQ   rh   r   r   r   parse_options   s   

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r2      )r[   r\   sharedr`   )r   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr|jn|jt	| j
d}|S )Nr   r1   )convert_custom_typesr!   )triton.language.extra.cudar   extrarr   r    r   r@   convert_custom_float8_sm80convert_custom_float8_sm70r!   r   )r   rB   rr   rQ   codegen_fnsr   r   r   get_codegen_implementation   s   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )rv   ztriton.language.extra.libdevice)r   rv   )r   rv   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s   t | d S r"   )r   load_dialects)r   ctxr   r   r   r      s   zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| |d dk r"tj	| tj
| tj| tj| tj| tj| tj| ||  | S )Nr5   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optrQ   pmr   r   r   	make_ttir   s   
zCUDABackend.make_ttirc                 C   sv  |j d ur| dt| j|j  t }|jd ur.|jd |_	|jd |_
|jd |_t| j}| }tj|d| |jd|j tj| |d dkrYtj| tjj|| tj| tj| tj| tj| tj||d	k tjj| tj| |d d
v rtj| tj | tj!| tj | tj"| tjj#$||j%| tj&||j% tj'| tj(||j%| n_|d dkr.tj| tj | tj!| tj)| tj*| tjj+| tj&||j% tj'| tj,||j% tj(||j%| tj"| tjj-| ntj!| tj | tj| tj.| tj||d	k tj/| tjj0| tj| tjj1| tj2| tj3| tj| tj4| |d dkrtjj5| tjj6| tj7| tj | |8|  |j	|j
|jf|d< | 9 }||d< | S )Nzttg.maxnregr   r2   r   r   r   r5   r   r1   )r   r   r   r`   tensordesc_meta):r_   set_attrr   builderr   get_int32_attrr   ClusterInfor`   clusterDimXclusterDimYclusterDimZr   r   r   r   add_convert_to_ttgpuirr[   r\   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecr^   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_tma_loweringadd_fence_insertionadd_sccpr   get_tensordesc_metadata)r   r   r   rQ   cluster_infor   dump_enabledr   r   r   r   
make_ttgir   s   


zCUDABackend.make_ttgirc                 C   sn   |}t |j}|  tj| tj| tj	
| tj| tj| || | |d< |S )Nr   )r   r   r   r   r   r   r   r   r  r   r   r   r   r   r  )r   srcr   rB   rQ   r   r   r   r   r   	ttgir_opt0  s   
zCUDABackend.ttgir_optc                 C   s   t || jj}|}t|j}|  tjj	
| tj| tj| tj| tj| tjj	| tj| tjj||| tj| tj| tjj	| tjj	| tj| tj| tj| tjjs~tj| || t !  t  }tjj"rt#dt $||}	t%|}
t&|| jj}d}t'  t (|	||
| t)|	 |j*rdd |j*D }t +|	| t ,|	t j- |.d}|d ur||d< |.d|d< |.d	|d
< |.d|d< |.d|d< t/|	}~	~|S )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )r   r   r)   r   r   r   r   h      z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsr[   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_align)0rC   r   r@   r   r   r   r   r   r   r   add_lower_mmar   r   add_allocate_warp_groupsconvertadd_scf_to_cfadd_allocate_shared_memoryadd_allocate_tensor_memory"add_allocate_global_scratch_memoryadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   r	   compilationdisable_line_infollvmiradd_di_scoper   r   init_targetsenable_asanr;   	to_modulerW   rH   set_short_ptrattach_datalayoutset_nvvm_reflect_ftzrp   link_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr8   )r   r  r   rB   rQ   rA   r   r   r   llvm_modprocrG   triplepathstotal_num_warpsretr   r   r   	make_llir?  sd   


zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}t||||g |jd}	t	d|	}
t
|
dks.J |
d |d< |d  d|d  }tjd	d
| |	tjd}	tjdd| |	tjd}	tdd|	}	tjjrltd t|	 |	S )Nr  Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r2   r   r   r5   r.   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rT   z // -----// NVPTX Dump //----- //)rC   r   r@   rW   rH   r   translate_to_asmrc   r   findalllensub	MULTILINEr	   r   
dump_nvptxprint)r   r  r   r   rQ   rA   r-  r,  rG   r0  namesr   r   r   make_ptx{  s    zCUDABackend.make_ptxc                 C   s|  t  j}tjdddd&}tjdddd}|| |  |jd }tjj	r.dd	gndg}	|j
r6g nd
g}
t|}tjjrEddgng }|jrP|jdng }|g|	|
d||d| |jd|}z%tj|dd|d tj|jrt|j tj|jrt|j W n\ tjy } zOt|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }t| d| dd| dd }~ww t|d}| }W d    n	1 sw   Y  tj|rt| W d    n1 sw   Y  W d    |S W d    |S 1 s7w   Y  |S )NFwz.ptx)deletemoderV   rz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
rI   )r$   r)   tempfileNamedTemporaryFilewriteflushr   r	   r  r  rc   rW   r   disable_ptxas_optra   r:   r'   r   osexistsremoveCalledProcessErrorrJ   rM   
returncodesignalSIGSEGVr
   r   )r   r  r   r   rQ   r#   fsrcflogfbin	line_infofmadr@   disable_optptx_extra_options	ptxas_cmdelog_filelogerrorrO   r   r   r   r   
make_cubin  s   






*///zCUDABackend.make_cubinc                    s    j |tjkr  fdd|d<  fdd|d< n|tjkr/ fdd|d<  fdd|d< fd	d|d
< fdd|d< d S )Nc                        | | S r"   )r   r  r   rQ   rB   r   r   r   <lambda>      z(CUDABackend.add_stages.<locals>.<lambda>r   c                    ra  r"   )r  rb  rc  r   r   rd    re  ttgirc                    ra  r"   )r  rb  rc  r   r   rd    re  c                    ra  r"   )r1  rb  rc  r   r   rd    re  llirc                        | | jjS r"   )r;  r   r@   rb  rB   r   r   r   rd    r  ptxc                    rh  r"   )r`  r   r@   rb  ri  r   r   rd    r  r   )r   r@   r   TRITONGLUON)r   stagesrB   r   r   rc  r   
add_stages  s   

zCUDABackend.add_stagesc                 C   s   t  }| d| jj S )Nr   )r-   r   r@   )r   r,   r   r   r   r     s   zCUDABackend.hash)r   r   r   staticmethodr   r   r   r8   r   r   r   r   r   r   r   r   r   r   r   r  r  r1  r;  r`  rn  	functools	lru_cacher   __classcell__r   r   r   r   r      s,    



J<3r   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r   tritonr	   triton.runtime.errorsr
   dataclassesr   rp  typingr   r   r   r   typesr   rK   r   rH  rR  rM  r'   pathlibr   r!   
NvidiaToolr$   rq  r-   r    r?   rC   rH   rP   rW   rY   r   r   r   r   r   <module>   s<    

*