o
    ,h                 -   @   s6  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlmZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZm Z m!Z! d dlm"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z: d dl;m<Z= edZ>edZ?ej@jAZAejBCdddZDeEd\ZFZGZHde
e
e?e>f ge
e?e>f f fddZIde'fddZJdd ZKdd ZLeIeAjMeAjNge4 dddejOd d fd!d"ZPeIeAjQjReAjQjSge4 d#d$ ZTeIeAjUjReAjUjSge4 d%d&d'd(ZUdhd)d*ZVd+d, ZWdid.d/ZXeIeAjYjRd0d1 ZZeIeAj[e4 d2d3 Z[eIeAj\jReAj\jSeAj]jReAj]jSge4d4d5d6d7 Z^eIeAj_jReAj_jSge4 d8d9 Z_d:d; Z`dhd<ed=eaeb d>ecfd?d@ZdeIeAjejReAjejSge4 dAdB ZfdZgd=eaeb fdCdDZheIeAjijReAjijSge4 dEdF ZjeIeAjkjlddGdHdIZmeIeAjkjRejnddddJdKdLZoeIeAjpjReAjpjSge4 ejnddddJdMdNZqeIeAjpjreAjpjsge4 ejnddddJdOdPZteIeAjujReAjujSge4 dddddJdQdRZveIeAjwjReAjwjSge4 d<ed=eaeb dSebdTebfdUdVZxeIeAjyjRdhdWdXZzdYdZ Z{eIeAj|jRd[d\ Z}eIeAj~			djd]ed^ed_ed`ee daee dbeej fdcddZeIeAj	dkdeedfedgedbeej fdhdiZeIeAjdjdjddkd]edeedfedgedbeej f
dldmZeIeAj				 	 	j	%dldnejdoejd`ee dpee dbeej dqecdrebdsebdtebfdudvZeIeAjjRd-dwd<ed=ebdxedyejdzed{ecdefd|d}ZeIeAjjRd-dwd<ed=ebdxedyejdzed{ecdefd~dZe4 eIeAjjRdd ZeIeAjjRdddd d dddedzedee d5ee dee debdecdefddZeIeAjjReAjjge4 dd ZeIeAjjdhddZeIeAjjReAjjge4 dd ZeIeAjjdhddZeIeAjjRdd ZeIeAjjSdd ZeIeAjjRdd ZeIeAjjdd ZeIeAjjRdd ZeIeAjjRddddddddZeIeAjjRdmddZeIeAjjRdjddZeIeAjjRdmddZeIeAjjRdd ZeIeAjjdd Zd<edefddZd<ededefddZ	-didededecfddZdndededefddZdededecdefddZ	dodeded]edefddZdefddZeIeAjjReAjjge4ddădpdededecfddȄZeIeAjjReAjjSge4 d]edefddʄZeIeAjge4ddăd]efdd̄ZdedefddτZeIeAje4 d<ededecdefdd҄ZeIeAje4 dhd<ededecdefddԄZeIeAje4 dhd<edecdefddքZeIeAje4 dhd<edecdefdd؄ZeIeAjjRdqdedecdecfddۄZeIeAjjReAjjSge4 d]ededefddބZeIeAjjRdhdedecfddZeIeAjjReAjjSge4dddd d dd<edecdecdeeeef fddZeIeAjjReAjjSge4 d ddedededecdef
ddZeIeAjjReAjjSge4dddd-ddedecdeeeef fddZeIeAjjReAjjSge4dddd-d ddedecdecdeeeef fddZeIeAjjReAjjSge4 d-d ddedededecdecdefddZeIeAj΃e4ddd	-	-drdededecdecdeeeef f
ddZdedeececf fddZeIeAjjReAjjSge4d ddsdededeeef fddZeIeAjjReAjjge4dddddedeeeeef fddZeIeAjjR	 	-	dtded	ecd
ecdee fddZ֐dededeeaeb eaeb f fddZאdededee deeef fddZd]ededecfddZeIeAjڃd-d ddddddededecdecdee dee dee dee deeeeef fddZeIeAjjReAjjSgd-d dddededecdecdecdee defddZeIeAj݃e4dd d-d!	-	 	 dud<ededecd"ecdecdeeef fd#d$ZeIeAjjRd%d& ZeIeAje4 	-	 dvd]edededecd"ecdefd'd(Zd)d* Zd+d, ZeIeAje4 d-d. ZeIeAje4 d/d0 Zd1d2 ZeIeAje4d3d4d5 ZeIeAje4d3d6d7 Zd8d9 ZeIeAje4 d:d; ZeIeAje4 d<d= ZeIeAjjReAjjeAjjReAjjge4d3d>d? Zd@dA ZeIeAje4 dBdC ZeIeAje4 dDdE ZeIeAjjReAjjeAjjReAjjge4d3dFdG ZeIeAje4 dwd<edIedefdJdKZ eIeAje4 dLed<edIedMedef
dNdOZeIeAjjReAjjSge4d-d!djdjdPdQdRZeIeAjjReAjjSge4 ddGdSdTZeIeAjjdxdVdWZeIeAjj	dxdXdYZ
eIeAjjReAjjSge4 dkdZd[ZeIeAjjR	 	 dqd\d]ZeIeAje4d-d!d^d_ Zd`da ZdydcddZ	dkdeejd^ejdfeeaeb ebf dgeeaeb ebf dheeaeb ebf diecdjebdkeeeaeb ebf  fdldmZdndo ZeIeAjjRdeejd^ejd`eej dpeej dqeej drecdsedtefdudvZeIeAjjRdeejd^ejd`ejdfeaeb dgeaeb dheaeb diecdkeaeb djebfdwdxZejj
rejBCdyddZeIej@jjjRdzd{ ZeIej@jjjRd|d} Z ejj!	rejBCd~ddZ"eIej@j#j$dd Z%ejBCdddZ&eIej@j'j(jReIej@j'j)jRdd Z*eIej@j'j(j+dd Z,eIej@j'j-jReIej@j'j-j.dd Z/eIej@j'j-j+eIej@j'j-j0dd Z1eIej@j'j2jReIej@j'j3jRdd Z4ejBCdddZ5eIej@j6j7				 dzddZ8eIej@j6j9dd Z:dd Z;eIeAj<jR			 	-	d{ddZ=dd Z>eIeAj?jRdd Z@eIeAjAe4 			 	-	d{ddZBeIeAjCe4d3dd ZDeIeAjEjRdd ZFeIeAjGjRdd ZHeIeAjIjRdd ZJeIeAjKe4d3dd ZLdedefddZMeIeAjNe4dd5dd ZOeIeAjPe4d3dd ZQeIeAjRe4dd5dd ZSeIeAjTe4d3dd ZUeIeAjVjdkddZWeIeAjXjReAjXjSge4 dd ZYeIeAjZjReAjZjSge4 d%ddebfddZZeIej@jAj[jRej@jAj[jSge4 dd Z[eIeAj\jeAj]jgdd Z^eIeAj_jRgdd Z`eIeAjajReAjajSge4d-d!djdjdPddZbeIeAjcjgdÐdĄ ZdeIeAjejReAjfjRgdddŜdƐdǄZgeIeAjhjRgdddŜdȐdɄZieIeAjjge4 dʐd˄ ZkeIeAjlgd̐d̈́ ZmeIeAjngdΐdτ ZoeIeAjpgdАdф ZqeIeAjrgdҐdӄ ZseIeAjtgdԐdՄ ZtdebdebdebfdؐdلZudڐdۄ ZveIeAjwgd`ee fdܐd݄ZxeIeAjygdސd߄ ZzeIeAj{gdd Z|eIeAj}jRdd Z~eIeAje4 dd ZeIeAjjR	 	 	 		 	%d|ddZeIeAjjRdd ZdiddZeIeAjjReAjjSge4 d}ddddZeIeAjjReAjjRgdd ZeIeAjjeAjjeAjjeAjjeAjjReAjjge4d4d5d~ddZeIeAjjRdd ZeIeAjjRdd ZeIeAjjRdd ZeIeAjjeAjjeAjjeAjjeAjjReAjjReAjjRgdd ZeIeAjjeAjjeAjjeAjjgdddZeIeAjjReAjjgdd Zdd  ZeIeAjjeAjjgdd ZeIeAjjeAjjgdd ZeIeAjjRdd ZeIeAjjeAjjgdd ZeIeAjjeAjjgd	d
 ZeIeAjjRdd ZeIeAjje4 ddefddZeIeAjge4 	dddZeIeAjg	dddZeIeAjg	dddZeIeAjjReAjjRgdhddZeIeAjjdd ZeIeAjjRdd ZeIeAjdd ZeIeAje4 dd  ZeIeAjd!d" ZeIeAjjRdhd#d$ZeIeAjjRd%d& Zǐdmd'd(ZeIeAjjRd)d* ZeIeAjjd+d, Zːd-d. Z̐d/d0 Z͐d1d2 Zΐd3d4 Z	 dhd]ed5ebd6ebd7ebd8ebd9ebd:ebd;ebd<ebd=ebd>ebd?ebd@ebdAebdBebdCebdDebdEebdFebdGebdedHecf,dIdJZАdKdL Zd]eded5ebd6ebd7ebd8ebd9ebd:ebd;ebd<ebd=ebd>ebdBebdCebdDebdEebdFebdGebdef&dMdNZҐdOdP ZeIeAjjRdQdR ZeIeAjjR				 dzdSdTZeIeAjjRdUdV ZeIeAjڃe4dd5				 dzdWdXZeIeAj܃e4d3dYdZ Zd]ed[efd\d]ZG d^d_ d_eZd]ed[ed`ebfdadbZeIeAjjRdcdd ZeIeAje4 dedf ZeIeAje4d3dgdhdi ZeIeAjjRgdjdk ZeIeAjjR					ddldmZeIeAjjReAjjSge4 ddddd dndodpZeIeAjjReAjjSge4 ddddd dndqdrZeIeAjjbdsdt ZeIeAjjRdudv ZeIeAjjRddwdxZdid=ebdyebdzecfd{d|Zd}d~ Zdd ZeIeAjjRdhddZdhddZdkddZdd ZdkddZdddZeIeAjjRdd ZeIeAjdd ZeIeAjj eAjjeAjjeAjjge4 dkddZeIeAjj eAjjeAjjeAjjgdkddZeIeAjg		 	 	ddededededecdecdee fddZdedeebdf fddZeIeAj	g		 	 	ddedededee decdedecdecdee fddZ
eIeAjg			 	 	ddedededee dedecdecdee fddZeIeAjg	dkdededededededededebdebdedecdededee fddZeIeAjg		 		ddededededecdee dee fddZeIeAjg		dmdedededededededecdee dee fddZeIeAjg		 	ddedededee decdecdee fddZeIeAjg	 	ddededededee dedededededeaec decdee fddZeIeAjg	dkdedededededededededededebdebdedecdee f ddZeIeAjg					ddedededee dee debdebdedecdecdee deeb deeb dee dee fdÐdĄZeIeAjg			djdededededededededebdebdedecdededee deeb deeb f"dŐdƄZeIeAjg	 				ddededed`ee dee dee deeb deeb dedebdecdee dee dee deeb fdϐdЄZeIeAjg			 ddedededed`ee dee dee dej dej dededededebdecdee deeb decf$dԐdՄZ!eIeAj"jRg				 dd<ejdgejdejdejd`eej deej dbeej decfdڐdۄZ#eIeAj$j%eAj$j&ge4 didܐd݄Z'eIeAj(j%didސd߄Z)eIeAj*jReAj*jSge4 dhddGddZ+dd Z,dd Z-eIeAj.jReAj/jRgdkddZ.eIeAj0jReAj1jRgdmddZ0eIeAj2jReAj3jRg		dmdedeeebej f  deeebej f  dee dee f
ddZ2eIeAj4jReAj5jRgdjddZ4eIeAj6jReAj6j7eAj6jeAj6j8gdddZ9dd Z:eIeAj;jR		dmddZ<eIeAj=jRdd Z=eIeAj>jRdd Z>dd Z?dd Z@eIeAjAjReAjBjRgd}d dZCeIeAjDjRdddZDeIeAjEjRdddZFeIeAjGe4 	dddZHeIeAjIjReAjIjge4d4d5d~dd	ZJejKZLd
d ZMeIeAjNjRdd ZNeIeAjOjRdd ZOeIeAjPjRdd ZQeIeAjRjRdd ZReIeAjSjeAjSjTge4 d d dddZUeIeAjVge4 dddZWeIeAjXjReAjYjRg		dmddZZeIeAj[jRg		dmddZ\eIeAj]jRdd Z]eIeAj^jReAj^jSge4 djd d!Z^eIej@jAj_d"d# Z_eIej@jAj`d$d% Z`eIeAjae4 d d ddd&d'd(Zbd)d* ZceIeAjdd+d, ZeeIeAjf	%dd-d.ZgeIeAjh	%dd/d0ZieIeAjj	%dd1d2ZkeIeAjle4 d d d3d4d5ZmeIeAjne4 d6ebd<edefd7d8ZoeIeAjpd<efd9d:ZqeIeAjre4d-d!d<edefd;d<ZreIeAjse4 d<edefd=d>Zsd?d@ Zt					 ddAedBedeej deej dCee d`ee deej dbeej decfdDdEZueIeAjve4 			djdAedBedCee d`ee dbeej defdFdGZweIeAjxjRg					 ddAejdBejdejdejdCeej d`eej deej dbeej decfdHdIZyeIeAjze4 dJed=ebdKecdefdLdMZ{eIeAj|e4 ddNdOZ}eIeAj~e4 	%	 	 dd^ed5edPebdQecdRecdefdSdTZ~eIeAjjR	dd4edeae dUeaeb dVefdWdXZdYdZ Zd[d\ ZeeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj d]d^ ZeIeAje4 d_d` ZeIeAje4 djdadbdcZeIeAje4 djdadddeZeeAjZeeAjZeeAjZd dl5Zd dlZd dlZdfdg Ze  dS (      N)Sequence)Enum)reducewraps)CallableOptionalTypeVarUnion)	ParamSpec)SymBoolSymFloatTensor)_add_op_to_registry_convert_out_paramsglobal_decomposition_table
meta_table)
OpOverload)_prim_elementwise_meta$ELEMENTWISE_PRIM_TYPE_PROMOTION_KINDview_of)BoolLikecorresponding_complex_dtypecorresponding_real_dtypedefinitely_contiguouselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KIND	FloatLikeIntLikeis_contiguousmake_contiguous_strides_forNumbersuggest_memory_format
TensorLike)_maybe_convert_to_dtype_maybe_resize_out_resize_output_check_safe_copy_outout_wrapper)_broadcast_shapes_maybe_broadcast)_config)_pytree_T_PatenIMPLMeta   returnc                    s    fdd}|S )Nc                    s$   t    fdd}t|  S )Nc                    s   t t|   d S N)r   r   opfn T/var/www/html/scripts/venv/lib/python3.10/site-packages/torch/_meta_registrations.pyregisterA      z0register_meta.<locals>.wrapper.<locals>.register)r   pytree	tree_map_)r7   r:   r4   r6   r9   wrapper>   s   zregister_meta.<locals>.wrapperr8   )r5   r>   r8   r4   r9   register_meta=   s   	r?   type_promotionc                    s>   t j|d| i\}  fdd|D }t| }t|dtjiS )Ntype_promotion_kindc                    s   g | ]}t | qS r8   )r#   .0xresult_dtyper8   r9   
<listcomp>S       z$elementwise_meta.<locals>.<listcomp>r@   )utilsr   r)   r   r   DEFAULT)r@   args_r8   rE   r9   elementwise_metaJ   s   
rM   c                 C   s(   t jt jt jt jt jt ji}|| | S r3   )torch	complex32halfcfloatfloatcdoubledoubleget)dtypefrom_complexr8   r8   r9   toRealValueType^   s
   rX   c                    s2   t tg|R   t k fdd d S )Nc                         d d  S )Nzoutput with shape z# doesn't match the broadcast shape r8   r8   broadcasted_shape
self_shaper8   r9   <lambda>k       z)check_inplace_broadcast.<locals>.<lambda>)tupler(   rN   _check)r\   
args_shaper8   rZ   r9   check_inplace_broadcastg   s
   rb   Fc	           	         s  t tjrt dkdd  t tjr$t dkdd  tdd fD rMtt  d u r> ntt	 fdd npRt t tj
s[J tt tfdd t tsqJ tdkd	d  tjf|d
||dS )Nr   c                   S      dS Nz:linspace only supports 0-dimensional start and end tensorsr8   r8   r8   r8   r9   r]          z(meta_linspace_logspace.<locals>.<lambda>c                   S   rc   rd   r8   r8   r8   r8   r9   r]      re   c                 s   s    | ]}t |tV  qd S r3   )
isinstancecomplex)rC   argr8   r8   r9   	<genexpr>       z)meta_linspace_logspace.<locals>.<genexpr>c                         d  d S )Nzlinspace(): inferred dtype z& can't be safely cast to passed dtype r8   r8   )default_complex_dtyperV   r8   r9   r]      r^   c                      s*   dt j dt  j dt j dS )Nz4received an invalid combination of arguments - got (, ))type__name__r8   )endstartstepsr8   r9   r]      s    c                   S   rc   )Nz$number of steps must be non-negativer8   r8   r8   r8   r9   r]      re   metarV   layoutdevice
pin_memoryrequires_grad)rf   rN   r   r`   dimanyrI   r   get_default_dtypeis_complex_dtyperV   _check_typer   empty)	rr   rq   rs   baserV   rw   rv   rx   ry   r8   )rl   rV   rq   rr   rs   r9   meta_linspace_logspaceo   sH   

r   c                    sN   t  jt jk fdd t |  dko  dk dd  |  jS )Nc                         d j  S )Nz2take(): Expected a long tensor for index, but got rV   r8   indexr8   r9   r]          zmeta_take.<locals>.<lambda>r   c                   S   rc   )Nz*take(): tried to take from an empty tensorr8   r8   r8   r8   r9   r]      re   )rN   r`   rV   long_check_indexnumel	new_emptyshape)selfr   r8   r   r9   	meta_take   s   

r   rz   c                   sh   j }j }t||kdd  t dko dk fdd tjj}|S )Nc                   S   rc   )Nz=linalg.cross: inputs must have the same number of dimensions.r8   r8   r8   r8   r9   r]      re   zlinalg_cross.<locals>.<lambda>r1   c                      s"   d  d   d   S )Nzlinalg.cross: inputs dimension z must have length 3. Got  and sizer8   rz   otherr   r8   r9   r]      s
   )ndimrN   r`   r   r(   r   r   )r   r   rz   x_dy_d	out_shaper8   r   r9   linalg_cross   s   
r   c                    s  ddl m mm}  fdd}fdd}t| dkr%dgt| S ttj| d}||dk}|r=||| |r=|S dgt| }	|rstt|d ddD ] }
|
t|d kr_d|	|
< qPt	||
d  d|	|
d   |	|
< qP|	S t|d }
|d }d}d}tt| d ddD ]m}|| | 9 }|dks|| |d  dkr|||d  || kr|
dkr|||k s|||
 dkr|| |	|
< |||
 9 }|
d8 }
|
dkr|||k s|||
 dks|||kr d S |dkr||d  }d}d}q|
dkrd S |	S )	Nr   )guard_or_falseguard_or_truesym_eqc                       r | S | S r3   r8   rD   )r   size_obliviousr8   r9   maybe_guard_or_false      z-_compute_stride.<locals>.maybe_guard_or_falsec                    r   r3   r8   r   )r   r   r8   r9   maybe_guard_or_true   r   z,_compute_stride.<locals>.maybe_guard_or_true   r   )
%torch.fx.experimental.symbolic_shapesr   r   r   lenr   operatormulrangemax)	old_shape
old_stride	new_shaper   r   r   r   r   
zero_numel
new_strideview_dchunk_base_stridetensor_numel
view_numeltensor_dr8   )r   r   r   r9   _compute_stride   sj   


r   c                    sV   ddl m  t fdd|  D p*t fdd|  D p*t fdd|D S )Nr   has_hintc                 3       | ]} | V  qd S r3   r8   rC   sr   r8   r9   ri     rj   z+_view_has_unbacked_input.<locals>.<genexpr>c                 3   r   r3   r8   r   r   r8   r9   ri     rj   c                 3   r   r3   r8   r   r   r8   r9   ri     rj   )r   r   r{   r   stridear   r8   r   r9   _view_has_unbacked_input  s   r   Tc                    s  ddl m}m} tjddt   jdkr; }D ]}t	|dk tj
|d}q | u r9t S |S tdkra } jD ]}t	|dk tj
|d}qF| u r_t S |S ttjd}t	  |k fdd tt jkr|| jrt S |rt rnt rt} |S t    |d	}	|	d ur |	S |rtjjjjst rt dd
S d j d   d d}
t |
)Nr   )r   r   F)validater   r   c                         d j  d dS )Nz&Could not reshape a tensor with shape  as a tensor with shape !r   r8   r   r8   r9   r]   E      z%_view_unbacked_meta.<locals>.<lambda>)r   )size_oblivious_enabledz Cannot view a tensor with shape z and strides r   r   )!r   r   r   rI   extract_shape_from_varargs
infer_sizer   r   rN   r`   _refs	unsqueezer   r   r   squeezer   r   r   r   r   r   
as_stridedr   r   r   fxexperimentalr*   backed_size_obliviousr   _view_unbacked_meta
ValueError)r   r   r   r   r   _alengthshape_numelstridesnew_stridesmsgr8   r   r9   r   !  sT   


"

r   c                 G   s:   t jjjjst| |rt| |S t jj| g|R ddiS )N
allow_copyF)	rN   r   r   r*   r   r   r   r   _reshape_view_helperr   r8   r8   r9   
_view_metae  s
   
r   c                 C   s$   t | d t| d tj| tjdS )Nzlinalg.matrix_expmemory_format)squareCheckInputscheckFloatingOrComplexrN   
empty_likecontiguous_formatr   r8   r8   r9   linalg_matrix_expo  s   

r   valuesindicesc                 C   sV   t j| j| j| jd}t j| j| jt jd}|  dkr'| jdkr't|| j ||fS )Nrw   rV   r   )	rN   r   r   rw   rV   int64r   r   maybe_wrap_dim)r   rz   r   r   r8   r8   r9   	cummaxminw  s
   r   c                 C   s   t || j tj| tjdS Nr   )r   r   rN   r   r   )r   rz   r8   r8   r9   logcumsumexp  s   r   c                   s  |j }t|}|| }tt|}dd t|D }	|D ]}
d|	|
< qg g }}|D ]}
|	|
 s6||
 q*||
 q*|| }t|}|  |d | }|j fdddd |||d   }||}dgt|j|d   }|	|}|
d}||d< t|}tt|D ]}|||  ||d	 < q| j|tjd
 dd t|D }d	}|d	 }|dkr|| d ||| < ||||  9 }|d	8 }|dkst||D ]}| d	||  ||| < q| |||   | S )Nc                 S      g | ]}d qS Fr8   rC   rL   r8   r8   r9   rG     r^   z_exec_fft.<locals>.<listcomp>Tc                        |  S r3   r8   r   self_stridesr8   r9   r]         z_exec_fft.<locals>.<lambda>keyreverser   r   r   r   c                 S   r   r   r8   r   r8   r8   r9   rG     r^   )r   r   listr   appendr   sortpermuter   reshaper   resize_rN   r   as_strided_storage_offset)outr   	out_sizesrz   forwardr   signal_ndim
batch_dimsdim_permuteis_transformed_dimdleftright	batch_endtmpinputbatched_sizes
batch_sizebatched_out_sizesiout_stridesbatch_numelr8   r   r9   	_exec_fft  sN   




r  r   rz   exclude_lastc                    s<   t |}|   |d t|t|  j fddd |S )Nc                    r   r3   r8   r  r   r8   r9   r]     r   z_sort_dims.<locals>.<lambda>)r   )r   r   r   intr   )r   rz   r  sorted_dimsr8   r   r9   
_sort_dims  s   
r  c                 C   sH   t | jj |s|  S t| |}| |  }t|| |  ||dS )Nr   )	rN   r`   rV   
is_complexcloner  r   r   r  )r   rz   normalizationr   r  r   r8   r8   r9   meta_fft_c2c  s   
r  c                 C   s8   t | tkst | dkr| d dkr| d dkrdS dS )N   r   r   FT)r   cufft_max_ndimr   r8   r8   r9   use_optimized_cufft_path  s   0r  c                    s  t | jj t|  }t|}|d }|| d d }t|}|||< |r+|||< t| dks7t| dkr| j|t	| jd}	| }
t| dkrXt
|rXt|	|
||dd ngt|dkr`|n|}t|	|
||gdd t|dkr}| j|t	| jd}
|d d }|r|
|	}	}
|
  |j fd	d
dd ttt|}|t|| d  }t|	|
||dd |d t||  }|s|s|	||| kr|
j|t jd |
}	|	S | j|t	| jdS )Nr   r  r   cudaxpur   Tr  c                    r   r3   r8   r  r   r8   r9   r]     r   zmeta_fft_r2c.<locals>.<lambda>r   r   )rN   r`   rV   is_floating_pointr   r   device_hintr   rI   r   r  r  r   r   r   minr  r   r   )r   rz   r  onesidedinput_sizesr   last_dimlast_dim_halfsizeonesided_sizesoutputworking_tensortarget_sizesr  max_dims	last_dimsr8   r  r9   meta_fft_r2c  sX   

r-  )	generatorc                C   s   t |t| gS r3   )r$   rN   Size)nr.  r   r8   r8   r9   meta_randperm%  s   r1  rV   rv   rw   rx   c                C      t j| ||||dS Nr2  rN   r   )r0  rV   rv   rw   rx   r8   r8   r9   meta_randperm_default*  s   	
r6  c                   s2   dt  k fdd t j|||||dS )Nr   c                      rY   Nz:random_ expects 'from' to be less than 'to', but got from=z >= to=r8   r8   highlowr8   r9   r]   F  r^   zmeta_randint.<locals>.<lambda>r2  rN   r`   r   )r9  r   rV   rv   rw   rx   r8   r8  r9   meta_randint8  s   
r<  c                   s.   t  k fdd t j|||||dS )Nc                      rY   r7  r8   r8   r8  r8   r9   r]   [  r^   z"meta_randint_low.<locals>.<lambda>r2  r;  )r:  r9  r   rV   rv   rw   rx   r8   r8  r9   meta_randint_lowM  s   
r=  c                C   r3  r4  r5  )r   rV   rv   rw   rx   r8   r8   r9   meta_rand_defaultb  s   
r>  r  lastdimc           
      C   s*  t | jj t| dkrZt|  }|||d < | j|t| jd}t	|r5t
|| jt jd||ddS t|dkrGt| |d d d|}n| jt jd}t
||||d gddS | }t|dkrv|d d }t| ||dd}|dd  }t| }|||d < | j|t| jd}	t
|	|||ddS )	Nr  r   r   r   Fr  r   r   )rN   r`   rV   r  r!  r   r   r   rX   r  r  r  r   r   r  )
r   rz   r  r?  r   r(  tempr  c2c_dimsr   r8   r8   r9   meta_fft_c2rj  s4   	rB  c                 C   sf   ddl m} || st| dkrtdt|tr1|| |}|  | kr1t	j
||   | S )Nr   )free_unbacked_symbolsr   zQmore than one element of the written-to tensor refers to a single memory location)r   rC  rN   _debug_has_internal_overlapRuntimeErrorrf   r   tor   r.   expand_copydefault)r   srcnon_blockingrC  intermediater8   r8   r9   
meta_copy_  s   
rL  c                 C   sX   t |  }t |  }||  krdn|| ||  }||d ||| ||fS Nr   )r   r   r   rz   insert)tensorrz   result_sizesresult_stridesr   r8   r8   r9   inferUnsqueezeGeometry  s    rR  c                 C   s0   t ||  d }t| |\}}| || | S rM  )r   rz   rR  r   )r   rz   g_sizes	g_stridesr8   r8   r9   meta_unsqueeze_  s   rU  r  weight_metabias_activation_opt	out_dtypec           	      C   s   t | j}|d ur|d|dksJ d|d| dd ks%J |d|d< t| jdks7J dd| df}|d urQ| jtjkrM|tjksQJ d| j||d u r[| jn|d	||}|S )	Nr   zoutput size mismatchr   r   r  z*we can only handle the squashed input case9out_dtype is only supported for i8i8->i32 linear operatorr   )
r   r   r   r   rV   rN   int8int32r   r   )	r  rV  rW  rX  rY  rZ  output_sizestransposed_stridesr(  r8   r8   r9   meta_sparse_structured_linear  s$   
	r`  mat1	mat1_metamat2c                 C   s   t | jdks	J t |jdksJ t |jdksJ | d|dd ks)J | d|dg}|d urF|jtjkrB|tjksFJ d|j||d u rP|jn|d}|S )Nr  r   r   r[  r   r   r   r   rV   rN   r\  r]  r   )ra  rb  rc  rZ  r^  r(  r8   r8   r9   meta_sparse_structured_mm  s   re  r   )alphabetarZ  c          	      C   s   t | jdksJ dt |jdksJ t |jdksJ t |jdks&J | d|dks4J d|d|dd ksBJ |d|dg}|d ur_|jtjkr[|tjks_J d|j||d u ri|jn|d}|S )Nr   zEonly input broadcasted to columns of mat1 * mat2 product is supportedr  r   r[  r   rd  )	r  ra  rb  rc  rf  rg  rZ  r^  r(  r8   r8   r9   meta_sparse_structured_addmm  s(   rh  compressed_Adense_Brf  transpose_resultalg_idsplit_ksplit_k_modec	                 C   s  |j tjtjtjtjtjhv sJ d| j |j ksJ dt|jdks(J d| j tjtjfv }	|	r5dnd}
|	rA|	 rAJ d|
d}|
d	}|  d
 |
|  }|d urb||
dksbJ |d urx|	rt|tjtjtjtjhv sxJ d|r~||fn||f}|j||dS )Nz;_cslt_sparse_mm only supports fp16, bf16, int8, and fp8e4m3zinputs must have the same dtyper  z'_cslt_sparse_mm only supports 2d inputs
   	   z.dense input must be transposed for 8bit dtypesr   r      z\out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!r   )rV   rN   float32float16bfloat16r\  float8_e4m3fnr   r   r   r   r   r]  r   )ri  rj  rX  rf  rZ  rk  rl  rm  rn  is_8bit_input_typecompression_factorkr0  moutput_shaper8   r8   r9   meta__cslt_sparse_mm  sB   


r{  )include_selfr   sourcer   r|  c                C      t j| t jdS r   rN   r   r   r   rz   r   r}  r   r|  r8   r8   r9   meta_index_reduceL  s   
r  c                C      | S r3   r8   r  r8   r8   r9   meta_index_reduce_Y  s   
r  c                 C   s.   t |  }|  dkr| ||< | |S Nr   )r   r   rz   r   r   )r   rz   r   result_sizer8   r8   r9   meta_index_selectg  s   
r  )lengthsr   offsetsaxisunsafeinitialdatar  r  r  r  c          
         sf   |d urt d fdd}|d ur||jS |d ur/|jd d |jd d f }	||	S td)Nz?segment_reduce(): indices based reduction is not supported yet.c                    s(   t j| j d d   jdt jdS )Nr   rt   rV   rw   r   )rN   r   r   rV   r   )lengths_shaper  r  r8   r9   segment_reduce_lengths_tensor  s   z:meta_segment_reduce.<locals>.segment_reduce_lengths_tensorr   r   z<segment_reduce(): Either lengths or offsets must be defined.)NotImplementedErrorr   rE  )
r  r   r  r   r  r  r  r  r  r  r8   r  r9   meta_segment_reducep  s   
r  c                 C   
   |  dS Nr8   r   r   r8   r8   r9   meta_max     
r  c                 C   6   t | j|f}t| ||}| || j|tjdfS Nr   rI   reduction_dimsr   _compute_reduction_shaper   rN   r   r   rz   keepdimrz  r8   r8   r9   meta_max_dim  
   r  c                 C   r  r  r  r   r8   r8   r9   meta_min  r  r  c                 C   r  r  r  r  r8   r8   r9   meta_min_dim  r  r  c                 C   s4   |   r
t| j}n	t| tjd\}}tj| |dS NrA   r   )r  r   rV   r   r   INT_TO_FLOATrN   r   )r   rF   rL   r8   r8   r9   
meta_angle  s   
r  c                 C   s$   t ||  | j |t | S r3   )rN   _resize_output_r   rw   copy_angle)r   r   r8   r8   r9   meta_angle_out  s   r  c                 C      d S r3   r8   )valr8   r8   r9   assert_async     r  c                 C   r  r3   r8   )r  
assert_msgr8   r8   r9   assert_async_meta  r  r  c                 C   r  r3   r8   )r   r8   r8   r9   
print_meta  r  r  rV   rv   rw   rx   r   c                 C   s   t jdddS )Nr   rt   rw   r5  r  r8   r8   r9   make_dep_token  s   	r  c                 C   s4   ddl m} t| ttfrtd|| ||d d S )Nr   )constrain_range'Constraining SymFloat or Symbool is nyir"  r   )r   r  rf   r   r   r   )r   r"  r   r  r8   r8   r9   sym_constrain_range  s   r  c                 C      t j| ||d |S Nr  )r.   r  r   r"  r   	dep_tokenr8   r8   r9   functional_sym_constrain_range     r  c                 C   s   ddl m} |d u r|d u rt|  d S t| ttfr tdt| t	u r>|d ur1t
| |k |d ur<t
| |k d S || ||d d S )Nr   )_constrain_range_for_sizer  r  )r   r  rN   _check_is_sizerf   r   r   r   ro   r  r`   )r   r"  r   r  r8   r8   r9   sym_constrain_range_for_size  s   
r  c                 C   r  r  )r.   r  r  r8   r8   r9   'functional_sym_constrain_range_for_size  r  r  c                 C   s   |S r3   r8   )r  r  r  r8   r8   r9   functional_assert_async_meta  r  r  f_namec                 C   sX   |   dksJ | d| d| dks*J | d| d d| d dd S )Nr  z3: The input tensor must have at least 2 dimensions.r   z5: A must be batches of square matrices, but they are  by 	 matrices)rz   r   )r   r  r8   r8   r9   r     s    r   Anamec                    s   t j jk fdd t j jk fdd t  d dk fdd t  ddk fdd d S )Nc                         dj  d j  dS )Nz:Expected b and A to be on the same device, but found b on z
 and A on 	 instead.r  r8   r  r   r8   r9   r]      
   z(linearSolveCheckInputs.<locals>.<lambda>c                      r  )Nz=Expected b and A to have the same dtype, but found b of type z and A of type r  r   r8   r  r8   r9   r]   (  r  r   r  c                      s   d  d d  d dS )Nz3A must be batches of square matrices, but they are r  r  r   r  r   r8   r  r8   r9   r]   0  s
   c                      s:   d d  d d  d d d d d 
S )NzIncompatible matrix sizes for z: each A matrix is r   r  z but each b matrix is r  r   r8   r  r  r   r8   r9   r]   8  s   )rN   r`   rw   rV   r   )r   r  r  r8   r  r9   linearSolveCheckInputs  s    


r  tallow_low_precision_dtypesc                    s^   | j  t|  p|   fdd |s-t tjtjtjtjfv  fdd d S d S )Nc                          d  S )Nz<: Expected a floating point or complex tensor as input. Got r8   r8   rV   r  r8   r9   r]   I      z(checkFloatingOrComplex.<locals>.<lambda>c                      r  )Nz*: Low precision dtypes not supported. Got r8   r8   r  r8   r9   r]   N  r  )	rV   rN   r`   r   r  rR   rT   rQ   rS   )r  r  r  r8   r  r9   r   A  s   r   arg_namec                    s"   t |  dk fdd d S )Nr  c                          d  dS )Nz: The input tensor z! must have at least 2 dimensions.r8   r8   r  r  r8   r9   r]   V  r^   zcheckIsMatrix.<locals>.<lambda>)rN   r`   rz   )r  r  r  r8   r  r9   checkIsMatrixS  s   
r  Br  c                    sZ   t   t tr ddkn	 ddk fdd d S )Nr  r   c                      sH    drdnd d  d d  d d d d d d	S )
Nz2: Incompatible shapes of A and B for the equation zAX = BzXA = Bz (r  rD   r   r   rn   r   r8   r  r  r  r  r8   r9   r]   _  s   
z#checkInputsSolver.<locals>.<lambda>)r   r  rN   r`   r   )r  r  r  r  r8   r  r9   checkInputsSolverZ  s   

*r  resultfn_nameresult_namec                    s&   t jjk fdd d S )Nc                	      s$     d d dj  dj  	S )Nz: Expected z5 and input tensors to be on the same device, but got z on z and input on r  r8   r  r  r  r  r8   r9   r]   o  s   z!checkSameDevice.<locals>.<lambda>)rN   r`   rw   )r  r  r  r  r8   r  r9   checkSameDeviceg  s   
r  UPLOc                    s8      }tt dko|dkp|dk fdd d S )Nr   ULc                      
   d  S )Nz1Expected UPLO argument to be 'L' or 'U', but got r8   r8   r  r8   r9   r]   z     
 zcheckUplo.<locals>.<lambda>)upperrN   r`   r   )r  UPLO_uppercaser8   r  r9   	checkUplov  s
   
r  eigenvalueseigenvectorsr  	compute_vc                 C   sp   t | d t| t| j}|r | |}||t|dd n| dg}|  | j|t| j	d}||fS )Nzlinalg.eighF	row_majorr   r   )
r   r  r   r   r   r   r   poprX   rV   )r  r  r  r   vecsvalsr8   r8   r9   meta__linalg_eigh~  s   


r  c                 C   s@   t | d t| jr| jnt| j}| j| jd d |dS )Nzlinalg.eigvalsr   r   r   rI   r}   rV   r   r   r   )r  complex_dtyper8   r8   r9   meta__linalg_eigvals  s   


r  c                 C   sX   t | d t| jr| jnt| j}| j| jd d |d}| j| j|d}||fS )Nz
linalg.eigr   r   r  )r  r  r   vectorsr8   r8   r9   meta_linalg_eig  s   


r  rI  c                 C   s   | j jtjdddS )Nr   r  r   )mTr  rN   r   	transpose)rI  r8   r8   r9   cloneBatchedColumnMajor     r  r  c                 C   s   t | S r3   )r  )r   r  r  r8   r8   r9   _cholesky_solve_helper  s   r  c                    sP   t jdkfdd t  jdk fdd t d\}}t|||S )Nr  c                         d j  dS )Nz-b should have at least 2 dimensions, but has  dimensions insteadr   r8   r   r8   r9   r]     r  z cholesky_solve.<locals>.<lambda>c                      r  )Nz-u should have at least 2 dimensions, but has r  r   r8   r  r8   r9   r]     r  cholesky_solve)rN   r`   r   !_linalg_broadcast_batch_dims_namer  )r   r  r  self_broadcastedA_broadcastedr8   r  r9   r    s   

r  c                 C   s.   |   dkrtj| tjdS t| d t| S )Nr   r   cholesky)r   rN   r   legacy_contiguous_formatr   r  r   r  r8   r8   r9   r    s   
r  c                 C   s   t | d t| S )Ncholesky_inverse)r   r  r  r8   r8   r9   r    s   
r  check_errorsc                 C   sf   t | d t| d | j}t|}t|d}| |}||| | j|d|d  tjd}||fS )Nzlinalg.choleskyFr   r  r   )	r   r   r   r   r   r   r   rN   r]  )r  r  r	  A_shaper   	L_stridesr  infosr8   r8   r9   linalg_cholesky_ex  s   



r  tauc                    s  t jdkdd  t ddkdd  t ddkdd  t jj dkfd	d jdkr[jd d }jd d  t  |k fd
d t jjkfdd tdd t jjtjddjj	dS )Nr  c                   S   rc   )NzHtorch.linalg.householder_product: input must have at least 2 dimensions.r8   r8   r8   r8   r9   r]     re   z,linalg_householder_product.<locals>.<lambda>r  r   c                   S   rc   )Nzbtorch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz`torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]r8   r8   r8   r8   r9   r]     re   r   c                         dj  d j  S )Nzptorch.linalg.householder_product: Expected tau to have one dimension less than input, but got tau.ndim equal to  and input.ndim is equal to r   r8   r  r  r8   r9   r]     
   c                      r  )Nzltorch.linalg.householder_product: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r8   r8   actual_batch_tau_shaper8   r9   r]        c                      r  )Nz,torch.linalg.householder_product: tau dtype z does not match input dtype r   r8   r  r8   r9   r]     s   
z torch.linalg.householder_productr  Fr  r   r   rV   rw   )
rN   r`   r   r   r   rV   r  empty_stridedr   rw   )r  r  expected_batch_tau_shaper8   )r  r  r  r9   linalg_householder_product  sD   


r  c                 C   s^   t | d t| ddd | | j}|| jt| jdd | j| jd d tjd}||fS )Nzlinalg.inv_exF)r  r  r  r   r   r   r   r   r   r   rN   r]  )r  r	  r  r  r8   r8   r9   linalg_inv_ex_meta  s   
r  LDpivotsinfo)	hermitianr	  r  c                C   st   t | d t| d tj| jt| jdd| j| jd}| j| jd d tj	d}| j| jd d tj	d}|||fS )Nztorch.linalg.ldl_factor_exFr  r  r   r   r  )
r   r   rN   r  r   r   rV   rw   r   r  )r   r  r	  r  r  r  r8   r8   r9   linalg_ldl_factor_ex_meta+  s   


r   )r  c                   s   t d td t d t jdk fdd jd d }t|jkfdd ttj	fdd tj	 j	k fdd t
 \}}tj|t|d	d
 j	 jdS )Nztorch.linalg.ldl_solver  c                      r  )NzMtorch.linalg.ldl_solve: Expected B to have at least 2 dimensions, but it has r  r   r8   )r  r8   r9   r]   N     z'linalg_ldl_solve_meta.<locals>.<lambda>r   c                      r  )Nzjtorch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, but got pivots with shape  insteadr   r8   r  r8   r9   r]   V  r!  c                      r   )Nz<torch.linalg.ldl_solve: Expected pivots to be integers. Got r   r8   r#  r8   r9   r]   ]  r   c                      r  )Nz!torch.linalg.ldl_solve: LD dtype z does not match b dtype r   r8   )r  r  r8   r9   r]   a  r   Fr  r  )r   r   r  rN   r`   r   r   rI   is_integer_dtyperV   _linalg_broadcast_batch_dimsr  r   rw   )r  r  r  r  expected_pivots_shapeB_broadcast_sizerL   r8   )r  r  r  r9   linalg_ldl_solve_meta@  s6   
	






r(  Pr  )pivotr*  c          	         s   t  jdk fdd t j}|d }|d }t||}||d< |r+ |}n dg}||d<  |}||d< ||d<  |}|||fS )Nr  c                      r  )Nz@linalg.lu: Expected tensor with 2 or more dimensions. Got size: r"  r   r8   r  r8   r9   r]   q  r  z linalg_lu_meta.<locals>.<lambda>r  r   r   )rN   r`   r   r   r   r"  r   )	r  r*  sizesry  r0  rx  r)  r  r  r8   r  r9   linalg_lu_metal  s$   





r,  LU)r*  r	  c          	         s   t  jdk fdd t j}|d }|d }t j|t|dd j jd}|	  t
|||d<  j|t jd	}|	   j|t jd	}|||fS )
Nr  c                      r  )NzFtorch.lu_factor: Expected tensor with 2 or more dimensions. Got size: r"  r   r8   r  r8   r9   r]     r  z*linalg_lu_factor_ex_meta.<locals>.<lambda>r  r   Fr  r  r   )rN   r`   r   r   r   r  r   rV   rw   r  r"  r   r  )	r  r*  r	  r+  ry  r0  r-  r  r  r8   r  r9   linalg_lu_factor_ex_meta  s&   



r.  )r  adjointr/  c                   s   t d tj jk fdd tjtjkdd  td t |d tddkdd  tjd d jkfdd t	 \}}tj
|t|| d	 j jd
}| dkru|su| ru| }|S )Nztorch.linalg.lu_solvec                      r  )NzPlinalg.lu_solve: Expected LU and B to have the same dtype, but found LU of type  and B of type r"  r   r8   )r  r-  r8   r9   r]     r  z&linalg_lu_solve_meta.<locals>.<lambda>c                   S   rc   )NzElinalg.lu_solve: pivots should be a Tensor of scalar type torch.int32r8   r8   r8   r8   r9   r]     re   zlinalg.lu_solver   c                   S   rc   )NzYlinalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrixr8   r8   r8   r8   r9   r]     re   c                      r  )Nzclinalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, but got pivots with shape r"  r   r8   r#  r8   r9   r]     r!  r  r  r   )r   rN   r`   rV   r  r   r  r   r   r%  r  r   rw   r   r  conj)r-  r  r  r  r/  r'  rL   r  r8   )r  r-  r  r9   linalg_lu_solve_meta  s<   




r2  unpack_dataunpack_pivotsc                    s   t  jdk fdd |rt |jt jkdd  t j}|d }|d }t||}||d< |r9 |}n dg}|rX||d<  |}	||d< ||d<  |}
n dg}	 dg}
||	|
fS )Nr  c                      r  )NzFtorch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: r"  r   r8   r-  r8   r9   r]     r  z lu_unpack_meta.<locals>.<lambda>c                   S   rc   )Nztorch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.
Note: this function is intended to be used with the output produced by torch.linalg.lu_factorr8   r8   r8   r8   r9   r]        r  r   r   )	rN   r`   r   rV   r]  r   r   r"  r   )r-  r  r3  r4  r+  ry  r0  rx  r)  r  r  r8   r5  r9   lu_unpack_meta  s4   





r7  modec                    sd    dkrd}d}||fS  dkrd}d}||fS  dkr$d}d}||fS t d fdd ||fS )NreducedTcompleteFrc                         d  dS )Nzqr received unrecognized mode 'z=' but expected one of 'reduced' (default), 'r', or 'complete'r8   r8   r8  r8   r9   r]     s   z _parse_qr_mode.<locals>.<lambda>rN   r`   )r8  	compute_qr9  r8   r=  r9   _parse_qr_mode  s"   	
r@  QRr9  c                 C   s   t | d t| d t|\}}| jd }| jd }t||}|r>t| j}|r*|n||d< | |}||t|dd n| dg}t| j}	|sM|sO|n||	d< | |	}
|
|	t|	dd ||
fS )Nz	linalg.qrr  r   Fr  r   )	r  r   r@  r   r"  r   r   r   r   )r  r8  r?  reduced_modery  r0  rx  Q_shaperA  R_shaperB  r8   r8   r9   linalg_qr_meta$  s"   








rF  sign	logabsdetc                 C   s   t | d t| dd | j}| |d d }| j|d d t| jd}tj|t|d| j| j	d}| j|d d tj
d}||||fS )Nzlinalg.slogdetFr  r   r  r   )r   r   r   r   rX   rV   rN   r  r   rw   r]  )r  r   rG  rH  r-  r  r8   r8   r9   _linalg_slogdet@  s   
rI  full_matrices
compute_uvdriverc                 C   s   t | d t| d t| jd d }| jd }| jd }t||}|r]|||r*|n|g }| |}	|	|t|dd ||rB|n||g }
| |
}t| dk}||
t|
|d n| dg}	| dg}| j||g t	| j
d}|	||fS )	Nz
linalg.svdr  r   Fr  r  r   r   )r  r   r   r   r"  r   r   r   r!  rX   rV   )r  rJ  rK  rL  r   ry  r0  rx  U_shaper  V_shapeVis_cudaSr8   r8   r9   _linalg_svd_metaT  s$   







rR  arg1arg2c                 C   sn   | j d d }|j d d }t||}t|}|| d| dg7 }t|}||d|dg7 }||fS )Nr  r   )r   r(   r   r   )rS  rT  arg1_batch_sizesarg2_batch_sizesexpand_batch_portionarg1_expand_sizearg2_expand_sizer8   r8   r9   r%  z  s   
r%  c                 C   sV   |rt | || t| |\}}|| jkr| n| |}||jkr"|n||}||fS r3   )r  r%  r   expand)rS  rT  r  rX  rY  arg1_broadcastedarg2_broadcastedr8   r8   r9   r    s   r  r   c                 C   s6   | j d d }|jdkp| jd |jko|j |k}|S )Nr   r   )r   r   )r  r   expected_batched_rhs_shapevector_caser8   r8   r9   linalg_solve_is_vector_rhs  s
   
r_  )r  r	  r  r-  r  r  c                   sh  t  d t jjk fdd t }|r dn}	t |	|d t|	 \}
}t|p6| dd  |rC|
d d n|
}tj|t	|| jj
d} j}tj|t	|d j j
d} j|d d tjd} j|d d	 tjd}||||f}||||f}td
d |D rt||D ]\}}t||j ||j|  t||dd q|S )Nzlinalg.solvec                         d j  dj  dS )NzKlinalg.solve: Expected A and B to have the same dtype, but found A of type r0  r"  r   r8   r  r  r8   r9   r]     r  z"_linalg_solve_ex.<locals>.<lambda>r   c                   S   rc   )Nzlinalg.solve: Vector broadcasting of the left hand side is not supported for left=False. In this case linalg.solve is equivalent to B / A.squeeze(-1)r8   r8   r8   r8   r9   r]     r6  r  Fr   r  c                 s   s    | ]}|d uV  qd S r3   r8   rB   r8   r8   r9   ri         z#_linalg_solve_ex.<locals>.<genexpr>)	copy_fromcopy_toexact_dtype)r   rN   r`   rV   r_  r   r  r%  r  r   rw   r   r   r]  allzipr$   r   r   r&   )r  r  r  r	  r  r-  r  r  r^  B_B_broad_shaperL   result_shaperesult_r   LU_pivots_info_r   resr;  or8   ra  r9   _linalg_solve_ex  sJ   



rq  )r  unitriangularr   rr  r   c          	      C   s   |d u r
|  dg}t|tsJ t| ||d t|| d \}}|dd o+| }|r6t||j	}|S t
||j	rL||ddj	 |dd |S )Nr   zlinalg.solve_triangularr  r   )r   rf   r"   r  r  r  r   is_conjr$   r   r%   r   
transpose_)	r  r  r  r  rr  r   rh  A_avoid_copy_Ar8   r8   r9   linalg_solve_triangular_meta  s   
rw  XM)re  r  c           	         s   t jdkfdd t  jdk fdd t d  jt jkrOt \}}t j|t|ddj	j
d}t j|t|dd j	 j
d}||fS  jt jks[ jt jkrjt }d	g}||fS t dd
d  ||fS )Nr  c                      r  )NzMtorch.triangular_solve: Expected b to have at least 2 dimensions, but it has r  r   r8   r   r8   r9   r]     r!  z'triangular_solve_meta.<locals>.<lambda>c                      r  )NzMtorch.triangular_solve: Expected A to have at least 2 dimensions, but it has r  r   r8   r  r8   r9   r]     r!  triangular_solveFr  r  r   c                   S   rc   )Nz+triangular_solve: Got an unexpected layout.r8   r8   r8   r8   r9   r]   (  re   )rN   r`   r   r  rv   stridedr%  r  r   rV   rw   
sparse_csr
sparse_bsrr   r   )	r   r  r  r  rr  self_broadcast_sizeA_broadcast_sizesolutioncloned_coefficientr8   r  r9   triangular_solve_meta  s<   	




r  c                 C   sp   t | d t| d | | jd d }| | j}|| jt| jdd | j| jd d tjd}|||fS )Nz
linalg.detr  Fr  r   r   r  )r  detr-  r  r8   r8   r9   _linalg_det_meta-  s   


r  c                    s  t jdkdd  t jdkdd  |rdndt j jd kfdd t j jd kfdd t jd jd kd	d  t jj d
kfdd t jjkfdd jdkrjd d }jd d t |kfdd jd d  t  |k fdd t jjkfdd t jjkfdd tdd tdd t jjtjddjjdS )Nr  c                   S   rc   )Nz3torch.ormqr: input must have at least 2 dimensions.r8   r8   r8   r8   r9   r]   E  re   zormqr.<locals>.<lambda>c                   S   rc   )Nz3torch.ormqr: other must have at least 2 dimensions.r8   r8   r8   r8   r9   r]   H  re   r  r   c                      r<  )Ntorch.ormqr: other.shape[z0] must be greater than or equal to tau.shape[-1]r8   r8   left_size_conditionr8   r9   r]   N  r   c                      r<  )Nr  z"] must be equal to input.shape[-2]r8   r8   r  r8   r9   r]   R  r   c                   S   rc   )NzHtorch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]r8   r8   r8   r8   r9   r]   W  re   r   c                      r  )Nz[torch.ormqr: Expected tau to have one dimension less than input, but got tau.ndim equal to r  r   r8   r  r8   r9   r]   \  r  c                      r  )Nzhtorch.ormqr: Expected other to have the same number of dimensions as input, but got other.ndim equal to r  r   r8   r  r   r8   r9   r]   c  r  c                      r  )NzWtorch.ormqr: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r8   r8   r  r8   r9   r]   n  r  c                      r  )NzYtorch.ormqr: Expected batch dimensions of other to be equal to input.shape[:-2], but got r8   r8   )actual_batch_other_shaper8   r9   r]   w  r  c                         d j  dj  S )NzPtorch.ormqr: Expected input and tau to have the same dtype, but input has dtype z and tau has dtype r   r8   r  r8   r9   r]     r  c                      r  )NzRtorch.ormqr: Expected input and other to have the same dtype, but input has dtype z and other has dtype r   r8   r  r8   r9   r]     r  ztorch.ormqrr  r   Fr  r  )	rN   r`   r   r   rV   r  r  r   rw   )r  r  r   r  r  expected_batch_shaper8   )r  r  r  r  r   r  r9   ormqr;  sn   	







r  c                   s   t td  k fdd j}| d k}|}| }|r3td|D ]}|o0|dk}q&ntd|D ]}|oB|dk}q8t |pI| fdd d S )Nr  c                      s   dd   dt  S )Nzpadding size is expected to be r  z, but got: r   r8   )rz   paddingr8   r9   r]         z,_padding_check_valid_input.<locals>.<lambda>r   r   c                      s    d d  d d  dj  S )N	Expected r   zD or r  zcD (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: r   r8   )rz   r  r8   r9   r]     s   )rN   r`   r   r   r   r   )r  r  rz   	input_dimis_batch_modevalid_batch_modevalid_non_batch_moder  r8   )rz   r  r  r9   _padding_check_valid_input  s$   r  c                   s   d}d d}j dkrd} d7  |d7 }t|dd |\|}   |rHtk o>k  fdd tdkfdd j dkra|fS ||fS )	Nr   r   r1   r   c                         d d d  dj  S NzcArgument #4: Padding size should be less than the corresponding input dimension, but got: padding (rm   ) at dimension 
 of input r   r8   dim_wr  pad_lpad_rr8   r9   r]        z_pad1d_common.<locals>.<lambda>c                      rk   )Nz
input (W: z%) is too small. Calculated output W: r8   r8   )input_woutput_wr8   r9   r]     r^   r  )r   r   r  rN   r`   r   )r  r  is_reflection	dim_planenbatchnplaner8   )r  r  r  r  r  r  r9   _pad1d_common  s0   




r  c                 C      t | |ddS NTr  )r  r  r  r8   r8   r9   meta_reflection_pad1d     r  c                    *   t  jt jk fdd t |ddS )Nc                         d j   dS )Nz)"replication_pad1d" not implemented for ''rV   __str__r8   r  r8   r9   r]         z(meta_replication_pad1d.<locals>.<lambda>Fr  )rN   r`   rV   boolr  r  r8   r  r9   meta_replication_pad1d  
   

r  c                   s   d |st t|dkdd  jdkr d7  |\ }|  |r=t |k o3|k  fdd t  k fdd jS )Nr   r  c                   S   rc   )Nz padding size is expected to be 2r8   r8   r8   r8   r9   r]     re   z(_pad1d_backward_common.<locals>.<lambda>r1   c                      r  r  r   r8   r  r8   r9   r]     r  c                         d d   S Nz(grad_output width unexpected. Expected: , Got: r   r8   r  grad_outputr  r8   r9   r]     rH   rN   r`   r   r   r   r   r   )r  r  r  r  r  r8   )r  r  r  r  r  r  r9   _pad1d_backward_common  s$   

r  
grad_inputc                 C      t | ||ddS r  r  r  r  r  r8   r8   r9   meta_reflection_pad1d_backward
     r  c                 C   r  )NFr  r  r  r8   r8   r9   meta_replication_pad1d_backward  r  r  c                   s2  dd d}d}t |dd j}|dkr'd}d7  d7  |d7 }|\	
|} 
   	 |rptk oS	k 	fdd t
k ofk  
fdd tdkpydkfd	d jd
kr|fS ||fS )Nr  r   r   r      c                      r  r  r   r8   r  r8   r9   r]   0  r  z_pad2d_common.<locals>.<lambda>c                         d d d  dj  S NzcArgument #6: Padding size should be less than the corresponding input dimension, but got: padding (rm   r  r  r   r8   dim_hr  pad_bpad_tr8   r9   r]   7  r  c                      s   d  d d d S )Nz
input (H:  W: z%) is too small. Calculated output H: r8   r8   )input_hr  output_hr  r8   r9   r]   ?  s
   r1   r  r   r   rN   r`   r   )r  r  r  
dim_slicesr  r   r  r8   )r  r  r  r  r  r  r  r  r  r  r  r9   _pad2d_common  sB   




r  c                 C   r  r  )r  r  r8   r8   r9   meta_reflection_pad2dK  r  r  c                    r  )Nc                      r  )Nz)"replication_pad2d" not implemented for 'r  r  r8   r  r8   r9   r]   V  r  z(meta_replication_pad2d.<locals>.<lambda>Fr  )rN   r`   rV   r  r  r  r8   r  r9   meta_replication_pad2dQ  r  r  c                    s   dd d}|j }| dkrd7  d7  |d7 }|\}}}}|  }	| }
|	| | |
| | tkfdd t k fdd ||j S )Nr  r   r   r  c                      r  r  r   r8   r  r8   r9   r]   x  rH   z%meta_pad2d_backward.<locals>.<lambda>c                      r  Nz)grad_output height unexpected. Expected: r  r   r8   r  r  r  r8   r9   r]   |  rH   )r   rz   rN   r`   r   r   )r  r   r  r  r\   r  r  r  r  r  r  r8   )r  r  r  r  r  r9   meta_pad2d_backward[  s,   
r  c             	      s  ddd d}t |dd jdk}|r+d}d7 d7  d7  |d7 }|\
|}    
   	|rtk odk fdd tk ow
k 
fd	d tk ok  fd
d t	dkpdkpdk	fdd |r||	fS |	fS )Nr1   r  r   r   r      c                      r  r  r   r8   r  r8   r9   r]     r  z_pad3d_common.<locals>.<lambda>c                      r  r  r   r8   r  r8   r9   r]     r  c                      r  )NzcArgument #8: Padding size should be less than the corresponding input dimension, but got: padding (rm   r  r  r   r8   )dim_dr  pad_bkpad_fr8   r9   r]     r  c                      s(   d  d d d d d S )Nz
input (D:  H: r  z%) is too small. Calculated output D: r8   r8   )input_dr  r  output_dr  r  r8   r9   r]     s   r  )r  r  r  r  
batch_moder  r  r8   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r9   _pad3d_common  sP   





r  c                 C   r  r  )r  r  r8   r8   r9   meta_reflection_pad3d  r  r  c                    r  )Nc                      r  )Nz)"replication_pad3d" not implemented for 'r  r  r8   r  r8   r9   r]     r  z(meta_replication_pad3d.<locals>.<lambda>Fr  )rN   r`   rV   r  r  r  r8   r  r9   meta_replication_pad3d  r  r  c                    s(  t t|dkdd  |jdksJ j|jksJ ddd |jdkr2d7 d7  d7  |\}}}}}}| }	|}
|}|	| | |
| | || | t kfdd t kfd	d t  k fd
d ||jS )N   c                   S   rc   )Nz padding size is expected to be 6r8   r8   r8   r8   r9   r]     re   z%meta_pad3d_backward.<locals>.<lambda>r1   r  r   r  c                      r  r  r   r8   r  r8   r9   r]     rH   c                      r  r  r   r8   r  r8   r9   r]     rH   c                      r  )Nz(grad_output depth unexpected. Expected: r  r   r8   )r  r  r  r8   r9   r]     rH   r  )r  r  r  r  r  r  r  r  r  r  r  r  r8   )r  r  r  r  r  r  r  r9   meta_pad3d_backward  s<   




r  r  pc                 C   s^   t |  dd  | d}|dkr| dgjt jdS | ||d  d fjt jdS )Nc                   S   rc   )Nz(_pdist_forward requires contiguous inputr8   r8   r8   r8   r9   r]   	  re   z%meta__pdist_forward.<locals>.<lambda>r   r   r   r  )rN   r`   r   r   r   rF  r  )r   r  r0  r8   r8   r9   meta__pdist_forward 	  s   
r  gradpdistc                 C   s8   t | dd  t | dd  t j|t jdS )Nc                   S   rc   )Nz._pdist_backward requires self to be contiguousr8   r8   r8   r8   r9   r]   	  re   z&meta__pdist_backward.<locals>.<lambda>c                   S   rc   )Nz/_pdist_backward requires pdist to be contiguousr8   r8   r8   r8   r9   r]   	  re   r   )rN   r`   r   r   r  )r  r   r  r  r8   r8   r9   meta__pdist_backward	  s   r  )rg  rf  c                   s  ddl m}m}  d} d}d}	|t|j|||	fr-|||	ft 	 dkdd  t	 dkdd  t
jsatj j  koVjkn   fd	d  j}
j|
d |
d td ko|d kfd
d  S )Nr   )r   r   r   r  r1   c                   S   rc   Nzbatch1 must be a 3D tensorr8   r8   r8   r8   r9   r]   %	  re   zmeta_baddbmm.<locals>.<lambda>c                   S   rc   Nzbatch2 must be a 3D tensorr8   r8   r8   r8   r9   r]   &	  re   c                      s   dj  d j  dj  S )Nz+Input dtypes must be the same, got: input: z
, batch1: z
, batch2: r   r8   )batch1batch2r   r8   r9   r]   *	      c                	      &   d d d d  d d  d	S Nz@Expected size for first two dimensions of batch2 tensor to be: [rm   z] but got: [r   r   ].r8   r8   batch2_sizesbscontraction_sizer8   r9   r]   2	  s   )r   r   r   r   rN   sym_notr   rZ  r`   rz   
exp_config&skip_dtype_check_in_meta_registrationsrV   r   )r   r  r  rg  rf  r   r   dim1dim2dim3batch1_sizesr8   )r  r  r  r  r  r   r9   meta_baddbmm	  s,   


r  c                C   r~  r   r  r   r.  r8   r8   r9   meta_bernoulli:	  s   r        ?c                 C   r  r3   r8   r   r  r.  r8   r8   r9   meta_bernoulli_A	  r  r   c                 C   r~  r   r  r  r8   r8   r9   meta_bernoulli_pF	  r  r  c                 C   
   t | S r3   rN   r   r  r8   r8   r9   meta_poissonL	  r  r  c                 C   s6   t |
|  k dd  t j| t jd}t | |fS )Nc                   S   rc   )NzJError in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()r8   r8   r8   r8   r9   r]   d	  re   z6meta__fused_moving_avg_obs_fq_helper.<locals>.<lambda>r   )rN   r`   rz   r   r  )r   observer_onfake_quant_onrunning_minrunning_maxscale
zero_pointaveraging_const	quant_min	quant_maxch_axisper_row_fake_quantsymmetric_quantmaskr8   r8   r9   $meta__fused_moving_avg_obs_fq_helperR	  s   
r  c                    sn   t |  dkdd  t | dkdd  | j\ |j\t  k fdd | S )Nr  c                   S   rc   )Nza must be 2Dr8   r8   r8   r8   r9   r]   m	  re   zmeta_mm.<locals>.<lambda>c                   S   rc   )Nzb must be 2Dr8   r8   r8   r8   r9   r]   n	  re   c                	      s   d d  d d d	S )Nz/a and b must have same reduction dim, but got [rm   z] X [r  r8   r8   M1M2Nr)  r8   r9   r]   s	  s    )rN   r`   rz   r   r   r   br8   r  r9   meta_mmj	  s   

r  c                    s0   |rt  fddtjD S tj S )Nc                 3   s&    | ]}| vrj | nd V  qdS )r   Nr   rC   r  dimsr   r8   r9   ri   z	  s   $ z+_compute_reduction_shape.<locals>.<genexpr>)r_   r   r   rI   compute_reduction_output_shaper   )r   r  r  r8   r  r9   r  x	  s   r  strc                 C   sD   t | tjjr| jjS t| dr t| jdr | jjdkr | jjS dS )Nrw   ro   rt   r  )rf   rN   _subclasses
FakeTensorfake_devicero   hasattrrw   )rO  r8   r8   r9   r!  	  s   
r!  input_tensorr   r  dilationis_transposedgroupsoutput_paddingc                    s@  dt dt dt dt dt dt fdd}dt dt dt dt dt d	t dt fd
d}	|jdd  }
| jdd   |r<||jd  }n|jd }|jd | | jd krQtd| jd |gt|tre|gt  }nt|dkrt|d gt  }t|tr|gt  }nt|dkr|d gt  }t|tr|gt  }nt|dkr|d gt  }d }|rt|tr|gt  }nt|dkr|d gt  }n|}tt D ]2}|r|	 | || || |
| || ||  qՈ| | || || |
| ||  qt	t
dd dd  D  fdd S )Nlnr  r  rx  r   r2   c                 S   s$   | d|  ||d   d | d S )a  
        Formula to apply to calculate the length of some dimension of the output

        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
        Returns:
            The output length
        r  r   r8   )r(  r  r  rx  r   r8   r8   r9   _formula	  s   $z+calc_conv_nd_return_shape.<locals>._formular5   c                 S   s(   | d | d|  ||d   | d S )a  
        Formula to apply to calculate the length of some dimension of the output
        if transposed convolution is used.
        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
            op: output padding in that dim

        Returns:
            The output length
        r   r  r8   )r(  r  r  rx  r   r5   r8   r8   r9   _formula_transposed	  s   (z6calc_conv_nd_return_shape.<locals>._formula_transposedr  r   r   zInvalid channel dimensionsc                 s       | ]}|d kV  qdS r   Nr8   rB   r8   r8   r9   ri   	  rb  z,calc_conv_nd_return_shape.<locals>.<genexpr>c                      s   dt   ddd   dS )NzGiven input size per channel: z&. Calculated output size per channel: r  z. Output size is too small)r   r8   r  	ret_shaper8   r9   r]   	  s    
z+calc_conv_nd_return_shape.<locals>.<lambda>)r  r   rE  rf   r   r   r   r   rN   r`   r{   )r#  rV  r   r  r$  r%  r&  r'  r)  r*  kernel_sizeout_channelsoutput_padding_listr  r8   r-  r9   calc_conv_nd_return_shape	  sb   "
&




"r2  c                 C      t j| t jkS r3   rN   _prims_commonr!   channels_lasttenr8   r8   r9   is_channels_last	     r9  running_meanrunning_vartrainingexponential_average_factorepsilonc                    s    j }|d ur
|j n|j }	|d ur|j n|j }
 fdd} |j| d}|r4 |	} |
}n
 d} d}|||fS )Nc                      s(   t  rtjS  jtjdrtjS tjS r   )r9  rN   r6  r   r   r8   r#  r8   r9   pick_memory_format
  s
   z2meta_miopen_batch_norm.<locals>.pick_memory_formatr   r   )r   r   rF  )r#  rV  rX  r;  r<  r=  r>  r?  r   save_mean_shapesave_var_shaperA  r   	save_meansave_varr8   r@  r9   meta_miopen_batch_norm 
  s   



rF  c	              	      sf    fdd}	t  ||||||r|nd }
d}d} |dkr%d|
|<  |
}|j|	 d}|S )Nc                      s^   t  dkrt strtjS nt rtjS  jtjdr#tjS  jtjdr-tjS d S Nr  r   )r!  r9  rN   r6  r   r   preserve_formatr8   r#  rV  r8   r9   rA  2
  s   z%meta_conv.<locals>.pick_memory_formatr   r   r   )r2  r   r   rF  )r#  rV  rX  r   r  r$  r%  r'  r&  rA  	shape_outinput_channels_dimoutput_channels_dimr   r8   rI  r9   	meta_conv&
  s$   

rM  mkldnnc
              	   C   sH   t | ||||d|g }
| |
}tj}|  dkrtj}|j|d}|S )NFr  r   )r2  r   rN   r6  rz   channels_last_3drF  )r#  rV  rX  r  r   r$  r&  attrscalars	algorithmrJ  r   out_memory_formatr8   r8   r9   meta_mkldnn_convolution_defaultX
  s   
rT  c                 C   s$   |  g | jd d |jd R S Nr   r   r   r   )r#  rV  rX  rP  rQ  rR  r8   r8   r9   meta_linear_pointwise_defaulto
  s   $rW  mklc                 C   s$   |  g | jd d |jd R S rU  rV  )r#  packed_weightorig_weightrX  r
  r8   r8   r9   meta_mkl_linearz
  s   r[  onednnc              	   C   s|   t | ||||	d|
d }|tjtjtjtjfv sJ | j||d}t|dv s*J dt|dkr3tjntj	}|j
|d}|S )NFr   r1   r  zonly conv1d/2d are supportedr  r   )r2  rN   rr  rt  uint8r\  r   r   r6  r   rF  )rD   x_scalex_zpww_scalew_zprX  r   r  r$  r&  output_scaleoutput_zero_pointoutput_dtyperP  rQ  rR  rJ  r   formatr8   r8   r9   meta_qconv_pointwise
  s    
rh  c                 C   s   |dksJ |S )Nsumr8   )rD   r_  r`  ra  rb  rc  accumrX  r   r  r$  r&  rd  re  rf  accum_scaleaccum_zero_pointbinary_op_namerf  unary_op_nameunary_op_argsunary_op_algorithmr8   r8   r9   meta_qconv2d_pointwise_binary
  s   rq  c                 C   sF   t | j}|jd |d< |	tjtjtjtjfv sJ | j||	d}|S )Nr   r   r   )r   r   rN   rr  rt  r\  r^  r   )rD   r_  r`  ra  rb  rc  rX  rd  re  rf  post_op_namepost_op_argspost_op_algorithmrz  r   r8   r8   r9   meta_qlinear_pointwise
  s
   
ru  c                 C   sR   |dkr|S t | j}|jd |d< |
tjtjtjtjfv s J | j||
d}|S )Nri  r   r   r   )r   r   rN   rr  rt  r^  r\  r   )rD   r_  r`  ra  rb  rc  x_2rX  rd  re  rf  x2_scalex2_zprm  rf  rn  ro  rp  rz  r   r8   r8   r9   meta_qlinear_pointwise_binary
  s   
ry  c                 C   s&   t | j}|jd |d< | |}|S )Nr   r   )r   r   r   )rD   ra  rX  rz  r   r8   r8   r9   meta_linear_dynamic_fp16
  s   

rz  	quantizedr8   r   r   c                 C   sr   t | |||||\}}}|  dkr| dnd}	tj}
|  dkr(|||g}n|	|||g}tj|| j| j|
dS Nr  r   r1   r  )#max_pool2d_checks_and_compute_shaperz   r   rN   r6  r   rV   rw   r  r/  r   r  r$  	ceil_modenInputPlaneoutputHeightoutputWidthr  r   r   r8   r8   r9   meta_quantized_max_pool2d  s$   r  c                 C   s   t |  dkd|   d t | dkd|  d t | jt jt jt jfv d| j  t |jt jkd|j  t |jt jkd|j  t |j| jkd|j  | j	| 
d	|
d	| jd
S )Nr  zx must be a 2D tensor, got Dzw must be a 2D tensor, got #expected x to be f32/f16/bf16, got expected w to be uint8, got z q_group_size must be int64, got z5q_scale_and_zeros must have the same dtype as x, got r   r   )rN   r`   rz   rV   rr  rs  rt  r^  r   r   r   rD   ra  q_group_sizeq_scale_and_zerosr8   r8   r9   meta_int4mm_packed_weight_cpu+  s      




r  c                    s4   t   koj k fdd d S )Nc                      s8   d  d d dd   d dj   S )NzExpected a tensor of dimension z and tensor.size[z] == rm   zbut got : dimension z] = rz   r   r8   rz   dim_sizer   rO  r8   r9   r]   C  s    z check_dim_size.<locals>.<lambda>)rN   r`   rz   r   )rO  rz   r  r   r8   r  r9   check_dim_size@  s   r  c                    s  dd }|d|\}}	t t|dv dd  t  jt jt jt jt jfv fdd t|dkr8||	}
}nt|d	krH|d |d }
}n|d
|\}
}|d|\}}t |d u p_|dkdd    dkro 	dnd	} 	d} 	d} 	d}t
||||
d	|}t
||	||d	|}t }t ||	|
|||d	d	||||||   dkr|||g}n||||g}t j| j j|dS )Nc                    D   t t|dv  fdd |d }t|dkr|n|d }||fS )Nr   r  c                      r<  )Nzavg_pool2d: 4 must either be a single int, or a tuple of two intsr8   r8   r  r8   r9   r]   U  r   z1meta_avg_pool2d.<locals>.unpack.<locals>.<lambda>r   r   rN   r`   r   r  r  HWr8   r  r9   unpackR     

zmeta_avg_pool2d.<locals>.unpackr/  r   r   r  c                   S   rc   NzOavg_pool2d: stride must either be omitted, a single int, or a tuple of two intsr8   r8   r8   r8   r9   r]   ^  re   z!meta_avg_pool2d.<locals>.<lambda>c                      r  )Nz""avg_pool2d" not implemented for 'r  r  r8   r  r8   r9   r]   b  r  r   r   r   r  c                   S   rc   Nzdivisor must be not zeror8   r8   r8   r8   r9   r]   o  re   r  r~  r  r   r1   r  )rN   r`   r   rV   r^  uint16uint32uint64rz   r   pooling_output_shaperI   r!   pool2d_shape_checkr   rw   )r  r/  r   r  r  count_include_paddivisor_overrider  kHkWdHdWpadHpadWr  r  inputHeight
inputWidthr  r  r   r   r8   r  r9   meta_avg_pool2dH  sj   
	





r  c                 C   sj   t | ||||||dd|	|
|||| |  }|	}t|||d | t|||d | t|||d | d S )Nr   r1   r  )r  rz   r  )r  
gradOutputr  r  r  r  r  r  r  r  r  r  r  r  
mem_formatr   nOutputPlaner8   r8   r9   avg_pool2d_backward_shape_check  s,   r  c                 C   s  t t|dkpt|dkdd  |d }t|dkr|n|d }	t t|dkp5t|dkp5t|dkdd  t|dkrB|n|d }
t|dkrN|	nt|dkrV|
n|d }t t|dkpgt|dkdd  |d }t|dkrx|n|d }t |d u p|dkdd  |j}| d	kr|d
 nd}|d }|d }|d }t||||
d|}t||	||d|}t|}t|| |||	|
||||||||| t j	||j
|j|dS )Nr   r  c                   S   rc   )NzKavg_pool2d: kernel_size must either be a single int, or a tuple of two intsr8   r8   r8   r8   r9   r]     re   z*meta_avg_pool2d_backward.<locals>.<lambda>r   c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   c                   S   rc   )NzGavg_pool2d: padding must either be a single int, or a tuple of two intsr8   r8   r8   r8   r9   r]     re   c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r  r~  r  r  r   r  )rN   r`   r   r   rz   r  rI   r!   r  r   rV   rw   )gradOutput_r  r/  r   r  r  r  r  r  r  r  r  r  r  
input_sizer  r  r  r  r  r  r  r8   r8   r9   meta_avg_pool2d_backward  sj   "(
r  c                    s6  t t|dv dd  |d }t|dkr|n|d }t|dkr$|n|d }	t | p2t|dv dd  t  jt jt jt jt jfv fdd |sP|n|d }
|sX|nt|dkr`|
n|d }|sh|	nt|dkrp|
n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t  jd
v dd  t | p|dkdd   	d} 	d} 	d} 	d} 	d}t
||||
d|}t
||||d|}t
||	||d|}t ||||	|
|||||ddd||||||ddd  jdkr ||||fS  |||||fS )Nr   r1   c                   S   rc   NzFavg_pool3d: kernel_size must be a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]     re   z!meta_avg_pool3d.<locals>.<lambda>r   r   r  c                   S   rc   NzJavg_pool3d: stride must be omitted, a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]   $  re   c                      r  )Nz""avg_pool3d" not implemented for 'r  r  r8   r  r8   r9   r]   (  r  c                   S   rc   NzBavg_pool3d: padding must be a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]   0  re   r  r  c                   S   rc   Nz9non-empty 4D or 5D (batch mode) tensor expected for inputr8   r8   r8   r8   r9   r]   8  re   c                   S   rc   r  r8   r8   r8   r8   r9   r]   =  re   r~  r  r  r   zavg_pool3d()T)check_input_sizer  )rN   r`   r   rV   r^  r  r  r  r   r   r  pool3d_shape_checkr   )r  r/  r   r  r  r  r  kTr  r  dTr  r  padTr  r  r  nslicesitimeiheightiwidthotimeoheightowidthr8   r  r9   meta_avg_pool3d  s   

  





r  c                 C   s  t t|dv dd  |d }t|dkr|n|d }	t|dkr$|n|d }
t | p2t|dv dd  |s;|n|d }|sC|	nt|dkrK|n|d }|sS|
nt|dkr[|n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t |jd	v d
d  t | p|dkdd  |d}|d}|d}|d}t||||d|}t||	||d|}t||
||d|}t|| |||	|
||||||||||||d ||jS )Nr  c                   S   rc   r  r8   r8   r8   r8   r9   r]   w  re   z*meta_avg_pool3d_backward.<locals>.<lambda>r   r   r  c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r  c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r~  r  r  r   zavg_pool3d_backward())	rN   r`   r   r   r   r  avg_pool3d_backward_shape_checkr   r   )r  r  r/  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  otime_for_shape_checkoheight_for_shape_checkowidth_for_shape_checkr8   r8   r9   meta_avg_pool3d_backwardi  st   
  




r  c                    sZ   t  jdkp jdk fdd  jd d t| }t }t j| j j	|dS )Nr1   r  c                      r   )Nz"Expected 3D or 4D tensor, but got r   r8   r   r8   r9   r]     r   z*meta_adaptive_avg_pool2d.<locals>.<lambda>r  r  )
rN   r`   r   r   r_   rI   r!   r   rV   rw   )r   output_sizerz  r   r8   r   r9   meta_adaptive_avg_pool2d  s   

r  c                    s@   t  jdkp jdk fdd   jd d t| S )Nr  r  c                      r   )Nz"Expected 4D or 5D tensor, but got r   r8   r   r8   r9   r]     r   z*meta_adaptive_avg_pool3d.<locals>.<lambda>r  )rN   r`   r   r   r   r_   )r   r  r8   r   r9   meta_adaptive_avg_pool3d  s
   
r  c                    s    j }td|D ]t dk fdd qt|dkp$|dkfdd tj jk fdd tj}trDtj}	j
j|d	S )
Nr   r   c                      r   )Nz{adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero                       size for non-batch dimensions,  with dimension  being emptyr   r8   )grad_outr  r8   r9   r]     s
    z4meta__adaptive_avg_pool2d_backward.<locals>.<lambda>r1   r  c                      r   )NzBadaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got r   r8   r   r8   r9   r]     r   c                      r  Nexpected dtype z! for `grad_output` but got dtype r   r8   )r  r   r8   r9   r]     r   r   )r   r   rN   r`   r   rV   r   r9  r6  r   r   rF  )r  r   r   r   r8   )r  r  r   r9   "meta__adaptive_avg_pool2d_backward  s$   

r  c                 C   s   t | d tj|tjdS )Nadaptive_avg_pool3d_backwardr   )!_adaptive_pool_empty_output_checkrN   r   r  r  r   r8   r8   r9   "meta__adaptive_avg_pool3d_backward  s   
r  r  c                    s<   j }td|D ]tdk fdd qd S )Nr   r   c                      s     dj  d dS )Nzc(): Expected grad_output to have non-zero size for non-batch dimensions, but grad_output has sizes r  r  r   r8   r  r  r  r8   r9   r]     s
   z3_adaptive_pool_empty_output_check.<locals>.<lambda>)r   r   rN   r`   r   )r  r  r   r8   r  r9   r    s   r  c                    s"  j }t|dv fdd td|D ] t dk fdd qtt|dkdd  d}d}d}j d	krGd}|d7 }|d }|\}}j d
krm|||f}|}	j|tjd}
|	|
fS ||||f}t	}|j
|d}	j|tjdj
|d}
|	|
fS )Nr]  c                      r   )Nz:adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: r   r8   r  r8   r9   r]     r   z*meta_adaptive_max_pool2d.<locals>.<lambda>r   r   c                         dj  d  dS )Nzjadaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r  r  r   r8   r  r  r8   r9   r]   	  
   r  c                   S   rc   )NzCadaptive_max_pool2d(): internal error: output_size.size() must be 2r8   r8   r8   r8   r9   r]     re   r  r1   r   r   )r   rN   r`   r   r   r   r   r   rI   r!   rF  )r  r  r   dimHsizeBsizeDosizeHosizeWr   r   r   r   r8   r  r9   meta_adaptive_max_pool2d  sD   







r  c                    sd    j }t|dv  fdd t d tj jk fdd t}jj	|dS )Nr]  c                      r   )NzKadaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: r   r8   r  r8   r9   r]   4  r   z3meta_adaptive_max_pool2d_backward.<locals>.<lambda>adaptive_max_pool2d_backwardc                      r  r  r   r8   r  r  r8   r9   r]   ;  r   r   )
r   rN   r`   r  rV   rI   r!   r   r   rF  )r  r  r   r   r   r8   r  r9   !meta_adaptive_max_pool2d_backward.  s   



r  c                    s   j }t|dv fdd td|D ] t dk fdd qtt|dkdd  d}d}d}|d	krFd}|d7 }|}|\}}}|d
kr[||||f}	n|||||f}	|	}
j|	tjd}|
|fS )Nr  c                      r   )Nz:adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: r   r8   r  r8   r9   r]   H  r   z*meta_adaptive_max_pool3d.<locals>.<lambda>r   r   c                      r  )Nzjadaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r  r  r   r8   r  r8   r9   r]   M  r  r1   c                   S   rc   )NzCadaptive_max_pool3d(): internal error: output_size.size() must be 3r8   r8   r8   r8   r9   r]   U  re   r  r  r   )r   rN   r`   r   r   r   r   r   )r  r  r   dimDr  r  osizeTr  r  r   r   r   r8   r  r9   meta_adaptive_max_pool3dB  s8   





r  c                 C   s   t | d ||jS )Nadaptive_max_pool3d_backward)r  r   r   )r  r  r   r8   r8   r9   !meta_adaptive_max_pool3d_backwardn  s   
r  c                 C   s   |d u rt d| |S )Nz:cannot repeat_interleave a meta tensor without output_size)rE  r   )repeatsr  r8   r8   r9   meta_repeat_interleave_Tensoru  s   
r  c                 C   s:   | j jsJ |j jsJ t| j|j}| j|t| j dS r  )rV   r   r(   r   r   r   )realimagr   r8   r8   r9   meta_complex|  s   r  )
fill_valuer  c                C   s   | j ||  ftjdS r  )r   rz   rN   r   )r   r   r  r8   r8   r9   nonzero_static  s   r  c                 C   s<   t tjdd  t j|  |  fd|  ft j| jdS )Nc                   S   rc   )NaY  The register_meta function for torch.nonzero() raises unimplemented by default, as a correct data-independent implementation does not exist. This implementation returns a fake value, assuming all elements of the tensor are non-zero. To enable this registration, please set 'torch.fx.experimental._config.meta_nonzero_assume_all_nonzero' to True.r8   r8   r8   r8   r9   r]     re   znonzero.<locals>.<lambda>r   rV   rw   )	rN   _check_not_implementedr  meta_nonzero_assume_all_nonzeror  r   rz   r   rw   r   r8   r8   r9   nonzero  s   
r  c              
      s@  t tdd  g }tD ]q\d ur|t jt jt jt jt jfv dd  jt jt jfv rv }t	|t 
j jkfdd tjD ]#t 
j j  kfdd ||d qQq| q| q|t t	jkfdd dd lm} t|j t	jk rd  t	jk sd}d	}D ]|dkrǈd urd}q|dkr҈d u rd
}qd ur nqd}|sg }g }tD ]\d ur| | qtD ]\d u r| | q||g g  g tD ]&\}	d u rBr8 j|	  q"j|	  q"tjq" fdd}
   }ddlm} | dkrk|S |
}t|}t|ttt	|krt|j|}t|}t|t|}|| |}|S )Nc                   S   rc   )Nz#at least one index must be providedr8   r8   r8   r8   r9   r]     re   z#meta_index_Tensor.<locals>.<lambda>c                   S   rc   )Nz?tensors used as indices must be long, int, byte or bool tensorsr8   r8   r8   r8   r9   r]     re   c                      r   )N)too many indices for tensor of dimension r   r8   r   r8   r9   r]     r   c                	      s$   dj  d  dj  d  S )NzThe shape of the mask 
 at index z0 does not match the shape of the indexed tensor r   r8   )r  r   jrx  r   r8   r9   r]     s
    r   c                      s   dj  dt  dS )Nr  z (got rn   )r   r   r8   )r   r   r8   r9   r]     r  r   Fr  Tc                    sL      }t |  }dgt |tt| jt  < | ||S )zI
        This follows restride_src in TensorAdvancedIndexing.cpp
        r   )r   r   r   r   r   )r   r   r   )after_shapebefore_shapereplacement_shaper8   r9   _restride_src   s    z(meta_index_Tensor.<locals>._restride_srcguard_size_oblivious) rN   r`   r  	enumeraterV   r   r  r\  r  r   r   r   r   r   r   selecttorch._refsr   r   r)   r   r   r   r  r   rI   3compute_elementwise_output_logical_to_physical_perm
apply_permr   invert_permr   r   )r   r   r  r  refsstatehas_contiguous_subspacer  transposed_indicesrz   r  r   r  restrided_selfperm
perm_shaper   r8   )	r  r  r  r   r   r  rx  r  r   r9   meta_index_Tensor  s   










r  c                 C   sT   d }d }d }|
d r|  | }|
d r|  | }|
d r%|  |}|||fS )Nr   r   r  r   r   )grad_output_input_weight_bias_sizes_optr   r  r$  
transposedr'  r&  output_maskbackend_grad_inputbackend_grad_weightbackend_grad_biasr8   r8   r9   meta_convolution_backward!  s   

r  c                   s     d} d}| ||f} t  dkdd  t dkdd  t  d dk fdd t  d dk fd	d t|  d|ko^|  d|kd
d  | |   S )Nr   r  r1   c                   S   rc   r  r8   r8   r8   r8   r9   r]   E  re   zmeta_addbmm.<locals>.<lambda>c                   S   rc   r  r8   r8   r8   r8   r9   r]   F  re   r   c                         d  d d d S )Nz8batch1 and batch2 must have same number of batches, got r   r   r   r8   r  r  r8   r9   r]   I  r  c                
      6   d  d d  d d d d d d	S )Nz#Incompatible matrix sizes for bmm (r   rD   r  r   rn   r   r8   r  r8   r9   r]   M  
   c                   S   rc   )Nz.self tensor does not match matmul output shaper8   r8   r8   r8   r9   r]   T  re   )r   rZ  rN   r`   rz   r   )r   r  r  rg  rf  r  r  r8   r  r9   meta_addbmm?  s$   

r  c                 K   s   |  |  S r3   r  )r   r9  kwargsr8   r8   r9   meta_randint_likeY  s   r!  )
grad_scale	found_infc       	            s4   | |||||fD ] t t t fdd qd S )Nc                         dt   S Nz'exponent must be a tensor list but got ro   r8   lr8   r9   r]   t  r  z#meta__fused_adam_.<locals>.<lambda>rN   r`   rf   r   )r   gradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepslrbeta1beta2weight_decayepsamsgradmaximizer"  r#  r8   r'  r9   meta__fused_adam_^  s   
r6  c       	            sZ   | |||||fD ] t t t fdd qdd }|| ||||||||fS )Nc                      r$  r%  r&  r8   r'  r8   r9   r]     r  z"meta__fused_adam.<locals>.<lambda>c                 S   s   dd | D S )Nc                 S   s   g | ]}t |qS r8   r  )rC   r  r8   r8   r9   rG     rH   z=meta__fused_adam.<locals>.empty_like_list.<locals>.<listcomp>r8   )tensor_listr8   r8   r9   empty_like_list  s   z)meta__fused_adam.<locals>.empty_like_listr)  )r   r*  r+  r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r"  r#  r8  r8   r'  r9   meta__fused_adamx  s   
r9  c                    s   t   dkdd  t  dkdd  t  jt ju  fdd t jt ju fdd t  ddk fd	d  j ddft jd
S )Nr  c                   S   rc   )Nza must be a 2D tensorr8   r8   r8   r8   r9   r]     re   zmeta__int_mm.<locals>.<lambda>c                   S   rc   )Nzb must be a 2D tensorr8   r8   r8   r8   r9   r]     re   c                      r   )Nzexpected self to be int8, got r   r8   )r   r8   r9   r]     r   c                      r   )Nzexpected mat2 to be int8, got r   r8   )r  r8   r9   r]     r   r   r   c                
      r  )Nz'Incompatible matrix sizes for _int_mm (r   rD   r   r   rn   r   r8   r  r8   r9   r]     r  r   )rN   r`   rz   rV   r\  r   r   r]  r  r8   r  r9   meta__int_mm  s   



 r:  c                    st   t   dkdd  t  jt ju  fdd  d} dd } j|d ||d  d	|d ft jd
S )Nr  c                   S   rc   Nzw must be a 2D tensorr8   r8   r8   r8   r9   r]     re   z2meta__convert_weight_to_int4pack.<locals>.<lambda>c                      r   Nr  r   r8   ra  r8   r9   r]     r   r   r      rq      r   )rN   r`   rz   rV   r^  r   r   r]  ra  inner_k_tilesr0  rx  r8   r=  r9    meta__convert_weight_to_int4pack  s   



rB  c                    s`   t   dkdd  t  jt ju  fdd  d} d} j||d ft jdS )Nr  c                   S   rc   r;  r8   r8   r8   r8   r9   r]     re   z:meta__convert_weight_to_int4pack_for_cpu.<locals>.<lambda>c                      r   Nzexpected w to be int32, got r   r8   r=  r8   r9   r]     r   r   r   r   )rN   r`   rz   rV   r]  r   r   r^  r@  r8   r=  r9   (meta__convert_weight_to_int4pack_for_cpu  s   




rD  c                    s   t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	dd	 jd
S )Nr  c                   S   rc   Nzx must be a 2D tensorr8   r8   r8   r8   r9   r]     re   z*meta__weight_int4pack_mm.<locals>.<lambda>r  c                   S   rc   )Nzw must be a 4D tensorr8   r8   r8   r8   r9   r]     re   c                      r   Nr  r   r8   r   r8   r9   r]     r   c                      r   rC  r   r8   r=  r8   r9   r]     r   r   r>  r   
rN   r`   rz   rV   rr  rs  rt  r]  r   r   r  r8   ra  rD   r9   meta__weight_int4pack_mm  s   


"rI  c                       t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	djdS )	Nr  c                   S   rc   rE  r8   r8   r8   r8   r9   r]     re   z2meta__weight_int4pack_mm_for_cpu.<locals>.<lambda>c                   S   rc   r;  r8   r8   r8   r8   r9   r]     re   c                      r   rF  r   r8   r   r8   r9   r]     r   c                      r   r<  r   r8   r=  r8   r9   r]     r   r   r   )
rN   r`   rz   rV   rr  rs  rt  r^  r   r   r  r8   rH  r9    meta__weight_int4pack_mm_for_cpu     


rK  c                    rJ  )	Nr  c                   S   rc   rE  r8   r8   r8   r8   r9   r]     re   z;_weight_int4pack_mm_with_scales_and_zeros.<locals>.<lambda>c                   S   rc   r;  r8   r8   r8   r8   r9   r]     re   c                      r   rF  r   r8   r   r8   r9   r]     r   c                      r   rC  r   r8   r=  r8   r9   r]     r   r   r   rG  )rD   ra  r  qScaleqZerosr8   rH  r9   )_weight_int4pack_mm_with_scales_and_zeros  rL  rO  r   r  c                 C   s   | | d | | S rM  r8   r  r8   r8   r9   kai_roundup  s   rP  c           	         s   | dkrv||kr/d}d}d}d
dddd 
fddfd	d
}||||||S |d dkrx|| dkrzd}d}d}d
ddd  fdd} 	
fdddd  fdd fdd	|||||||S d S d S d S )Nr  r>  rq  r  c                 S   s   t || d}t | |S )Nr  rP  )rx  krsrkr_sr_roundedup4r8   r8   r9   kai_k_roundedup  s   
z3get_kai_packed_weight_size.<locals>.kai_k_roundedupc                    s8    | ||}|d dksJ d||d     S )Nr  r   zk_internal must be evenr8   )rx  nrrR  rS  
k_internal)rU  kai_num_bytes_biaskai_num_bytes_multiplier_rhskai_num_bytes_sum_rhsr8   r9   9kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0  s   z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0c                    s    t | || }| |||| S r3   rQ  )r0  rx  rV  rR  rS  num_rows)r[  r8   r9   7kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0'  s   z[get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0r?  r   c                    sR   || dksJ | dksJ |  dksJ t | || }|||||| S r  rQ  )r0  rx  rV  rR  rS  blr\  )kai_bl_multiple_of;kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0kai_nr_multiple_ofr8   r9   9kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0?  s   
z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                    s^   || dksJ | dksJ |  dksJ  }| |}||}|||    S r  r8   )rx  rV  rR  rS  r^  num_bytes_multiplier_rhsnum_blocks_per_rownum_bytes_per_block)r_  #kai_get_bf16_datatype_size_in_bytesra  kai_num_blocks_per_rowrX  kai_num_bytes_per_blockrZ  r8   r9   r`  O  s   
z_get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                   S   rc   )Nr  r8   r8   r8   r8   r9   rf  e  r6  zGget_kai_packed_weight_size.<locals>.kai_get_bf16_datatype_size_in_bytesc                    s   |  dksJ t | || S r  rQ  )rx  r^  r_  r8   r9   rg  h  s   z:get_kai_packed_weight_size.<locals>.kai_num_blocks_per_rowc                    s   |   dksJ | d | S )Nr   r  r8   )r^  rc  ri  r8   r9   rh  l  s   z;get_kai_packed_weight_size.<locals>.kai_num_bytes_per_blockr8   )	n_bitsr  K	groupsizekai_nrkai_krkai_srr]  rb  r8   )r_  rf  r`  r[  rU  ra  rg  rX  rY  rh  rZ  r9   get_kai_packed_weight_size  s@   
-rp  c                    s   t  jt ju  fdd t jj rE||kr|jt jks4||k rE|d dkrE|| dkrE|jt jkrEt	d|||} j
t|t jdS   |  } j
|t jdS )Nc                      r   r<  r   r8   weightsr8   r9   r]   {  r   z2meta__dyn_quant_pack_4bit_weight.<locals>.<lambda>r?  r   r  r   )rN   r`   rV   r^  backendskleidiaiis_availablerR   rt  rp  r   r  r   )rr  scales_zerosrX  
block_sizein_featuresout_featurespacked_weight_sizer8   rq  r9    meta__dyn_quant_pack_4bit_weightu  s    

r{  c                    sR   t   dkdd  t  jt jfv  fdd  d} j|| jdS )Nr  c                   S   rc   )Nzinput must be a 2D tensorr8   r8   r8   r8   r9   r]     re   z-meta__dyn_quant_matmul_4bit.<locals>.<lambda>c                      r   )Nzexpected input to be f32, got r   r8   inpr8   r9   r]     r   r   r   )rN   r`   rz   rV   rr  r   r   )r}  packed_weightsrw  rx  ry  ry  r8   r|  r9   meta__dyn_quant_matmul_4bit  s   

r  c                    s   t  dkdd  t jt jt jt jfv fdd t   dkdd  t  jt ju  fdd j	d 	djdS )	Nr  c                   S   rc   rE  r8   r8   r8   r8   r9   r]     re   z*meta__weight_int8pack_mm.<locals>.<lambda>c                      r   rF  r   r8   r   r8   r9   r]     r   c                   S   rc   r;  r8   r8   r8   r8   r9   r]     re   c                      r   )Nzexpected w to be int8, got r   r8   r=  r8   r9   r]     r   r   r   )
rN   r`   rz   rV   rr  rs  rt  r\  r   r   )rD   ra  q_scalesr8   rH  r9   meta__weight_int8pack_mm  s   


r  c           	         s  t  dkfdd t  dkfdd t ddkfdd t tjdd  t tjdd  t |d	kd
d  t  dv  fdd d}d}jd d }jd d }tt 	||}|
||g |S )Nr  c                         d    dS )Nz1cdist only supports at least 2D tensors, X1 got: r  r   r8   )x1r8   r9   r]     r^   z$meta_cdist_forward.<locals>.<lambda>c                      r  )Nz1cdist only supports at least 2D tensors, X2 got: r  r   r8   )x2r8   r9   r]     r^   r   c                      r  )Nz4X1 and X2 must have the same number of columns. X1: r   z X2: r   r8   )r  r  r8   r9   r]     r  c                   S   rc   )Nz=cdist only supports floating-point dtypes, X1 got: {x1.dtype}r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz=cdist only supports floating-point dtypes, X2 got: {x2.dtype}r8   r8   r8   r8   r9   r]     re   r   c                   S   rc   )Nz)cdist only supports non-negative p valuesr8   r8   r8   r8   r9   r]     re   Nr   r  c                      r  )Nz%possible modes: None, 1, 2, but was: r8   r8   )compute_moder8   r9   r]     r  r  )rN   r`   rz   r   rI   is_float_dtyperV   r   r   broadcast_shapesextendr   )	r  r  r  r  r1r2batch_tensor1batch_tensor2rz  r8   )r  r  r  r9   meta_cdist_forward  s@   









r  c                 C   s   |j d }|j d }|j d }|j d d }|j d d }	tt||	}
|
 }|||g t|
}|dksE|dksE|dksE|dkrJt|S |t|j krV|	|}tj
|tjdS )Nr   r  r   r   )r   r   rN   r  copyr  mathprod
zeros_likerZ  r   r   )r  r  r  r  cdistc1r  r  r  r  rW  tensor1_expand_sizebatch_productr8   r8   r9   meta_cdist_backward  s   



 

r  c	                    s  t  jt jt jfv  fdd t jt jt jfv fdd t tjfdd d}	|rEt |	dkdd  |	d8 }	|	d}
d urzt |t	kdd  t j
dkfd	d t    k fd
d fdddd fdd}tdkr  d}  }|tkr |	d}nR d}nL||
|}|ttfv s|s̈ d}nd}|	}jd }|tkr|rt |dkdd  |d8 }|jd }n| }|
|||fS )Nc                      r   )Nz(expected indices to be long or int, got r   r8   )r   r8   r9   r]     r   z$meta_embedding_bag.<locals>.<lambda>c                      r   )Nz(expected offsets to be long or int, got r   r8   )r  r8   r9   r]     r   c                      r   )Nz/expected weight to be floating point type, got r   r8   )rV  r8   r9   r]     r   r   r   c                   S   rc   Nz1include_last_offset: numBags should be at least 1r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz@embedding_bag: per_sample_weights only supported with mode='sum'r8   r8   r8   r8   r9   r]     re   c                      r  )Nz1expected per_sample_weights to be 1D tensor, got r  r   r8   )per_sample_weightsr8   r9   r]     r  c                      s   d   d    dS )Nz%expected per_sample_weights.numel() (z$ to be the same as indices.numel() (rn   r   r8   )r   r  r8   r9   r]     s   c                    s    | ||o| ddkS Nr   r   r   rI  r	  r(  padding_idx)is_fast_path_index_selectr8   r9   is_fast_path_index_select_scale  s   z;meta_embedding_bag.<locals>.is_fast_path_index_select_scalec                 S   s<   | j tjks| j tjko| ddko|ddko|dk S Nr   r   )rV   rN   rR   rP   r   )rI  r(  r  r8   r8   r9   r  "  s   z5meta_embedding_bag.<locals>.is_fast_path_index_selectc                    s"   |d ur| |||S  | ||S r3   r8   r  )r  r  r8   r9   is_fast_path*  s   z(meta_embedding_bag.<locals>.is_fast_pathcpuc                   S   rc   r  r8   r8   r8   r8   r9   r]   D  re   )rN   r`   rV   r   r  rI   r  r   r   MODE_SUMr   r   r!  MODE_MAX	MODE_MEANr   )rV  r   r  scale_grad_by_freqr8  sparser  include_last_offsetr  num_bagsr(  r  
offset2bagbag_sizemax_indicesfast_path_sumnumBagsr8   )r   r  r  r  r  rV  r9   meta_embedding_bag  st   








r  c                 G   sB   t | ||g|R  \}}}}t|dkr|| }||||fS )Nr  )r  r!  r   r   )rV  r   r  rK   r(  r  r  r  r8   r8   r9   meta_embedding_bag_forward_onlyM  s   r  c                 C   s.   |r|S | j js| j jr| j S |rtjS | j S r3   )rV   r   r  rN   r   )r  rV   promote_int_to_longr8   r8   r9   _get_reduction_dtypeW  s   r  r   c                C   s6   t | |dd}t| j|}t| ||}| j||dS )NT)r  r   )r  rI   r  r   r  r   )r  r  r  rV   rf  rz  r8   r8   r9   meta_nansumd  s   r  c                 C   s$   t | jtt|  }| |S r3   )rI   r  r   r_   r   rz   r   )r  rz  r8   r8   r9   meta_medianm  s   
r  c                 C   sL   t | dkrtd t| j|f}t| ||}| || j|tjdfS )Nr  zmedian CUDA with indices outputr   )	r!  rI   alert_not_deterministicr  r   r  r   rN   r   )r  rz   r  rz  r8   r8   r9   meta_median_mode_dimu  s   
r  c                 C   r  r3   r8   r   r8   r8   r9   meta_logical_not_  r  r  c                    s   t t|  kdd  tD ]\ t dk fdd qt|   }d| t| j fddttD }| |S )Nc                   S   rc   )NzZNumber of dimensions of repeat dims can not be smaller than number of dimensions of tensorr8   r8   r8   r8   r9   r]     re   zmeta_repeat.<locals>.<lambda>r   c                      rY   )Nz"Repeats cannot be negative, found r  r8   r8   )r  repr8   r9   r]     r^   r|  c                    s   g | ]
} | |  qS r8   r8   r  )padded_sizer  r8   r9   rG     r  zmeta_repeat.<locals>.<listcomp>)	rN   r`   r   rz   r  r_   r   r   r   )r   r  num_new_dimensionstarget_sizer8   )r  r  r  r  r9   meta_repeat  s   
r  c                 C   r  r3   r8   r   r8   r8   r9   
meta_zero_  r  r  c                 C   s   t |tjrt| j|j | S r3   )rf   rN   r   rb   r   r   r   r8   r8   r9   meta_binop_inplace  s   r  c                 C   sf   dd }dd }dd }|| r||rt d|| r$||s$t dt|tjr1t| j|j | S )	a*  
    Some checks for inplace ops.
    Checks for promotion rules for some dtypes.
    int.add/sub_(float) and bool.add/sub_(others) are rejected.
    Promoting in these in-place operations would require reallocating
    and copying over elements, hence not allowed.
    Checks for alpha param.
    c                 S       t | trt| jS t | tS r3   )rf   r"   rI   r$  rV   r   rh   r8   r8   r9   is_integeric     

z.meta_binop_inplace_alpha.<locals>.is_integericc                 S   r  r3   )rf   r"   rI   r  rV   r   r  r8   r8   r9   
is_floatic  r  z,meta_binop_inplace_alpha.<locals>.is_floaticc                 S   r  r3   )rf   r"   rI   is_boolean_dtyperV   r   r  r8   r8   r9   is_booleanic  r  z.meta_binop_inplace_alpha.<locals>.is_booleanicz]Promotion of int.add/sub_(float) in in-place ops are not possible due to element size change.z_Promotion of book.add/sub_(others) in in-place ops are not possible due to element size change.)rE  rf   rN   r   rb   r   )r   r   rf  r  r  r  r8   r8   r9   meta_binop_inplace_alpha  s   r  c                 K      t | tjdS Nr@   rM   r   rJ   )r   r   r8   r8   r9   
meta_round  s   r  c                    sl   t tj fdd tt jr&t tj fdd d S t tt fdd d S )Nc                           dj  S )Nz7: Expected input tensor to have an integral dtype. Got r   r8   )r  r   r8   r9   r]     r^   z#shift_dtype_check.<locals>.<lambda>c                      r  )Nz6: Expected shift value to have an integral dtype. Got r   r8   r  r  r8   r9   r]     r^   c                      s     d S )Nz): Expected shift value to be an int. Got r8   r8   r  r8   r9   r]     r  )rN   r`   rI   r$  rV   rf   r   r   r  r   r  r8   r  r9   shift_dtype_check  s   

r  c                 C      t d| | t| |tjdS )Nrshiftr  r  rM   r   rJ   r  r8   r8   r9   meta_rshifts     r  c                 C   r  )Nlshiftr  r  r  r8   r8   r9   meta_lshifts  r  r  c                 C      |  | jS r3   rV  r   r8   r8   r9   	meta_zero     r  c                 C   r  r3   r8   r   r  r8   r8   r9   
meta_fill_  r  r  c                 C   r  r3   r  r  r8   r8   r9   	meta_fill!     
r  c                 C   r  r3   r8   r   r8   r8   r9   
meta_relu_&  r  r  c                 C      t | |tjdS r  r  )r   r   rf  r8   r8   r9   meta__add_relu+     r        ?UUUUUU?c                 C   r  r3   r  r   noiselowerr  r=  r.  r8   r8   r9   meta_rrelu_with_noise3  s   
r  c                 C   s   t | t |fS r3   r  r  r8   r8   r9    meta_rrelu_with_noise_functional;  s   r  c                 C   r  r3   r8   )r   r  r  r=  r.  r8   r8   r9   meta_rrelu_with_noise_B  s   r  c                 C   r  r3   r  r   r   r   
accumulater8   r8   r9   meta_index_putI  r  r  c                 C   s   t | j|j | S r3   rb   r   )r   r  valuer8   r8   r9   meta_masked_fill_N  s   r  c                 C   s    |  |  jt| d}|S r   )r   r   rF  rI   r!   )r   r  r	  masked_scaler8   r8   r9   meta__masked_scaleT  s   r  c                    s@   t |jt jt jfv dd  t  jjk fdd  S )Nc                   S   rc   )NzMask must be bool or uint8r8   r8   r8   r8   r9   r]   _  re   z&meta_masked_scatter_.<locals>.<lambda>c                      r  )NzEmasked_scatter: expected self and source to have same dtypes but got r   r   r8   r   r}  r8   r9   r]   c  s
    )rN   r`   rV   r  r^  )r   r  r}  r8   r  r9   meta_masked_scatter_\  s   
r  c                 C   s*   t | |\} }tj| tjd}t|||S r   )r)   rN   r   r   r  )r   r  r}  r(  r8   r8   r9   meta_masked_scatteri  s   r  c                 C   s
   |  |S r3   r  )r   r  r+  r8   r8   r9   meta_masked_scatter_backwardq  r  r  c                 C   r  r3   r8   r  r8   r8   r9   meta_index_put_v  r  r  c                 C   r  r3   )viewr   r   r8   r8   r9   
meta_alias{  r  r  c           
         s8  t |  dkdd  t | dkdd  |  }|  |d |d |d } d }||ft  d koB d k fdd |rt| jt jkpX| jt jko]|t jk}t || jkpf|d	d  |	|}	n|}	|sd urt  dkd
d  t  kfdd |	S )Nr1   c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   z)common_meta_baddbmm_bmm.<locals>.<lambda>c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r   r  r   c                	      r  r  r8   r8   r  r8   r9   r]     s    c                   S   rc   )Nzfout_dtype only supported for torch.float32 output with float16/bfloat16 inputs or same as input dtypesr8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nzself must be a 3D tensorr8   r8   r8   r8   r9   r]     re   c                      s   d  d   S )Nz*Expected an input tensor shape with shape z but got shape: r   r8   )r  self_baddbmmr8   r9   r]     r   )
rN   r`   rz   r   rV   rs  rt  rr  r   rF  )
r  r  is_bmmr  rZ  r  res_rowsres_colssupported_out_dtyper(  r8   )r  r  r  r  r  r9   common_meta_baddbmm_bmm  s>   


r   c                 C   s   t | |dS )NTr   )r   rc  r8   r8   r9   meta_bmm  r  r  c                 C   s   t | |d|dS )NT)rZ  r  )r   rc  rZ  r8   r8   r9   meta_bmm_dtype  s   r  c                 C   s<   | | }| | }|dkrt |dk t |dk kr|d8 }|S r  )r  )rD   yqr;  r8   r8   r9   div_rtn  s
    r  c                 C   sZ   t | | | ||d   d |r|d nd |d }|r+|d | | | kr+|d8 }|S r  )r  )	inputSize
kernelSizer  r  r   r$  r  
outputSizer8   r8   r9   pooling_output_shape_pad_lr  s*   

	r
  c                    sl   t |dkdd  t dkfdd t d   d d k fdd t| | |S )Nr   c                   S   rc   )Nzstride should not be zeror8   r8   r8   r8   r9   r]     re   z&pooling_output_shape.<locals>.<lambda>c                      r  )Nz'pad must be non-negative, but got pad: r8   r8   padr8   r9   r]     r  r   r  c                      s   d d d  S )NzApad should be at most half of effective kernel size, but got pad=z, kernel_size=z and dilation=r8   r8   r$  r  r  r8   r9   r]     s
   )rN   r`   r
  )r  r  r  r   r$  r  r8   r  r9   r    s   r  c              	      sN     }tdkodkdd  t|dko|dkdd  t|dko+|dkdd   ddko= ddk}|tjkrWt|dkoQ|oQ d	dkd
d  n"t|d	krf ddkrf|pr|dkor|or d	dk fdd td 
kod 	k	
fdd tdkodkfdd d S )Nr   c                   S   rc   )NzCkernel size should be greater than zero, but got kH: {kH}, kW: {kW}r8   r8   r8   r8   r9   r]     re   z$pool2d_shape_check.<locals>.<lambda>c                   S   rc   )Nz>stride should be greater than zero, but got dH: {dH}, dW: {dW}r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz\dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}r8   r8   r8   r8   r9   r]     re   r   r  r  r1   c                   S   rc   )NzExpected 4D (batch mode) tensor expected for input with channels_last layout with optional 0 dim batch size for input, but got: {input.size()}r8   r8   r8   r8   r9   r]     re   c                         d    S )NzYExpected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: r   r8   r  r8   r9   r]     r  c                      s   d d d d  S )NzKpad should be smaller than or equal to half of kernel size, but got padW = z	, padH = z, kW = z, kH = r8   r8   )r  r  r  r  r8   r9   r]     s    c                      s*   d d  d d d d dS NzGiven input size: (rD   z). Calculated output size: (z). Output size is too smallr8   r8   )r  r  r  r  r  r  r8   r9   r]     s    )rz   rN   r`   r   r6  )r  r  r  r  r  r  r  	dilationH	dilationWr  r  r  r  r  r   r   
valid_dimsr8   )r  r  r  r  r  r  r  r  r  r  r  r9   r    sB   

r  r  r  r  r  r  r  r  pTpHpW	dilationTr  r  r  r  r  r  r  r  r  c              
      s  	j }tdkodkodkfdd tdko&dko& dk fdd tdko<dko<dkfdd t|dv 	fdd t|D ]|dkradkraqVt	dk	fd	d qV|rt
kokok
fd
d td kod kod kfdd tdkodkodk
fdd d S )Nr   c                         d d  d S )Nz5kernel size should be greater than zero, but got kT: z, kH: z, kW: r8   r8   )r  r  r  r8   r9   r]   A     z$pool3d_shape_check.<locals>.<lambda>c                      r  )Nz0stride should be greater than zero, but got dT: z, dH: z, dW: r8   r8   )r  r  r  r8   r9   r]   H  s   c                      r  )Nz9dilation should be greater than zero, but got dilationT: z, dilationH: z, dilationW: r8   r8   )r  r  r  r8   r9   r]   N  r  r  c                      r  )Nz/: Expected 4D or 5D tensor for input, but got: r   r8   )r  r  r8   r9   r]   V  r^   r  c                      s     dj  d dS )NzZ: Expected input's non-batch dimensions to have positive length, but input has a shape of z and non-batch dimension z has length zero!)r   r   r8   )r  r  r  r8   r9   r]   _  s
   c                      s*   d d  d d d d dS )Nzinput image (T: r  r  z ) smaller than kernel size (kT:  kH:  kW: rn   r8   r8   )r  r  r  r  r  r  r8   r9   r]   i  s   r  c                      s(   d d d  d d d S )NzHpad should be smaller than or equal to half of kernel size, but got kT: r  r  z padT: z padW: z padH: r8   r8   )r  r  r  r  r  r  r8   r9   r]   q  s   r   c                      s6   d d d  d d d d d dS r  r8   r8   )r  r  r  r  r  r  r  r8   r9   r]   y  s   )r   rN   r`   r   r   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r8   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r9   r  %  sJ   	"r  c                 C   s   | j }t| |||||||	|
|||||||||||| t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | d S )Nr  r1   r  r   r   r  r  )r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r8   r8   r9   max_pool3d_backward_shape_check  s@   r  c                 C   s   | j }t| ||||||||	|
|ddd|||||||d t|||d | t|||d | t|||d | t|||d | d S )Nr   Tr  r1   r  r  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r8   r8   r9   r    s:   r  c                 C   sB  dd }|d|\}}t t|dv dd  t|dkr#||}	}
n|d|\}	}
|d	|\}}|d
|\}}| d}| d}| d}t| }|t jkr^t |  dkdd  n|t jkrpt |  dv dd  nt ddd  t	||||	||}t	||||
||}t
| |||	|
|||||||||| |||fS )Nc                    r  )Nr  c                      r<  )Nzmax_pool2d: r  r8   r8   r  r8   r9   r]     r   zEmax_pool2d_checks_and_compute_shape.<locals>.unpack.<locals>.<lambda>r   r   r  r  r8   r  r9   r    r  z3max_pool2d_checks_and_compute_shape.<locals>.unpackr/  r  c                   S   rc   )NzOmax_pool2d: stride must either be omitted, a single int, or a tuple of two intsr8   r8   r8   r8   r9   r]     re   z5max_pool2d_checks_and_compute_shape.<locals>.<lambda>r   r   r  r$  r  r  r   r  c                   S   rc   )NzMnon-empty 4D (batch mode) tensor expected for input with channels_last layoutr8   r8   r8   r8   r9   r]     re   r]  c                   S   rc   )Nz9non-empty 3D or 4D (batch mode) tensor expected for inputr8   r8   r8   r8   r9   r]   !  re   Fc                   S   rc   )Nz?Unsupport memory format. Supports only ChannelsLast, Contiguousr8   r8   r8   r8   r9   r]   &  re   )rN   r`   r   r   rI   r!   r6  rz   r   r  r  )r  r/  r   r  r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r8   r8   r9   r    sb   		









r  c                    s   t |||||\}tj jk fdd |jfdd}	|	  |	| t}
tjjjj	|
dS )Nc                      r  )NzExpected dtype z  for `gradOutput` but got dtype r   r8   r  r8   r9   r]   V  r   z7meta_max_pool2d_with_indices_backward.<locals>.<lambda>c                    s:   t | d   t | d  t | d  d S )Nr1   r  r   )r  )r  )r  r   r  r  r8   r9   _check_dim_size\  s   z>meta_max_pool2d_with_indices_backward.<locals>._check_dim_sizer  )
r  rN   r`   rV   r   rI   r!   r   r   rw   )r  r   r/  r   r  r$  r  r   r  r  r   r8   )r  r  r   r  r  r   r9   %meta_max_pool2d_with_indices_backwardA  s.   

r  c                 C   s   t | |||||\}}}|  dkr| dnd}	t| }
|  dkr*|||g}n|	|||g}tj|| j| j|
dtj|tj	| j|
dfS r}  )
r  rz   r   rI   r!   rN   r   rV   rw   r   r  r8   r8   r9   meta_max_pool2d_with_indicesm  s2   
r  c           
   	      s  t jdv fdd j}t|d |D ] t  dkd  d  d qt td	kd
d  t t|d	kdd  d}dd|dkr_d}nd}t jjkdd  t jdkfdd d}d}d	 t ||kd t ||kdd  t  d	k fdd t |d d  d kfdd t |d d  d kfdd  dkr|||d |d g}	n	||d |d g}	t j|	jj	dt j|	t j
j	dfS )Nr]  c                      r   )Nz:fractional_max_pool2d: Expected 3D or 4D tensor, but got: r   r8   r   r8   r9   r]     r   z,meta_fractional_max_pool2d.<locals>.<lambda>r1   r   z^fractional_max_pool2d: Expected input to have non-zero  size for non-batch dimenions, but got r  z emptyr  c                   S   rc   )NzNfractional_max_pool2d: kernel_size musteither be a single int or tuple of Intsr8   r8   r8   r8   r9   r]     re   c                   S   rc   )NzOfractional_max_pool2d: output_size must either be a single int or tuple of Intsr8   r8   r8   r8   r9   r]     re   r  r  r   r  r   c                   S   rc   )Nz6Expect _random_samples to have the same dtype as inputr8   r8   r8   r8   r9   r]     re   c                      r   )Nz1Expect _random samples to have 3 dimensions got, r   r8   )random_samplesr8   r9   r]     r   z=Expect _random_samples.size(0) no less then input batch size.c                   S   rc   )Nz<Expect _random_samples.size(1) equals to input channel size.r8   r8   r8   r8   r9   r]     re   c                      r<  )Nz/Expect _random_samples.size(2) equals to 2 got .r8   r8   )r  r8   r9   r]     r   c                         dd  d  S )Nz%fractional_max_pool2d: kernel height r   z' is too large relative to input height r8   r8   )input_heightr/  r8   r9   r]     r   c                      r"  )Nz$fractional_max_pool2d: kernel width r   z& is too large relative to input width r8   r8   )input_widthr/  r8   r9   r]     r   r  )rN   r`   r   r   r   r   rV   rz   r   rw   r   )
r   r/  r  r   r   input_channelsinput_batchr0  cr   r8   )r  r#  r$  r/  r   r   r9   meta_fractional_max_pool2d  s   










r(  c                 C   s  t t|dv dd  |d }t|dkr|n|d }t|dkr$|n|d }t | p2t|dv dd  |s;|n|d }	|sC|nt|dkrK|	n|d }
|sS|nt|dkr[|	n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t | jd
v dd  | jdkr| dnd}| d}| d}| d}| d}t||||	||}t||||
||}t||||||}t| |||||	|
|||||||||||||d | jdkot| t j	k}| jdkr:| 
d}|  o2|jt j	d}||||f}n|||||f}| |}| j|t jd}|r_|jt j	d}|jt j	d}||fS )Nr  c                   S   rc   NzMmax_pool3d: kernel_size must either be a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]     re   z.meta_max_pool3d_with_indices.<locals>.<lambda>r   r   r  c                   S   rc   NzQmax_pool3d: stride must either be omitted, a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]     re   c                   S   rc   NzImax_pool3d: padding must either be a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]     re   c                   S   rc   NzJmax_pool3d: dilation must be either a single int, or a tuple of three intsr8   r8   r8   r8   r9   r]     re   r  c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r  r~  r  r  r   zmax_pool3d_with_indices()r  r   r   )rN   r`   r   r   r   r  r  rI   r!   rO  r   r   r   r   rF  )r  r/  r   r  r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r6  input_channels_last_checkr   r   r   r8   r8   r9   meta_max_pool3d_with_indices  s   

  







r/  c                 C   s^  t t|dv dd  |d }t|dkr|n|d }	t|dkr$|n|d }
t | p2t|dv dd  |s;|n|d }|sC|	nt|dkrK|n|d }|sS|
nt|dkr[|n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t |jd
v dd  |d}|d}|d}|d}| d}| d}| d}t|| ||||	|
|||||||||||||||d |jdkot|t jk}|jdkr|	d}|
  o|j
t jd}||j}|r-|jt jd}|S )Nr  c                   S   rc   r)  r8   r8   r8   r8   r9   r]   `  re   z7meta_max_pool3d_with_indices_backward.<locals>.<lambda>r   r   r  c                   S   rc   r*  r8   r8   r8   r8   r9   r]   h  re   c                   S   rc   r+  r8   r8   r8   r8   r9   r]   p  re   c                   S   rc   r,  r8   r8   r8   r8   r9   r]   x  re   r  c                   S   rc   r  r8   r8   r8   r8   r9   r]     re   r~  r  r  r   z"max_pool3d_with_indices_backward()r  r  r   )rN   r`   r   r   r   r  rI   r!   rO  r   r   r   r   rF  )r  r  r/  r   r  r$  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r6  r.  r  r8   r8   r9   %meta_max_pool3d_with_indices_backwardR  s   
  









r0  gridc                    s   t j jk fdd t jt jko jt jk fdd t jd  jd k fdd t  jd jd k fdd tdjD ]t j dkfd	d qPd S )
Nc                      r  )NzNgrid_sampler(): expected input and grid to be on same device, but input is on z and grid is on r  r8   r1  r  r8   r9   r]     r  z+check_grid_sampler_common.<locals>.<lambda>c                      r  )NzTgrid_sampler(): expected input and grid to have torch.strided layout, but input has z and grid has )rv   r8   r2  r8   r9   r]     r  r   c                      r  )NzZgrid_sampler(): expected grid and input to have same batch size, but got input with sizes  and grid with sizes r   r8   r2  r8   r9   r]     r  r   r  c                      s   dj d  d j S )Nz+grid_sampler(): expected grid to have size r  z, in last dimension, but got grid with sizes )r   r   r8   r2  r8   r9   r]     s   c                      r  )NzYgrid_sampler(): expected input to have non-empty spatial dimensions, but input has sizes r  r  r   r8   r  r8   r9   r]     r  )rN   r`   rw   rv   r{  r   r   r   )r  r1  r8   )r1  r  r  r9   check_grid_sampler_common  s,   
r4  c                   @   s   e Zd ZdZdZdZdS )GridSamplerInterpolationr   r   r  N)rp   
__module____qualname__BILINEARNEARESTBICUBICr8   r8   r8   r9   r5    s    r5  interpolation_modec                    sP   t jdkoj jk fdd t jdko |tjjk dd  d S )Nr  c                      r  )Nzdgrid_sampler(): expected 5D input and grid with same number of dimensions, but got input with sizes r3  r   r8   r2  r8   r9   r]     s
   z'check_grid_sampler_3d.<locals>.<lambda>c                   S   rc   )Nz<grid_sampler(): bicubic interpolation only supports 4D inputr8   r8   r8   r8   r9   r]     re   )rN   r`   r   r5  r:  r  )r  r1  r;  r8   r2  r9   check_grid_sampler_3d  s   

r<  c           
      C   s:   |d }|rt j|t jd}nd }t j|t jd}	||	fS Nr   r   )rN   r  r   r   
r  r  r1  r;  padding_modealign_cornersr  input_requires_gradr  	grad_gridr8   r8   r9   grid_sampler_2d_backward_meta  s   
rC  c           
      C   s\   t | | t| || | jd }| jd }|jd }|jd }|jd }	| |||||	fS )Nr   r   r  r1   )r4  r<  r   r   )
r  r1  r;  r?  r@  r  Cout_Dout_Hout_Wr8   r8   r9   grid_sampler_3d  s   
	




rH  rB  c           
      C   sP   t || t||| |d }|rtj|tjd}nd }tj|tjd}	||	fS r=  )r4  r<  rN   r  r  r   r>  r8   r8   r9   grid_sampler_3d_backward  s   
rI  c                 O   s:   | dd }|st|}||d< tj| g|R i |S )NrV   )rU   rI   	get_dtyperN   r   )r   r  rK   r   rV   r8   r8   r9   full7  s
   
rK  c                 C   s   |t jkrJt |d u dd  t jd|d u r| jn|||d u r"| jn||d}| jr8||  | 	 | 
  n||  |  d |d |S tjj| |||||d}|d |S )Nc                   S   rc   )Nz9memory format option is only supported by strided tensorsr8   r8   r8   r8   r9   r]   M  re   zzeros_like.<locals>.<lambda>r   r2  Tr  )rN   
sparse_coor`   r   rV   rw   	is_sparsesparse_resize_and_clear_r   
sparse_dim	dense_dimrz   _coalesced_r.   r   rH  fill_)r   rV   rv   rw   rx   r   ro  r8   r8   r9   r  A  s:   
	

	r  ru   c                C   B   |d u rt  }|d u rt  }|d u rt j}t j| ||||dS r4  rN   r|   get_default_devicer{  r   r   rV   rv   rw   rx   ry   r8   r8   r9   	meta_onesn     
rW  c                C   rS  r4  rT  rV  r8   r8   r9   
meta_zeros  rX  rY  c           	         s   ddl m}  }t|dkdd   dkr n |   }t| |kp1||k  fdd dkrAn| t }t } |    }| = | = 	|||S )Nr   r   c                   S   rc   )Nz-select() cannot be applied to a 0-dim tensor.r8   r8   r8   r8   r9   r]     re   zmeta_select.<locals>.<lambda>c                      s   d d   d  S )Nzselect(): index z! out of range for tensor of size z at dimension r   r8   rz   r   r   r8   r9   r]     s
    )
r   r  rz   rN   r   r   r   r   r   r   )	r   rz   r   r  r   r   new_sizer   new_storage_offsetr8   rZ  r9   meta_select  s(   
r]  c                 C   r  r3   rI   clone_preserve_strides)r   rI  rz   r   r8   r8   r9   meta_select_scatter  r  r`  c                 C   r  r3   r^  )r   rI  rz   rr   rq   stepr8   r8   r9   meta_slice_scatter  r  rb  dim_post_exprwrap_scalarc                 C   sb   |dkr
|sJ d}| }|d }| |k s| |kr'J d|  d| d| d| dk r/| |7 } | S )Nr   r   zdim z out of bounds (rm   rn   r8   )rz   rc  rd  r"  r   r8   r8   r9   r     s   ,r   c                 C   s   |   dkrdS | j| S r  r  )r  rz   r8   r8   r9   ensure_nonempty_size  s   re  c                    st   t  d}t  d}t||kdd  t|D ] kr7tttk fdd qd S )Nr   c                   S   rc   )NzDIndex tensor must have the same number of dimensions as input tensorr8   r8   r8   r8   r9   r]     re   z$gather_shape_check.<locals>.<lambda>c                      s$   d dj  dj  d   S )Nz!Size does not match at dimension z expected index  to be no larger than self  apart from dimension r   r8   rz   r  r   r   r8   r9   r]     s    )r   rz   rN   r`   r   re  )r   rz   r   	self_dims
index_dimsr8   rh  r9   gather_shape_check  s   rk  c                    sn   ddl m} t||  }|  dk}|s1t jtjkp$ jtj	k fdd t
| |  |  jS )Nr   r   c                      r   )Nz8gather(): Expected dtype int32/int64 for index, but got r   r8   r   r8   r9   r]     r   zmeta_gather.<locals>.<lambda>)r   r  r   rz   r   rN   r`   rV   r   r  rk  r   r   )r   rz   r   sparse_gradr  wrapped_dimis_index_emptyr8   r   r9   meta_gather  s   
ro  c                 C   s   |r*| dkrdS | dkrdS | dkrdS | dkrdS | d	kr d
S t ddd  d S | dkr0dS | dkr6dS t ddd  d S )Nri  
REDUCE_ADDr  REDUCE_MULTIPLYmeanREDUCE_MEANamaxREDUCE_MAXIMUMaminREDUCE_MINIMUMFc                   S   rc   )Nz=reduce argument must be either sum, prod, mean, amax or amin.r8   r8   r8   r8   r9   r]     re   z#get_operator_enum.<locals>.<lambda>addmultiplyc                   S   rc   )Nz/reduce argument must be either add or multiply.r8   r8   r8   r8   r9   r]     re   r>  )reduce_use_new_optionsr8   r8   r9   get_operator_enum  s,   r|  c                    sp   ddl m} || dkr"t|jtjkp|jtjk fdd |d ur6t|j|jk fdd d S d S )Nr   r   c                      
     dS )Nz((): Expected dtype int32/int64 for indexr8   r8   method_namer8   r9   r]     r  z,scatter_gather_dtype_check.<locals>.<lambda>c                      r}  )Nz0(): Expected self.dtype to be equal to src.dtyper8   r8   r~  r8   r9   r]     r  )r   r  r   rN   r`   rV   r   r  )r  r   r   src_optr  r8   r~  r9   scatter_gather_dtype_check  s   


r  c                 C   s
   t | dS rM  )r   r   r8   r8   r9   ensure_nonempty_dim"  s   
r  c           	         s0  ddl m} | dkrd S tt t kdd  d}t }t|D ]}t|}| kr:q.|t|krEd} nq.|scd urct|D ]}t|}|t|krbd} nqPd urtt t kdd  t|  fdd d S t|  fd	d d S )
Nr   r   c                   S   rc   NzCIndex tensor must have the same number of dimensions as self tensorr8   r8   r8   r8   r9   r]   .  re   z%scatter_shape_check.<locals>.<lambda>FTc                   S   rc   r  r8   r8   r8   r8   r9   r]   H  re   c                      s&   dj  dj  d  dj   S )NExpected index rf  rg  z and to be no larger than src r   r8   rz   r   r   r  r8   r9   r]   L  s    c                      s   dj  dj  d   S )Nr  rf  rg  r   r8   rZ  r8   r9   r]   R  s    )	r   r  r   rN   r`   r  rz   r   re  )	r   rz   r   r  r  is_wrong_shaperi  r  index_d_sizer8   r  r9   scatter_shape_check'  sJ   

r  c                 C   sD   t ||  }td| || t| ||| |d ur t|| d S d S )Nscatter)r   rz   r  r  r|  )r   rz   r   rI  rz  r{  rm  r8   r8   r9   scatter_meta_implX  s   r  c                 C   s   t | |||d | | jS Nrx  r  r   r   r   rz   r   rI  r8   r8   r9   meta_scatter_adda  s   r  c                 C   s   t | |||d | S r  r  r  r8   r8   r9   meta_scatter_add_g  r  r  c                 C   s0   t |tjr|nd }t| |||| | | jS r3   )rf   rN   r   r  r   r   r   rz   r   src_or_valuer   rI  r8   r8   r9   meta_scatterm  s   
r  c                 C   s(   t |tjr|nd }t| |||| | S r3   )rf   rN   r   r  r  r8   r8   r9   meta_scatter_|  s   	r          queryr   r  	dropout_p	is_causalreturn_debug_maskr	  c              	   C   sJ  |  d}|  d}|  d}	|  d}
| d}| dd}t|dd}tj|||	ftj| jd}|rb|
dkr=dnd}t|	| }|dkrMd}n|dkrSd}tj|||	|f| j	| jd}n
tjd| j	| jd}tj
jrtj rtjd	tjd
d}tjd	tjd
d}ntjdtjd
d}tjd	tjd
d}||d d |	||||f	S )Nr   r   r  r1   r  @         r8   rt   )r   r  rN   r   r   rR   rw   r  ceilrV   versionhipr  ru  r   r  )r  r   r  r  r  r  r	  r
  	num_headsmax_seqlen_batch_qhead_dimmax_seqlen_batch_kquery_t	attention	logsumexpblocksize_cmax_seqlen_k
debug_maskseedoffsetr8   r8   r9   (meta__scaled_dot_product_flash_attention  sP   






r  	res_shape.c                    s   t jkrdd}t|dd}|S tg dfdddd fdd	 D } fd
d	tt D }tj|j	j
d|}|S )Nr   r  )r   r   r  r1   c                    s      |  S r3   r  )idx)r  r8   r9   r]     r   z,alloc_with_matching_layout.<locals>.<lambda>Tr   c                    s   g | ]} | qS r8   r8   )rC   r  )r  r8   r9   rG     r   z.alloc_with_matching_layout.<locals>.<listcomp>c                    s   g | ]}  |qS r8   r   r  )	dim_orderr8   r9   rG     rH   r  )r_   r   r  rN   r   sortedr   r   r   rV   rw   r   )r  r  r  ro  permuted_shapefinal_permuter8   )r  r  r  r9   alloc_with_matching_layout  s   
r  	attn_biascompute_log_sumexpc	              	   C   s   |  d}	|  d}
|  d}| d}| d}|	|
||f}t| |}tj|	|
|ftj| jd}tjdtjdd}tjdtjdd}||d d ||||d f	S Nr   r   r  r   r  r8   rt   r   r  rN   r   rR   rw   r   )r  r   r  r  r  r  r  r  r	  r  r  S_QS_KVD_Vr  ro  
logsum_expr  r  r8   r8   r9   (meta__scaled_dot_product_cudnn_attention  s0   





r  c              	   C   s   |  d}|  d}	|  d}
| d}| d}||	|
|f}t| |}tj||	|
ftj| jd}tjdtjdd}tjdtjdd}||d d |
|||d f	S r  r  )r  r   r  r  r  r  r  r	  r  H_Qr  r  r  r  ro  r  r  r  r8   r8   r9   5meta__scaled_dot_product_fused_attention_overrideable  s0   





r  r  r  	cum_seq_q	cum_seq_kmax_qmax_kphilox_seedphilox_offsetc                 C   sX   t |dddd}t |dddd}t |dddd}|||fS r  )rN   r   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r	  grad_qgrad_kgrad_vr8   r8   r9   'meta__scaled_dot_product_flash_backward6  s   
r  	attn_maskc                 C   sR   |  d}|  d}|  d}	t| }
tj||	|ftj| jddd}|
|fS )Nr   r   r  r  )r   rN   r   r   rR   rw   r  )r  r   r  r  r  r  r	  r
  r  r  r  r  r8   r8   r9   0meta__scaled_dot_product_flash_attention_for_cpuR  s"   




r  c
                 C   s   | d}
| d}| d}| d}| d}tj|
|||fd|j|jd}tj|
|||fd|j|jd}tj|
|||fd|j|jd}|||fS )Nr   r   r1   r  r   r  r   r1   r  )r   rN   empty_permutedrV   rw   )r  r  r   r  r   r  r  r  r  r	  r
  r  r  len_qlen_kr  r  r  r8   r8   r9   9meta__scaled_dot_product_flash_attention_for_cpu_backwardt  s0   








r  c                 C   s   |  dd} | dd}| dd}| d}| d}	| d}
|d}tj||	|
|| j| jd}tjjrDtj	 rD	 |rA|	nd}n|rOt
|	d d nd}tj||
|ftj| jd}| dd}tjdtjd	d}tjdtjd	d}||||fS )
Nr   r  r   r  r   r  r?  r8   rt   )r  r   rN   r   rV   rw   r  r  r  ru  r  r  rR   r   )r  r   r  r  r  r  r  r	  r  ry  r  Kvro  logsumexp_dimr  r  r  r8   r8   r9   ,meta__scaled_dot_product_efficient_attention  s*   



r  grad_input_maskc                 C   s  | d}| d}| d}| d}| d}| d}tj||||fd|j|jd}tj||||fd|j|jd}tj||||fd|j|jd}d }|d ur|
d r| d}|d dkrb|n|d |d  }t|  }||d< tj||j|jd}|d	d |f }||||fS )
Nr   r   r  r1   r  r  r   rq  .)r   rN   r  rV   rw   r   r   )r  r  r   r  r  r   r  r  r  r  r  r  r	  r
  r  r  r  
head_dim_vr  r  r  r  	grad_biaslastDimlastDimAligned	new_sizesr8   r8   r9   +meta__scaled_dot_product_efficient_backward  sF   









 
r  c                 C   s(   t |}t |}t |}|||fS r3   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r8   r8   r9   'meta__scaled_dot_product_cudnn_backward  s   



r  window_size_leftwindow_size_right	seqused_kalibi_slopesc                 C   s  |d u r	|  dn| d }|d u r|  dn|}|d u r#| dn|}|  d}|  d}t| }|d u rFtj|||ftj| jd}n|  d}tj||ftj| jd}|	r|dkr_dnd}t|| }|dkrod}n|dkrud}tj||||f| j	| jd}n
tjd| j	| jd}d	\}}tj
jrtj rtjd
tjdd}tjd
tjdd}ntjdtjdd}tjd
tjdd}|||||fS )Nr   r   r  r   r  r  r  r  NNr8   rt   r  )r   r   rN   r   r   rR   rw   r  r  rV   r  r  r  ru  r   r  )r  r   r  r  r  r  r  r  r  r  r	  r  r  r  r  r
  r  r  r  r  r  r  total_qr  r  r  r  r  r8   r8   r9   meta__flash_attention_forward)  sR   




r  c                 C   s(   t |}t |}t |}|||fS r3   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r	  r  r  
grad_querygrad_key
grad_valuer8   r8   r9   meta__flash_attention_backwardy  s   



r  cu_seqlens_qcu_seqlens_kmax_seqlen_qr  custom_mask_typecausal_diagonalseqlen_kwindow_sizec                 C   s   |  d}|  d}| d}|  d}| d}tj||||| j| jd}|d ur1| dd n|}|}|d urA|d us?J |}|d urG|n|}|
rTt|d d nd}tj|||ftj| jd}tjdtjdd}tjdtjdd}||||||fS )	Nr   r   r  r   r  r?  r8   rt   )	r   rN   r   rV   rw   r  r  rR   r   )r  r   r  rX  r  r  r  r  r  r  r  r	  r  r  r  r  ry  r  r  r  ro  logsumexp_batch_dimactual_max_seqlen_qactual_max_seqlen_kr  r  r  r  r8   r8   r9   !meta__efficient_attention_forward  s,   




r  bias_requires_gradnum_splits_keyshared_storage_dqdkdvc                 C   sL  |rSt |jd |jd kdd  t |jd |jd kdd  t jg |jdd d|jd |jd R |j|jd	}|d
d}|d
d}|d
d}nt |}t |}t |}|d ur|d}|d dkrs|n|d |d  }t	| }||d< t j||j|jd	}|dd |f }nt jd|jd}||||fS )Nr   c                   S   rc   )Nz,seqlen must match for `shared_storage_dqdkdvr8   r8   r8   r8   r9   r]     re   z4meta__efficient_attention_backward.<locals>.<lambda>r1   c                   S   rc   )Nz3embedding dim must match for `shared_storage_dqdkdvr8   r8   r8   r8   r9   r]     re   r   r  r   r  r  r  rq  .r8   r  )
rN   r`   r   r   rV   rw   r  r   r   r   )r  r  r   r  rX  r  r  r  r  r  r  r  r  r  r  r	  r  r  chunkr  r  r  r  r  r  r  r8   r8   r9   "meta__efficient_attention_backward  s:   *



 r  scale_ascale_bscale_resultuse_fast_accumc                    sl  dd }t  dko dkfdd t |jo$|jfdd tdkrdd	 }	d
d }
dd }t |	 pJ|fdd t |
 p\|fdd t dd dkfdd t dd dkodd dkfdd j\}djt jkojt jkpjt j	kojt j	k}
 dkrψ
 dkrt jt jkoɈjt jkdd  n|r6jt j	krd}|d }nd}d}dd }|||}||dd }||| |  ||| | 
  kr(
 kr(t  dd  t  dd  ntt d fdd nft jt jkoDjt jkdd  t  dkoW dkfd d dkrddkrddkrdkrt  o d!d  nt dfd"d |d ur|nj}t jdd|jd#S )$Nc                 S   s   | t jt jt jt jt jfv S r3   )rN   ru  float8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzfloat4_e2m1fn_x2r   r8   r8   r9   is_fp8_or_fp4_type  s   z*meta_scaled_mm.<locals>.is_fp8_or_fp4_typer  c                      s   d   d    S )Nz%Inputs must be 2D but got self.dim()=z and mat2.dim()=r   r8   rc  r   r8   r9   r]     r  z meta_scaled_mm.<locals>.<lambda>c                      r  )Nz?Expected both inputs to be fp8 or fp4 types but got self.dtype=z and mat2.dtype=r   r8   r  r8   r9   r]   "  r   r  c                 S   s   | d | d ko| d dkS r  r8   r  r8   r8   r9   is_row_major'     z$meta_scaled_mm.<locals>.is_row_majorc                 S   s   | d dko| d dkS r  r8   r  r8   r8   r9   is_col_major*  r  z$meta_scaled_mm.<locals>.is_col_majorc                 S   s   |  ddkp|  ddkS r  r   )	tensor_2dr8   r8   r9   has_zero_dim-  r  z$meta_scaled_mm.<locals>.has_zero_dimc                      r  )Nz#self must be row_major, got stride r  r8   r   r8   r9   r]   2  r  c                      r  )Nz#mat2 must be col_major, got stride r  r8   rc  r8   r9   r]   6  r  r   rq  r   c                      s   d  d S )NzBExpected self.size(1) to be divisible by 16, but got self.size(1)=r   r   r8   r   r8   r9   r]   :  r^   c                      r   )Nz>Expected both dimensions of mat2 to be divisble by 16 but got r   r8   r  r8   r9   r]   >  r   c                   S   rc   )NzNFor tensorwise scaling, both scale_a and scale_b must be float (fp32) tensors.r8   r8   r8   r8   r9   r]   R  re   r?  r  c                 S   s   | | d | S rM  r8   r  r8   r8   r9   ceil_divb  r;   z meta_scaled_mm.<locals>.ceil_divr  c                   S   rc   )Nzscale_a must be contiguousr8   r8   r8   r8   r9   r]   u  re   c                   S   rc   )Nzscale_b must be contiguousr8   r8   r8   r8   r9   r]   y  re   Fc                	      s&   d  d   d d   d	S )NzTInvalid blockwise scaling configuration. For blockwise scaling, scale_a should have  elements, got z, scale_b should have r!  r  r8   )expected_a_sizeexpected_b_sizer  r  r8   r9   r]   ~  s   c                   S   rc   )NzKFor rowwise scaling, both scale_a and scale_b must be float (fp32) tensors.r8   r8   r8   r8   r9   r]     re   c                      s   d   d  S )NzLFor non-tensorwise scaling, scale tensors must be 2D, but got scale_a.dim()=z and scale_b.dim()=r   r8   )r  r  r8   r9   r]     r  c                   S   rc   )Nz@Both scale_a and scale_b must be contiguous for rowwise scaling.r8   r8   r8   r8   r9   r]     re   c                      sB   d  d d d d d d d d d dS )	Nz}Invalid scaling configuration. For tensorwise scaling, both scales should be scalar. For rowwise scaling, scale_a should be (z, 1), scale_b should be (1, z). Got scale_a.size()=(r   rm   r   z) and scale_b.size()=(rn   r   r8   )ry  r0  r  r  r8   r9   r]     s   r  )rN   r`   rz   rV   r!  r   r   r   float8_e8m0fnuru  r   rr  r   r   rw   )r   rc  r  r  rX  r  rZ  r  r  r   r  r  _kis_blockwise_scalingblock_size_kblock_size_mnr  num_k_blockspadded_num_k_blocks
_out_dtyper8   )r  r	  ry  rc  r0  r  r  r   r9   meta_scaled_mm  s   	


"






	 r  c                 C   s    t | ||||dd | | jS NT)r{  r  r   rz   r   rI  r   r|  r8   r8   r9   meta_scatter_reduce_two  s   r  c                 C   s   t | ||||dd | S r  r  r  r8   r8   r9   meta_scatter_reduce__two  s   r  c                   sh   t d    k odkn   fdd   dkr&t j|t j jdS t j d|t j jdS )Nr   r  c                      r  )Nz@The probabilty distributions dimensions must be 1 or 2, but got r   r8   r  r8   r9   r]     r  z"meta_multinomial.<locals>.<lambda>r   r  )rN   r`   rz   r   r   rw   r   )r  num_samplesreplacementr.  r8   r  r9   meta_multinomial  s   
r  c                 C   s   d}| D ]}||9 }q|S rM  r8   )vsr;  vr8   r8   r9   multiply_integers  s   
r  c                    s   t tkfdd d  t t k fdd t tdd dd  D o9tdd D fdd d d \}}||gR S )Nc                         d  dt  S )Nz%It is expected output_size equals to , but got size r  r8   )num_spatial_dimsr  r8   r9   r]     r   z'upsample_common_check.<locals>.<lambda>r  c                      r  )Nz$It is expected input_size equals to r  r  r8   )expected_input_dimsr  r8   r9   r]     r   c                 s   r+  r,  r8   r   r8   r8   r9   ri     rb  z(upsample_common_check.<locals>.<genexpr>c                      rk   )NzDInput and output sizes should be greater than 0, but got input size z and output size r8   r8   )r  r  r8   r9   r]     s
    )rN   r`   r   rf  )r  r  r  r  channelsr8   )r   r  r  r  r9   upsample_common_check  s   

*r"  c                    sZ   t   dkpt  dd   fdd t  |dd} |jt	 dS )Nr   r   c                      r  )Nz>Non-empty 3D data tensor expected but got a tensor with sizes r   r8   r  r8   r9   r]     r  z$upsample_nearest1d.<locals>.<lambda>r  r   
rN   r`   r   r  r   r"  r   rF  rI   r!   )r  r  scalesfull_output_sizer8   r  r9   upsample_nearest1d     


r'  c           	         s   t   dkpt  dd   fdd t  |dd} |}t } j	\}}}} j
jdkr?|dk r?t j}|j|d	}|S )
Nr   r   c                      r  Nz>Non-empty 4D data tensor expected but got a tensor with sizes r   r8   r  r8   r9   r]     r  z$upsample_nearest2d.<locals>.<lambda>r  r#  r  r  r   )rN   r`   r   r  r   r"  r   rI   r!   r   rw   ro   r   
contiguous)	r  r  scales_hscales_wr&  r(  r   rL   
n_channelsr8   r  r9   upsample_nearest2d  s   



r.  r  r  r+  r,  c                    st   t ||dd tjdkfdd tdD ]t  k fdd q|jt	dS )Nr  r#  r  c                      r   NzFExpected grad_output to be a tensor of dimension 4 but got: dimension r   r8   r  r8   r9   r]     r   z-upsample_nearest2d_backward.<locals>.<lambda>c                
      &   d d   d d  S )NzCExpected grad_output to have the same shape as output; output.size() = z but got grad_output.size(r   r8   r&  r  r  r8   r9   r]   #  s   r   )
r"  rN   r`   r   r   r   r   rF  rI   r!   )r  r  r  r+  r,  r8   r2  r9   upsample_nearest2d_backward  s   

	r3  c                    sZ   t   dkpt  dd   fdd t  |dd} |jt	 dS )Nr   r   c                      r  )Nz>Non-empty 5D data tensor expected but got a tensor with sizes r   r8   r  r8   r9   r]   5  r  z$upsample_nearest3d.<locals>.<lambda>r1   r#  r   r$  )r  r  scales_dr+  r,  r&  r8   r  r9   upsample_nearest3d/  r(  r5  c           
      C   s   t | t j| t jd}}|d urQ|d urQt|tsJ t|ts$J |j}| }	t||}t||}|||	 |||	 t	||d t	||d ||fS ||fS )Nr   )rc  rd  )
rN   r   r   rf   r"   r   r   r$   r   r&   )
r   stablerz   
descendingr   r   r  r  r   
out_strider8   r8   r9   	meta_sort?  s   	

r9  c                    s  t jdkfdd t jjkfdd dd urPt jdkfdd t  kfdd t jjkfdd t jdkfd	d d
   t   k fdd t tfddfD dd  d S )Nr  c                          j  dS Nz != 2r   r8   input_gatesr8   r9   r]   b  r   z%rnn_cell_checkSizes.<locals>.<lambda>c                         j  d j  S N != r   r8   )hidden_gatesr=  r8   r9   r]   e  r  r   c                      r:  )Nz != 1r   r8   )
input_biasr8   r9   r]   i  r   c                      s      d  S r?  r  r8   )
gates_sizerB  r8   r9   r]   l  r  c                      r>  r?  r   r8   )hidden_biasrB  r8   r9   r]   p  r  c                      r:  r;  r   r8   )prev_hiddenr8   r9   r]   r  r   r   c                
      s,      dd d d d  d
S )Nr@  r   z * z // z (aka rn   )r   r   r8   )expected_prev_hidden_numelfactorrC  r=  rE  r8   r9   r]   v  s   , c                 3   s    | ]	}|j  j kV  qd S r3   r  rB   r<  r8   r9   ri   y  s
    

z&rnn_cell_checkSizes.<locals>.<genexpr>c                   S   rc   )Nz%expected all inputs to be same devicer8   r8   r8   r8   r9   r]   }  re   )rN   r`   r   r   r   r   rf  )r=  rA  rB  rD  rG  rE  r8   )rF  rG  rC  rD  rA  rB  r=  rE  r9   rnn_cell_checkSizesZ  s8   





rH  c                 C   sL   t | |||d| tj| tjd}tj|tjd}tj|tjd}|||fS )Nr  r   )rH  rN   r   r   )r=  rA  cxrB  rD  	workspacehycyr8   r8   r9   _thnn_fused_lstm_cell_meta  s
   
rM  c                 C   s(  t |dk}|rt |}|d }| jd }n|
r| jd n| jd }|
r)| jd n| jd }d}|r4dnd}|dkr<|n|}|rG||| g}n|
rP|||| gn|||| g}| |}|	| ||g}|d u rptjd| jd}n||}||	| ||g}|rdnd}| j|tjd}|||||fS )Nr   r   r   r  r  r   )r   r   r   rN   r   rw   r^  )r  rV  weight_stride0
weight_bufhxrI  r8  hidden_size	proj_size
num_layersbatch_firstdropouttrainbidirectionalbatch_sizesdropout_stateis_input_packed
seq_length
mini_batchbatch_sizes_sumnum_directionsout_sizer   r(  
cell_shaperL  rK  reserve_shapereserver8   r8   r9   
_cudnn_rnn  s2   

rc  c                 C   s   |r| j d n| j d }|r| j d n| j d }|
}|r!|||gn|||g}| |}|d u r8tjd| jd}n||j }|d u rKtjd| jd}n||j }tjd| jtjd}||||fS )Nr   r   r  r   )r   r   rN   r   rw   r^  )r  w0w1w2w3hx_cx_r   rX  r8  rQ  rS  
has_biasesrW  rT  rV  r[  r\  output_chanelsr   r(  rK  rL  rJ  r8   r8   r9   mkldnn_rnn_layer  s    
rl  c                    sT   | j dkrt dkp dk fdd d S t|  dk fdd d S )Nr   r   c                      r  )Nz4: Expected reduction dim -1 or 0 for scalar but got r8   r8   rz   r  r8   r9   r]     r  z'zero_numel_check_dims.<locals>.<lambda>c                      r  )Nz: Expected reduction dim z to have non-zero size.r8   r8   rm  r8   r9   r]     r^   )r   rN   r   r   )r   rz   r  r8   rm  r9   zero_numel_check_dims  s   
rn  c                    sF   |d urt || }t||  d S t| dk fdd d S )Nr   c                      r}  )Nz@: Expected reduction dim to be specified for input.numel() == 0.r8   r8   r  r8   r9   r]     r  z%check_argmax_argmin.<locals>.<lambda>)r   rz   rn  rN   r`   r   )r  r   rz   r8   r  r9   check_argmax_argmin  s   

ro  c                 C   sD   t d| | t| j|d ur|fnd }t| ||}| j|tjdS )Nargmaxr   )ro  rI   r  r   r  r   rN   r   )r   rz   r  r  r   r8   r8   r9   argmax_argmin_meta	  s   rq  c                 C   s$   |t jkrt j}t jd||||dS )Nr8   r2  )rN   jaggedr{  r   )r   rV   rv   rw   rx   r8   r8   r9   scalar_tensor  s
   

rs  c                 C   s   t ||  dd}|  dkrdn| |}t| t||kdd  t| j}t|dkr4|||< | 	|| j	|tj
dfS )NTrd  r   r   c                   S   rc   )Nzk not in range for dimensionr8   r8   r8   r8   r9   r]   #  re   ztopk_meta.<locals>.<lambda>r   )r   rz   r   rN   r  r`   r   r   r   r   r   )r   rx  rz   largestr  	sliceSizetopKSizer8   r8   r9   	topk_meta  s   

rx  c           
      C   s@   |d us|d usJ d|  }|   }	tj||	j|	j|	jdS )Nz;segment_reduce(): Either lengths or offsets must be defined)rV   rw   rv   )r*  rN   r   rV   rw   rv   )
r  r(  r  r   r  r  r  r  data_contiggrad_contigr8   r8   r9   meta__segment_reduce_backward+  s   r{  c                    s   ddl m} t |  dd |  dkr|  nd}t||dk||k fdd t| jd   | j d d   }|rM|  dkrM|	 d | 
|| j
|tjdfS )	Nr   )sym_andTrt  r   c                      r  )Nz9kthvalue(): selected number k out of range for dimension r8   r8   r   r8   r9   r]   F  r  zkthvalue_meta.<locals>.<lambda>r   )r   r|  r   rz   r   rN   r`   r   r   rN  r   r   )r   rx  rz   r  r|  dimSizer   r8   r   r9   kthvalue_meta=  s   
$r~  c                 C   s   | d ur| n|}t | dkdd  | }| d ur(t |  |kdd  |d ur8t | |kdd  t | |kdd  t | |kdd  t | dkdd  t | |d	 |d
  d kdd  d S )Nr  c                   S   rc   N r8   r8   r8   r8   r9   r]   U  re   z(checkLSTMBackwardSizes.<locals>.<lambda>c                   S   rc   r  r8   r8   r8   r8   r9   r]   X  re   c                   S   rc   r  r8   r8   r8   r8   r9   r]   Z  re   c                   S   rc   r  r8   r8   r8   r8   r9   r]   [  re   c                   S   rc   r  r8   r8   r8   r8   r9   r]   \  re   c                   S   rc   r  r8   r8   r8   r8   r9   r]   ]  re   r   r   r  c                   S   rc   r  r8   r8   r8   r8   r9   r]   ^  re   )rN   r`   rz   r   r   )grad_hygrad_cyrI  rL  rJ  defined_gradexp_sizer8   r8   r9   checkLSTMBackwardSizesS  s   ,r  c           	      C   s`   | d u r
|d u r
dS t | |||| tj|td}tj|td}|r)|jdddnd }|||fS )NNNNr   r   F)r  )r  rN   r   legacy_contiguous_memory_formatri  )	r  r  rI  rL  rJ  has_bias
grad_gatesgrad_cxr  r8   r8   r9   #_thnn_fused_lstm_cell_backward_implb  s   
r  c                 C   sf   d }d }d }|d r| |  }|d s|d r.| |d| df}| |d}|||fS )Nr   r   r  r   r  )r  r  r  r  r  grad_weightr  r8   r8   r9   linear_backwardp  s   
r  c                    s   t jdkrjd ||  dksJ dj d| dd   fdd	}jd ||  }jd
 | }jd | }g jd d |||R }|}|j| d}|S )Nr  r  r   z'Invalid input shape for pixel_shuffle: z with upscale_factor = c                 S   r3  r3   r4  r7  r8   r8   r9   r9    r:  z,meta_pixel_shuffle.<locals>.is_channels_lastc                      sL    rt dkrtjS tjS jtjdrtjS jtjdr$tjS d S rG  )r!  rN   r   r6  r   rH  r8   r9  r   r8   r9   rA    s   z.meta_pixel_shuffle.<locals>.pick_memory_formatr  r   r   )r   r   r   rF  )r   upscale_factorrA  rD  HrWrr   r   r8   r  r9   meta_pixel_shuffle}  s   & 
r  c                 C   sZ   |  | j}| |j}| |j}| |j}| |j}| |j}|||||||fS r3   rV  )r  weight0weight1weight2weight3rh  cx_tmpr(  hy_cy_grad_output_r_optgrad_hy_r_optgrad_cy_r_optr   r8  rQ  rS  rj  rV  rW  rX  rT  rJ  diff_xdiff_hxdiff_cxdiff_w1diff_w2diff_br8   r8   r9   mkldnn_rnn_layer_backward  s   r  )	out_int32r  c                C   s   t j| |rt jnt jt jdS )NrV   r   )rN   r   r]  r   r   )r   
boundariesr  r  r8   r8   r9   meta_bucketize  s
   r  d   c                    s   dt dkrt fdd t dkr# r#td tt t fdd t dk fd	d tttfd
d tttfdd tkdd  tj	 j
jdS )Nzhistc()r  c                      r  )Nz%"histogram_cpu" not implemented for 'r  r   r8   r  r8   r9   r]     r  zmeta_histc.<locals>.<lambda>r  z%_histc_cuda with floating point inputc                      s    dt   S )Nz#: argument 'bins' must be int, not r&  r8   binsr  r8   r9   r]     r  r   c                      r  )Nz: bins must be > 0, but got r8   r8   r  r8   r9   r]     r  c                           dt  S )Nz%: argument 'min' must be Number, not r&  r8   )r  r"  r8   r9   r]     r  c                      r  )Nz%: argument 'max' must be Number, not r&  r8   )r  r   r8   r9   r]     r  c                   S   rc   )Nz&{fn_name}: max must be larger than minr8   r8   r8   r8   r9   r]     re   r   )r!  rN   r`   r   rI   r  rf   r   r    r   rw   rV   )r  r  r"  r   r8   )r  r  r  r   r"  r9   
meta_histc  s.   

r  c                    sd   t   |dd}t  dkptdd   dd  D  fdd  |jt	 d	S )
Nr  r#  r   c                 s   r+  r,  r8   )rC   r   r8   r8   r9   ri     rb  z,meta_upsample_bimode2d_aa.<locals>.<genexpr>r   c                      r  r)  r   r8   r  r8   r9   r]     r  z+meta_upsample_bimode2d_aa.<locals>.<lambda>r   )
r"  r   rN   r`   r   rf  r   rF  rI   r!   )r  r  r@  r+  r,  r&  r8   r  r9   meta_upsample_bimode2d_aa  s   

(

r  c                    st   t ||dd tjdkfdd tdD ]tj   k fdd q|jt	dS )Nr  r#  r  c                      r   r/  r   r8   r  r8   r9   r]   
  r   z4meta_upsample_bimode2d_aa_backward.<locals>.<lambda>c                
      r0  )NzD
Expected grad_output to have the same shape as output; output.size(r1  z
but got grad_output_size(r   r8   r2  r8   r9   r]     s    r   )
r"  rN   r`   r   r   r   r   rF  rI   r!   )r  r  r  r@  r+  r,  r8   r2  r9   "meta_upsample_bimode2d_aa_backward  s   	

r  c                 C   s\   t | dkdd  t | dkdd  t |jjdd  t |jjdd  d S )Nr   c                   S   rc   )Nz%found_inf must be a 1-element tensor.r8   r8   r8   r8   r9   r]     re   z<_amp_foreach_non_finite_check_and_unscale_.<locals>.<lambda>c                   S   rc   )Nz%inv_scale must be a 1-element tensor.r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz!found_inf must be a float tensor.r8   r8   r8   r8   r9   r]   #  re   c                   S   rc   )Nz!inv_scale must be a float tensor.r8   r8   r8   r8   r9   r]   '  re   )rN   r`   r   rV   r   )r   r#  	inv_scaler8   r8   r9   *_amp_foreach_non_finite_check_and_unscale_  s   r  c                 C   s   t |  }| |S r3   )r   r   r   )r   nanposinfneginfr  r8   r8   r9   
nan_to_num,  s   
r  c                 C   s   | j tjtjtjtjhvsJ d| j  d| j}t||}t||}||kr)| S t| 	 }t| 
 }|| || ||< ||< || || ||< ||< | || | S )Nz>torch.transpose_: in-place transposition is not supported for z layout)rv   rN   r|  
sparse_cscr}  
sparse_bscr   r   r   r   r   r   )r   dim0r  ndimsr   r   r8   r8   r9   rt  3  s&   	

rt  c                 C   sz   | j }| jr"|  }|  }|dkr|dks!J d| d| dn|  dks0J d| dt| d|dk r:dS dS )	Nr  r   zEt_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got z sparse and z dense dimensionsz6t_ expects a tensor with <= 2 dimensions, but self is r  r   )r   rM  rO  rP  rz   rt  )r   r  rO  rP  r8   r8   r9   t_P  s   
r  )r  r  sidesorterc                   s   t tjdkpjd d  jd d k fdd t d u p)jjkfdd t |dkp9| d |rAt jnt j}t t jrSt j |t j	dS t j
d	|jd
S )Nr   r   c                      s   dt j dt  j S )Nztorch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor and input value tensor must match, but we got boundaries tensor z and input value tensor r   r   r8   )r   sorted_sequencer8   r9   r]   s  s
   z#meta_searchsorted.<locals>.<lambda>c                      s,   dt  j dd urt j S g  S )Nz[torch.searchsorted(): boundary and sorter must have the same size, but got boundary tensor z and got sorter tensor r  r8   )r  r  r8   r9   r]   ~  s   r  zetorch.searchsorted(): side and right can't be set to opposites, got side of left while right was Truer  r8   r  )rN   r`   r   r   r]  r   rf   r   r   r   r   rw   )r  r   r  r  r  r  rV   r8   )r   r  r  r9   meta_searchsortedc  s&   
r  c                    s(   t  t jt jt jfv fdd d S )Nc                      r  )Nz/Unsupported input type encountered for isin(): r8   r8   r   r8   r9   r]     r  z3_check_for_unsupported_isin_dtype.<locals>.<lambda>)rN   r`   r  
complex128	complex64r   r8   r   r9   !_check_for_unsupported_isin_dtype  s   
r  c                 C   s   |  || df}|S )Nr   r  )r  r   num_weightsr  r  r  r8   r8   r9   meta_embedding_dense_backward  s   r  c                 C   s:   |	rt | ||||||||
|
S t| ||||||||
|
S r3   )r.   _embedding_bag_sparse_backward!meta_embedding_bag_dense_backward)r  r   r  r  r  maximum_indicesr  r  r8  r  r  r  r8   r8   r9   meta_embedding_bag_backward  s2   r  c
                    sX   t  jt jt jt jt jfv  fdd |tkr t |d u  | 	df}
|
S )Nc                      r   )Nz$Unsupported input type encountered: r   r8   r  r8   r9   r]     r   z3meta_embedding_bag_dense_backward.<locals>.<lambda>r   )
rN   r`   rV   rs  rt  rr  float64r  r   r   )r  r   r  r  r  r  r  r8  r  r  index_grad_weightr8   r  r9   r    s   
r  c           
      C   s~   |  d}t|tkd t|  dk t| dk | d}t| dk t| d|k | |f}	|	S )Nr   zHembedding_bag_backward: per_sample_weights only supported for mode='sum'r  r   )r   rN   r`   r  rz   r   )
r  rV  r   r  r  r8  r  embedding_featuresr  r(  r8   r8   r9   .meta_embedding_bag_per_sample_weights_backward  s   


r  )assume_uniqueinvertc                C   sx   t t| tpt|tdd  t| tst j| |jd} t|ts*t j|| jd}t| j t|j t j| t j	dS )Nc                   S   rc   )Nz<At least one of elements and test_elements must be a Tensor.r8   r8   r8   r8   r9   r]     re   zmeta_isin.<locals>.<lambda>r  r   )
rN   r`   rf   r   rO  rw   r  rV   r   r  )elementstest_elementsr  r  r8   r8   r9   	meta_isin  s   



r  r0  c                 C   s4   t | dkdd  t|tjd\}}t j||dS )Nr   c                   S   rc   )Nz,polygamma(n, x) does not support negative n.r8   r8   r8   r8   r9   r]     re   z meta_polygamma.<locals>.<lambda>r  r   )rN   r`   r   r   r  r   )r0  r   rL   rF   r8   r8   r9   meta_polygamma  s   
r  c                 C   s   t d)Nz.Tensor.item() cannot be called on meta tensors)rE  r   r8   r8   r9   meta_local_scalar_dense  s   r  c                 C   r  r3   r  r   r8   r8   r9   silu$  r  r  c                 C   s    t | tjd\}}tj| |dS r  )r   r   r  rN   r   )r   rL   rF   r8   r8   r9   sigmoid*  s
   
r  c                 C   sF  |   dk}|  dk}|r8|r|d| d|dg}nPt|d|dkd | d|dg}n7|rSt|d| dkd | d|dg}nt| d|dkd | d| d|dg}|ps| j}d|j }|d | d | | }||kr|d | |dg}	n|dg}	tj||	|| jd}
|
S )	Nr  r   r   z matrix batch sizes have to matchr   zbatched dimension has to matchrq  r  )rz   r   rN   r`   rV   itemsizer  rw   )ra  rc  offsrZ  
mat1_is_2d
mat2_is_2dr_  	alignmentsize_paddedr8  r   r8   r8   r9    _create_grouped_mm_output_tensor4  s0   

r  mat_amat_br  c	                    sv  t |d u |d u kdd  |d uo|d u}	|	r.t  jt jko%jt jk fdd nt  jt jko;jt jk fdd t   dv oP dv  fdd   dk}
 dk}|	rdd	 }d
d }t |  fdd t |fdd dd }|d  |d |d ur|d urt |jt jko|jt jkdd  d dd}d ur|
r|rjd nd}|d| d| |d|d| t |d u dd  |
s|rt d u fdd d urt  dkfdd t jt jkfdd n
t d u dd  t |d u dd  t |d u p/|t jkdd  t	 |S )!Nc                   S   rc   )Nz,Either both scale factors are given, or noner8   r8   r8   r8   r9   r]   d  re   z)_meta_grouped_mm_common.<locals>.<lambda>c                      r`  )Nz5Expected inputs of E4M3 FP8 type but got mat_a.dtype= and mat_b.dtype=r!  r   r8   r  r  r8   r9   r]   o  rH   c                      r`  )Nz1Expected inputs of BF16 type but got mat_a.dtype=r  r!  r   r8   r  r8   r9   r]   t  rH   )r  r1   c                      s   d    d   S )Nz3Multiplicands must be 2D or 3D but got mat_a.dim()=z and mat_b.dim()=r   r8   r  r8   r9   r]   y  r  r  c                 S   s    |   }|d dko|d dkS Nr  r   r   r  mat
mat_strider8   r8   r9   r        z-_meta_grouped_mm_common.<locals>.is_row_majorc                 S   s    |   }|d dko|d dkS r  r  r  r8   r8   r9   r    r  z-_meta_grouped_mm_common.<locals>.is_col_majorc                         d   dd   S )NzNExpected mat_a tensor to be row major in the last two dimensions, got strides r  r  r8   )r  r8   r9   r]     rH   c                      r  )NzQExpected mat_b tensor to be column major in the last two dimensions, got strides r  r  r8   )r  r8   r9   r]     rH   c                    s     d  d  }  d  dkr:  tdj d  kr:t  | dk fdd d S   dkrd d  tdj  krdt d  | dk fdd d S tdfdd d S )	Nr   rq  r   c                      s   d d  d   dS )Nr   stride along % dim to be multiple of 16 bytes, got r!  r8   r8   end_dimmat_namer  r8   r9   r]     r  zF_meta_grouped_mm_common.<locals>.check_valid_strides.<locals>.<lambda>c                      s$   d d d  d d   dS )Nr  r  r   r  r!  r8   r8   r  r8   r9   r]        $ Fc                      s   d d j  dS )NzInvalid strides/sizes, got z for strides and z for sizes.r   r8   r  r8   r9   r]     r   )rz   element_sizer   r   r   rN   r`   )r  r  r  r8   )r  r  r  r  r9   check_valid_strides  s*   
z4_meta_grouped_mm_common.<locals>.check_valid_stridesr  r  c                   S   rc   )NzBoth scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.r8   r8   r8   r8   r9   r]     re   r   c                    s     dkr;t  dkfdd t fdd tjd  j  k fdd d S t  dkfdd tddkfd	d tjd  jd k fd
d tjd  jd  k fdd d S )Nr  r   c                         d d    dS )Nr  z to be 1D tensor, but got 	D tensor.r   r8   r	  
scale_namer8   r9   r]     rH   z>_meta_grouped_mm_common.<locals>.check_scale.<locals>.<lambda>c                      r<  )Nr  z to be contiguous.r8   r8   r  r8   r9   r]     r   r   c                      s(   d d j    dj d  dS )Nr  z	 to have r  r   z
 elements.r   r8   r  r	  scale_multiplierr  
scaled_dimr8   r9   r]        ( c                      r  )Nr  z to be 2D tensor, but got r  r   r8   r  r8   r9   r]     rH   c                      r<  )Nr  z( to be contiguous in the last dimension.r8   r8   r  r8   r9   r]     r   c                      s$   d d j d  dj d  dS )Nr  z batch dimension to be r   , got r!  r   r8   )r  r	  r  r8   r9   r]     r  c                      s(   d d j d   dj d  dS )Nr  z non-batch dimension to be r   r  r!  r   r8   )r  r	  r  r  r8   r9   r]     r  )rz   rN   r`   r   r   r   )r  r	  r  r  r  r8   r  r9   check_scale  s:   



z,_meta_grouped_mm_common.<locals>.check_scaler   r  r  c                   S   rc   )Nz:Scale result tensor provided, but it is not supported yet.r8   r8   r8   r8   r9   r]     re   c                      s   d    d   dS )Nz/Offsets tensor not provided, but is needed for zD/zD multiplicand layouts.r   r8   r  r8   r9   r]     s    c                      r  )Nz.Offsets tensor must be 1D, but got offs.dim()=r!  r   r8   r  r8   r9   r]     r^   c                      r  )Nz7Offsets tensor must be integer (int32) tensor, but got r!  r   r8   r  r8   r9   r]     r  c                   S   rc   )NzJOffsets tensor provided, but is not needed for 3D/3D multiplicand layouts.r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz2Bias tensor provided, but it is not supported yet.r8   r8   r8   r8   r9   r]     re   c                   S   rc   )Nz4If output dtype provided, it must be torch.bfloat16.r8   r8   r8   r8   r9   r]     re   r|  )
rN   r`   rV   ru  rt  rz   rr  r   r]  r  )r  r  r  r  r  rX  r  rZ  r  scaledmat_a_is_2dmat_b_is_2dr   r  r  r  r  r8   )r  r  r  r9   _meta_grouped_mm_commonW  s   




!





r  c              
   C   s   t | |d d ||d |dS )N)r  r  r  rX  r  rZ  r  )r  r  r  rX  rZ  r8   r8   r9   
grouped_mm  s   	r   c	           	      C   s   t | ||||||||d	S )N)r  r  r  rX  r  rZ  r  r  )	r  r  r  r  r  rX  r  rZ  r  r8   r8   r9   meta_scaled_grouped_mm  s   r  rD   half_to_floatc                 C   sL   |r
| j tjks
J tj| tjjd\}}|s|n|}tj| |tjd}|S )Nr  r  )	rV   rN   rP   rI   r   r   rJ   r   r   )rD   rz   r  computation_dtyperF   ro  r8   r8   r9   softmax-  s   
r  c              	      s   t td dkfdd | jttd }| t |kfdd td  }t|D ]1 t d d       d   }t |dk fdd || q:t j|| j| j	| j
t| dS )	Nr  r   c                      r$  )Nz1Length of pad must be even but instead it equals r  r8   r  r8   r9   r]   A  r  z'_constant_pad_nd_meta.<locals>.<lambda>c                      s   dt  d  dS )Nz`Length of pad should be no more than twice the number of dimensions of the input. Pad length is z while the input has z dimensions.r  r8   )l_inpr  r8   r9   r]   K  s
    r   c                	      s6   d    d  dd   d   d	S )NzThe input size z, plus negative padding r   r   zG resulted in a negative output size, which is invalid. Check dimension z of your input.r8   r8   )r  r$  l_diffr  pad_idxr8   r9   r]   V  s    
)rV   rw   ry   r   )rN   r`   r   r   r   r   r   r   rV   rw   ry   r!   )r  r  r  l_padr   new_dimr8   )r  r$  r  r  r  r  r9   _constant_pad_nd_meta;  s8   
 r
  r  r  r  c           	      C   sx   |   dks
J d| j}|j}|jdkr|d f}n|jdkr)|d |d f}n	g ||d R }| j}| j||dS )Nr  z'weight' must be 2-Dr   r   r   )rz   r   r   rV   r   )	rV  r   r  r  r  weight_shapeindices_shaper   rZ  r8   r8   r9   	embeddinge  s   	

r  max_lengthspadding_valuec                 C   s\   t |dksJ t |dksJ |d jd d }|d }||g| jdd  R }| |S r  )r   r   r   )r   r  r  r  r  rQ  rz  r8   r8   r9   $meta__jagged_to_padded_dense_forward}  s   
r  c                 C      t | t dd }|S )Nc                 S   r  r  rM   r   r  r   r8   r8   r9   _f  s   z)_create_unary_float_meta_func.<locals>._fr?   r'   funcr  r8   r8   r9   _create_unary_float_meta_func     r  c                 C   r  )Nc                 S   r  r  r  )rD   r  r8   r8   r9   r    r  z*_create_binary_float_meta_func.<locals>._fr  r  r8   r8   r9   _create_binary_float_meta_func  r  r  c                    s<   t   fdd} j d}||_ttt||}|S )Nc                    s(    | g|R i |}t | j|j | S r3   r  )r   rK   r   r   r6   r8   r9   _fn  s   z#_register_inplace_meta.<locals>._fnrL   )r   rp   r?   getattrr.   )r7   r  inplace_namer8   r6   r9   _register_inplace_meta  s   r  c                    sr   t j jk fdd  g}ttr1jdkr,t jjkfdd | t|dtj	iS )Nc                      r  )Nr  z for `end`, but got dtype r   r8   )rq   rr   r8   r9   r]     r   zlerp.<locals>.<lambda>r   c                      r  )Nr  z for `weight`, but got dtype r   r8   )rr   rV  r8   r9   r]     r   r@   )
rN   r`   rV   rf   r"   r   r   rM   r   rJ   )rr   rq   rV  rK   r8   )rq   rr   rV  r9   lerp  s"   




r  )r  c                C   s   t | ||tjdS r  r  r  tensor1tensor2r  r8   r8   r9   addcmul  s   
r"  c                C   s8   t t|jot|j dd  t| ||tjdS )Nc                   S   rc   )N)zFInteger division with addcdiv is no longer supported, and in a future zErelease addcdiv will perform a true division of tensor1 and tensor2. z4The historic addcdiv behavior can be implemented as zA(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) zfor integer inputs and as z6(input + value * tensor1 / tensor2) for float inputs. z?The future addcdiv behavior is just the latter implementation: z4(input + value * tensor1 / tensor2), for all dtypes.r8   r8   r8   r8   r9   r]     re   zaddcdiv.<locals>.<lambda>r  )rN   r`   rI   r$  rV   rM   r   rJ   r  r8   r8   r9   addcdiv  s   

r#  c                  C   s4  i } dD ]}t | }|D ]}|| vr|| | |< qq|  D ]y\}}t|tjjr*qt|ts1J |tjj	j
| tj| drR|t d v rQt| dq|jrVq| dv r]qd| v rjt|| qd| v rwt|| qd| v rt|| qd	| v rt|| qt|| qd S )
N)rt   post_autogradpre_autogradCompositeImplicitAutogradrt   z is a CompositeImplicitAutograd op, we shouldn't register meta function for it. Instead, we should let the decomposition run and write meta kernels for the base operators.>   aten::cloneaten::copy_aten::rot90aten::_to_copyaten::empty_stridedaten::constant_pad_ndaten::as_strided_scatterzmkldnn::zmkl::zonednn::zquantized::)r   itemsrf   rN   _opsHigherOrderOperatorr   py_impl_CDispatchKeyr0   %_dispatch_has_kernel_for_dispatch_keyr  rE  is_view2_meta_lib_dont_use_me_use_register_meta_for_mkldnnimpl/_meta_lib_dont_use_me_use_register_meta_for_mkl2_meta_lib_dont_use_me_use_register_meta_for_onednn5_meta_lib_dont_use_me_use_register_meta_for_quantized'_meta_lib_dont_use_me_use_register_meta)activate_meta_tablero   registryopoop_overloadr7   r8   r8   r9   activate_meta  sN   r@  r   )Tr  r3   )NNNFr   r   r   r  r  )r  )r  T)FF)TT)r9  )FTN)TFF)TF)r  )r  N)r2   r  )r8   r   r|  F)r8   r   FTN)Fr   FNFr   )NF)r   F)r  r  FN)NNNNN)r   NNr   )NNF)r  FFN)Nr  FFN)r  FNN)r  FN)FN)FNNNN)NNNF)Nr   FNN)NNNN)r   TT)NNr   N)r  r   r   )r   )NNNNF)r   FF)r  (  r  r   collections.abcr   enumr   	functoolsr   r   typingr   r   r   r	   typing_extensionsr
   rN   torch._prims_commonr5  rI   r   r   r   torch._decompr   r   r   r   
torch._opsr   torch._primsr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   torch._prims_common.wrappersr#   r$   r%   r&   r'   r  r(   r)   torch.fx.experimentalr*   r  torch.utilsr+   r<   r,   r-   opsr.   libraryLibraryr;  r   r  r  r  r?   rM   rX   rb   linspacelogspacer{  r   takerH  r   r   r   r   r   r   r  r   r   cummaxcumminr   r   r  r   r  r  r  _fft_c2cr  r  r  _fft_r2cr-  randpermgenerator_outr1  r   r6  randintr<  r:  low_outr=  randr>  _fft_c2rrB  r  rL  rR  
unsqueeze_rU  _sparse_semi_structured_linearr  rV   r`  _sparse_semi_structured_mmre  _sparse_semi_structured_addmmrh  _cslt_sparse_mmr{  index_reducer  index_reduce_r  index_selectr  segment_reducer  r   	unary_outr  rz   r  r"  r  r  r  r  r  _assert_asyncr  r   r  _printr  _make_dep_tokenr  r  _functional_sym_constrain_ranger  r  (_functional_sym_constrain_range_for_sizer  _functional_assert_asyncr  r   r  r   r  r  r  r  _linalg_eighr  r  _linalg_eigvalslinalg_eigvalsr  
linalg_eigr  r  r  r  r  r  r  r  linalg_inv_exr  linalg_ldl_factor_exr_   r   linalg_ldl_solver(  	linalg_lur,  linalg_lu_factor_exr.  linalg_lu_solver2  	lu_unpackr7  r@  	linalg_qrrF  rI  rG  _linalg_svdrR  r%  r  r_  rq  linalg_solve_triangularrw  rz  r  _linalg_detr  r  r  r  reflection_pad1dr  replication_pad1dr  r  reflection_pad1d_backwardr  replication_pad1d_backwardr  r  reflection_pad2dr  replication_pad2dr  reflection_pad2d_backwardr  replication_pad2d_backwardr  r  reflection_pad3dr  replication_pad3dr  reflection_pad3d_backwardreplication_pad3d_backwardr  _pdist_forwardrR   r  _pdist_backwardr  baddbmmr  	bernoullir  
bernoulli_r   r  r  poissonr  _fused_moving_avg_obs_fq_helperr  mmr  r  r!  r2  r9  miopen_batch_normrF  convolutionrM  r2  _has_mkldnnr6  rN  _convolution_pointwiserT  _linear_pointwiserW  has_mklr8  rX  _mkl_linearr[  r9  r\  qconv2d_pointwiseqconv_pointwiserh  binaryrq  qlinear_pointwiserO  ru  binary_tensorry  linear_dynamic_fp16linear_relu_dynamic_fp16rz  r:  r{  
max_pool2dr  int4mm_packed_weight_cpur  r  
avg_pool2dr  r  avg_pool2d_backwardr  
avg_pool3dr  avg_pool3d_backwardr  _adaptive_avg_pool2dr  _adaptive_avg_pool3dr  _adaptive_avg_pool2d_backwardr  _adaptive_avg_pool3d_backwardr  r  adaptive_max_pool2dr  r  r  adaptive_max_pool3dr  r  r  repeat_interleaver  rg   r  r  r  r   _unsafe_indexr  convolution_backwardr  addbmmr  randint_liker!  _fused_adam__fused_adamw_r6  _fused_adamr9  _int_mmr:  _convert_weight_to_int4packrB  #_convert_weight_to_int4pack_for_cpurD  _weight_int4pack_mmrI  _weight_int4pack_mm_for_cpurK  rO  rP  rp  _dyn_quant_pack_4bit_weightr{  _dyn_quant_matmul_4bitr  _weight_int8pack_mmr  _cdist_forwardr  _cdist_backwardr  _embedding_bagr  _embedding_bag_forward_onlyr  r  nansumr  median	nanmedianr  
dim_valuesr8  r   r  logical_not_r  repeatr  zero_r  mul_Scalardiv_logical_and_logical_or_logical_xor_r  add_sub_r  rounddecimalsr  r  
__rshift__r  
__lshift__r  zeror  rR  r  fillr  relu_r  	_add_relur  rrelu_with_noiser  rrelu_with_noise_functionalr  rrelu_with_noise_r  	index_put_unsafe_index_putr  masked_fill_r  _masked_scaler  masked_scatter_r  masked_scatterr  masked_scatter_backwardr  
index_put_r  aliasr  r   bmmr  r  r  r
  r  r  r  r  r  r   max_pool2d_with_indices_backwardr  max_pool2d_with_indicesr  fractional_max_pool2dr(  max_pool3d_with_indicesr/   max_pool3d_with_indices_backwardr0  r4  r5  r<  grid_sampler_2d_backwardrC  rH  rI  rK  r  onesrW  zerosrY  r  r]  select_scatterr`  slice_scatterrb  r   re  rk  gatherro  r|  r  r  r  r  scatter_addr  scatter_add_r  r  rI  r  value_reducer  scatter_r  #_scaled_dot_product_flash_attentionr  r  #_scaled_dot_product_cudnn_attentionr  0_scaled_dot_product_fused_attention_overrideabler  ,_scaled_dot_product_flash_attention_backwardr  +_scaled_dot_product_flash_attention_for_cpur  4_scaled_dot_product_flash_attention_for_cpu_backwardr  '_scaled_dot_product_efficient_attentionr  0_scaled_dot_product_efficient_attention_backwardr  ,_scaled_dot_product_cudnn_attention_backwardr  _flash_attention_forwardr  _flash_attention_backwardr  _efficient_attention_forwardr  _efficient_attention_backwardSymIntr  
_scaled_mmr  scatter_reducetwotwo_outr  scatter_reduce_r  multinomialr  r  r"  r'  _upsample_nearest_exact1dr.  _upsample_nearest_exact2dr3  "_upsample_nearest_exact2d_backwardr5  _upsample_nearest_exact3dr   r6  values_stabler9  rH  _thnn_fused_lstm_cellrM  rc  rl  rn  ro  rp  argminrq  rs  topkrx  _segment_reduce_backwardr{  kthvaluer~  r   r  r  r  r  pixel_shuffler  r  	bucketize
Tensor_outr  histcr  _upsample_bilinear2d_aa_upsample_bicubic2d_aar   _upsample_bilinear2d_aa_backwardr  r  r  rt  r  searchsortedr  r  embedding_dense_backwardr  _embedding_bag_backwardr  _embedding_bag_dense_backwardr  *_embedding_bag_per_sample_weights_backwardr  isinr  	polygammar  _local_scalar_denser  r  r  r  r  _grouped_mmr   _scaled_grouped_mmr  _softmaxr  constant_pad_ndr
  r  _jagged_to_padded_dense_forwardr  r  r  special_airy_aispecial_bessel_y0special_bessel_y1special_modified_bessel_i0special_modified_bessel_i1special_modified_bessel_k0special_modified_bessel_k1!special_scaled_modified_bessel_k0!special_scaled_modified_bessel_k1special_chebyshev_polynomial_tspecial_chebyshev_polynomial_uspecial_chebyshev_polynomial_vspecial_chebyshev_polynomial_w&special_shifted_chebyshev_polynomial_t&special_shifted_chebyshev_polynomial_u&special_shifted_chebyshev_polynomial_v&special_shifted_chebyshev_polynomial_wspecial_hermite_polynomial_hspecial_hermite_polynomial_hespecial_laguerre_polynomial_lspecial_legendre_polynomial_pr  r  r"  r#  lerp_addcmul_addcdiv_torch._refs.nn.functionaltorch._refs.specialr@  r8   r8   r8   r9   <module>   s  <(
	8I

D
		6

;
"
'
	
!"	
0



#
	

	











	




'



"

2
*
*
"7
(*&
%
	
:

/Z&5?'&
.



l%	,$Q,
H
XN



.


*" 
$ m
#c	







-


)
T	
\>	
6L+&
T

ge( 

	, $1	






@	+*	
		
*,	
7	
	
K	
	
/	
7 #



'7'

"
0


*


"
	


(	 
)	
(

(




E