o
    ,h                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZ g dZeG dd dZ dee! de!fddZ"dej#de!de!de$ej#e!f fddZ%dej#de!de!de!dej#f
ddZ&ej'(d ej'(d G dd dej)Z*dej#d e!dej#fd!d"Z+d#ej#d$ej#d%e$e!e!e!f d&e$e!e!e!f d'ej#d(ej#d)ej#dej#fd*d+Z,dej#d,ej#d-e-fd.d/Z.ej'(d+ ej'(d/ G d0d1 d1ej)Z/G d2d3 d3ej)Z0G d4d5 d5ej)Z1G d6d7 d7ej)Z2d8e3e  d9e4d:e
e d;e-d<ede2fd=d>Z5G d?d@ d@eZ6G dAdB dBeZ7e edCe6j8fdDddEdFd:e
e6 d;e-d<ede2fdGdHZ9e edCe7j8fdDddEdFd:e
e7 d;e-d<ede2fdIdJZ:dS )K    N)Sequence)	dataclass)partial)AnyCallableOptional   )MLPStochasticDepth)VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)MViTMViT_V1_B_Weights	mvit_v1_bMViT_V2_S_Weights	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	MSBlockConfig	num_headsinput_channelsoutput_channelskernel_q	kernel_kvstride_q	stride_kvN)__name__
__module____qualname__int__annotations__list r'   r'   X/var/www/html/scripts/venv/lib/python3.10/site-packages/torchvision/models/video/mvit.pyr      s   
 r   sreturnc                 C   s   d}| D ]}||9 }q|S N   r'   )r)   productvr'   r'   r(   _prod'   s   
r/   x
target_dim
expand_dimc                 C   sF   |   }||d kr| |} | |fS ||krtd| j | |fS )Nr,   zUnsupported input dimension )dim	unsqueeze
ValueErrorshaper0   r1   r2   
tensor_dimr'   r'   r(   
_unsqueeze.   s   
r9   r8   c                 C   s   ||d kr|  |} | S r+   )squeezer7   r'   r'   r(   _squeeze7   s   
r;   c                       s|   e Zd Z		ddejdeej deej deddf
 fdd	Zd
ej	de
eeef de
ej	e
eeef f fddZ  ZS )PoolNFpoolnorm
activationnorm_before_poolr*   c                    sV   t    || _g }|d ur|| |d ur|| |r#tj| nd | _|| _d S )N)super__init__r=   appendnn
Sequentialnorm_actr@   )selfr=   r>   r?   r@   layers	__class__r'   r(   rB   B   s   



zPool.__init__r0   thwc                 C   s   t |dd\}}tj|ddd\}}|dd}|jd d \}}}||| |f|  }| jr<| jd ur<| |}| 	|}|jdd  \}}	}
||||ddd}tj
||fdd}| jsm| jd urm| |}t|dd|}|||	|
ffS )	N   r,   r,   r   )indicesr3   r   r3   )r9   torchtensor_split	transposer6   reshape
contiguousr@   rF   r=   catr;   )rG   r0   rK   r8   class_tokenBNCTHWr'   r'   r(   forwardS   s   


zPool.forward)NF)r!   r"   r#   rD   Moduler   boolrB   rQ   Tensortupler$   r^   __classcell__r'   r'   rI   r(   r<   A   s    >r<   	embeddingdc                 C   s@   | j d |kr	| S tjj| ddd|dddddS )Nr   r,   linear)sizemode)r6   rD   
functionalinterpolatepermuter4   r:   )rd   re   r'   r'   r(   _interpolatem   s   rl   attnqq_thwk_thw	rel_pos_h	rel_pos_w	rel_pos_tc           %      C   s6  |\}}}	|\}
}}t dt|| d }t dt|	| d }t dt||
 d }t|| d}t|| d}t|d d d f | t|d d d f d|  |  }t||	 d}t|	| d}t|	d d d f | t|d d d f d|  |  }t|
| d}t||
 d}t|d d d f | t|
d d d f d|
  |  }t||}t||}t||}||  }||  }||  }|j\}}}}|d d d d dd f |||||	|} td| |}!td| |}"| 	dddddd	||| | |	 |} t
| |dddd}#|#||||	||
	dddddd	}#|!d d d d d d d d d d d d d d f |"d d d d d d d d d d d d d d f  |#d d d d d d d d d d d d d d f  |||| |	 |
| | }$| d d d d dd dd f  |$7  < | S )
Nr   r,         ?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rL      )r$   maxrQ   arangerl   longr6   rT   einsumrk   matmulrS   view)%rm   rn   ro   rp   rq   rr   rs   q_tq_hq_wk_tk_hk_wdhdwdt	q_h_ratio	k_h_ratiodist_h	q_w_ratio	k_w_ratiodist_w	q_t_ratio	k_t_ratiodist_tRhRwRtrX   n_head_r3   r_qrel_h_qrel_w_qrel_q_trel_posr'   r'   r(   _add_rel_pos|   sH   


<<<


**$...(r   shortcutresidual_with_cls_embedc              	   C   sZ   |r	|  | | S | d d d d dd d d f  |d d d d dd d d f 7  < | S r+   )add_)r0   r   r   r'   r'   r(   _add_shortcut   s
   
Dr   c                       s   e Zd Zdejfdee dedededee dee dee d	ee d
edededede	dej
f ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleAttention        
input_size	embed_dim
output_dimr   r   r   r   r    residual_poolr   rel_pos_embeddropout
norm_layer.r*   Nc              
      sp  t    || _|| _|| _|| | _dt| j | _|	| _	|
| _
t|d| | _t||g}|dkr@|tj|dd tj| | _d | _t|dksUt|dkrrdd |D }ttj| j| j|||| jd	d
|| j| _d | _d | _t|dkst|dkrdd |D }ttj| j| j|||| jd	d
|| j| _ttj| j| j|||| jd	d
|| j| _d | _d | _d | _|r6t|dd  }t|dkr||d  n|}t|dkr||d  n|}dt|| d }d|d  d }tt|| j| _tt|| j| _tt|| j| _tj j!| jdd tj j!| jdd tj j!| jdd d S d S )Nrt   r   r   Tinplacer,   c                 S      g | ]}t |d  qS r   r$   ).0rn   r'   r'   r(   
<listcomp>       z0MultiscaleAttention.__init__.<locals>.<listcomp>F)stridepaddinggroupsbiasc                 S   r   r   r   )r   kvr'   r'   r(   r      r   r   r   {Gz?std)"rA   rB   r   r   r   head_dimmathsqrtscalerr   r   rD   LinearqkvrC   DropoutrE   projectpool_qr/   r<   Conv3dpool_kpool_vrq   rr   rs   rv   len	ParameterrQ   zerosinittrunc_normal_)rG   r   r   r   r   r   r   r   r    r   r   r   r   r   rH   	padding_q
padding_kvrg   q_sizekv_sizespatial_dimtemporal_dimrI   r'   r(   rB      s   

			zMultiscaleAttention.__init__r0   rK   c                 C   s:  |j \}}}| |||d| j| jddjdd\}}}| jd ur-| ||\}}	n|}	| jd ur<| ||d }| j	d urI| 	||\}}t
| j| |dd}
| jd urr| jd urr| jd urrt|
|||	| j| j| j}
|
jdd}
t
|
|}| jrt||| j |dd|d| j}| |}||fS )Nr   r,   r   rP   r   rO   )r6   r   rT   r   r   rS   unbindr   r   r   rQ   rz   r   rq   rr   rs   r   softmaxr   r   r   r   r   )rG   r0   rK   rX   rY   rZ   rn   kr.   rp   rm   r'   r'   r(   r^   !  s6   2


	
zMultiscaleAttention.forward)r!   r"   r#   rD   	LayerNormr&   r$   r`   floatr   r_   rB   rQ   ra   rb   r^   rc   r'   r'   rI   r(   r      sB    	
>\r   c                       s   e Zd Zddejfdee dededededede	d	e	d
e
dejf ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleBlockr   r   cnfr   r   r   proj_after_attnr   stochastic_depth_probr   .r*   Nc
                    s  t    || _d | _t|jdkr.dd |jD }
dd |
D }ttj|
|j|dd | _|r3|j	n|j
}|	|j
| _|	|| _t| jtj| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	| _d | _|j
|j	krt|j
|j	| _d S d S )
Nr,   c                 S   s    g | ]}|d kr|d  n|qS rM   r'   )r   r)   r'   r'   r(   r   V  s     z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   r   r   r   )r   r   r'   r'   r(   r   W  r   )r   r   )	r   r   r   r    r   r   r   r   r   rL   )activation_layerr   r   row)rA   rB   r   	pool_skipr/   r   r<   rD   	MaxPool3dr   r   norm1norm2
isinstanceBatchNorm1dneeds_transposalr   r   r   r   r    rm   r	   GELUmlpr
   stochastic_depthr   r   )rG   r   r   r   r   r   r   r   r   r   kernel_skippadding_skipattn_dimrI   r'   r(   rB   E  sP   

zMultiscaleBlock.__init__r0   rK   c           	      C   s   | j r| |ddddn| |}| ||\}}| jd u s%| js'|n| |}| jd u r3|n| ||d }|| | }| j rR| |ddddn| |}| jd u s_| jra|n| |}|| | 	| |fS )Nr,   r   r   )
r   r   rS   rm   r   r   r   r   r   r   )	rG   r0   rK   x_norm1x_attnthw_newx_skipx_norm2x_projr'   r'   r(   r^     s   **zMultiscaleBlock.forward)r!   r"   r#   rD   r   r&   r$   r   r`   r   r   r_   rB   rQ   ra   rb   r^   rc   r'   r'   rI   r(   r   D  s4    		
>:r   c                
       sP   e Zd Zdedeeef dededdf
 fddZd	ejdejfd
dZ	  Z
S )PositionalEncoding
embed_sizespatial_sizetemporal_sizer   r*   Nc                    s   t    || _|| _tt|| _d | _	d | _
d | _|sGtt| jd | jd  || _	tt| j|| _
tt|| _d S d S )Nr   r,   )rA   rB   r   r   rD   r   rQ   r   rW   spatial_postemporal_pos	class_pos)rG   r   r   r   r   rI   r'   r(   rB     s   
$zPositionalEncoding.__init__r0   c                 C   s   | j |ddd}tj||fdd}| jd ur\| jd ur\| jd ur\| jj	\}}tj
| j|dd}|| jd| jddd| tj| jd|fddd}|| |S )Nr   rO   r,   rP   )rW   expandrg   r4   rQ   rV   r   r   r   r6   repeat_interleaver   r   rT   )rG   r0   rW   hw_sizer   pos_embeddingr'   r'   r(   r^     s   & 
zPositionalEncoding.forward)r!   r"   r#   r$   rb   r`   rB   rQ   ra   r^   rc   r'   r'   rI   r(   r     s    *r   c                $       s   e Zd Z									ddeeef d	ed
ee dedededededededede	e
dejf  de	e
dejf  deeeef deeeef deeeef ddf" fddZdejdejfddZ  ZS ) r         ?r     Nr      r   r   rL   rL   r,   r   r   r   r   block_settingr   r   r   r   r   attention_dropoutr   num_classesblock.r   patch_embed_kernelpatch_embed_stridepatch_embed_paddingr*   c                    s  t    t|  t|}|dkrtd|du rt}|du r&ttjdd}tj	d|d j
|||d| _dd	 t|f| | jjD }t|d j
|d
 |d f|d |d| _t | _t|D ]/\}}|
| |d  }| j||||||||	||d	 t|jdkrdd	 t||jD }q`||d j| _ttj|ddt|d j|| _|  D ][}t|tjrtjj|jdd t|tjr|j durtj!|j d qt|tjr|jdurtj!|jd |j durtj!|j d qt|tr
|" D ]
}tjj|dd qqdS )a  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngư>)epsr   )in_channelsout_channelskernel_sizer   r   c                 S      g | ]\}}|| qS r'   r'   r   rg   r   r'   r'   r(   r     r   z!MViT.__init__.<locals>.<listcomp>r,   r   )r   r   r   r   rt   )	r   r   r   r   r   r   r   r   r   c                 S   r  r'   r'   r  r'   r'   r(   r     r   rO   Tr   r   r   r   )#rA   rB   r   r   r5   r   r   rD   r   r   r   	conv_projzipr   r   pos_encoding
ModuleListblocks	enumeraterC   r   r   r>   rE   r   r   headmodulesr   r   r   weightr   	constant_
parameters)rG   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  total_stage_blocksr   stage_block_idr   sd_probmweightsrI   r'   r(   rB     s   
)


zMViT.__init__r0   c                 C   s   t |ddd }| |}|ddd}| |}| jjf| jj }| jD ]	}|||\}}q'| |}|d d df }| 	|}|S )Nru   r   r   r,   )
r9   r	  flattenrS   r  r   r   r  r>   r  )rG   r0   rK   r   r'   r'   r(   r^   "  s   




zMViT.forward)	r   r   r   r   NNr   r   r   )r!   r"   r#   rb   r$   r   r   r`   r   r   r   rD   r_   rB   rQ   ra   r^   rc   r'   r'   rI   r(   r     s\    

	
xr   r   r   r  progresskwargsc                 K   s   |d ur1t |dt|jd  |jd d |jd d ksJ t |d|jd  t |d|jd  |dd	}|dd
}td||| |dd|dd|dd|dd|d|}|d urk||j|dd |S )Nr   
categoriesmin_sizer   r,   r   r   min_temporal_size   r      r   Fr   Tr   r   )r   r   r   r   r   r   r   r   )r  
check_hashr'   )r   r   metapopr   load_state_dictget_state_dict)r   r   r  r  r  r   r   modelr'   r'   r(   _mvit9  s,    



	r(  c                   @   J   e Zd Zedeedddddddedd	d
ddddiddd	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthr     ?r-  r-  ?r/  r/  	crop_sizeresize_sizemeanr   r!  zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip.Kinetics-400gJ+S@gh|?eW@zacc@1zacc@5guVQ@g rxa@	r  r  r  recipe_docs
num_params_metrics_ops
_file_sizeurl
transformsr#  N	r!   r"   r#   r   r   r   r   KINETICS400_V1DEFAULTr'   r'   r'   r(   r   Z  4    r   c                   @   r)  )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthr  r*  r,  r.  r0  r!  zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdr4  ir5  g r0T@g(\W@r6  guVP@g?5^I|`@r7  r>  NrA  r'   r'   r'   r(   r   {  rD  r   
pretrained)r  T)r  r  c                 K   s  t | } g dg dg dg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dd|dd| |d|S )a  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    r,   r   r   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL      rG  `      rJ    rK  rK  rK  rK  rK  rK  rK  rK  rK  rK     rL  )rJ  rJ  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rL  rL  rL  r   r   r   r,   r   r   r,   rG  rG  r,   rL   rL   r,   r,   r,   r   r   r   r   r   r   r    r   r   r   r   r   r   r    r  r!  Fr   皙?)r   r   r   r   r   r   r  r  Nr'   )r   verifyranger   rC   r   r(  r$  r  r  r  configr   ir'   r'   r(   r     s   
..,







	r   c                 K   sD  t | } g dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dddd|dd| |d
|S )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    rF  )rI  rI  rJ  rJ  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rL  rH  rM  rQ  rN  rO  rP  rR  r   r   r   r   r   r   r    r  r!  TFr   rS  )
r   r   r   r   r   r   r   r   r  r  Nr'   )r   rT  rU  r   rC   r   r(  r$  rV  r'   r'   r(   r     s   
N







r   );r   collections.abcr   dataclassesr   	functoolsr   typingr   r   r   rQ   torch.fxtorch.nnrD   opsr	   r
   transforms._presetsr   utilsr   _apir   r   r   _metar   _utilsr   r   __all__r   r$   r/   ra   rb   r9   r;   fxwrapr_   r<   rl   r   r`   r   r   r   r   r   r&   r   r(  r   r   rB  r   r   r'   r'   r'   r(   <module>   s    	
&"	,
< H 
!!!*`.