o
    ,hn}                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlm  mZ d dlmZmZ d dlmZmZmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% g dZ&de'e(e(f de(de(de(de'e(e(f f
ddZ)de'e(e(f de(de*e'e(e(f  fddZ+de(de(dejfddZ,G dd dej-Z.G dd  d ej-Z/G d!d" d"ej-Z0G d#d$ d$ej-Z1G d%d& d&ej-Z2G d'd( d(ej-Z3G d)d* d*ej-Z4G d+d, d,ej-Z5G d-d. d.ej-Z6		/dCd0e(d1e*e( d2e*e( d3e7d4e(d5e(d6e
e d7e8d8ede6fd9d:Z9G d;d< d<eZ:e ed=e:j;fd>dd?d@d6e
e: d7e8d8ede6fdAdBZ<dS )D    N)OrderedDict)Sequence)partial)AnyCallableOptional)nnTensor)register_modelWeightsWeightsEnum)_IMAGENET_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)Conv2dNormActivationSqueezeExcitation)StochasticDepth)ImageClassificationInterpolationMode)_log_api_usage_once)MaxVitMaxVit_T_Weightsmaxvit_t
input_sizekernel_sizestridepaddingreturnc                 C   s8   | d | d|  | d | d | d|  | d fS )Nr          )r   r   r   r   r    r    T/var/www/html/scripts/venv/lib/python3.10/site-packages/torchvision/models/maxvit.py_get_conv_output_shape   s   r"   n_blocksc                 C   s<   g }t | ddd}t|D ]}t |ddd}|| q|S )zQUtil function to check that the input size is correct for a MaxVit configuration.   r   r   )r"   rangeappend)r   r#   shapesblock_input_shape_r    r    r!   _make_block_input_shapes!   s   r*   heightwidthc                 C   s   t t jt | t |gdd}t |d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | d 7  < |d d d d df  |d 7  < |d d d d df  d| d 9  < |dS )Nij)indexingr   r   r   )torchstackmeshgridarangeflattenpermute
contiguoussum)r+   r,   coordscoords_flatrelative_coordsr    r    r!   _get_relative_position_index+   s   $,""&
r;   c                       sp   e Zd ZdZ	ddedededededed	ejf d
ed	ejf deddf fddZ	de
de
fddZ  ZS )MBConva=  MBConv: Mobile Inverted Residual Bottleneck.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        p_stochastic_dropout (float): Probability of stochastic depth.
            in_channelsout_channelsexpansion_ratiosqueeze_ratior   activation_layer.
norm_layerp_stochastic_dropoutr   Nc	                    s*  t    |  |dkp||k}	|	r2tj||ddddg}
|dkr+tjd|ddg|
 }
tj|
 | _nt | _t|| }t|| }|rMt	|dd| _
nt | _
t }|||d	< t||ddd
||d d|d< t||d|d|||d d	|d< t||tjd|d< tj||ddd|d< t|| _d S )Nr   T)r   r   biasr   r$   r   r   r   rowmodepre_normr   )r   r   r   rB   rC   inplaceconv_a)r   r   r   rB   rC   groupsrK   conv_b)
activationsqueeze_excitation)r>   r?   r   rE   conv_c)super__init__r   Conv2d	AvgPool2d
SequentialprojIdentityintr   stochastic_depthr   r   r   SiLUlayers)selfr>   r?   r@   rA   r   rB   rC   rD   should_projrW   mid_channelssqz_channels_layers	__class__r    r!   rS   D   sP   





zMBConv.__init__xc                 C   s"   |  |}| | |}|| S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride].
        )rW   rZ   r\   r]   rd   resr    r    r!   forward   s   
zMBConv.forward)r=   )__name__
__module____qualname____doc__rY   floatr   r   ModulerS   r	   rg   __classcell__r    r    rb   r!   r<   6   s.    	
=r<   c                       sT   e Zd ZdZdedededdf fddZdejfd	d
ZdedefddZ	  Z
S )$RelativePositionalMultiHeadAttentionzRelative Positional Multi-Head Attention.

    Args:
        feat_dim (int): Number of input features.
        head_dim (int): Number of features per head.
        max_seq_len (int): Maximum sequence length.
    feat_dimhead_dimmax_seq_lenr   Nc                    s   t    || dkrtd| d| || | _|| _tt|| _|| _	t
|| j| j d | _|d | _t
| j| j || _t
jtjd| j d d| j d  | jftjd| _| d	t| j| j tj
jj| jd
d d S )Nr   z
feat_dim: z  must be divisible by head_dim: r$   g      r   r   )dtyperelative_position_index{Gz?std)rR   rS   
ValueErrorn_headsrq   rY   mathsqrtsizerr   r   Linearto_qkvscale_factormerge	parameter	Parameterr0   emptyfloat32relative_position_bias_tableregister_bufferr;   inittrunc_normal_)r]   rp   rq   rr   rb   r    r!   rS      s   


,z-RelativePositionalMultiHeadAttention.__init__c                 C   s@   | j d}| j| | j| jd}|ddd }|dS )Nr/   r   r   r   )rt   viewr   rr   r5   r6   	unsqueeze)r]   
bias_indexrelative_biasr    r    r!   get_relative_positional_bias   s   
zARelativePositionalMultiHeadAttention.get_relative_positional_biasrd   c                 C   s  |j \}}}}| j| j}}| |}tj|ddd\}	}
}|	|||||ddddd}	|
|||||ddddd}
||||||ddddd}|
| j }
t	d|	|
}| 
 }tj|| dd}t	d	||}|ddddd||||}| |}|S )
z
        Args:
            x (Tensor): Input tensor with expected layout of [B, G, P, D].
        Returns:
            Tensor: Output tensor with expected layout of [B, G, P, D].
        r$   r/   )dimr   r   r      z!B G H I D, B G H J D -> B G H I Jz!B G H I J, B G H J D -> B G H I D)shapery   rq   r~   r0   chunkreshaper5   r   einsumr   Fsoftmaxr   )r]   rd   BGPDHDHqkvqkvdot_prodpos_biasoutr    r    r!   rg      s   
   

z,RelativePositionalMultiHeadAttention.forward)rh   ri   rj   rk   rY   rS   r0   r	   r   rg   rn   r    r    rb   r!   ro      s    ro   c                       sD   e Zd ZdZdededdf fddZdejdejfd	d
Z  Z	S )SwapAxeszPermute the axes of a tensor.abr   Nc                    s   t    || _|| _d S N)rR   rS   r   r   )r]   r   r   rb   r    r!   rS      s   

zSwapAxes.__init__rd   c                 C   s   t || j| j}|S r   )r0   swapaxesr   r   re   r    r    r!   rg      s   zSwapAxes.forward)
rh   ri   rj   rk   rY   rS   r0   r	   rg   rn   r    r    rb   r!   r      s    r   c                       s8   e Zd ZdZd
 fddZdededefdd	Z  ZS )WindowPartitionzB
    Partition the input tensor into non-overlapping windows.
    r   Nc                       t    d S r   rR   rS   r]   rb   r    r!   rS         zWindowPartition.__init__rd   pc                 C   sf   |j \}}}}|}||||| ||| |}|dddddd}|||| ||  || |}|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
            p (int): Number of partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
        r   r   r   r$      r   r   r   r5   )r]   rd   r   r   Cr   Wr   r    r    r!   rg      s    zWindowPartition.forwardr   N	rh   ri   rj   rk   rS   r	   rY   rg   rn   r    r    rb   r!   r      s    r   c                
       s@   e Zd ZdZd fddZdededed	edef
d
dZ  ZS )WindowDepartitionzo
    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
    r   Nc                    r   r   r   r   rb   r    r!   rS     r   zWindowDepartition.__init__rd   r   h_partitionsw_partitionsc                 C   s`   |j \}}}}|}	||}
}|||
||	|	|}|dddddd}||||
|	 ||	 }|S )ar  
        Args:
            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
            p (int): Number of partitions.
            h_partitions (int): Number of vertical partitions.
            w_partitions (int): Number of horizontal partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r   r   r$   r   r   r   )r]   rd   r   r   r   r   r   PPr   r   HPWPr    r    r!   rg     s   

zWindowDepartition.forwardr   r   r    r    rb   r!   r      s    &r   c                       s   e Zd ZdZdededededeeef deded	ej	f d
ed	ej	f de
de
de
ddf fddZdedefddZ  ZS )PartitionAttentionLayera  
    Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window.

    Args:
        in_channels (int): Number of input channels.
        head_dim (int): Dimension of each attention head.
        partition_size (int): Size of the partitions.
        partition_type (str): Type of partitioning to use. Can be either "grid" or "window".
        grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into.
        mlp_ratio (int): Ratio of the  feature size expansion in the MLP layer.
        activation_layer (Callable[..., nn.Module]): Activation function to use.
        norm_layer (Callable[..., nn.Module]): Normalization function to use.
        attention_dropout (float): Dropout probability for the attention layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        p_stochastic_dropout (float): Probability of dropping out a partition.
    r>   rq   partition_sizepartition_type	grid_size	mlp_ratiorB   .rC   attention_dropoutmlp_dropoutrD   r   Nc              	      s(  t    || | _|| _|d | | _|| _|| _|dvr"td|dkr/|| j| _| _	n| j|| _| _	t
 | _t | _|dkrHtddnt | _|dkrVtddnt | _t||t|||d t|	| _tt|t||| | t|| |t|
| _t|d	d
| _d S )Nr   )gridwindowz0partition_type must be either 'grid' or 'window'r   r   r   rG   rH   )rR   rS   ry   rq   n_partitionsr   r   rx   r   gr   partition_opr   departition_opr   r   rX   partition_swapdepartition_swaprV   ro   Dropout
attn_layer	LayerNormr}   	mlp_layerr   stochastic_dropout)r]   r>   rq   r   r   r   r   rB   rC   r   r   rD   rb   r    r!   rS   -  s8   

		z PartitionAttentionLayer.__init__rd   c                 C   s   | j d | j | j d | j }}t| j d | j dko&| j d | j dkd| j | j | || j}| |}|| | | }|| | 	| }| 
|}| || j||}|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r   z[Grid size must be divisible by partition size. Got grid size of {} and partition size of {})r   r   r0   _assertformatr   r   r   r   r   r   r   )r]   rd   ghgwr    r    r!   rg   g  s   "
&

zPartitionAttentionLayer.forward)rh   ri   rj   rk   rY   strtupler   r   rm   rl   rS   r	   rg   rn   r    r    rb   r!   r     s8    
	
:r   c                       s   e Zd ZdZdededededededejf d	edejf d
edededededede	eef ddf fddZ
dedefddZ  ZS )MaxVitLayera  
    MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        grid_size (Tuple[int, int]): Size of the input feature grid.
    r>   r?   rA   r@   r   rC   .rB   rq   r   r   r   rD   r   r   r   Nc                    s   t    t }t||||||||d|d< t|||d||	|tj||
|d|d< t|||d||	|tj||
|d|d< t|| _d S )N)r>   r?   r@   rA   r   rB   rC   rD   MBconvr   )r>   rq   r   r   r   r   rB   rC   r   r   rD   window_attentionr   grid_attention)	rR   rS   r   r<   r   r   r   rV   r\   )r]   r>   r?   rA   r@   r   rC   rB   rq   r   r   r   rD   r   r   r\   rb   r    r!   rS     sN   



zMaxVitLayer.__init__rd   c                 C   s   |  |}|S z
        Args:
            x (Tensor): Input tensor of shape (B, C, H, W).
        Returns:
            Tensor: Output tensor of shape (B, C, H, W).
        r\   )r]   rd   r    r    r!   rg     s   
zMaxVitLayer.forward)rh   ri   rj   rk   rY   rl   r   r   rm   r   rS   r	   rg   rn   r    r    rb   r!   r     sD    	

Ar   c                       s   e Zd ZdZdedededededejf dedejf d	ed
edededede	eef dede
e ddf fddZdedefddZ  ZS )MaxVitBlocka(  
    A MaxVit block consisting of `n_layers` MaxVit layers.

     Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        input_grid_size (Tuple[int, int]): Size of the input feature grid.
        n_layers (int): Number of layers in the block.
        p_stochastic (List[float]): List of probabilities for stochastic depth for each layer.
    r>   r?   rA   r@   rC   .rB   rq   r   r   r   r   input_grid_sizen_layersp_stochasticr   Nc                    s   t    t||kstd| d| dt | _t|dddd| _t	|D ]+\}}|dkr2dnd}|  jt
|dkr>|n||||||||||	|
|| j|d	g7  _q(d S )
Nz'p_stochastic must have length n_layers=z, got p_stochastic=.r$   r   r   rF   r   )r>   r?   rA   r@   r   rC   rB   rq   r   r   r   r   r   rD   )rR   rS   lenrx   r   
ModuleListr\   r"   r   	enumerater   )r]   r>   r?   rA   r@   rC   rB   rq   r   r   r   r   r   r   r   idxr   r   rb   r    r!   rS     s4   


zMaxVitBlock.__init__rd   c                 C   s   | j D ]}||}q|S r   r   )r]   rd   layerr    r    r!   rg   -  s   

zMaxVitBlock.forward)rh   ri   rj   rk   rY   rl   r   r   rm   r   listrS   r	   rg   rn   r    r    rb   r!   r     sD    	
3r   c                !       s   e Zd ZdZdejddddddfdeeef ded	ed
ee dee dede	de
edejf  dedejf de	de	dede	de	deddf  fddZdedefddZdd Z  ZS )r   ay  
    Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_ paper.
    Args:
        input_size (Tuple[int, int]): Size of the input image.
        stem_channels (int): Number of channels in the stem.
        partition_size (int): Size of the partitions.
        block_channels (List[int]): Number of channels in each block.
        block_layers (List[int]): Number of layers in each block.
        stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
        squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
        expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
        activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
        mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0.
        attention_dropout (float): Dropout probability for the attention layer. Default: 0.0.
        num_classes (int): Number of classes. Default: 1000.
    Ng      ?r   r=   i  r   stem_channelsr   block_channelsblock_layersrq   stochastic_depth_probrC   .rB   rA   r@   r   r   r   num_classesr   c                    s  t    t|  d}|d u rttjddd}t|t|}t|D ]%\}}|d | dks6|d | dkrGt	d| d| d	| d
| d	q"t
t||dd||	dd dt||ddd d dd| _t|dddd}|| _t | _|g|d d  }|}td|t| }d}t|||D ]+\}}}| jt|||
|||	|||||||||||  d | jd j}||7 }qt
tdt t|d t|d |d t tj|d |dd| _|   d S )Nr$   gMbP?g{Gz?)epsmomentumr   r   zInput size z
 of block z$ is not divisible by partition size zx. Consider changing the partition size or the input size.
Current configuration yields the following block input sizes: r   r   F)r   rC   rB   rE   rK   T)r   rC   rB   rE   rF   r/   )r>   r?   rA   r@   rC   rB   rq   r   r   r   r   r   r   r   )rE   ) rR   rS   r   r   r   BatchNorm2dr*   r   r   rx   rV   r   stemr"   r   r   blocksnplinspacer7   tolistzipr&   r   r   AdaptiveAvgPool2dFlattenr   r}   Tanh
classifier_init_weights)r]   r   r   r   r   r   rq   r   rC   rB   rA   r@   r   r   r   r   input_channelsblock_input_sizesr   block_input_sizer>   r?   r   p_idx
in_channelout_channel
num_layersrb   r    r!   rS   N  s   
 


	zMaxVit.__init__rd   c                 C   s,   |  |}| jD ]}||}q| |}|S r   )r   r   r   )r]   rd   blockr    r    r!   rg     s
   



zMaxVit.forwardc                 C   s   |   D ]P}t|tjr"tjj|jdd |jd ur!tj|j qt|tj	r9tj
|jd tj
|jd qt|tjrTtjj|jdd |jd urTtj|j qd S )Nru   rv   r   r   )modules
isinstancer   rT   r   normal_weightrE   zeros_r   	constant_r}   )r]   mr    r    r!   r     s   

zMaxVit._init_weights)rh   ri   rj   rk   r   GELUr   rY   r   rl   r   r   rm   rS   r	   rg   r   rn   r    r    rb   r!   r   9  sZ    %
	
vr   Fr   r   r   r   r   rq   weightsprogresskwargsc              
   K   s   |d ur(t |dt|jd  |jd d |jd d ksJ t |d|jd  |dd}	td| ||||||	d|}
|d urK|
|j|d	d
 |
S )Nr   
categoriesmin_sizer   r   r      r  )r   r   r   r   rq   r   r   T)r  
check_hashr    )r   r   metapopr   load_state_dictget_state_dict)r   r   r   r   r   rq   r
  r  r  r   modelr    r    r!   _maxvit  s&    r  c                   @   sH   e Zd Zedeeddejdedddddd	d
idddddZ	e	Z
dS )r   z9https://download.pytorch.org/models/maxvit_t-bc5ab103.pthr  )	crop_sizeresize_sizeinterpolationir  zLhttps://github.com/pytorch/vision/tree/main/references/classification#maxvitzImageNet-1KgT@g|?5.X@)zacc@1zacc@5gZd;@gK7]@zThese weights reproduce closely the results of the paper using a similar training recipe.
            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.)r  
num_paramsr  recipe_metrics_ops
_file_size_docs)url
transformsr  N)rh   ri   rj   r   r   r   r   BICUBICr   IMAGENET1K_V1DEFAULTr    r    r    r!   r     s*    
r   
pretrained)r
  T)r
  r  c                 K   s2   t | } td	dg dg dddd| |d|S )
a  
    Constructs a maxvit_t architecture from
    `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_.

    Args:
        weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.MaxVit_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.MaxVit_T_Weights
        :members:
    @   )r'        i   )r   r   r   r       g?   )r   r   r   rq   r   r   r
  r  Nr    )r   verifyr  )r
  r  r  r    r    r!   r     s   
	r   )NF)=rz   collectionsr   collections.abcr   	functoolsr   typingr   r   r   numpyr   r0   torch.nn.functionalr   
functionalr   r	   torchvision.models._apir
   r   r   torchvision.models._metar   torchvision.models._utilsr   r   torchvision.ops.miscr   r    torchvision.ops.stochastic_depthr   torchvision.transforms._presetsr   r   torchvision.utilsr   __all__r   rY   r"   r   r*   r;   rm   r<   ro   r   r   r   r   r   r   r   rl   boolr  r   r$  r   r    r    r    r!   <module>   sr    .*
WIhaU .

*.