o
    ,hWg                  &   @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ ddgZG d	d deZd
de de de de de
 d e_ dee dee dee dee dee dee dededededededededededef"d d!Zdee dee dee dee dee dee dededededededededededef"d"d#Zeed$	%		%	%	%	%d(dee dee dee dee dee dee ded&ee dedededededededededef$d'dZdS ))z'Implementation for the NAdam algorithm.    )castOptionalUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_params_doc_stack_if_compiling
_to_scalar_use_grad_for_differentiable_view_as_real	OptimizerParamsTNAdamnadamc                       s   e Zd Z						ddddddd	ed
eeef deeef dedededede	e dededef fddZ
 fddZdd ZedddZ  ZS )r   Mb`?g?g+?:0yE>r   Mbp?FN)foreachmaximize
capturabledifferentiableparamslrbetasepsweight_decaymomentum_decaydecoupled_weight_decayr   r   r   r    c                   s   t |tr| dkrtdd|kstd| d|ks%td| d|d   kr1dk s;n td|d  d|d   krGdk sQn td	|d  d|ks\td
| d|ksgtd| t|||||||	||
|d
}t || d S )Nr   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )
r"   r#   r$   r%   r&   r'   r   r   r   r    )
isinstancer   numel
ValueErrordictsuper__init__)selfr!   r"   r#   r$   r%   r&   r'   r   r   r   r    defaults	__class__ L/var/www/html/scripts/venv/lib/python3.10/site-packages/torch/optim/nadam.pyr/   !   s6   zNAdam.__init__c                    s  t  | | jD ]|}|dd |dd  |dd |dd |dd |d D ]W}| j|g }t|dkrt|d	 sat	|d	 }|d rWtj
|t |jd
ntj
|t d|d	< t|d s|d }|d rztj
|t |jd
ntj
|t d|d< q-q	d S )Nr   Fr   r   r    r'   r!   r   stepdtypedevicer8   
mu_product)r.   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   r9   )r0   r?   grouppp_statestep_valmu_prod_valr2   r4   r5   r<   L   s:   


zNAdam.__setstate__c                 C   s*  d}|d D ]}	|	j d ur|t|	O }||	 |	j jr!td||	j  | j|	 }
t|
dkrv|d r@tjdt	 |	j
dntjdt	 d	|
d
< |d rXtjdt	 |	j
dntjdt	 d	|
d< tj|	tjd|
d< tj|	tjd|
d< ||
d  ||
d  ||
d  ||
d
  q|S )NFr!   z'NAdam does not support sparse gradientsr   r   r4   r7   r(   r:   r6   r)   r;   )memory_formatexp_avg
exp_avg_sq)gradrB   
is_complexappend	is_sparseRuntimeErrorr?   rA   zerosr   r9   rE   ones
zeros_likepreserve_format)r0   rF   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepshas_complexrG   r?   r4   r4   r5   _init_groupj   s<   





zNAdam._init_groupc                 C   s   |    d}|dur!t  | }W d   n1 sw   Y  | jD ]N}g }g }g }g }g }g }	ttttf |d \}
}| |||||||	}t||||||	|
||d |d |d |d |d |d |d	 |d
 |d |d q$|S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr#   r"   r%   r&   r$   r   r'   r   r   r    )beta1beta2r"   r%   r&   r$   r   r'   r   r   r    r]   )	 _cuda_graph_capture_health_checkrB   enable_gradr=   r   tuplerD   r^   r   )r0   closurelossrF   rW   rX   rY   rZ   r[   r\   r_   r`   r]   r4   r4   r5   r6      sX   


z
NAdam.step)r   r   r   r   r   FN)__name__
__module____qualname__r   r   rD   r   rc   boolr   r/   r<   r^   r   r6   __classcell__r4   r4   r2   r5   r       sN    



+2a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}, \:\textit{maximize}             \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to decouple the weight
            decay as in AdamW to obtain NAdamW. If True, the algorithm does not
            accumulate weight decay in the momentum nor variance. (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    r!   rX   rY   rZ   r[   r\   r_   r`   r"   r%   r&   r$   r'   r   r   r    r]   c                C   sn  t j s	t|}t| D ]&\}}|s|| n||  }|| }|| }|| }|| }t |rFt |}t |}t |}t |}t j sp|rpt	 }|j
j|j
j  kr`|j
jkrhn n|j
j|v spJ d| d|d7 }|ry|}nt|}d||  }|	dkr|r|d||	   n|j||	d}|ddd||
     }|ddd|d |
     }||9 }||d|  ||j||d| d	 || }|s|r||}|| }|| d|  d|   }|| | d|   }||| ||| qt|| }|| |j||| d|  dt|  d	 |j||| | d|  d	 qd S )
NzVIf capturable=True, params, mu_products and state_steps must be on supported devices: .r   r   alphar)         ?Q?)value)rB   jitis_scriptingr   	enumeraterO   view_as_realcompileris_compilingr   r9   typer   mul_addlerp_addcmul_divsqrtaddcdiv_add_)r!   rX   rY   rZ   r[   r\   r_   r`   r"   r%   r&   r$   r'   r   r   r    r]   iparamrN   rL   rM   r;   step_tcapturable_supported_devicesr6   bias_correction2mumu_nextdenommu_product_nextr4   r4   r5   _single_tensor_nadam  sh   





$


r   c          (         s  t | dkrd S |rJ dtj s1|r1tddtfddt| ||D s1J d dtt	| |||||g}|
 D ]\\}}}}}}}ttt |}ttt |}ttt |}ttt |}ttt |}ttt |}|rt|||| |rt|}tj s|d jrtj|tjd	d
dd	d nt|d |	dkr|rt|d|	   n|rtj|||	d ntj|||	d}t||d   t| t|||d  t|}|r@t|} td| }!t|!d t|!d	 t|!  t|  td| }"t|"d t|"d	 t|"  ~ t|}#t|#d	 t|# t|# nfdd|D }# fdd|D }! fdd|D }"t||! t||# t|| ~#|rt|!d	 t|! t|d	}$t|$ t|!|$ |!}%~$t||"}$t|" t|$d	 t|"|$ |"}&~$t|%|}'t|'|&| t||'| qDt fddt||!D }%t fddt||"D }&t||||% t||||& qDd S )Nr   z#_foreach ops don't support autogradF)supports_xlac                 3   sF    | ]\}}}|j j|j j  ko|j jkn  o|j j v V  qd S rf   )r9   rx   ).0rG   mpr6   )r   r4   r5   	<genexpr>  s    $

z&_multi_tensor_nadam.<locals>.<genexpr>zWIf capturable=True, params, mu_products, and state_steps must be on supported devices: rl   r)   cpu)r9   rm   r   rp   g      c                    s    g | ]}d  t |  d qS )r   ro   r   r   r6   )r`   r4   r5   
<listcomp>  s    z'_multi_tensor_nadam.<locals>.<listcomp>c                    s(   g | ]} d ddt |     qS )r)   ro   rp   r   r   r_   r&   r4   r5   r     s    c                    s,   g | ]} d ddt |d      qS )r)   ro   rp   r   r   r   r   r4   r5   r     s    c                    s0   g | ]\}}t  d |  d t |  d qS r)   r   )r   r;   r   r"   r4   r5   r   6  s    c                    s0   g | ]\}}t  | d t ||   d qS r   r   )r   r;   r   r   r4   r5   r   <  s    )!rA   rB   rv   rw   r   allzipr   r   "_group_tensors_by_device_and_dtypevaluesr   listr   r   _foreach_negis_cpu_foreach_add_rE   _foreach_mul__foreach_add_foreach_lerp__foreach_addcmul__foreach_sqrt_foreach_mul_foreach_pow_foreach_sub__foreach_neg__foreach_sqrt__foreach_div__foreach_sub_foreach_addcdiv_r   )(r!   rX   rY   rZ   r[   r\   r_   r`   r"   r%   r&   r$   r'   r   r   r    r]   grouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_avg_sqs_grouped_mu_products_grouped_state_steps__grouped_paramsgrouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_mu_productsgrouped_state_stepsexp_avg_sq_sqrtexponentmusmu_nextsbias_correction_sqrtr   step_size_gradsstep_size_expavg	numeratorr4   )r_   r`   r   r"   r&   r5   _multi_tensor_nadamz  s  











 r   )single_tensor_fnFr   c                C   s   t dd |D stdt dd |D std|du r't| |	dd\}}|r2tj r2td	|r<tj s<t}nt}|| |||||||||||||||	|
d
 dS )zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c                 s       | ]	}t |tjV  qd S rf   r*   rB   r   r   tr4   r4   r5   r   q      znadam.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc                 s   r   rf   r   r   r4   r4   r5   r   v  r   zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizers)r_   r`   r"   r%   r&   r   r'   r$   r   r    r]   )r   rR   r   rB   rr   rs   r   r   )r!   rX   rY   rZ   r[   r\   r'   r   r   r    r]   r   r_   r`   r"   r%   r&   r$   r   funcr4   r4   r5   r   U  sH   

)FNFFFF) __doc__typingr   r   r   rB   r   	optimizerr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   r   rD   rj   r   r   r   r4   r4   r4   r5   <module>   s  H 8'C	

a	

 \
	
