o
    ,h^                     @   s   d dl Z d dlZd dlZddlmZ eejds0edejjd< edejjd< edejjd< d dlm	Z	m
Z
mZ d	d
 Zdd ZG dd dejj
ZG dd dZ	dddZdS )    N   )_dummy_type_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r   r   r   c                   C      t  S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r    r	   r	   L/var/www/html/scripts/venv/lib/python3.10/site-packages/torch/cuda/graphs.pyis_current_stream_capturing   s   r   c                   C   r   )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )r   r	   r	   r	   r
   graph_pool_handle"   s   r   c                       s   e Zd ZdZd fdd	Zd fdd	Z fd	d
Z fddZ fddZ fddZ	 fddZ
 fddZ fddZ fddZ  ZS )	CUDAGrapha/  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiatiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it wil be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    Fc                    s   t  | |S N)super__new__)cls
keep_graph	__class__r	   r
   r   I   s   zCUDAGraph.__new__Nglobalc                    s   t  j||d dS )a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )poolcapture_error_modeN)r   capture_begin)selfr   r   r   r	   r
   r   L   s   zCUDAGraph.capture_beginc                       t    dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r   capture_endr   r   r	   r
   r   _   s   	zCUDAGraph.capture_endc                    r   )a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r   instantiater   r   r	   r
   r   j   s   zCUDAGraph.instantiatec                    r   )z,Replay the CUDA work captured by this graph.N)r   replayr   r   r	   r
   r   s      zCUDAGraph.replayc                    r   )z1Delete the graph currently held by this instance.N)r   resetr   r   r	   r
   r    w   r   zCUDAGraph.resetc                    
   t   S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r   r   r   r   r	   r
   r   {   s   
zCUDAGraph.poolc                    r!   )z/Enable debugging mode for CUDAGraph.debug_dump.)r   enable_debug_moder   r   r	   r
   r"      s   
zCUDAGraph.enable_debug_modec                    s   t  |S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r   
debug_dump)r   
debug_pathr   r	   r
   r#      s   zCUDAGraph.debug_dumpc                    r!   )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r   raw_cuda_graphr   r   r	   r
   r%      s   
zCUDAGraph.raw_cuda_graph)F)Nr   )__name__
__module____qualname____doc__r   r   r   r   r   r    r   r"   r#   r%   __classcell__r	   r	   r   r
   r   .   s    	
r   c                   @   sJ   e Zd ZU dZdZejd ed< 			ddefddZ	d	d
 Z
dd ZdS )grapha  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Nztorch.cuda.Streamdefault_capture_streamr   r   c                 C   sr   | j jd u rtj | j _|d u rdn|f| _|d ur|n| j j| _| jd us)J tj| j| _|| _	|| _
d S )Nr	   )r   r,   torchcudaStreamr   capture_streamstream
stream_ctx
cuda_graphr   )r   r3   r   r1   r   r	   r	   r
   __init__   s   

zgraph.__init__c                 C   s@   t j  t  t j  | j  | jj	| j
d| ji d S )Nr   )r-   r.   synchronizegccollectempty_cacher2   	__enter__r3   r   r   r   r   r	   r	   r
   r9      s   



zgraph.__enter__c                 C   s   | j   | j||| d S r   )r3   r   r2   __exit__)r   exc_type	exc_value	tracebackr	   r	   r
   r:      s   
zgraph.__exit__)NNr   )r&   r'   r(   r)   r,   typingOptional__annotations__strr4   r9   r:   r	   r	   r	   r
   r+      s   
 
r+      Fc           '         s  t  rt  rtdd}t| tsd}| f} |f}g  t| |D ]N\}}t|t jjrUt	|j
dkrBt	|jdkrBt	|jdksFJ dtdd | D sUJ dt jjj| } t| td	d |D spJ d
q"dd  D }	dd | D  fddtt	| D }
dd tt	| D }dd tt	| D }|du rt n|}t j  t jt j \ t| ||
D ]M\}}}d\}}}t|D ]4}t jj|| }tdd |D }t	|dkrt jj|tdd |D tdd |D d|d}q|||fD ]}~q	qW d   n	1 sw   Y  t j  g }g }t| ||D ]8\}}}t jj||d || }W d   n	1 sKw   Y  t jj|\}}|t| || q.g }g }tt|
t|t|D ]\}}}tdd |D }tdd |D }d}t	|dkrt jj||d! t jj|tdd |D tdd |D d|d}W d   n	1 sw   Y  g }d} |D ]}!|!jr|dur|||   | d7 } q|d qt|}|| || qw|   |   dd }"g }#t!| D ]E\}$}|"||$ ||$ |$ |	|$ ||$ |
|$ ||$ ||$ ||$ 	}%t|t jjrOdd  }&|&||j"|%|j#|_#|#| q|#|% q|r]|#d S t|#S )!a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FTr   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s   s    | ]}|j d u V  qdS )FNrequires_grad.0br	   r	   r
   	<genexpr>@      z)make_graphed_callables.<locals>.<genexpr>zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s   s    | ]	}t |tjV  qd S r   )
isinstancer-   Tensor)rF   argr	   r	   r
   rH   G      zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S   s   g | ]}t |qS r	   )len)rF   argsr	   r	   r
   
<listcomp>N  s    z*make_graphed_callables.<locals>.<listcomp>c                 S   s*   g | ]}t |tjjrt| nd qS )r	   )rJ   r-   nnModuletuple
parameters)rF   cr	   r	   r
   rP   O  s    c                    s   g | ]
} | |  qS r	   r	   rF   iflatten_sample_argsper_callable_module_paramsr	   r
   rP   S  s    c                 S      g | ]}t j qS r	   r-   r.   r   rF   _r	   r	   r
   rP   X      c                 S   r[   r	   r\   r]   r	   r	   r
   rP   Y  r_   N)NNNc                 s       | ]}|j r|V  qd S r   rC   rF   or	   r	   r
   rH   h  rI   c                 s   r`   r   rC   rV   r	   r	   r
   rH   l  s    
c                 s   s     | ]}|j rt|V  qd S r   rD   r-   
empty_likera   r	   r	   r
   rH   o  s    
)outputsinputsgrad_outputsonly_inputsallow_unused)r   c                 s   s$    | ]}|j rt|nd V  qd S r   rc   ra   r	   r	   r
   rH         
c                 s   r`   r   rC   ra   r	   r	   r
   rH     rI   c                 s   r`   r   rC   rV   r	   r	   r
   rH     rI   c                 s   s    | ]	}|d ur|V  qd S r   r	   ra   r	   r	   r
   rH     rM      c	           
         s8   G 	fdddt jj  fdd}	|	S )Nc                       s@   e Zd ZefddZeejjj fddZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                    s`   t D ]}|  ||  kr| ||  q   tts'J tdd D S )Nc                 s   s    | ]}|  V  qd S r   detachra   r	   r	   r
   rH     s    zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)rangedata_ptrcopy_r   rJ   rS   )ctxrf   rW   )	fwd_graphlen_user_argsstatic_input_surfacestatic_outputsr	   r
   forward  s   zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardc                    sr   t |t ks
J t|D ]\}}|d ur$| | kr$|| q   tts0J tdd D S )Nc                 s   s$    | ]}|d ur|  n|V  qd S r   rl   rE   r	   r	   r
   rH     rj   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)rN   zipro   rp   r   rJ   rS   )rq   gradsggrad)	bwd_graphstatic_grad_inputsstatic_grad_outputsr	   r
   backward  s   
zXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)
r&   r'   r(   staticmethodrv   r-   autogradfunctiononce_differentiabler~   r	   )r{   rr   rs   r|   r}   rt   ru   r	   r
   Graphed  s    	r   c                     s0   t jjj|  } jt|  }t jj|S r   )r-   utils_pytreearg_tree_leavesapplyrS   tree_unflatten)	user_argsflatten_user_argsout)r   module_paramsoutput_unflatten_specr	   r
   functionalized  s   zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r-   r   Function)
rr   r{   r   rs   r   rt   ru   r}   r|   r   r	   )
r   r{   rr   rs   r   r   r|   r}   rt   ru   r
   make_graphed_autograd_function  s   $z>make_graphed_callables.<locals>.make_graphed_autograd_functionc                    s    fdd}|S )Nc                     s    j kr	|  S |  S r   )training)r   funcgraph_training_stategraphedorig_fwdr	   r
   new_fwd  s   
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdr	   )r   r   r   r   r   r	   r   r
   make_graphed_forward  s   z4make_graphed_callables.<locals>.make_graphed_forward)$r-   is_autocast_enabledis_autocast_cache_enabledRuntimeErrorrJ   rS   rw   rQ   rR   rN   _backward_hooks_forward_hooks_forward_pre_hooksallbuffersr   r   r   appendrn   r   r.   r5   r1   r/   tree_leavesr   rz   r+   tree_flattenreversedrD   reverse	enumerater   rv   )'	callablessample_argsnum_warmup_itersallow_unused_inputr   just_one_callablerU   rO   flatten_argper_callable_len_user_args"per_callable_static_input_surfaces
fwd_graphs
bwd_graphsmempoolr   rt   grad_inputsre   outputs_gradr^   vper_callable_static_outputs"per_callable_output_unflatten_specrr   flatten_outputsspec per_callable_static_grad_outputsper_callable_static_grad_inputsru   r{   r}   r|   grad_idxrL   r   retrW   r   r   r	   rX   r
   make_graphed_callables   s   E






3r   )rB   FN)r6   r>   r-   _utilsr   hasattr_C__dict__torch._Cr   r   r   r   r   r   r+   r   r	   r	   r	   r
   <module>   s"   	kK