o
    h                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZ g dZeG dd dZ dee! de!fddZ"dej#de!de!de$ej#e!f fddZ%dej#de!de!de!dej#f
ddZ&ej'(d ej'(d G dd dej)Z*dej#d e!dej#fd!d"Z+d#ej#d$ej#d%e$e!e!e!f d&e$e!e!e!f d'ej#d(ej#d)ej#dej#fd*d+Z,dej#d,ej#d-e-fd.d/Z.ej'(d+ ej'(d/ G d0d1 d1ej)Z/G d2d3 d3ej)Z0G d4d5 d5ej)Z1G d6d7 d7ej)Z2d8e3e  d9e4d:e
e d;e-d<ede2fd=d>Z5G d?d@ d@eZ6G dAdB dBeZ7e edCe6j8fdDddEdFd:e
e6 d;e-d<ede2fdGdHZ9e edCe7j8fdDddEdFd:e
e7 d;e-d<ede2fdIdJZ:dS )K    N)Sequence)	dataclass)partial)AnyCallableOptional   )MLPStochasticDepth)VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)MViTMViT_V1_B_Weights	mvit_v1_bMViT_V2_S_Weights	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	MSBlockConfig	num_headsinput_channelsoutput_channelskernel_q	kernel_kvstride_q	stride_kvN)__name__
__module____qualname__int__annotations__list r'   r'   c/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torchvision/models/video/mvit.pyr      s   
 r   sreturnc                 C   s   d}| D ]}||9 }q|S N   r'   )r)   productvr'   r'   r(   _prod'   s   
r/   x
target_dim
expand_dimc                 C   sF   |   }||d kr| |} | |fS ||krtd| j | |fS )Nr,   zUnsupported input dimension )dim	unsqueeze
ValueErrorshaper0   r1   r2   
tensor_dimr'   r'   r(   
_unsqueeze.   s   
r9   r8   c                 C   s   ||d kr|  |} | S r+   )squeezer7   r'   r'   r(   _squeeze7   s   
r;   c                       s|   e Zd Z		ddejdeej deej deddf
 fdd	Zd
ej	de
eeef de
ej	e
eeef f fddZ  ZS )PoolNFpoolnorm
activationnorm_before_poolr*   c                    sV   t    || _g }|d ur|| |d ur|| |r#tj| nd | _|| _d S )N)super__init__r=   appendnn
Sequentialnorm_actr@   )selfr=   r>   r?   r@   layers	__class__r'   r(   rB   B   s   



zPool.__init__r0   thwc                 C   s   t |dd\}}tj|ddd\}}|dd}|jd d \}}}||| |f|  }| jr<| jd ur<| |}| 	|}|jdd  \}}	}
||||ddd}tj
||fdd}| jsm| jd urm| |}t|dd|}|||	|
ffS )	N   r,   r,   r   )indicesr3   r   r3   )r9   torchZtensor_split	transposer6   reshape
contiguousr@   rF   r=   catr;   )rG   r0   rK   r8   class_tokenBNCTHWr'   r'   r(   forwardS   s   


zPool.forward)NF)r!   r"   r#   rD   Moduler   boolrB   rQ   Tensortupler$   r]   __classcell__r'   r'   rI   r(   r<   A   s    >r<   	embeddingdc                 C   s@   | j d |kr	| S tjj| ddd|dddddS )Nr   r,   Zlinear)sizemode)r6   rD   Z
functionalZinterpolatepermuter4   r:   )rc   rd   r'   r'   r(   _interpolatem   s   rh   attnqq_thwk_thw	rel_pos_h	rel_pos_w	rel_pos_tc           %      C   s6  |\}}}	|\}
}}t dt|| d }t dt|	| d }t dt||
 d }t|| d}t|| d}t|d d d f | t|d d d f d|  |  }t||	 d}t|	| d}t|	d d d f | t|d d d f d|  |  }t|
| d}t||
 d}t|d d d f | t|
d d d f d|
  |  }t||}t||}t||}||  }||  }||  }|j\}}}}|d d d d dd f |||||	|} td| |}!td| |}"| 	dddddd	||| | |	 |} t
| |dddd}#|#||||	||
	dddddd	}#|!d d d d d d d d d d d d d d f |"d d d d d d d d d d d d d d f  |#d d d d d d d d d d d d d d f  |||| |	 |
| | }$| d d d d dd dd f  |$7  < | S )
Nr   r,         ?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rL      )r$   maxrQ   Zarangerh   longr6   rS   Zeinsumrg   matmulrR   view)%ri   rj   rk   rl   rm   rn   ro   Zq_tZq_hZq_wZk_tZk_hZk_wZdhZdwdtZ	q_h_ratioZ	k_h_ratioZdist_hZ	q_w_ratioZ	k_w_ratioZdist_wZ	q_t_ratioZ	k_t_ratioZdist_tZRhZRwZRtrW   Zn_head_r3   Zr_qZrel_h_qZrel_w_qZrel_q_tZrel_posr'   r'   r(   _add_rel_pos|   sH   


<<<


**$...(rx   shortcutresidual_with_cls_embedc              	   C   sZ   |r	|  | | S | d d d d dd d d f  |d d d d dd d d f 7  < | S r+   )add_)r0   ry   rz   r'   r'   r(   _add_shortcut   s
   
Dr|   c                       s   e Zd Zdejfdee dedededee dee dee d	ee d
edededede	dej
f ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleAttention        
input_size	embed_dim
output_dimr   r   r   r   r    residual_poolrz   rel_pos_embeddropout
norm_layer.r*   Nc              
      sp  t    || _|| _|| _|| | _dt| j | _|	| _	|
| _
t|d| | _t||g}|dkr@|tj|dd tj| | _d | _t|dksUt|dkrrdd |D }ttj| j| j|||| jd	d
|| j| _d | _d | _t|dkst|dkrdd |D }ttj| j| j|||| jd	d
|| j| _ttj| j| j|||| jd	d
|| j| _d | _d | _d | _|r6t|dd  }t|dkr||d  n|}t|dkr||d  n|}dt|| d }d|d  d }tt|| j| _tt|| j| _tt|| j| _tj j!| jdd tj j!| jdd tj j!| jdd d S d S )Nrp   r   r~   Tinplacer,   c                 S      g | ]}t |d  qS r   r$   ).0rj   r'   r'   r(   
<listcomp>       z0MultiscaleAttention.__init__.<locals>.<listcomp>F)stridepaddinggroupsbiasc                 S   r   r   r   )r   kvr'   r'   r(   r      r   r   r   {Gz?std)"rA   rB   r   r   r   head_dimmathsqrtscalerr   rz   rD   LinearqkvrC   DropoutrE   projectpool_qr/   r<   Conv3dpool_kpool_vrm   rn   ro   rr   len	ParameterrQ   zerosinittrunc_normal_)rG   r   r   r   r   r   r   r   r    r   rz   r   r   r   rH   Z	padding_qZ
padding_kvre   Zq_sizeZkv_sizeZspatial_dimZtemporal_dimrI   r'   r(   rB      s   

			zMultiscaleAttention.__init__r0   rK   c                 C   s:  |j \}}}| |||d| j| jddjdd\}}}| jd ur-| ||\}}	n|}	| jd ur<| ||d }| j	d urI| 	||\}}t
| j| |dd}
| jd urr| jd urr| jd urrt|
|||	| j| j| j}
|
jdd}
t
|
|}| jrt||| j |dd|d| j}| |}||fS )Nr   r,   r   rP   r   rO   )r6   r   rS   r   r   rR   Zunbindr   r   r   rQ   rt   r   rm   rn   ro   rx   Zsoftmaxr   r|   rz   r   r   )rG   r0   rK   rW   rX   rY   rj   kr.   rl   ri   r'   r'   r(   r]   !  s6   2


	
zMultiscaleAttention.forward)r!   r"   r#   rD   	LayerNormr&   r$   r_   floatr   r^   rB   rQ   r`   ra   r]   rb   r'   r'   rI   r(   r}      sB    	
>\r}   c                       s   e Zd Zddejfdee dededededede	d	e	d
e
dejf ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleBlockr~   r   cnfr   rz   r   proj_after_attnr   stochastic_depth_probr   .r*   Nc
                    s  t    || _d | _t|jdkr.dd |jD }
dd |
D }ttj|
|j|dd | _|r3|j	n|j
}|	|j
| _|	|| _t| jtj| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	| _d | _|j
|j	krt|j
|j	| _d S d S )
Nr,   c                 S   s    g | ]}|d kr|d  n|qS rM   r'   )r   r)   r'   r'   r(   r   V  s     z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   r   r   r   )r   r   r'   r'   r(   r   W  r   )r   r   )	r   r   r   r    r   r   rz   r   r   rL   )Zactivation_layerr   r   row)rA   rB   r   	pool_skipr/   r   r<   rD   Z	MaxPool3dr   r   norm1norm2
isinstanceZBatchNorm1dneeds_transposalr}   r   r   r   r    ri   r	   ZGELUmlpr
   stochastic_depthr   r   )rG   r   r   r   rz   r   r   r   r   r   Zkernel_skipZpadding_skipZattn_dimrI   r'   r(   rB   E  sP   

zMultiscaleBlock.__init__r0   rK   c           	      C   s   | j r| |ddddn| |}| ||\}}| jd u s%| js'|n| |}| jd u r3|n| ||d }|| | }| j rR| |ddddn| |}| jd u s_| jra|n| |}|| | 	| |fS )Nr,   r   r   )
r   r   rR   ri   r   r   r   r   r   r   )	rG   r0   rK   Zx_norm1Zx_attnZthw_newZx_skipZx_norm2Zx_projr'   r'   r(   r]     s   **zMultiscaleBlock.forward)r!   r"   r#   rD   r   r&   r$   r   r_   r   r   r^   rB   rQ   r`   ra   r]   rb   r'   r'   rI   r(   r   D  s4    		
>:r   c                
       sP   e Zd Zdedeeef dededdf
 fddZd	ejdejfd
dZ	  Z
S )PositionalEncoding
embed_sizespatial_sizetemporal_sizer   r*   Nc                    s   t    || _|| _tt|| _d | _	d | _
d | _|sGtt| jd | jd  || _	tt| j|| _
tt|| _d S d S )Nr   r,   )rA   rB   r   r   rD   r   rQ   r   rV   spatial_postemporal_pos	class_pos)rG   r   r   r   r   rI   r'   r(   rB     s   
$zPositionalEncoding.__init__r0   c                 C   s   | j |ddd}tj||fdd}| jd ur\| jd ur\| jd ur\| jj	\}}tj
| j|dd}|| jd| jddd| tj| jd|fddd}|| |S )Nr   rO   r,   rP   )rV   expandre   r4   rQ   rU   r   r   r   r6   Zrepeat_interleaver{   r   rS   )rG   r0   rV   Zhw_sizer   Zpos_embeddingr'   r'   r(   r]     s   & 
zPositionalEncoding.forward)r!   r"   r#   r$   ra   r_   rB   rQ   r`   r]   rb   r'   r'   rI   r(   r     s    *r   c                $       s   e Zd Z									ddeeef d	ed
ee dedededededededede	e
dejf  de	e
dejf  deeeef deeeef deeeef ddf" fddZdejdejfddZ  ZS ) r         ?r~     Nr      r   r   rL   rL   r,   r   r   r   r   block_settingr   rz   r   r   r   attention_dropoutr   num_classesblock.r   patch_embed_kernelpatch_embed_stridepatch_embed_paddingr*   c                    s  t    t|  t|}|dkrtd|du rt}|du r&ttjdd}tj	d|d j
|||d| _dd	 t|f| | jjD }t|d j
|d
 |d f|d |d| _t | _t|D ]/\}}|
| |d  }| j||||||||	||d	 t|jdkrdd	 t||jD }q`||d j| _ttj|ddt|d j|| _|  D ][}t|tjrtjj|jdd t|tjr|j durtj!|j d qt|tjr|jdurtj!|jd |j durtj!|j d qt|tr
|" D ]
}tjj|dd qqdS )a  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngư>)epsr   )Zin_channelsZout_channelsZkernel_sizer   r   c                 S      g | ]\}}|| qS r'   r'   r   re   r   r'   r'   r(   r     r   z!MViT.__init__.<locals>.<listcomp>r,   r   )r   r   r   r   rp   )	r   r   r   rz   r   r   r   r   r   c                 S   r   r'   r'   r   r'   r'   r(   r     r   rO   Tr   r   r   r~   )#rA   rB   r   r   r5   r   r   rD   r   r   r   	conv_projzipr   r   pos_encodingZ
ModuleListblocks	enumeraterC   r   r   r>   rE   r   r   headmodulesr   r   r   weightr   Z	constant_
parameters)rG   r   r   r   r   rz   r   r   r   r   r   r   r   r   r   r   r   Ztotal_stage_blocksr   Zstage_block_idr   Zsd_probmweightsrI   r'   r(   rB     s   
)


zMViT.__init__r0   c                 C   s   t |ddd }| |}|ddd}| |}| jjf| jj }| jD ]	}|||\}}q'| |}|d d df }| 	|}|S )Nrq   r   r   r,   )
r9   r   flattenrR   r   r   r   r   r>   r   )rG   r0   rK   r   r'   r'   r(   r]   "  s   




zMViT.forward)	r   r~   r~   r   NNr   r   r   )r!   r"   r#   ra   r$   r   r   r_   r   r   r   rD   r^   rB   rQ   r`   r]   rb   r'   r'   rI   r(   r     s\    

	
xr   r   r   r   progresskwargsc                 K   s   |d ur1t |dt|jd  |jd d |jd d ksJ t |d|jd  t |d|jd  |dd	}|dd
}td||| |dd|dd|dd|dd|d|}|d urk||j|dd |S )Nr   
categoriesmin_sizer   r,   r   r   min_temporal_size   r      r   Frz   Tr   r   )r   r   r   r   rz   r   r   r   )r   Z
check_hashr'   )r   r   metapopr   Zload_state_dictZget_state_dict)r   r   r   r   r   r   r   modelr'   r'   r(   _mvit9  s,    



	r   c                   @   J   e Zd Zedeedddddddedd	d
ddddiddd	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthr      ?r   r   ?r   r   Z	crop_sizeZresize_sizemeanr   r   zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip.Kinetics-400gJ+S@gh|?eW@zacc@1zacc@5guVQ@g rxa@	r   r   r   ZrecipeZ_docsZ
num_paramsZ_metricsZ_ops
_file_sizeurlZ
transformsr   N	r!   r"   r#   r   r   r   r   KINETICS400_V1DEFAULTr'   r'   r'   r(   r   Z  4    r   c                   @   r   )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthr   r   r   r   r   r   zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdr   ir   g r0T@g(\W@r   guVP@g?5^I|`@r   r   Nr   r'   r'   r'   r(   r   {  r   r   Z
pretrained)r   T)r   r   c                 K   s  t | } g dg dg dg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dd|dd| |d|S )a  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    r,   r   r   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL      r   `      r     r   r   r   r   r   r   r   r   r   r      r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   r   r,   r   r   r,   rL   rL   r,   r,   r,   r   r   r   r   r   r   r    r   r   r   r   r   r   r    r   r   Fr   皙?)r   r   r   r   rz   r   r   r   Nr'   )r   verifyranger   rC   r   r   r   r   r   r   configr   ir'   r'   r(   r     s   
..,







	r   c                 K   sD  t | } g dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dddd|dd| |d
|S )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r   r   r   r   r   r   r    r   r   TFr   r  )
r   r   r   r   rz   r   r   r   r   r   Nr'   )r   r  r	  r   rC   r   r   r   r
  r'   r'   r(   r     s   
N







r   );r   collections.abcr   dataclassesr   	functoolsr   typingr   r   r   rQ   Ztorch.fxZtorch.nnrD   opsr	   r
   Ztransforms._presetsr   utilsr   Z_apir   r   r   _metar   Z_utilsr   r   __all__r   r$   r/   r`   ra   r9   r;   Zfxwrapr^   r<   rh   rx   r_   r|   r}   r   r   r   r&   r   r   r   r   r   r   r   r'   r'   r'   r(   <module>   s    	
&"	,
< H 
!!!*`.