o
    h|}                     @   s  U d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZmZ d	dlmZ d	dlmZmZ g dZG dd deZG dd deZ G dd dej!Z"G dd dej!Z#G dd dej!Z$de%de%de%de%de%de	e de&ded e$fd!d"Z'd#eiZ(e)e*ef e+d$< i e(d%d&d'Z,G d(d) d)eZ-G d*d+ d+eZ.G d,d- d-eZ/G d.d/ d/eZ0G d0d1 d1eZ1e ed2e-j2fd3dd4d5de	e- de&ded e$fd6d7Z3e ed2e.j2fd3dd4d5de	e. de&ded e$fd8d9Z4e ed2e/j2fd3dd4d5de	e/ de&ded e$fd:d;Z5e ed2e0j2fd3dd4d5de	e0 de&ded e$fd<d=Z6e ed>d3dd4d5de	e1 de&ded e$fd?d@Z7	A	BdJdCe%de%dDdEdFe*dGe&d dEfdHdIZ8dS )K    N)OrderedDict)partial)AnyCallable
NamedTupleOptional   )Conv2dNormActivationMLP)ImageClassificationInterpolationMode)_log_api_usage_once   )register_modelWeightsWeightsEnum)_IMAGENET_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)VisionTransformerViT_B_16_WeightsViT_B_32_WeightsViT_L_16_WeightsViT_L_32_WeightsViT_H_14_Weightsvit_b_16vit_b_32vit_l_16vit_l_32vit_h_14c                   @   sV   e Zd ZU eed< eed< eed< ejZedej	f ed< ej
Zedej	f ed< dS )ConvStemConfigout_channelskernel_sizestride.
norm_layeractivation_layerN)__name__
__module____qualname__int__annotations__nnZBatchNorm2dr$   r   ModuleZReLUr%    r-   r-   k/home/www/facesmatcher.com/frenv_anti/lib/python3.10/site-packages/torchvision/models/vision_transformer.pyr        s   
 r    c                       s>   e Zd ZdZdZdededef fddZ fdd	Z  Z	S )
MLPBlockzTransformer MLP block.r   in_dimmlp_dimdropoutc                    sd   t  j|||gtjd |d |  D ]}t|tjr/tj|j	 |j
d ur/tjj|j
dd qd S )N)r%   Zinplacer2   ư>std)super__init__r+   ZGELUmodules
isinstanceLinearinitZxavier_uniform_weightbiasnormal_)selfr0   r1   r2   m	__class__r-   r.   r7   -   s   
zMLPBlock.__init__c              	      s   | dd }|d u s|dk r<tdD ])}	dD ]$}
| d|	d  d|
 }| d|	  d|
 }||v r:||||< qqt ||||||| d S )Nversionr   )r<   r=   Zlinear_r   .   )getrangepopr6   _load_from_state_dict)r?   Z
state_dictprefixZlocal_metadatastrictZmissing_keysZunexpected_keys
error_msgsrC   itypeold_keyZnew_keyrA   r-   r.   rI   6   s&   
zMLPBlock._load_from_state_dict)
r&   r'   r(   __doc___versionr)   floatr7   rI   __classcell__r-   r-   rA   r.   r/   (   s
    	r/   c                       sd   e Zd ZdZeejddfdededededed	e	d
e
jjf f fddZde
jfddZ  ZS )EncoderBlockzTransformer encoder block.r3   eps	num_heads
hidden_dimr1   r2   attention_dropoutr$   .c                    sV   t    || _||| _tj|||dd| _t|| _||| _	t
|||| _d S )NT)r2   Zbatch_first)r6   r7   rW   ln_1r+   ZMultiheadAttentionself_attentionDropoutr2   ln_2r/   mlp)r?   rW   rX   r1   r2   rY   r$   rA   r-   r.   r7   Y   s   
	

zEncoderBlock.__init__inputc                 C   sj   t | dkd|j  | |}| j|||dd\}}| |}|| }| |}| |}|| S )NrE   2Expected (batch_size, seq_length, hidden_dim) got F)Zneed_weights)	torch_assertdimshaperZ   r[   r2   r]   r^   )r?   r_   x_yr-   r-   r.   forwardn   s   



zEncoderBlock.forwardr&   r'   r(   rP   r   r+   	LayerNormr)   rR   r   ra   r,   r7   Tensorrh   rS   r-   r-   rA   r.   rT   V   s"    	rT   c                       sl   e Zd ZdZeejddfdededededed	ed
ede	de
jjf f fddZde
jfddZ  ZS )Encoderz?Transformer Model Encoder for sequence to sequence translation.r3   rU   
seq_length
num_layersrW   rX   r1   r2   rY   r$   .c	                    s~   t    ttd||jdd| _t|| _	t
 }	t|D ]}
t|||||||	d|
 < q!t|	| _||| _d S )Nr   g{Gz?r4   Zencoder_layer_)r6   r7   r+   	Parameterra   emptyr>   pos_embeddingr\   r2   r   rG   rT   
Sequentiallayersln)r?   rm   rn   rW   rX   r1   r2   rY   r$   rs   rM   rA   r-   r.   r7   }   s   
zEncoder.__init__r_   c                 C   s<   t | dkd|j  || j }| | | |S )NrE   r`   )ra   rb   rc   rd   rq   rt   rs   r2   )r?   r_   r-   r-   r.   rh      s   
zEncoder.forwardri   r-   r-   rA   r.   rl   z   s*    	rl   c                       s   e Zd ZdZddddeejdddfdeded	ed
edededededede	e de
dejjf de	ee  f fddZdejdejfddZdejfddZ  ZS )r   z;Vision Transformer as per https://arxiv.org/abs/2010.11929.        i  Nr3   rU   
image_size
patch_sizern   rW   rX   r1   r2   rY   num_classesrepresentation_sizer$   .conv_stem_configsc                    s  t    t|  t|| dkd || _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|d urjt }d}t|D ]\}}|d| t||j|j|j|j|jd |j}q<|dtj||dd || _n
tjd|||d	| _|| d
 }ttdd|| _|d7 }t||||||||| _|| _t }|
d u rt ||	|d< nt ||
|d< t! |d< t |
|	|d< t|| _"t#| jtjr| jj$| jjd  | jjd  }tj%j&| jj't()d| d | jj*d urtj%+| jj* n5| jj,d ur.t#| jj,tjr.tj%j-| jj,j'dt()d| jj,j d | jj,j*d ur.tj%+| jj,j* t.| j"dr\t#| j"j/tj r\| j"j/j0}tj%j&| j"j/j't()d| d tj%+| j"j/j* t#| j"j1tj rytj%+| j"j1j' tj%+| j"j1j* d S d S )Nr   z&Input shape indivisible by patch size!rE   Zconv_bn_relu_)in_channelsr!   r"   r#   r$   r%   	conv_lastr   )r{   r!   r"   )r{   r!   r"   r#   r   head
pre_logitsZactr4   ru   g       @)meanr5   )2r6   r7   r   ra   rb   rv   rw   rX   r1   rY   r2   rx   ry   r$   r+   rr   	enumerateZ
add_moduler	   r!   r"   r#   r%   ZConv2d	conv_projro   Zzerosclass_tokenrl   encoderrm   r   r:   ZTanhheadsr9   r{   r;   Ztrunc_normal_r<   mathsqrtr=   Zzeros_r|   r>   hasattrr~   Zin_featuresr}   )r?   rv   rw   rn   rW   rX   r1   r2   rY   rx   ry   r$   rz   Zseq_projZprev_channelsrM   Zconv_stem_layer_configrm   Zheads_layersZfan_inrA   r-   r.   r7      s   

   
 zVisionTransformer.__init__re   returnc           	      C   s   |j \}}}}| j}t|| jkd| j d| d t|| jkd| j d| d || }|| }| |}||| j|| }|ddd}|S )NzWrong image height! Expected z	 but got !zWrong image width! Expected r   r   r   )	rd   rw   ra   rb   rv   r   reshaperX   permute)	r?   re   nchwpZn_hZn_wr-   r-   r.   _process_input  s   ""
z VisionTransformer._process_inputc                 C   s^   |  |}|jd }| j|dd}tj||gdd}| |}|d d df }| |}|S )Nr   r   rc   )r   rd   r   expandra   catr   r   )r?   re   r   Zbatch_class_tokenr-   r-   r.   rh   !  s   



zVisionTransformer.forward)r&   r'   r(   rP   r   r+   rj   r)   rR   r   r   ra   r,   listr    r7   rk   r   rh   rS   r-   r-   rA   r.   r      sF    
	

ir   rw   rn   rW   rX   r1   weightsprogresskwargsr   c           
   	   K   s   |d ur*t |dt|jd  |jd d |jd d ksJ t |d|jd d  |dd}td|| ||||d|}	|rJ|	|j|d	d
 |	S )Nrx   
categoriesmin_sizer   r   rv      )rv   rw   rn   rW   rX   r1   T)r   Z
check_hashr-   )r   lenmetarH   r   Zload_state_dictZget_state_dict)
rw   rn   rW   rX   r1   r   r   r   rv   modelr-   r-   r.   _vision_transformer4  s$   
 
r   r   _COMMON_METAz(https://github.com/facebookresearch/SWAGz:https://github.com/facebookresearch/SWAG/blob/main/LICENSE)recipelicensec                   @   s   e Zd Zedeeddi edddddd	d
idddddZedeeddej	di e
dddddd
idddddZedeeddej	di e
ddddddd
idddd dZeZd!S )"r   z9https://download.pytorch.org/models/vit_b_16-c867db91.pthr   	crop_sizei(r   r   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_b_16ImageNet-1KgS㥛DT@g1ZW@zacc@1zacc@5gMb1@g(\t@
                These weights were trained from scratch by using a modified version of `DeIT
                <https://arxiv.org/abs/2012.12877>`_'s training recipe.
            
num_paramsr   r   _metrics_ops
_file_size_docsurlZ
transformsr   z>https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth  r   resize_sizeinterpolationi^-)r   r   g~jtSU@giX@gˡEK@g|?5^t@
                These weights are learnt via transfer learning by end-to-end fine-tuning the original
                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
            r   r   r   r   r   r   zAhttps://download.pytorch.org/models/vit_b_16_lc_swag-4e70ced5.pth+https://github.com/pytorch/vision/pull/5793gbX9xT@gQX@
                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
            r   r   r   r   r   r   r   Nr&   r'   r(   r   r   r   r   IMAGENET1K_V1r   BICUBIC_COMMON_SWAG_METAIMAGENET1K_SWAG_E2E_V1IMAGENET1K_SWAG_LINEAR_V1DEFAULTr-   r-   r-   r.   r   _  s    
r   c                   @   H   e Zd Zedeeddi edddddd	d
idddddZeZdS )r   z9https://download.pytorch.org/models/vit_b_32-d86f8d99.pthr   r   i1Br   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_b_32r   g|?5^R@gW@r   gA`Т@gl	u@r   r   r   N	r&   r'   r(   r   r   r   r   r   r   r-   r-   r-   r.   r     *    
r   c                   @   s   e Zd Zedeedddi eddddd	d
didddddZedeeddej	di e
ddddddidddddZedeeddej	di e
dddddddiddd d!dZeZd"S )#r   z9https://download.pytorch.org/models/vit_l_16-852ce7e3.pthr      )r   r   i#r   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_l_16r   g|?5^S@gFԨW@r   gףp=
N@g;O$@a  
                These weights were trained from scratch by using a modified version of TorchVision's
                `new training recipe
                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
            r   r   z>https://download.pytorch.org/models/vit_l_16_swag-4f3808c9.pth   r   i0)r   r   gjtV@gT㥛ĠX@gƟv@gy&11@r   r   zAhttps://download.pytorch.org/models/vit_l_16_lc_swag-4d563306.pthr   gMbXIU@g^I[X@r   r   Nr   r-   r-   r-   r.   r     s    r   c                   @   r   )r   z9https://download.pytorch.org/models/vit_l_32-c7638314.pthr   r   i[Er   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_l_32r   g|?5>S@gGzDW@r   gK7.@gE@r   r   r   Nr   r-   r-   r-   r.   r     r   r   c                   @   s   e Zd Zedeeddejdi edddddd	id
ddddZ	edeeddejdi eddddddd	idddddZ
e	ZdS )r   z>https://download.pytorch.org/models/vit_h_14_swag-80465313.pth  r   i%)r   r   r   gS#V@g#~jX@r   g~jŏ@gK7I@r   r   r   zAhttps://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pthr   r   i@%r   gZd;OmU@gQnX@g=
ףpd@gIk֢@r   r   N)r&   r'   r(   r   r   r   r   r   r   r   r   r   r-   r-   r-   r.   r   2  sb    r   
pretrained)r   T)r   r   c              
   K   (   t | } tdddddd| |d|S )a  
    Constructs a vit_b_16 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_B_16_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_B_16_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_B_16_Weights
        :members:
                rw   rn   rW   rX   r1   r   r   Nr-   )r   verifyr   r   r   r   r-   r-   r.   r   k     
r   c              
   K   r   )a  
    Constructs a vit_b_32 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_B_32_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_B_32_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_B_32_Weights
        :members:
        r   r   r   r   Nr-   )r   r   r   r   r-   r-   r.   r     r   r   c              
   K   s(   t | } tdddddd| |d|S )a  
    Constructs a vit_l_16 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_L_16_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_L_16_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_L_16_Weights
        :members:
    r            r   Nr-   )r   r   r   r   r-   r-   r.   r     r   r   c              
   K   (   t | } tdddddd| |d|S )	a  
    Constructs a vit_l_32 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_L_32_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_L_32_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_L_32_Weights
        :members:
    r   r   r   r   r   r   Nr-   )r   r   r   r   r-   r-   r.   r     r   r   )r   Nc              
   K   r   )	a  
    Constructs a vit_h_14 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_H_14_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_H_14_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_H_14_Weights
        :members:
       r   r   i   i   r   Nr-   )r   r   r   r   r-   r-   r.   r     r   r   bicubicFrv   model_statezOrderedDict[str, torch.Tensor]interpolation_modereset_headsc                 C   s`  |d }|j \}}}|dkrtd|j  | | d d }	|	|kr|d8 }|	d8 }	|ddddddf }
|ddddddf }|ddd}tt|}|| |krdtd||  d| |d|||}| | }tjj	|||d	d
}|d||	}|ddd}t
j|
|gdd}||d< |rt }| D ]\}}|ds|||< q|}|S )a  This function helps interpolate positional embeddings during checkpoint loading,
    especially when you want to apply a pre-trained model on images with different resolution.

    Args:
        image_size (int): Image size of the new model.
        patch_size (int): Patch size of the new model.
        model_state (OrderedDict[str, torch.Tensor]): State dict of the pre-trained model.
        interpolation_mode (str): The algorithm used for upsampling. Default: bicubic.
        reset_heads (bool): If true, not copying the state of heads. Default: False.

    Returns:
        OrderedDict[str, torch.Tensor]: A state dict which can be loaded into the new model.
    zencoder.pos_embeddingr   z%Unexpected position embedding shape: r   Nr   zPseq_length is not a perfect square! Instead got seq_length_1d * seq_length_1d = z and seq_length = T)sizemodeZalign_cornersr   r   )rd   
ValueErrorr   r)   r   r   r   r+   Z
functionalZinterpolatera   r   r   items
startswith)rv   rw   r   r   r   rq   r   rm   rX   Znew_seq_lengthZpos_embedding_tokenZpos_embedding_imgZseq_length_1dZnew_seq_length_1dZnew_pos_embedding_imgZnew_pos_embeddingZmodel_state_copykvr-   r-   r.   interpolate_embeddings  sH   
r   )r   F)9r   collectionsr   	functoolsr   typingr   r   r   r   ra   Ztorch.nnr+   Zops.miscr	   r
   Ztransforms._presetsr   r   utilsr   Z_apir   r   r   _metar   Z_utilsr   r   __all__r    r/   r,   rT   rl   r   r)   boolr   r   dictstrr*   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   r-   r-   r.   <module>   s   
 .$& 	
!OP9* * * * *$