init
This commit is contained in:
14
lib/models/backbones/__init__.py
Normal file
14
lib/models/backbones/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from .swin_v2 import SwinTransformerV2MSL
|
||||
from .vit import VisionTransformerMSL
|
||||
|
||||
__all__ = [
|
||||
'SwinTransformerV2MSL', 'VisionTransformerMSL'
|
||||
]
|
||||
|
||||
type_mapping = {
|
||||
'SwinTransformerV2MSL': SwinTransformerV2MSL,
|
||||
'VisionTransformerMSL': VisionTransformerMSL
|
||||
}
|
||||
|
||||
def build_backbone(type, **kwargs):
|
||||
return type_mapping[type](**kwargs)
|
||||
702
lib/models/backbones/swin_v2.py
Normal file
702
lib/models/backbones/swin_v2.py
Normal file
@@ -0,0 +1,702 @@
|
||||
from copy import deepcopy
|
||||
from typing import Sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint as cp
|
||||
from mmcv.cnn import build_norm_layer
|
||||
from mmcv.cnn.bricks.transformer import FFN, PatchEmbed
|
||||
from mmcv.cnn.utils.weight_init import trunc_normal_
|
||||
from mmcv.runner.base_module import BaseModule, ModuleList
|
||||
from mmcv.utils.parrots_wrapper import _BatchNorm
|
||||
|
||||
from mmcls.models.utils import (PatchMerging, ShiftWindowMSA, WindowMSAV2,
|
||||
resize_pos_embed, to_2tuple)
|
||||
from mmcls.models.backbones.base_backbone import BaseBackbone
|
||||
from mmcv.runner import (CheckpointLoader,
|
||||
load_state_dict)
|
||||
from mmcv.cnn.bricks.transformer import MultiheadAttention
|
||||
|
||||
class SwinBlockV2(BaseModule):
|
||||
"""Swin Transformer V2 block. Use post normalization.
|
||||
|
||||
Args:
|
||||
embed_dims (int): Number of input channels.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): The height and width of the window. Defaults to 7.
|
||||
shift (bool): Shift the attention window or not. Defaults to False.
|
||||
extra_norm (bool): Whether add extra norm at the end of main branch.
|
||||
ffn_ratio (float): The expansion ratio of feedforward network hidden
|
||||
layer channels. Defaults to 4.
|
||||
drop_path (float): The drop path rate after attention and ffn.
|
||||
Defaults to 0.
|
||||
pad_small_map (bool): If True, pad the small feature map to the window
|
||||
size, which is common used in detection and segmentation. If False,
|
||||
avoid shifting window and shrink the window size to the size of
|
||||
feature map, which is common used in classification.
|
||||
Defaults to False.
|
||||
attn_cfgs (dict): The extra config of Shift Window-MSA.
|
||||
Defaults to empty dict.
|
||||
ffn_cfgs (dict): The extra config of FFN. Defaults to empty dict.
|
||||
norm_cfg (dict): The config of norm layers.
|
||||
Defaults to ``dict(type='LN')``.
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
|
||||
memory while slowing down the training speed. Defaults to False.
|
||||
pretrained_window_size (int): Window size in pretrained.
|
||||
init_cfg (dict, optional): The extra config for initialization.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dims,
|
||||
num_heads,
|
||||
window_size=8,
|
||||
shift=False,
|
||||
extra_norm=False,
|
||||
ffn_ratio=4.,
|
||||
drop_path=0.,
|
||||
pad_small_map=False,
|
||||
attn_cfgs=dict(),
|
||||
ffn_cfgs=dict(),
|
||||
norm_cfg=dict(type='LN'),
|
||||
with_cp=False,
|
||||
pretrained_window_size=0,
|
||||
init_cfg=None):
|
||||
|
||||
super(SwinBlockV2, self).__init__(init_cfg)
|
||||
self.with_cp = with_cp
|
||||
self.extra_norm = extra_norm
|
||||
|
||||
_attn_cfgs = {
|
||||
'embed_dims': embed_dims,
|
||||
'num_heads': num_heads,
|
||||
'shift_size': window_size // 2 if shift else 0,
|
||||
'window_size': window_size,
|
||||
'dropout_layer': dict(type='DropPath', drop_prob=drop_path),
|
||||
'pad_small_map': pad_small_map,
|
||||
**attn_cfgs
|
||||
}
|
||||
# use V2 attention implementation
|
||||
_attn_cfgs.update(
|
||||
window_msa=WindowMSAV2,
|
||||
msa_cfg=dict(
|
||||
pretrained_window_size=to_2tuple(pretrained_window_size)))
|
||||
self.attn = ShiftWindowMSA(**_attn_cfgs)
|
||||
self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
|
||||
|
||||
_ffn_cfgs = {
|
||||
'embed_dims': embed_dims,
|
||||
'feedforward_channels': int(embed_dims * ffn_ratio),
|
||||
'num_fcs': 2,
|
||||
'ffn_drop': 0,
|
||||
'dropout_layer': dict(type='DropPath', drop_prob=drop_path),
|
||||
'act_cfg': dict(type='GELU'),
|
||||
'add_identity': False,
|
||||
**ffn_cfgs
|
||||
}
|
||||
self.ffn = FFN(**_ffn_cfgs)
|
||||
self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
|
||||
|
||||
# add extra norm for every n blocks in huge and giant model
|
||||
if self.extra_norm:
|
||||
self.norm3 = build_norm_layer(norm_cfg, embed_dims)[1]
|
||||
|
||||
def forward(self, x, hw_shape):
|
||||
|
||||
def _inner_forward(x):
|
||||
# Use post normalization
|
||||
identity = x
|
||||
x = self.attn(x, hw_shape)
|
||||
x = self.norm1(x)
|
||||
x = x + identity
|
||||
|
||||
identity = x
|
||||
x = self.ffn(x)
|
||||
x = self.norm2(x)
|
||||
x = x + identity
|
||||
|
||||
if self.extra_norm:
|
||||
x = self.norm3(x)
|
||||
|
||||
return x
|
||||
|
||||
if self.with_cp and x.requires_grad:
|
||||
x = cp.checkpoint(_inner_forward, x)
|
||||
else:
|
||||
x = _inner_forward(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class SwinBlockV2Sequence(BaseModule):
|
||||
"""Module with successive Swin Transformer blocks and downsample layer.
|
||||
|
||||
Args:
|
||||
embed_dims (int): Number of input channels.
|
||||
depth (int): Number of successive swin transformer blocks.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): The height and width of the window. Defaults to 7.
|
||||
downsample (bool): Downsample the output of blocks by patch merging.
|
||||
Defaults to False.
|
||||
downsample_cfg (dict): The extra config of the patch merging layer.
|
||||
Defaults to empty dict.
|
||||
drop_paths (Sequence[float] | float): The drop path rate in each block.
|
||||
Defaults to 0.
|
||||
block_cfgs (Sequence[dict] | dict): The extra config of each block.
|
||||
Defaults to empty dicts.
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
|
||||
memory while slowing down the training speed. Defaults to False.
|
||||
pad_small_map (bool): If True, pad the small feature map to the window
|
||||
size, which is common used in detection and segmentation. If False,
|
||||
avoid shifting window and shrink the window size to the size of
|
||||
feature map, which is common used in classification.
|
||||
Defaults to False.
|
||||
extra_norm_every_n_blocks (int): Add extra norm at the end of main
|
||||
branch every n blocks. Defaults to 0, which means no needs for
|
||||
extra norm layer.
|
||||
pretrained_window_size (int): Window size in pretrained.
|
||||
init_cfg (dict, optional): The extra config for initialization.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dims,
|
||||
depth,
|
||||
num_heads,
|
||||
window_size=8,
|
||||
downsample=False,
|
||||
downsample_cfg=dict(),
|
||||
drop_paths=0.,
|
||||
block_cfgs=dict(),
|
||||
with_cp=False,
|
||||
pad_small_map=False,
|
||||
extra_norm_every_n_blocks=0,
|
||||
pretrained_window_size=0,
|
||||
init_cfg=None):
|
||||
super().__init__(init_cfg)
|
||||
|
||||
if not isinstance(drop_paths, Sequence):
|
||||
drop_paths = [drop_paths] * depth
|
||||
|
||||
if not isinstance(block_cfgs, Sequence):
|
||||
block_cfgs = [deepcopy(block_cfgs) for _ in range(depth)]
|
||||
|
||||
if downsample:
|
||||
self.out_channels = 2 * embed_dims
|
||||
_downsample_cfg = {
|
||||
'in_channels': embed_dims,
|
||||
'out_channels': self.out_channels,
|
||||
'norm_cfg': dict(type='LN'),
|
||||
**downsample_cfg
|
||||
}
|
||||
self.downsample = PatchMerging(**_downsample_cfg)
|
||||
else:
|
||||
self.out_channels = embed_dims
|
||||
self.downsample = None
|
||||
|
||||
self.blocks = ModuleList()
|
||||
for i in range(depth):
|
||||
extra_norm = True if extra_norm_every_n_blocks and \
|
||||
(i + 1) % extra_norm_every_n_blocks == 0 else False
|
||||
_block_cfg = {
|
||||
'embed_dims': self.out_channels,
|
||||
'num_heads': num_heads,
|
||||
'window_size': window_size,
|
||||
'shift': False if i % 2 == 0 else True,
|
||||
'extra_norm': extra_norm,
|
||||
'drop_path': drop_paths[i],
|
||||
'with_cp': with_cp,
|
||||
'pad_small_map': pad_small_map,
|
||||
'pretrained_window_size': pretrained_window_size,
|
||||
**block_cfgs[i]
|
||||
}
|
||||
block = SwinBlockV2(**_block_cfg)
|
||||
self.blocks.append(block)
|
||||
|
||||
def forward(self, x, in_shape):
|
||||
if self.downsample:
|
||||
x, out_shape = self.downsample(x, in_shape)
|
||||
else:
|
||||
out_shape = in_shape
|
||||
|
||||
for block in self.blocks:
|
||||
x = block(x, out_shape)
|
||||
|
||||
return x, out_shape
|
||||
|
||||
|
||||
class SwinTransformerV2(BaseBackbone):
|
||||
"""Swin Transformer V2.
|
||||
|
||||
A PyTorch implement of : `Swin Transformer V2:
|
||||
Scaling Up Capacity and Resolution
|
||||
<https://arxiv.org/abs/2111.09883>`_
|
||||
|
||||
Inspiration from
|
||||
https://github.com/microsoft/Swin-Transformer
|
||||
|
||||
Args:
|
||||
arch (str | dict): Swin Transformer architecture. If use string, choose
|
||||
from 'tiny', 'small', 'base' and 'large'. If use dict, it should
|
||||
have below keys:
|
||||
|
||||
- **embed_dims** (int): The dimensions of embedding.
|
||||
- **depths** (List[int]): The number of blocks in each stage.
|
||||
- **num_heads** (List[int]): The number of heads in attention
|
||||
modules of each stage.
|
||||
- **extra_norm_every_n_blocks** (int): Add extra norm at the end
|
||||
of main branch every n blocks.
|
||||
|
||||
Defaults to 'tiny'.
|
||||
img_size (int | tuple): The expected input image shape. Because we
|
||||
support dynamic input shape, just set the argument to the most
|
||||
common input image shape. Defaults to 224.
|
||||
patch_size (int | tuple): The patch size in patch embedding.
|
||||
Defaults to 4.
|
||||
in_channels (int): The num of input channels. Defaults to 3.
|
||||
window_size (int | Sequence): The height and width of the window.
|
||||
Defaults to 7.
|
||||
drop_rate (float): Dropout rate after embedding. Defaults to 0.
|
||||
drop_path_rate (float): Stochastic depth rate. Defaults to 0.1.
|
||||
use_abs_pos_embed (bool): If True, add absolute position embedding to
|
||||
the patch embedding. Defaults to False.
|
||||
interpolate_mode (str): Select the interpolate mode for absolute
|
||||
position embeding vector resize. Defaults to "bicubic".
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
|
||||
memory while slowing down the training speed. Defaults to False.
|
||||
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
||||
-1 means not freezing any parameters. Defaults to -1.
|
||||
norm_eval (bool): Whether to set norm layers to eval mode, namely,
|
||||
freeze running stats (mean and var). Note: Effect on Batch Norm
|
||||
and its variants only. Defaults to False.
|
||||
pad_small_map (bool): If True, pad the small feature map to the window
|
||||
size, which is common used in detection and segmentation. If False,
|
||||
avoid shifting window and shrink the window size to the size of
|
||||
feature map, which is common used in classification.
|
||||
Defaults to False.
|
||||
norm_cfg (dict): Config dict for normalization layer for all output
|
||||
features. Defaults to ``dict(type='LN')``
|
||||
stage_cfgs (Sequence[dict] | dict): Extra config dict for each
|
||||
stage. Defaults to an empty dict.
|
||||
patch_cfg (dict): Extra config dict for patch embedding.
|
||||
Defaults to an empty dict.
|
||||
pretrained_window_sizes (tuple(int)): Pretrained window sizes of
|
||||
each layer.
|
||||
init_cfg (dict, optional): The Config for initialization.
|
||||
Defaults to None.
|
||||
|
||||
Examples:
|
||||
>>> from mmcls.models import SwinTransformerV2
|
||||
>>> import torch
|
||||
>>> extra_config = dict(
|
||||
>>> arch='tiny',
|
||||
>>> stage_cfgs=dict(downsample_cfg={'kernel_size': 3,
|
||||
>>> 'padding': 'same'}))
|
||||
>>> self = SwinTransformerV2(**extra_config)
|
||||
>>> inputs = torch.rand(1, 3, 224, 224)
|
||||
>>> output = self.forward(inputs)
|
||||
>>> print(output.shape)
|
||||
(1, 2592, 4)
|
||||
"""
|
||||
arch_zoo = {
|
||||
**dict.fromkeys(['t', 'tiny'],
|
||||
{'embed_dims': 96,
|
||||
'depths': [2, 2, 6, 2],
|
||||
'num_heads': [3, 6, 12, 24],
|
||||
'extra_norm_every_n_blocks': 0}),
|
||||
**dict.fromkeys(['s', 'small'],
|
||||
{'embed_dims': 96,
|
||||
'depths': [2, 2, 18, 2],
|
||||
'num_heads': [3, 6, 12, 24],
|
||||
'extra_norm_every_n_blocks': 0}),
|
||||
**dict.fromkeys(['b', 'base'],
|
||||
{'embed_dims': 128,
|
||||
'depths': [2, 2, 18, 2],
|
||||
'num_heads': [4, 8, 16, 32],
|
||||
'extra_norm_every_n_blocks': 0}),
|
||||
**dict.fromkeys(['l', 'large'],
|
||||
{'embed_dims': 192,
|
||||
'depths': [2, 2, 18, 2],
|
||||
'num_heads': [6, 12, 24, 48],
|
||||
'extra_norm_every_n_blocks': 0}),
|
||||
# head count not certain for huge, and is employed for another
|
||||
# parallel study about self-supervised learning.
|
||||
**dict.fromkeys(['h', 'huge'],
|
||||
{'embed_dims': 352,
|
||||
'depths': [2, 2, 18, 2],
|
||||
'num_heads': [8, 16, 32, 64],
|
||||
'extra_norm_every_n_blocks': 6}),
|
||||
**dict.fromkeys(['g', 'giant'],
|
||||
{'embed_dims': 512,
|
||||
'depths': [2, 2, 42, 4],
|
||||
'num_heads': [16, 32, 64, 128],
|
||||
'extra_norm_every_n_blocks': 6}),
|
||||
} # yapf: disable
|
||||
|
||||
_version = 1
|
||||
num_extra_tokens = 0
|
||||
|
||||
def __init__(self,
|
||||
arch='tiny',
|
||||
img_size=256,
|
||||
patch_size=4,
|
||||
in_channels=3,
|
||||
vocabulary_size=128,
|
||||
window_size=8,
|
||||
drop_rate=0.,
|
||||
drop_path_rate=0.1,
|
||||
out_indices=(3, ),
|
||||
use_abs_pos_embed=False,
|
||||
interpolate_mode='bicubic',
|
||||
with_cp=False,
|
||||
frozen_stages=-1,
|
||||
norm_eval=False,
|
||||
pad_small_map=False,
|
||||
norm_cfg=dict(type='LN'),
|
||||
stage_cfgs=dict(downsample_cfg=dict(is_post_norm=True)),
|
||||
patch_cfg=dict(),
|
||||
pretrained_window_sizes=[0, 0, 0, 0],
|
||||
init_cfg=None):
|
||||
super(SwinTransformerV2, self).__init__(init_cfg=init_cfg)
|
||||
|
||||
if isinstance(arch, str):
|
||||
arch = arch.lower()
|
||||
assert arch in set(self.arch_zoo), \
|
||||
f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
|
||||
self.arch_settings = self.arch_zoo[arch]
|
||||
else:
|
||||
essential_keys = {
|
||||
'embed_dims', 'depths', 'num_heads',
|
||||
'extra_norm_every_n_blocks'
|
||||
}
|
||||
assert isinstance(arch, dict) and set(arch) == essential_keys, \
|
||||
f'Custom arch needs a dict with keys {essential_keys}'
|
||||
self.arch_settings = arch
|
||||
|
||||
self.vocabulary_size = vocabulary_size + 1 # 增加ignore类别
|
||||
self.embed_dims = self.arch_settings['embed_dims']
|
||||
self.depths = self.arch_settings['depths']
|
||||
self.num_heads = self.arch_settings['num_heads']
|
||||
self.extra_norm_every_n_blocks = self.arch_settings[
|
||||
'extra_norm_every_n_blocks']
|
||||
self.num_layers = len(self.depths)
|
||||
self.out_indices = out_indices
|
||||
self.use_abs_pos_embed = use_abs_pos_embed
|
||||
self.interpolate_mode = interpolate_mode
|
||||
self.frozen_stages = frozen_stages
|
||||
|
||||
if isinstance(window_size, int):
|
||||
self.window_sizes = [window_size for _ in range(self.num_layers)]
|
||||
elif isinstance(window_size, Sequence):
|
||||
assert len(window_size) == self.num_layers, \
|
||||
f'Length of window_sizes {len(window_size)} is not equal to '\
|
||||
f'length of stages {self.num_layers}.'
|
||||
self.window_sizes = window_size
|
||||
else:
|
||||
raise TypeError('window_size should be a Sequence or int.')
|
||||
|
||||
_patch_cfg = dict(
|
||||
in_channels=in_channels,
|
||||
input_size=img_size,
|
||||
embed_dims=self.embed_dims,
|
||||
conv_type='Conv2d',
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
norm_cfg=dict(type='LN'),
|
||||
)
|
||||
_patch_cfg.update(patch_cfg)
|
||||
self.patch_embed = PatchEmbed(**_patch_cfg)
|
||||
self.patch_resolution = self.patch_embed.init_out_size
|
||||
self.patch_size = patch_size
|
||||
|
||||
if self.use_abs_pos_embed:
|
||||
num_patches = self.patch_resolution[0] * self.patch_resolution[1]
|
||||
self.absolute_pos_embed = nn.Parameter(
|
||||
torch.zeros(1, num_patches, self.embed_dims))
|
||||
self._register_load_state_dict_pre_hook(
|
||||
self._prepare_abs_pos_embed)
|
||||
|
||||
self._register_load_state_dict_pre_hook(self._delete_reinit_params)
|
||||
|
||||
self.drop_after_pos = nn.Dropout(p=drop_rate)
|
||||
self.norm_eval = norm_eval
|
||||
|
||||
# stochastic depth
|
||||
total_depth = sum(self.depths)
|
||||
dpr = [
|
||||
x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
|
||||
] # stochastic depth decay rule
|
||||
|
||||
self.stages = ModuleList()
|
||||
embed_dims = [self.embed_dims]
|
||||
for i, (depth, num_heads) in enumerate(zip(self.depths,
|
||||
self.num_heads)):
|
||||
if isinstance(stage_cfgs, Sequence):
|
||||
stage_cfg = stage_cfgs[i]
|
||||
else:
|
||||
stage_cfg = deepcopy(stage_cfgs)
|
||||
downsample = True if i > 0 else False
|
||||
_stage_cfg = {
|
||||
'embed_dims': embed_dims[-1],
|
||||
'depth': depth,
|
||||
'num_heads': num_heads,
|
||||
'window_size': self.window_sizes[i],
|
||||
'downsample': downsample,
|
||||
'drop_paths': dpr[:depth],
|
||||
'with_cp': with_cp,
|
||||
'pad_small_map': pad_small_map,
|
||||
'extra_norm_every_n_blocks': self.extra_norm_every_n_blocks,
|
||||
'pretrained_window_size': pretrained_window_sizes[i],
|
||||
**stage_cfg
|
||||
}
|
||||
|
||||
stage = SwinBlockV2Sequence(**_stage_cfg)
|
||||
self.stages.append(stage)
|
||||
|
||||
dpr = dpr[depth:]
|
||||
embed_dims.append(stage.out_channels)
|
||||
|
||||
for i in out_indices:
|
||||
if norm_cfg is not None:
|
||||
norm_layer = build_norm_layer(norm_cfg, embed_dims[i + 1])[1]
|
||||
else:
|
||||
norm_layer = nn.Identity()
|
||||
|
||||
self.add_module(f'norm{i}', norm_layer)
|
||||
|
||||
def init_weights(self):
|
||||
if (isinstance(self.init_cfg, dict)
|
||||
and self.init_cfg['type'] == 'Pretrained'):
|
||||
# Suppress default init if use pretrained model.
|
||||
from mmcls.utils import get_root_logger
|
||||
logger = get_root_logger()
|
||||
checkpoint = CheckpointLoader.load_checkpoint(
|
||||
self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
|
||||
|
||||
if 'state_dict' in checkpoint:
|
||||
state_dict = checkpoint['state_dict']
|
||||
else:
|
||||
state_dict = checkpoint
|
||||
# print(self.state_dict().keys())
|
||||
# print('---')
|
||||
# print(state_dict.keys())
|
||||
# import pdb; pdb.set_trace()
|
||||
load_state_dict(self, state_dict, strict=False, logger=logger)
|
||||
return
|
||||
else:
|
||||
super(SwinTransformerV2, self).init_weights()
|
||||
if self.use_abs_pos_embed:
|
||||
trunc_normal_(self.absolute_pos_embed, std=0.02)
|
||||
|
||||
def forward(self, x):
|
||||
x, hw_shape = self.patch_embed(x)
|
||||
|
||||
if self.use_abs_pos_embed:
|
||||
x = x + resize_pos_embed(
|
||||
self.absolute_pos_embed, self.patch_resolution, hw_shape,
|
||||
self.interpolate_mode, self.num_extra_tokens)
|
||||
x = self.drop_after_pos(x)
|
||||
|
||||
outs = []
|
||||
for i, stage in enumerate(self.stages):
|
||||
x, hw_shape = stage(x, hw_shape)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
out = norm_layer(x)
|
||||
out = out.view(-1, *hw_shape,
|
||||
stage.out_channels).permute(0, 3, 1,
|
||||
2).contiguous()
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages >= 0:
|
||||
self.patch_embed.eval()
|
||||
for param in self.patch_embed.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
for i in range(0, self.frozen_stages + 1):
|
||||
m = self.stages[i]
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.requires_grad = False
|
||||
for i in self.out_indices:
|
||||
if i <= self.frozen_stages:
|
||||
for param in getattr(self, f'norm{i}').parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def train(self, mode=True):
|
||||
super(SwinTransformerV2, self).train(mode)
|
||||
self._freeze_stages()
|
||||
if mode and self.norm_eval:
|
||||
for m in self.modules():
|
||||
# trick: eval have effect on BatchNorm only
|
||||
if isinstance(m, _BatchNorm):
|
||||
m.eval()
|
||||
|
||||
def _prepare_abs_pos_embed(self, state_dict, prefix, *args, **kwargs):
|
||||
name = prefix + 'absolute_pos_embed'
|
||||
if name not in state_dict.keys():
|
||||
return
|
||||
|
||||
ckpt_pos_embed_shape = state_dict[name].shape
|
||||
if self.absolute_pos_embed.shape != ckpt_pos_embed_shape:
|
||||
from mmcls.utils import get_root_logger
|
||||
logger = get_root_logger()
|
||||
logger.info(
|
||||
'Resize the absolute_pos_embed shape from '
|
||||
f'{ckpt_pos_embed_shape} to {self.absolute_pos_embed.shape}.')
|
||||
|
||||
ckpt_pos_embed_shape = to_2tuple(
|
||||
int(np.sqrt(ckpt_pos_embed_shape[1] - self.num_extra_tokens)))
|
||||
pos_embed_shape = self.patch_embed.init_out_size
|
||||
|
||||
state_dict[name] = resize_pos_embed(state_dict[name],
|
||||
ckpt_pos_embed_shape,
|
||||
pos_embed_shape,
|
||||
self.interpolate_mode,
|
||||
self.num_extra_tokens)
|
||||
|
||||
def _delete_reinit_params(self, state_dict, prefix, *args, **kwargs):
|
||||
# delete relative_position_index since we always re-init it
|
||||
relative_position_index_keys = [
|
||||
k for k in state_dict.keys() if 'relative_position_index' in k
|
||||
]
|
||||
for k in relative_position_index_keys:
|
||||
del state_dict[k]
|
||||
|
||||
# delete relative_coords_table since we always re-init it
|
||||
relative_position_index_keys = [
|
||||
k for k in state_dict.keys() if 'relative_coords_table' in k
|
||||
]
|
||||
for k in relative_position_index_keys:
|
||||
del state_dict[k]
|
||||
|
||||
class Proj_MHSA(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dims,
|
||||
proj_dims,
|
||||
num_heads=16,
|
||||
batch_first=True,
|
||||
bias = True
|
||||
):
|
||||
super().__init__()
|
||||
self.proj_in = nn.Linear(in_features=embed_dims, out_features=proj_dims)
|
||||
self.attn = MultiheadAttention(
|
||||
embed_dims=proj_dims,
|
||||
num_heads=num_heads,
|
||||
batch_first=batch_first,
|
||||
bias=bias
|
||||
)
|
||||
self.proj_out = nn.Linear(in_features=proj_dims, out_features=embed_dims)
|
||||
def forward(self, x):
|
||||
x = self.proj_in(x)
|
||||
x = self.attn(x, x, x)
|
||||
x = self.proj_out(x)
|
||||
return x
|
||||
|
||||
class SwinTransformerV2MSL(SwinTransformerV2):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if 'use_attn' in kwargs:
|
||||
self.use_attn = kwargs.pop('use_attn')
|
||||
else:
|
||||
self.use_attn = False
|
||||
if 'merge_stage' in kwargs:
|
||||
self.merge_stage = kwargs.pop('merge_stage')
|
||||
else:
|
||||
self.merge_stage = 0
|
||||
if 'with_cls_pos' in kwargs:
|
||||
self.with_cls_pos = kwargs.pop('with_cls_pos')
|
||||
else:
|
||||
self.with_cls_pos = False
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
|
||||
#self.vocabulary_token = nn.Parameter(torch.zeros(1, 1, 1, self.vocabulary_size, self.embed_dims))
|
||||
self.vocabulary_token = nn.Parameter(torch.zeros(self.vocabulary_size, self.embed_dims))
|
||||
self.vocabulary_weight = nn.Parameter(torch.zeros(1, self.patch_size * self.patch_size))
|
||||
trunc_normal_(self.mask_token, mean=0., std=.02)
|
||||
trunc_normal_(self.vocabulary_token, mean=0., std=.02)
|
||||
|
||||
if self.use_attn:
|
||||
self.attn1 = Proj_MHSA(embed_dims=352, proj_dims=256, num_heads=16, batch_first=True, bias = True)
|
||||
self.attn2 = Proj_MHSA(embed_dims=704, proj_dims=512, num_heads=16, batch_first=True, bias = True)
|
||||
self.attn3 = Proj_MHSA( embed_dims=1408, proj_dims=1024, num_heads=16, batch_first=True, bias = True)
|
||||
self.attention_blocks = [self.attn1, self.attn2, self.attn3]
|
||||
self.norm_attn = build_norm_layer(dict(type='LN'), 1408)[1]
|
||||
|
||||
def create_ann_token(self, anno_img):
|
||||
B, H, W = anno_img.shape
|
||||
ann_token = torch.index_select(self.vocabulary_token, 0, anno_img.reshape(-1)).reshape(B, H, W, -1)
|
||||
assert H % self.patch_size == 0 and W % self.patch_size == 0
|
||||
nph, npw = H // self.patch_size, W // self.patch_size
|
||||
weight = F.softmax(self.vocabulary_weight, dim=1) * self.patch_size * self.patch_size
|
||||
weight = weight.reshape(1, 1, self.patch_size, 1, self.patch_size).repeat(1, nph, 1, npw, 1).reshape(1, H, W, 1)
|
||||
ann_token = ann_token * weight
|
||||
ann_token = F.avg_pool2d(torch.einsum('BHWC->BCHW', ann_token), self.patch_size, self.patch_size)
|
||||
ann_token = torch.einsum('BCHW->BHWC', ann_token).reshape(B, nph * npw, self.embed_dims) # shape B, L, C
|
||||
return ann_token
|
||||
|
||||
def forward(self, hr_img, anno_img, mask=None):
|
||||
x, hw_shape = self.patch_embed(hr_img)
|
||||
y = self.create_ann_token(anno_img)
|
||||
assert x.shape == y.shape
|
||||
B, L, C = y.shape
|
||||
if mask is not None:
|
||||
mask_tokens = self.mask_token.expand(B, L, -1)
|
||||
w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens)
|
||||
y = y * (1. - w) + mask_tokens * w
|
||||
|
||||
if self.merge_stage == 0:
|
||||
x = (x + y) * 0.5
|
||||
else:
|
||||
x = x.reshape(B, *hw_shape, C)
|
||||
y = y.reshape(B, *hw_shape, C)
|
||||
x = torch.cat((x, y), dim=2)
|
||||
hw_shape = (hw_shape[0], hw_shape[1] * 2)
|
||||
x = x.reshape(B, -1, C)
|
||||
if self.use_abs_pos_embed:
|
||||
x = x + resize_pos_embed(
|
||||
self.absolute_pos_embed, self.patch_resolution, hw_shape,
|
||||
self.interpolate_mode, self.num_extra_tokens)
|
||||
if self.with_cls_pos:
|
||||
hw_shape_half = [hw_shape[0], hw_shape[1] // 2]
|
||||
x = x.reshape(B, *hw_shape, C)
|
||||
x1 = x[:, :, :x.shape[2]//2, :].reshape(B, -1, C)
|
||||
x2 = x[:, :, x.shape[2]//2:, :].reshape(B, -1, C)
|
||||
x1 = x1 + resize_pos_embed(
|
||||
self.absolute_pos_embed, self.patch_resolution, hw_shape_half,
|
||||
self.interpolate_mode, self.num_extra_tokens)
|
||||
x2 = x2 + resize_pos_embed(
|
||||
self.absolute_pos_embed, self.patch_resolution, hw_shape_half,
|
||||
self.interpolate_mode, self.num_extra_tokens)
|
||||
x1 = x1.reshape(B, *hw_shape_half, C)
|
||||
x2 = x2.reshape(B, *hw_shape_half, C)
|
||||
x = torch.cat((x1, x2), dim=2).reshape(B, -1, C)
|
||||
x = self.drop_after_pos(x)
|
||||
outs = []
|
||||
merge_idx = self.merge_stage - 1
|
||||
for i, stage in enumerate(self.stages):
|
||||
x, hw_shape = stage(x, hw_shape)
|
||||
if i == merge_idx:
|
||||
x = x.reshape(x.shape[0], *hw_shape, x.shape[-1]) # b,l,c -> b, h, w, c
|
||||
x = (x[:, :, :x.shape[2]//2] + x[:, :, x.shape[2]//2:]) * 0.5
|
||||
x = x.reshape(x.shape[0], -1, x.shape[-1])
|
||||
hw_shape = (hw_shape[0], hw_shape[1] // 2)
|
||||
if self.use_attn:
|
||||
if i <= len(self.attention_blocks) - 1:
|
||||
x = x + self.attention_blocks[i](x)
|
||||
if i == len(self.attention_blocks) - 1:
|
||||
x = self.norm_attn(x)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
out = norm_layer(x)
|
||||
out = out.view(-1, *hw_shape, stage.out_channels).permute(0, 3, 1, 2).contiguous()
|
||||
outs.append(out)
|
||||
return outs
|
||||
611
lib/models/backbones/vit.py
Normal file
611
lib/models/backbones/vit.py
Normal file
@@ -0,0 +1,611 @@
|
||||
# Copyright (c) Ant Group. All rights reserved.
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint as cp
|
||||
from mmcv.cnn import build_norm_layer
|
||||
from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
|
||||
from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
|
||||
trunc_normal_)
|
||||
from mmcv.runner import (BaseModule, CheckpointLoader, ModuleList,
|
||||
load_state_dict)
|
||||
from torch.nn.modules.batchnorm import _BatchNorm
|
||||
from torch.nn.modules.utils import _pair as to_2tuple
|
||||
|
||||
from mmseg.ops import resize
|
||||
from mmseg.utils import get_root_logger
|
||||
from mmseg.models.utils.embed import PatchEmbed
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
class TransformerEncoderLayer(BaseModule):
|
||||
"""Implements one encoder layer in Vision Transformer.
|
||||
Args:
|
||||
embed_dims (int): The feature dimension.
|
||||
num_heads (int): Parallel attention heads.
|
||||
feedforward_channels (int): The hidden dimension for FFNs.
|
||||
drop_rate (float): Probability of an element to be zeroed
|
||||
after the feed forward layer. Default: 0.0.
|
||||
attn_drop_rate (float): The drop out rate for attention layer.
|
||||
Default: 0.0.
|
||||
drop_path_rate (float): stochastic depth rate. Default 0.0.
|
||||
num_fcs (int): The number of fully-connected layers for FFNs.
|
||||
Default: 2.
|
||||
qkv_bias (bool): enable bias for qkv if True. Default: True
|
||||
act_cfg (dict): The activation config for FFNs.
|
||||
Default: dict(type='GELU').
|
||||
norm_cfg (dict): Config dict for normalization layer.
|
||||
Default: dict(type='LN').
|
||||
batch_first (bool): Key, Query and Value are shape of
|
||||
(batch, n, embed_dim)
|
||||
or (n, batch, embed_dim). Default: True.
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save
|
||||
some memory while slowing down the training speed. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dims,
|
||||
num_heads,
|
||||
feedforward_channels,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
num_fcs=2,
|
||||
qkv_bias=True,
|
||||
act_cfg=dict(type='GELU'),
|
||||
norm_cfg=dict(type='LN'),
|
||||
batch_first=True,
|
||||
attn_cfg=dict(),
|
||||
ffn_cfg=dict(),
|
||||
with_cp=False):
|
||||
super(TransformerEncoderLayer, self).__init__()
|
||||
|
||||
self.norm1_name, norm1 = build_norm_layer(norm_cfg,
|
||||
embed_dims,
|
||||
postfix=1)
|
||||
self.add_module(self.norm1_name, norm1)
|
||||
|
||||
attn_cfg.update(
|
||||
dict(embed_dims=embed_dims,
|
||||
num_heads=num_heads,
|
||||
attn_drop=attn_drop_rate,
|
||||
proj_drop=drop_rate,
|
||||
batch_first=batch_first,
|
||||
bias=qkv_bias))
|
||||
|
||||
self.build_attn(attn_cfg)
|
||||
|
||||
self.norm2_name, norm2 = build_norm_layer(norm_cfg,
|
||||
embed_dims,
|
||||
postfix=2)
|
||||
self.add_module(self.norm2_name, norm2)
|
||||
|
||||
ffn_cfg.update(
|
||||
dict(embed_dims=embed_dims,
|
||||
feedforward_channels=feedforward_channels,
|
||||
num_fcs=num_fcs,
|
||||
ffn_drop=drop_rate,
|
||||
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate)
|
||||
if drop_path_rate > 0 else None,
|
||||
act_cfg=act_cfg))
|
||||
self.build_ffn(ffn_cfg)
|
||||
self.with_cp = with_cp
|
||||
|
||||
def build_attn(self, attn_cfg):
|
||||
self.attn = MultiheadAttention(**attn_cfg)
|
||||
|
||||
def build_ffn(self, ffn_cfg):
|
||||
self.ffn = FFN(**ffn_cfg)
|
||||
|
||||
@property
|
||||
def norm1(self):
|
||||
return getattr(self, self.norm1_name)
|
||||
|
||||
@property
|
||||
def norm2(self):
|
||||
return getattr(self, self.norm2_name)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
def _inner_forward(x):
|
||||
x = self.attn(self.norm1(x), identity=x)
|
||||
x = self.ffn(self.norm2(x), identity=x)
|
||||
return x
|
||||
|
||||
if self.with_cp and x.requires_grad:
|
||||
x = cp.checkpoint(_inner_forward, x)
|
||||
else:
|
||||
x = _inner_forward(x)
|
||||
return x
|
||||
|
||||
|
||||
class VisionTransformer(BaseModule):
|
||||
"""Vision Transformer.
|
||||
This backbone is the implementation of `An Image is Worth 16x16 Words:
|
||||
Transformers for Image Recognition at
|
||||
Scale <https://arxiv.org/abs/2010.11929>`_.
|
||||
Args:
|
||||
img_size (int | tuple): Input image size. Default: 224.
|
||||
patch_size (int): The patch size. Default: 16.
|
||||
in_channels (int): Number of input channels. Default: 3.
|
||||
embed_dims (int): embedding dimension. Default: 768.
|
||||
num_layers (int): depth of transformer. Default: 12.
|
||||
num_heads (int): number of attention heads. Default: 12.
|
||||
mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
|
||||
Default: 4.
|
||||
out_indices (list | tuple | int): Output from which stages.
|
||||
Default: -1.
|
||||
qkv_bias (bool): enable bias for qkv if True. Default: True.
|
||||
drop_rate (float): Probability of an element to be zeroed.
|
||||
Default 0.0
|
||||
attn_drop_rate (float): The drop out rate for attention layer.
|
||||
Default 0.0
|
||||
drop_path_rate (float): stochastic depth rate. Default 0.0
|
||||
with_cls_token (bool): Whether concatenating class token into image
|
||||
tokens as transformer input. Default: True.
|
||||
output_cls_token (bool): Whether output the cls_token. If set True,
|
||||
`with_cls_token` must be True. Default: False.
|
||||
norm_cfg (dict): Config dict for normalization layer.
|
||||
Default: dict(type='LN')
|
||||
act_cfg (dict): The activation config for FFNs.
|
||||
Default: dict(type='GELU').
|
||||
patch_norm (bool): Whether to add a norm in PatchEmbed Block.
|
||||
Default: False.
|
||||
final_norm (bool): Whether to add a additional layer to normalize
|
||||
final feature map. Default: False.
|
||||
interpolate_mode (str): Select the interpolate mode for position
|
||||
embeding vector resize. Default: bicubic.
|
||||
num_fcs (int): The number of fully-connected layers for FFNs.
|
||||
Default: 2.
|
||||
norm_eval (bool): Whether to set norm layers to eval mode, namely,
|
||||
freeze running stats (mean and var). Note: Effect on Batch Norm
|
||||
and its variants only. Default: False.
|
||||
with_cp (bool): Use checkpoint or not. Using checkpoint will save
|
||||
some memory while slowing down the training speed. Default: False.
|
||||
pretrained (str, optional): model pretrained path. Default: None.
|
||||
init_cfg (dict or list[dict], optional): Initialization config dict.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=224,
|
||||
patch_size=16,
|
||||
in_channels=3,
|
||||
embed_dims=768,
|
||||
num_layers=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
out_indices=-1,
|
||||
qkv_bias=True,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
with_cls_token=True,
|
||||
output_cls_token=False,
|
||||
norm_cfg=dict(type='LN'),
|
||||
act_cfg=dict(type='GELU'),
|
||||
patch_norm=False,
|
||||
final_norm=False,
|
||||
interpolate_mode='bicubic',
|
||||
num_fcs=2,
|
||||
norm_eval=False,
|
||||
with_cp=False,
|
||||
use_ccd=False,
|
||||
ccd_num=0,
|
||||
pretrained=None,
|
||||
init_cfg=None):
|
||||
super(VisionTransformer, self).__init__(init_cfg=init_cfg)
|
||||
|
||||
if isinstance(img_size, int):
|
||||
img_size = to_2tuple(img_size)
|
||||
elif isinstance(img_size, tuple):
|
||||
if len(img_size) == 1:
|
||||
img_size = to_2tuple(img_size[0])
|
||||
assert len(img_size) == 2, \
|
||||
f'The size of image should have length 1 or 2, ' \
|
||||
f'but got {len(img_size)}'
|
||||
|
||||
if output_cls_token:
|
||||
assert with_cls_token is True, f'with_cls_token must be True if' \
|
||||
f'set output_cls_token to True, but got {with_cls_token}'
|
||||
|
||||
assert not (init_cfg and pretrained), \
|
||||
'init_cfg and pretrained cannot be set at the same time'
|
||||
if isinstance(pretrained, str):
|
||||
warnings.warn('DeprecationWarning: pretrained is deprecated, '
|
||||
'please use "init_cfg" instead')
|
||||
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
|
||||
elif pretrained is not None:
|
||||
raise TypeError('pretrained must be a str or None')
|
||||
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.interpolate_mode = interpolate_mode
|
||||
self.norm_eval = norm_eval
|
||||
self.with_cp = with_cp
|
||||
self.pretrained = pretrained
|
||||
self.embed_dims = embed_dims
|
||||
self.patch_embed = PatchEmbed(
|
||||
in_channels=in_channels,
|
||||
embed_dims=embed_dims,
|
||||
conv_type='Conv2d',
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
padding='corner',
|
||||
norm_cfg=norm_cfg if patch_norm else None,
|
||||
init_cfg=None,
|
||||
)
|
||||
|
||||
self.use_ccd = use_ccd
|
||||
self.ccd_num = ccd_num
|
||||
if self.use_ccd:
|
||||
self.ccd_embed = nn.Parameter(
|
||||
torch.rand(1, self.ccd_num, embed_dims))
|
||||
|
||||
num_patches = (img_size[0] // patch_size) * \
|
||||
(img_size[1] // patch_size)
|
||||
|
||||
self.with_cls_token = with_cls_token
|
||||
self.output_cls_token = output_cls_token
|
||||
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
|
||||
# self.pos_embed = nn.Parameter(
|
||||
# torch.zeros(1, num_patches, embed_dims))
|
||||
# 原来是
|
||||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches+1, embed_dims))
|
||||
self.drop_after_pos = nn.Dropout(p=drop_rate)
|
||||
|
||||
if isinstance(out_indices, int):
|
||||
if out_indices == -1:
|
||||
out_indices = num_layers - 1
|
||||
self.out_indices = [out_indices]
|
||||
elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
|
||||
self.out_indices = out_indices
|
||||
else:
|
||||
raise TypeError('out_indices must be type of int, list or tuple')
|
||||
|
||||
dpr = [
|
||||
x.item() for x in torch.linspace(0, drop_path_rate, num_layers)
|
||||
] # stochastic depth decay rule
|
||||
|
||||
self.layers = ModuleList()
|
||||
for i in range(num_layers):
|
||||
self.layers.append(
|
||||
TransformerEncoderLayer(embed_dims=embed_dims,
|
||||
num_heads=num_heads,
|
||||
feedforward_channels=mlp_ratio *
|
||||
embed_dims,
|
||||
attn_drop_rate=attn_drop_rate,
|
||||
drop_rate=drop_rate,
|
||||
drop_path_rate=dpr[i],
|
||||
num_fcs=num_fcs,
|
||||
qkv_bias=qkv_bias,
|
||||
act_cfg=act_cfg,
|
||||
norm_cfg=norm_cfg,
|
||||
with_cp=with_cp,
|
||||
batch_first=True))
|
||||
|
||||
self.final_norm = final_norm
|
||||
if final_norm:
|
||||
self.norm1_name, norm1 = build_norm_layer(norm_cfg,
|
||||
embed_dims,
|
||||
postfix=1)
|
||||
self.add_module(self.norm1_name, norm1)
|
||||
|
||||
@property
|
||||
def norm1(self):
|
||||
return getattr(self, self.norm1_name)
|
||||
|
||||
def init_weights(self):
|
||||
if (isinstance(self.init_cfg, dict)
|
||||
and self.init_cfg.get('type') == 'Pretrained'):
|
||||
logger = get_root_logger()
|
||||
checkpoint = CheckpointLoader.load_checkpoint(
|
||||
self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
|
||||
|
||||
if 'state_dict' in checkpoint:
|
||||
state_dict = checkpoint['state_dict']
|
||||
else:
|
||||
state_dict = checkpoint
|
||||
|
||||
if 'pos_embed' in state_dict.keys():
|
||||
if self.pos_embed.shape != state_dict['pos_embed'].shape:
|
||||
logger.info(msg=f'Resize the pos_embed shape from '
|
||||
f'{state_dict["pos_embed"].shape} to '
|
||||
f'{self.pos_embed.shape}')
|
||||
h, w = self.img_size
|
||||
pos_size = int(
|
||||
math.sqrt(state_dict['pos_embed'].shape[1] - 1))
|
||||
state_dict['pos_embed'] = self.resize_pos_embed(
|
||||
state_dict['pos_embed'],
|
||||
(h // self.patch_size, w // self.patch_size),
|
||||
(pos_size, pos_size), self.interpolate_mode)
|
||||
|
||||
load_state_dict(self, state_dict, strict=False, logger=logger)
|
||||
elif self.init_cfg is not None:
|
||||
super(VisionTransformer, self).init_weights()
|
||||
else:
|
||||
# We only implement the 'jax_impl' initialization implemented at
|
||||
# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501
|
||||
trunc_normal_(self.pos_embed, std=.02)
|
||||
trunc_normal_(self.cls_token, std=.02)
|
||||
if self.use_ccd:
|
||||
trunc_normal_(self.ccd_embed, std=0.02)
|
||||
for n, m in self.named_modules():
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight, std=.02)
|
||||
if m.bias is not None:
|
||||
if 'ffn' in n:
|
||||
nn.init.normal_(m.bias, mean=0., std=1e-6)
|
||||
else:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Conv2d):
|
||||
kaiming_init(m, mode='fan_in', bias=0.)
|
||||
elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
|
||||
constant_init(m, val=1.0, bias=0.)
|
||||
|
||||
def _pos_embeding(self, patched_img, hw_shape, pos_embed):
|
||||
"""Positioning embeding method.
|
||||
Resize the pos_embed, if the input image size doesn't match
|
||||
the training size.
|
||||
Args:
|
||||
patched_img (torch.Tensor): The patched image, it should be
|
||||
shape of [B, L1, C].
|
||||
hw_shape (tuple): The downsampled image resolution.
|
||||
pos_embed (torch.Tensor): The pos_embed weighs, it should be
|
||||
shape of [B, L2, c].
|
||||
Return:
|
||||
torch.Tensor: The pos encoded image feature.
|
||||
"""
|
||||
assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
|
||||
'the shapes of patched_img and pos_embed must be [B, L, C]'
|
||||
x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
|
||||
if x_len != pos_len:
|
||||
if pos_len == (self.img_size[0] // self.patch_size) * (
|
||||
self.img_size[1] // self.patch_size) + 1:
|
||||
pos_h = self.img_size[0] // self.patch_size
|
||||
pos_w = self.img_size[1] // self.patch_size
|
||||
else:
|
||||
raise ValueError(
|
||||
'Unexpected shape of pos_embed, got {}.'.format(
|
||||
pos_embed.shape))
|
||||
pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
|
||||
(pos_h, pos_w),
|
||||
self.interpolate_mode)
|
||||
return self.drop_after_pos(patched_img + pos_embed)
|
||||
|
||||
@staticmethod
|
||||
def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
|
||||
"""Resize pos_embed weights.
|
||||
Resize pos_embed using bicubic interpolate method.
|
||||
Args:
|
||||
pos_embed (torch.Tensor): Position embedding weights.
|
||||
input_shpae (tuple): Tuple for (downsampled input image height,
|
||||
downsampled input image width).
|
||||
pos_shape (tuple): The resolution of downsampled origin training
|
||||
image.
|
||||
mode (str): Algorithm used for upsampling:
|
||||
``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
|
||||
``'trilinear'``. Default: ``'nearest'``
|
||||
Return:
|
||||
torch.Tensor: The resized pos_embed of shape [B, L_new, C]
|
||||
"""
|
||||
assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
|
||||
pos_h, pos_w = pos_shape
|
||||
# keep dim for easy deployment
|
||||
cls_token_weight = pos_embed[:, 0:1]
|
||||
pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
|
||||
pos_embed_weight = pos_embed_weight.reshape(
|
||||
1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
|
||||
pos_embed_weight = resize(pos_embed_weight,
|
||||
size=input_shpae,
|
||||
align_corners=False,
|
||||
mode=mode)
|
||||
pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
|
||||
pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
|
||||
return pos_embed
|
||||
|
||||
def forward(self, inputs, ccd_index=None):
|
||||
B = inputs.shape[0]
|
||||
|
||||
x, hw_shape = self.patch_embed(inputs)
|
||||
|
||||
if self.use_ccd:
|
||||
_ccd_idx = np.concatenate(ccd_index, axis=0)
|
||||
_ccd_embed = self.ccd_embed[:, _ccd_idx, :].permute(1, 0, 2)
|
||||
x = x + _ccd_embed
|
||||
|
||||
# stole cls_tokens impl from Phil Wang, thanks
|
||||
cls_tokens = self.cls_token.expand(B, -1, -1)
|
||||
x = torch.cat((cls_tokens, x), dim=1)
|
||||
x = self._pos_embeding(x, hw_shape, self.pos_embed)
|
||||
|
||||
if not self.with_cls_token:
|
||||
# Remove class token for transformer encoder input
|
||||
x = x[:, 1:]
|
||||
|
||||
outs = []
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x)
|
||||
if i == len(self.layers) - 1:
|
||||
if self.final_norm:
|
||||
x = self.norm1(x)
|
||||
if i in self.out_indices:
|
||||
if self.with_cls_token:
|
||||
# Remove class token and reshape token for decoder heads
|
||||
out = x[:, 1:]
|
||||
else:
|
||||
out = x
|
||||
B, _, C = out.shape
|
||||
out = out.reshape(B, hw_shape[0], hw_shape[1],
|
||||
C).permute(0, 3, 1, 2).contiguous()
|
||||
if self.output_cls_token:
|
||||
out = [out, x[:, 0]]
|
||||
outs.append(out)
|
||||
|
||||
return tuple(outs)
|
||||
|
||||
def train(self, mode=True):
|
||||
super(VisionTransformer, self).train(mode)
|
||||
if mode and self.norm_eval:
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.LayerNorm):
|
||||
m.eval()
|
||||
|
||||
|
||||
class VisionTransformerMSL(VisionTransformer):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if 'use_attn' in kwargs:
|
||||
self.use_attn = kwargs.pop('use_attn')
|
||||
else:
|
||||
self.use_attn = False
|
||||
if 'merge_stage' in kwargs:
|
||||
self.merge_stage = kwargs.pop('merge_stage')
|
||||
else:
|
||||
self.merge_stage = 0
|
||||
if 'with_cls_pos' in kwargs:
|
||||
self.with_cls_pos = kwargs.pop('with_cls_pos')
|
||||
else:
|
||||
self.with_cls_pos = False
|
||||
|
||||
self.vocabulary_size = kwargs.pop('vocabulary_size') + 1 # 增加ignore类别
|
||||
super().__init__(**kwargs)
|
||||
self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
|
||||
img_size = kwargs.pop('img_size')
|
||||
patch_size = kwargs.pop('patch_size')
|
||||
num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
|
||||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.embed_dims))
|
||||
|
||||
self.vocabulary_token = nn.Parameter(torch.zeros(self.vocabulary_size, self.embed_dims))
|
||||
self.vocabulary_weight = nn.Parameter(torch.zeros(1, self.patch_size * self.patch_size))
|
||||
trunc_normal_(self.mask_token, mean=0., std=.02)
|
||||
trunc_normal_(self.vocabulary_token, mean=0., std=.02)
|
||||
|
||||
if self.use_attn:
|
||||
self.attn1 = MultiheadAttention(embed_dims=1024, num_heads=16, batch_first=True, bias = True)
|
||||
self.attn2 = MultiheadAttention(embed_dims=1024, num_heads=16, batch_first=True, bias = True)
|
||||
self.attn3 = MultiheadAttention(embed_dims=1024, num_heads=16, batch_first=True, bias = True)
|
||||
self.attention_blocks = [self.attn1, self.attn2, self.attn3]
|
||||
self.norm_attn = build_norm_layer(dict(type='LN'), 1024)[1]
|
||||
|
||||
def create_ann_token(self, anno_img):
|
||||
B, H, W = anno_img.shape
|
||||
ann_token = torch.index_select(self.vocabulary_token, 0, anno_img.reshape(-1)).reshape(B, H, W, -1)
|
||||
assert H % self.patch_size == 0 and W % self.patch_size == 0
|
||||
nph, npw = H // self.patch_size, W // self.patch_size
|
||||
weight = F.softmax(self.vocabulary_weight, dim=1) * self.patch_size * self.patch_size
|
||||
weight = weight.reshape(1, 1, self.patch_size, 1, self.patch_size).repeat(1, nph, 1, npw, 1).reshape(1, H, W, 1)
|
||||
ann_token = ann_token * weight
|
||||
ann_token = F.avg_pool2d(torch.einsum('BHWC->BCHW', ann_token), self.patch_size, self.patch_size)
|
||||
ann_token = torch.einsum('BCHW->BHWC', ann_token).reshape(B, nph * npw, self.embed_dims) # shape B, L, C
|
||||
return ann_token
|
||||
def _pos_embeding(self, patched_img, hw_shape, pos_embed):
|
||||
"""Positioning embeding method.
|
||||
Resize the pos_embed, if the input image size doesn't match
|
||||
the training size.
|
||||
Args:
|
||||
patched_img (torch.Tensor): The patched image, it should be
|
||||
shape of [B, L1, C].
|
||||
hw_shape (tuple): The downsampled image resolution.
|
||||
pos_embed (torch.Tensor): The pos_embed weighs, it should be
|
||||
shape of [B, L2, c].
|
||||
Return:
|
||||
torch.Tensor: The pos encoded image feature.
|
||||
"""
|
||||
assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
|
||||
'the shapes of patched_img and pos_embed must be [B, L, C]'
|
||||
x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
|
||||
if x_len != pos_len:
|
||||
if pos_len == (self.img_size[0] // self.patch_size) * (
|
||||
self.img_size[1] // self.patch_size):
|
||||
pos_h = self.img_size[0] // self.patch_size
|
||||
pos_w = self.img_size[1] // self.patch_size
|
||||
else:
|
||||
raise ValueError(
|
||||
'Unexpected shape of pos_embed, got {}.'.format(
|
||||
pos_embed.shape))
|
||||
pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
|
||||
(pos_h, pos_w),
|
||||
self.interpolate_mode)
|
||||
return self.drop_after_pos(patched_img + pos_embed)
|
||||
|
||||
@staticmethod
|
||||
def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
|
||||
"""Resize pos_embed weights.
|
||||
Resize pos_embed using bicubic interpolate method.
|
||||
Args:
|
||||
pos_embed (torch.Tensor): Position embedding weights.
|
||||
input_shpae (tuple): Tuple for (downsampled input image height,
|
||||
downsampled input image width).
|
||||
pos_shape (tuple): The resolution of downsampled origin training
|
||||
image.
|
||||
mode (str): Algorithm used for upsampling:
|
||||
``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
|
||||
``'trilinear'``. Default: ``'nearest'``
|
||||
Return:
|
||||
torch.Tensor: The resized pos_embed of shape [B, L_new, C]
|
||||
"""
|
||||
assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
|
||||
pos_h, pos_w = pos_shape
|
||||
pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
|
||||
pos_embed_weight = pos_embed_weight.reshape(
|
||||
1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
|
||||
pos_embed_weight = resize(pos_embed_weight,
|
||||
size=input_shpae,
|
||||
align_corners=False,
|
||||
mode=mode)
|
||||
pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
|
||||
pos_embed = pos_embed_weight # torch.cat((cls_token_weight, pos_embed_weight), dim=1)
|
||||
return pos_embed
|
||||
|
||||
def forward(self, x, y, mask=None):
|
||||
x, hw_shape = self.patch_embed(x)
|
||||
y = self.create_ann_token(y)
|
||||
assert x.shape == y.shape
|
||||
B, L, C = y.shape
|
||||
if mask is not None:
|
||||
mask_tokens = self.mask_token.expand(B, L, -1)
|
||||
w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens)
|
||||
y = y * (1. - w) + mask_tokens * w
|
||||
if self.merge_stage == 0:
|
||||
x = (x + y) * 0.5
|
||||
else:
|
||||
x = x.reshape(B, *hw_shape, C)
|
||||
y = y.reshape(B, *hw_shape, C)
|
||||
x = torch.cat((x, y), dim=2)
|
||||
hw_shape = (hw_shape[0], hw_shape[1] * 2)
|
||||
x = x.reshape(B, -1, C)
|
||||
x = self._pos_embeding(x, hw_shape, self.pos_embed)
|
||||
merge_idx = self.merge_stage - 1
|
||||
outs = []
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x)
|
||||
if i == merge_idx:
|
||||
x = x.reshape(x.shape[0], *hw_shape, x.shape[-1]) # b,l,c -> b, h, w, c
|
||||
x = (x[:, :, :x.shape[2]//2] + x[:, :, x.shape[2]//2:]) * 0.5
|
||||
x = x.reshape(x.shape[0], -1, x.shape[-1])
|
||||
hw_shape = (hw_shape[0], hw_shape[1] // 2)
|
||||
if self.use_attn:
|
||||
if i <= len(self.attention_blocks) - 1:
|
||||
x = x + self.attention_blocks[i](x)
|
||||
if i == len(self.attention_blocks) - 1:
|
||||
x = self.norm_attn(x) # 会不会有冲突
|
||||
if (not self.use_attn) and (i == len(self.layers) - 1):
|
||||
if self.final_norm:
|
||||
x = self.norm1(x) # 会不会有冲突
|
||||
if i in self.out_indices:
|
||||
if self.with_cls_token:
|
||||
# Remove class token and reshape token for decoder heads
|
||||
out = x[:, 1:]
|
||||
else:
|
||||
out = x
|
||||
B, _, C = out.shape
|
||||
out = out.reshape(B, hw_shape[0], hw_shape[1],
|
||||
C).permute(0, 3, 1, 2).contiguous()
|
||||
if self.output_cls_token:
|
||||
out = [out, x[:, 0]]
|
||||
outs.append(out)
|
||||
|
||||
return tuple(outs)
|
||||
Reference in New Issue
Block a user