init
This commit is contained in:
330
finetune/mmseg/models/utils/embed.py
Normal file
330
finetune/mmseg/models/utils/embed.py
Normal file
@@ -0,0 +1,330 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import math
|
||||
from typing import Sequence
|
||||
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from mmcv.cnn import build_conv_layer, build_norm_layer
|
||||
from mmengine.model import BaseModule
|
||||
from mmengine.utils import to_2tuple
|
||||
|
||||
|
||||
class AdaptivePadding(nn.Module):
|
||||
"""Applies padding to input (if needed) so that input can get fully covered
|
||||
by filter you specified. It support two modes "same" and "corner". The
|
||||
"same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
|
||||
input. The "corner" mode would pad zero to bottom right.
|
||||
|
||||
Args:
|
||||
kernel_size (int | tuple): Size of the kernel:
|
||||
stride (int | tuple): Stride of the filter. Default: 1:
|
||||
dilation (int | tuple): Spacing between kernel elements.
|
||||
Default: 1.
|
||||
padding (str): Support "same" and "corner", "corner" mode
|
||||
would pad zero to bottom right, and "same" mode would
|
||||
pad zero around input. Default: "corner".
|
||||
Example:
|
||||
>>> kernel_size = 16
|
||||
>>> stride = 16
|
||||
>>> dilation = 1
|
||||
>>> input = torch.rand(1, 1, 15, 17)
|
||||
>>> adap_pad = AdaptivePadding(
|
||||
>>> kernel_size=kernel_size,
|
||||
>>> stride=stride,
|
||||
>>> dilation=dilation,
|
||||
>>> padding="corner")
|
||||
>>> out = adap_pad(input)
|
||||
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
|
||||
>>> input = torch.rand(1, 1, 16, 17)
|
||||
>>> out = adap_pad(input)
|
||||
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
|
||||
"""
|
||||
|
||||
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
|
||||
|
||||
super().__init__()
|
||||
|
||||
assert padding in ('same', 'corner')
|
||||
|
||||
kernel_size = to_2tuple(kernel_size)
|
||||
stride = to_2tuple(stride)
|
||||
dilation = to_2tuple(dilation)
|
||||
|
||||
self.padding = padding
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
|
||||
def get_pad_shape(self, input_shape):
|
||||
input_h, input_w = input_shape
|
||||
kernel_h, kernel_w = self.kernel_size
|
||||
stride_h, stride_w = self.stride
|
||||
output_h = math.ceil(input_h / stride_h)
|
||||
output_w = math.ceil(input_w / stride_w)
|
||||
pad_h = max((output_h - 1) * stride_h +
|
||||
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
|
||||
pad_w = max((output_w - 1) * stride_w +
|
||||
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
|
||||
return pad_h, pad_w
|
||||
|
||||
def forward(self, x):
|
||||
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
|
||||
if pad_h > 0 or pad_w > 0:
|
||||
if self.padding == 'corner':
|
||||
x = F.pad(x, [0, pad_w, 0, pad_h])
|
||||
elif self.padding == 'same':
|
||||
x = F.pad(x, [
|
||||
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
|
||||
pad_h - pad_h // 2
|
||||
])
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(BaseModule):
|
||||
"""Image to Patch Embedding.
|
||||
|
||||
We use a conv layer to implement PatchEmbed.
|
||||
|
||||
Args:
|
||||
in_channels (int): The num of input channels. Default: 3
|
||||
embed_dims (int): The dimensions of embedding. Default: 768
|
||||
conv_type (str): The config dict for embedding
|
||||
conv layer type selection. Default: "Conv2d".
|
||||
kernel_size (int): The kernel_size of embedding conv. Default: 16.
|
||||
stride (int, optional): The slide stride of embedding conv.
|
||||
Default: None (Would be set as `kernel_size`).
|
||||
padding (int | tuple | string ): The padding length of
|
||||
embedding conv. When it is a string, it means the mode
|
||||
of adaptive padding, support "same" and "corner" now.
|
||||
Default: "corner".
|
||||
dilation (int): The dilation rate of embedding conv. Default: 1.
|
||||
bias (bool): Bias of embed conv. Default: True.
|
||||
norm_cfg (dict, optional): Config dict for normalization layer.
|
||||
Default: None.
|
||||
input_size (int | tuple | None): The size of input, which will be
|
||||
used to calculate the out size. Only work when `dynamic_size`
|
||||
is False. Default: None.
|
||||
init_cfg (`mmengine.ConfigDict`, optional): The Config for
|
||||
initialization. Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
embed_dims=768,
|
||||
conv_type='Conv2d',
|
||||
kernel_size=16,
|
||||
stride=None,
|
||||
padding='corner',
|
||||
dilation=1,
|
||||
bias=True,
|
||||
norm_cfg=None,
|
||||
input_size=None,
|
||||
init_cfg=None):
|
||||
super().__init__(init_cfg=init_cfg)
|
||||
|
||||
self.embed_dims = embed_dims
|
||||
if stride is None:
|
||||
stride = kernel_size
|
||||
|
||||
kernel_size = to_2tuple(kernel_size)
|
||||
stride = to_2tuple(stride)
|
||||
dilation = to_2tuple(dilation)
|
||||
|
||||
if isinstance(padding, str):
|
||||
self.adap_padding = AdaptivePadding(
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
dilation=dilation,
|
||||
padding=padding)
|
||||
# disable the padding of conv
|
||||
padding = 0
|
||||
else:
|
||||
self.adap_padding = None
|
||||
padding = to_2tuple(padding)
|
||||
|
||||
self.projection = build_conv_layer(
|
||||
dict(type=conv_type),
|
||||
in_channels=in_channels,
|
||||
out_channels=embed_dims,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
bias=bias)
|
||||
|
||||
if norm_cfg is not None:
|
||||
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
if input_size:
|
||||
input_size = to_2tuple(input_size)
|
||||
# `init_out_size` would be used outside to
|
||||
# calculate the num_patches
|
||||
# when `use_abs_pos_embed` outside
|
||||
self.init_input_size = input_size
|
||||
if self.adap_padding:
|
||||
pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
|
||||
input_h, input_w = input_size
|
||||
input_h = input_h + pad_h
|
||||
input_w = input_w + pad_w
|
||||
input_size = (input_h, input_w)
|
||||
|
||||
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
|
||||
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
|
||||
(kernel_size[0] - 1) - 1) // stride[0] + 1
|
||||
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
|
||||
(kernel_size[1] - 1) - 1) // stride[1] + 1
|
||||
self.init_out_size = (h_out, w_out)
|
||||
else:
|
||||
self.init_input_size = None
|
||||
self.init_out_size = None
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
|
||||
|
||||
Returns:
|
||||
tuple: Contains merged results and its spatial shape.
|
||||
|
||||
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
|
||||
- out_size (tuple[int]): Spatial shape of x, arrange as
|
||||
(out_h, out_w).
|
||||
"""
|
||||
|
||||
if self.adap_padding:
|
||||
x = self.adap_padding(x)
|
||||
|
||||
x = self.projection(x)
|
||||
out_size = (x.shape[2], x.shape[3])
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
if self.norm is not None:
|
||||
x = self.norm(x)
|
||||
return x, out_size
|
||||
|
||||
|
||||
class PatchMerging(BaseModule):
|
||||
"""Merge patch feature map.
|
||||
|
||||
This layer groups feature map by kernel_size, and applies norm and linear
|
||||
layers to the grouped feature map. Our implementation uses `nn.Unfold` to
|
||||
merge patch, which is about 25% faster than original implementation.
|
||||
Instead, we need to modify pretrained models for compatibility.
|
||||
|
||||
Args:
|
||||
in_channels (int): The num of input channels.
|
||||
out_channels (int): The num of output channels.
|
||||
kernel_size (int | tuple, optional): the kernel size in the unfold
|
||||
layer. Defaults to 2.
|
||||
stride (int | tuple, optional): the stride of the sliding blocks in the
|
||||
unfold layer. Default: None. (Would be set as `kernel_size`)
|
||||
padding (int | tuple | string ): The padding length of
|
||||
embedding conv. When it is a string, it means the mode
|
||||
of adaptive padding, support "same" and "corner" now.
|
||||
Default: "corner".
|
||||
dilation (int | tuple, optional): dilation parameter in the unfold
|
||||
layer. Default: 1.
|
||||
bias (bool, optional): Whether to add bias in linear layer or not.
|
||||
Defaults: False.
|
||||
norm_cfg (dict, optional): Config dict for normalization layer.
|
||||
Default: dict(type='LN').
|
||||
init_cfg (dict, optional): The extra config for initialization.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=2,
|
||||
stride=None,
|
||||
padding='corner',
|
||||
dilation=1,
|
||||
bias=False,
|
||||
norm_cfg=dict(type='LN'),
|
||||
init_cfg=None):
|
||||
super().__init__(init_cfg=init_cfg)
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
if stride:
|
||||
stride = stride
|
||||
else:
|
||||
stride = kernel_size
|
||||
|
||||
kernel_size = to_2tuple(kernel_size)
|
||||
stride = to_2tuple(stride)
|
||||
dilation = to_2tuple(dilation)
|
||||
|
||||
if isinstance(padding, str):
|
||||
self.adap_padding = AdaptivePadding(
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
dilation=dilation,
|
||||
padding=padding)
|
||||
# disable the padding of unfold
|
||||
padding = 0
|
||||
else:
|
||||
self.adap_padding = None
|
||||
|
||||
padding = to_2tuple(padding)
|
||||
self.sampler = nn.Unfold(
|
||||
kernel_size=kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding,
|
||||
stride=stride)
|
||||
|
||||
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
|
||||
|
||||
if norm_cfg is not None:
|
||||
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
|
||||
|
||||
def forward(self, x, input_size):
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Has shape (B, H*W, C_in).
|
||||
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
|
||||
Default: None.
|
||||
|
||||
Returns:
|
||||
tuple: Contains merged results and its spatial shape.
|
||||
|
||||
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
|
||||
- out_size (tuple[int]): Spatial shape of x, arrange as
|
||||
(Merged_H, Merged_W).
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
assert isinstance(input_size, Sequence), f'Expect ' \
|
||||
f'input_size is ' \
|
||||
f'`Sequence` ' \
|
||||
f'but get {input_size}'
|
||||
|
||||
H, W = input_size
|
||||
assert L == H * W, 'input feature has wrong size'
|
||||
|
||||
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
|
||||
# Use nn.Unfold to merge patch. About 25% faster than original method,
|
||||
# but need to modify pretrained model for compatibility
|
||||
|
||||
if self.adap_padding:
|
||||
x = self.adap_padding(x)
|
||||
H, W = x.shape[-2:]
|
||||
|
||||
x = self.sampler(x)
|
||||
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
|
||||
|
||||
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
|
||||
(self.sampler.kernel_size[0] - 1) -
|
||||
1) // self.sampler.stride[0] + 1
|
||||
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
|
||||
(self.sampler.kernel_size[1] - 1) -
|
||||
1) // self.sampler.stride[1] + 1
|
||||
|
||||
output_size = (out_h, out_w)
|
||||
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
|
||||
x = self.norm(x) if self.norm else x
|
||||
x = self.reduction(x)
|
||||
return x, output_size
|
||||
Reference in New Issue
Block a user