init

2025-12-08 22:16:31 +08:00
commit 01adcfdf60
305 changed files with 50879 additions and 0 deletions
--- a/finetune/mmseg/models/segmentors/init.py
+++ b/finetune/mmseg/models/segmentors/init.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseSegmentor
+from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .depth_estimator import DepthEstimator
+from .encoder_decoder import EncoderDecoder
+from .multimodal_encoder_decoder import MultimodalEncoderDecoder
+from .seg_tta import SegTTAModel
+
+__all__ = [
+    'BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder', 'SegTTAModel',
+    'MultimodalEncoderDecoder', 'DepthEstimator'
+]
--- a/finetune/mmseg/models/segmentors/base.py
+++ b/finetune/mmseg/models/segmentors/base.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple
+
+from mmengine.model import BaseModel
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ForwardResults, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList)
+from ..utils import resize
+
+
+class BaseSegmentor(BaseModel, metaclass=ABCMeta):
+    """Base class for segmentors.
+
+    Args:
+        data_preprocessor (dict, optional): Model preprocessing config
+            for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``,
+            ``mean`` and ``std``. Default to None.
+       init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self) -> bool:
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self) -> bool:
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, inputs: Tensor) -> bool:
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, inputs: Tensor, batch_data_samples: SampleList):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    def forward(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`SegDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape (N, C, ...) in
+                general.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    def postprocess_result(self,
+                           seg_logits: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            seg_logits (Tensor): The segmentation results, seg_logits from
+                model of each input image.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        batch_size, C, H, W = seg_logits.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_seg_logits shape is 1, C, H, W after remove padding
+                i_seg_logits = seg_logits[i:i + 1, :,
+                                          padding_top:H - padding_bottom,
+                                          padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_seg_logits = i_seg_logits.flip(dims=(3, ))
+                    else:
+                        i_seg_logits = i_seg_logits.flip(dims=(2, ))
+
+                # resize as original shape
+                i_seg_logits = resize(
+                    i_seg_logits,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_seg_logits = seg_logits[i]
+
+            if C > 1:
+                i_seg_pred = i_seg_logits.argmax(dim=0, keepdim=True)
+            else:
+                i_seg_logits = i_seg_logits.sigmoid()
+                i_seg_pred = (i_seg_logits >
+                              self.decode_head.threshold).to(i_seg_logits)
+            data_samples[i].set_data({
+                'seg_logits':
+                PixelData(**{'data': i_seg_logits}),
+                'pred_sem_seg':
+                PixelData(**{'data': i_seg_pred})
+            })
+
+        return data_samples
--- a/finetune/mmseg/models/segmentors/cascade_encoder_decoder.py
+++ b/finetune/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class CascadeEncoderDecoder(EncoderDecoder):
+    """Cascade Encoder Decoder segmentors.
+
+    CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
+    CascadeEncoderDecoder are cascaded. The output of previous decoder_head
+    will be the input of next decoder_head.
+
+    Args:
+
+        num_stages (int): How many stages will be cascaded.
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        self.num_stages = num_stages
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        assert isinstance(decode_head, list)
+        assert len(decode_head) == self.num_stages
+        self.decode_head = nn.ModuleList()
+        for i in range(self.num_stages):
+            self.decode_head.append(MODELS.build(decode_head[i]))
+        self.align_corners = self.decode_head[-1].align_corners
+        self.num_classes = self.decode_head[-1].num_classes
+        self.out_channels = self.decode_head[-1].out_channels
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages - 1):
+            out = self.decode_head[i].forward(x, out)
+        seg_logits_list = self.decode_head[-1].predict(x, out, batch_img_metas,
+                                                       self.test_cfg)
+
+        return seg_logits_list
+
+    def _decode_head_forward_train(self, inputs: Tensor,
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+
+        loss_decode = self.decode_head[0].loss(inputs, data_samples,
+                                               self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode_0'))
+        # get batch_img_metas
+        batch_size = len(data_samples)
+        batch_img_metas = []
+        for batch_index in range(batch_size):
+            metainfo = data_samples[batch_index].metainfo
+            batch_img_metas.append(metainfo)
+
+        for i in range(1, self.num_stages):
+            # forward test again, maybe unnecessary for most methods.
+            if i == 1:
+                prev_outputs = self.decode_head[0].forward(inputs)
+            else:
+                prev_outputs = self.decode_head[i - 1].forward(
+                    inputs, prev_outputs)
+            loss_decode = self.decode_head[i].loss(inputs, prev_outputs,
+                                                   data_samples,
+                                                   self.train_cfg)
+            losses.update(add_prefix(loss_decode, f'decode_{i}'))
+
+        return losses
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_semantic_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages):
+            # TODO support PointRend tensor mode
+            out = self.decode_head[i].forward(x, out)
+
+        return out
--- a/finetune/mmseg/models/segmentors/depth_estimator.py
+++ b/finetune/mmseg/models/segmentors/depth_estimator.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from ..utils import resize
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class DepthEstimator(EncoderDecoder):
+    """Encoder Decoder depth estimator.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict depth estimation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    depth (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_depth_map``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of depth estimator.
+        decode_head (ConfigType): The config for the decode head of depth estimator.
+        neck (OptConfigType): The config for the neck of depth estimator.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            depth estimator. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self,
+                     inputs: Tensor,
+                     batch_img_metas: Optional[List[dict]] = None) -> Tensor:
+        """Extract features from images."""
+
+        if getattr(self.backbone, 'class_embed_select', False) and \
+                isinstance(batch_img_metas, list) and \
+                'category_id' in batch_img_metas[0]:
+            cat_ids = [meta['category_id'] for meta in batch_img_metas]
+            cat_ids = torch.tensor(cat_ids).to(inputs.device)
+            inputs = (inputs, cat_ids)
+
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a depth map of the same
+        size as input."""
+        x = self.extract_feat(inputs, batch_img_metas)
+        depth = self.decode_head.predict(x, batch_img_metas, self.test_cfg)
+
+        return depth
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        x = self.extract_feat(inputs, batch_img_metas)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_depth_map`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estimation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_max``(PixelData): Prediction of depth estimation.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        depth = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(depth, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_depth_map`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_flip_inference(self, inputs: Tensor,
+                             batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap and flip.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is depth tensor map
+                # with shape [N, C, H, W]
+                crop_depth_map = self.encode_decode(crop_img, batch_img_metas)
+
+                # average out the original and flipped prediction
+                crop_depth_map_flip = self.encode_decode(
+                    crop_img.flip(dims=(3, )), batch_img_metas)
+                crop_depth_map_flip = crop_depth_map_flip.flip(dims=(3, ))
+                crop_depth_map = (crop_depth_map + crop_depth_map_flip) / 2.0
+
+                preds += F.pad(crop_depth_map,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        depth = preds / count_mat
+
+        return depth
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole',
+                                                      'slide_flip'], \
+            f'Only "slide", "slide_flip" or "whole" test mode are ' \
+            f'supported, but got {self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            depth_map = self.slide_inference(inputs, batch_img_metas)
+        if self.test_cfg.mode == 'slide_flip':
+            depth_map = self.slide_flip_inference(inputs, batch_img_metas)
+        else:
+            depth_map = self.whole_inference(inputs, batch_img_metas)
+
+        return depth_map
+
+    def postprocess_result(self,
+                           depth: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            depth (Tensor): The depth estimation results.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estomation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_map``(PixelData): Prediction of depth estimation.
+        """
+        batch_size, C, H, W = depth.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_depth shape is 1, C, H, W after remove padding
+                i_depth = depth[i:i + 1, :, padding_top:H - padding_bottom,
+                                padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_depth = i_depth.flip(dims=(3, ))
+                    else:
+                        i_depth = i_depth.flip(dims=(2, ))
+
+                # resize as original shape
+                i_depth = resize(
+                    i_depth,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_depth = depth[i]
+
+            data_samples[i].set_data(
+                {'pred_depth_map': PixelData(**{'data': i_depth})})
+
+        return data_samples
--- a/finetune/mmseg/models/segmentors/encoder_decoder.py
+++ b/finetune/mmseg/models/segmentors/encoder_decoder.py
@@ -0,0 +1,364 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class EncoderDecoder(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     infercen(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(MODELS.build(head_cfg))
+            else:
+                self.auxiliary_head = MODELS.build(auxiliary_head)
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract features from images."""
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        seg_logits = self.decode_head.predict(x, batch_img_metas,
+                                              self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole'], \
+            f'Only "slide" or "whole" test mode are supported, but got ' \
+            f'{self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
--- a/finetune/mmseg/models/segmentors/multimodal_encoder_decoder.py
+++ b/finetune/mmseg/models/segmentors/multimodal_encoder_decoder.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class MultimodalEncoderDecoder(BaseSegmentor):
+    """Multimodal Encoder-Decoder segmentors.
+
+    Multimodal segmentation architecture is used for open-vocabulary
+    semantic segmentation with combining the visual and language
+    pretrain models. It consists of a image_encoder (backbone) to extract
+    visual feature, a text encoder to extract text feature, and a decode
+    head to generate semantic maps.
+    Note that the deep supervision during training is implemented in decode head.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train()
+     _decode_head_forward_train(): decode_head.loss()
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        image_encoder (ConfigType): The config for the visual encoder of segmentor.
+        text_encoder ((ConfigType): The config for the text encoder of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        asymetric_input (bool): whether to use different size of input for image encoder
+            and decode head. Defaults to False.
+        encoder_resolution (float): resize scale of input images for image encoder.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 image_encoder: ConfigType,
+                 text_encoder: ConfigType,
+                 decode_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 asymetric_input: bool = True,
+                 encoder_resolution: float = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            image_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            text_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            decode_head.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+
+        if asymetric_input:
+            assert encoder_resolution is not None, \
+                'if asymetric_input set True, ' \
+                'clip_resolution must be a certain value'
+        self.asymetric_input = asymetric_input
+        self.encoder_resolution = encoder_resolution
+        self.image_encoder = MODELS.build(image_encoder)
+        self.text_encoder = MODELS.build(text_encoder)
+        self._init_decode_head(decode_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract visual features from images."""
+        x = self.image_encoder(inputs)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode the name of classes with text_encoder and encode images with
+        image_encoder.
+
+        Then decode the class embedding and visual feature into a semantic
+        segmentation map of the same size as input.
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+        seg_logits = self.decode_head.predict([inputs, x, classifier_embeds],
+                                              batch_img_metas, self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(
+            [inputs, x, classifier_embeds], data_samples)
+        losses.update(loss_decode)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = batch_img_metas[0]['ori_shape']
+        assert all(_['ori_shape'] == ori_shape for _ in batch_img_metas)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
--- a/finetune/mmseg/models/segmentors/seg_tta.py
+++ b/finetune/mmseg/models/segmentors/seg_tta.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.model import BaseTTAModel
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class SegTTAModel(BaseTTAModel):
+
+    def merge_preds(self, data_samples_list: List[SampleList]) -> SampleList:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[SampleList]): List of predictions
+                of all enhanced data.
+
+        Returns:
+            SampleList: Merged prediction.
+        """
+        predictions = []
+        for data_samples in data_samples_list:
+            seg_logits = data_samples[0].seg_logits.data
+            logits = torch.zeros(seg_logits.shape).to(seg_logits)
+            for data_sample in data_samples:
+                seg_logit = data_sample.seg_logits.data
+                if self.module.out_channels > 1:
+                    logits += seg_logit.softmax(dim=0)
+                else:
+                    logits += seg_logit.sigmoid()
+            logits /= len(data_samples)
+            if self.module.out_channels == 1:
+                seg_pred = (logits > self.module.decode_head.threshold
+                            ).to(logits).squeeze(1)
+            else:
+                seg_pred = logits.argmax(dim=0)
+            data_sample.set_data({'pred_sem_seg': PixelData(data=seg_pred)})
+            if hasattr(data_samples[0], 'gt_sem_seg'):
+                data_sample.set_data(
+                    {'gt_sem_seg': data_samples[0].gt_sem_seg})
+            data_sample.set_metainfo({'img_path': data_samples[0].img_path})
+            predictions.append(data_sample)
+        return predictions