task_attributes: segmentation: dataset_attributes: pretraining_loader: data_root_dir: 'pretrain_datasets/' train_json_path_list: ['dynamic-mm/dynamic-mm_train.json', 'pastis-mm/pastis-mm_train.json', 'vaihingen/vaihingen_train.json', 'deepglobe/deepglobe_train.json', 'potsdam/potsdam_train.json', 'fbp/fbp_train.json', 'loveda/loveda_train.json', 'isaid/isaid_train.json', 'jl16-mm/jl16-mm_train.json', 'flair-mm/flair-mm_train.json', 's2naip-mm/s2naip-mm_train.json', 'dfc20-mm/dfc20-mm_train.json', 'c2segab-mm/c2segab-mm_train.json'] val_json_path_list: ['dynamic-mm/dynamic-mm_val.json', 'pastis-mm/pastis-mm_val.json', 'vaihingen/vaihingen_val.json', 'deepglobe/deepglobe_val.json', 'potsdam/potsdam_val.json', 'fbp/fbp_val.json', 'loveda/loveda_val.json', 'isaid/isaid_val.json', 'jl16-mm/jl16-mm_val.json', 'flair-mm/flair-mm_val.json', 's2naip-mm/s2naip-mm_val.json', 'dfc20-mm/dfc20-mm_val.json', 'c2segab-mm/c2segab-mm_val.json'] use_multi_pairs: True seq_len: 1 half_mask_ratio: 0.3 min_random_scale: 0.3 cls_repeat_cnt: 2000 image_size: hr: (512, 512) s2: (16, 16) s1: (16, 16) anno: (512, 512) mim: input_size: (1024, 512) patch_size: 128 mask_ratio: 0.5 model_attributes: SkySensePP: sources: ['hr', 's2', 's1'] use_modal_vae: True use_ctpe: False use_cls_token_uper_head: False upsacle_results: True calendar_time: 365 vocabulary_size: 64 backbone_hr: type: 'SwinTransformerV2MSL' arch: 'huge' use_attn: True merge_stage: 2 vocabulary_size: 64 img_size: 224 patch_size: 4 in_channels: 3 window_size: 8 drop_rate: 0. drop_path_rate: 0.2 out_indices: (0,1,2,3) use_abs_pos_embed: False interpolate_mode: 'bicubic' with_cp: True frozen_stages: -1 norm_eval: False pad_small_map: False pretrained_window_sizes: [0, 0, 0, 0] init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_backbone_hr.pth' backbone_s2: type: 'VisionTransformerMSL' img_size: (16, 16) use_attn: False merge_stage: 4 vocabulary_size: 64 patch_size: 4 in_channels: 10 embed_dims: 1024 num_layers: 24 num_heads: 16 mlp_ratio: 4 out_indices: (5,11,17,23) qkv_bias: True drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.3 with_cls_token: False output_cls_token: False act_cfg: type: 'GELU' norm_cfg: type: 'LN' eps: 1e-6 with_cp: True interpolate_mode: 'bicubic' init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_backbone_s2.pth' head_s2: type: 'UPHead' in_dim: 1024 out_dim: 2816 #2816 up_scale: 4 init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_head_s2.pth' backbone_s1: type: 'VisionTransformerMSL' img_size: (16, 16) use_attn: False merge_stage: 4 vocabulary_size: 64 patch_size: 4 in_channels: 2 embed_dims: 1024 num_layers: 24 num_heads: 16 mlp_ratio: 4 out_indices: (5,11,17,23) qkv_bias: True drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.3 with_cls_token: False output_cls_token: False act_cfg: type: 'GELU' norm_cfg: type: 'LN' eps: 1e-6 with_cp: True interpolate_mode: 'bicubic' init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_backbone_s1.pth' head_s1: type: 'UPHead' in_dim: 1024 out_dim: 2816 #2816 up_scale: 4 init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_head_s1.pth' rec_head_hr: type: 'UPerHead' in_channels: [704, 704, 1408, 2816, 1024] in_index: [0, 1, 2, 3, 4] pool_scales: (1, 2, 3, 6) channels: 512 dropout_ratio: 0.1 num_classes: 65 norm_cfg: type: 'SyncBN' requires_grad: true align_corners: false necks: type: 'TransformerEncoder' input_dims: 2816 embed_dims: 1024 num_layers: 24 num_heads: 16 mlp_ratio: 4 qkv_bias: True drop_rate: 0. attn_drop_rate: 0. drop_path_rate: 0.3 with_cls_token: True output_cls_token: True norm_cfg: type: 'LN' act_cfg: type: 'GELU' num_fcs: 2 norm_eval: False with_cp: True init_cfg: type: Pretrained checkpoint: 'pretrain/skysense_model_fusion.pth' modality_vae: type: 'ModalityCompletion' input_shape_hr: [2816, 32, 16] input_shape_s2: [2816, 32, 16] input_shape_s1: [2816, 32, 16] conv_dim: 256 z_dim: 256 n_codebook: 8192 metrics: - type: 'sem_metric' losses: - type: 'RecLoss' params: weight: 1.0 patch_size: 4 balance: True use_all_patch: True vocabulary_size: 64 feature_merged: True pred_key: 'logits_hr' mask_key: 'mask_hr' target_key: 'mapped_targets' use_bg: True - type: 'ModalityVAELoss' params: weight: 1.0 optimizer_attributes: type: AdamW params: lr: 2e-04 betas: (0.9, 0.999) weight_decay: 0.04 lr_parameters: layer_decay: 0.7 frozen_blocks: 12 frozen_fusion_blocks_start: 3 training_parameters: trainer: 'seg_trainer' run_type: train seed: 24042301 pin_memory: True batch_size: 256 test_batch_size: 128 num_workers: 16 max_iterations: 30000 num_warmup_steps: 1000 log_interval: 50 snapshot_interval: 3000 cos_lr: False clip_norm_mode: all clip_gradients: true max_grad_l2_norm: 5 enable_tf32: False enable_amp: True find_unused_parameters: True synchronized_loss: True static_graph: True replace_speedup_op: True ema: False distributed_batch_sampler: batch_size: 8 amp_attributes: amp_escapes: Conv2d opt_level: O1 init_scale: 1