init

2025-12-08 21:38:53 +08:00
commit 71118fc649
22 changed files with 4780 additions and 0 deletions
--- a/train.py
+++ b/train.py
@@ -0,0 +1,480 @@
+# -------------------------------------#
+#       对数据集进行训练
+# -------------------------------------#
+import datetime
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+from data.dataloader_for_IRSTD_UAV import seqDataset, dataset_collate
+from model.TDCNet.TDCNetwork import TDCNetwork
+from model.nets.yolo_training import (ModelEMA, YOLOLoss, get_lr_scheduler, set_optimizer_lr, weights_init)
+from utils.callbacks import EvalCallback, LossHistory
+from utils.utils import get_classes, show_config
+from utils.utils_fit import fit_one_epoch
+
+if __name__ == "__main__":
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    # ---------------------------------#
+    #   num_frame   输入帧数
+    # ---------------------------------#
+    num_frame = 5
+    # ---------------------------------#
+    #   Cuda    是否使用Cuda
+    #           没有GPU可以设置成False
+    # ---------------------------------#
+    Cuda = True
+    # ---------------------------------------------------------------------#
+    #   distributed     用于指定是否使用单机多卡分布式运行
+    #                   终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
+    #                   Windows系统下默认使用DP模式调用所有显卡，不支持DDP。
+    #   DP模式：
+    #       设置            distributed = False
+    #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python train.py
+    #   DDP模式：
+    #       设置            distributed = True
+    #       在终端中输入    CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
+    # ---------------------------------------------------------------------#
+    distributed = False
+    # ---------------------------------------------------------------------#
+    #   sync_bn     是否使用sync_bn，DDP模式多卡可用
+    # ---------------------------------------------------------------------#
+    sync_bn = False
+    # ---------------------------------------------------------------------#
+    #   fp16        是否使用混合精度训练
+    #               可减少约一半的显存、需要pytorch1.7.1以上
+    # ---------------------------------------------------------------------#
+    fp16 = False
+    # ---------------------------------------------------------------------#
+    #   classes_path    指向model_data下的txt，与自己训练的数据集相关 
+    #                   训练前一定要修改classes_path，使其对应自己的数据集
+    # ---------------------------------------------------------------------#
+    classes_path = 'model_data/classes.txt'
+    model_path = ''
+    input_shape = [640, 640]
+    # ----------------------------------------------------------------------------------------------------------------------------#
+    #   训练分为两个阶段，分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。
+    #   冻结训练需要的显存较小，显卡非常差的情况下，可设置Freeze_Epoch等于UnFreeze_Epoch，Freeze_Train = True，此时仅仅进行冻结训练。
+    #      
+    #   在此提供若干参数设置建议，各位训练者根据自己的需求进行灵活调整：
+    #   （一）从整个模型的预训练权重开始训练： 
+    #       Adam：
+    #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 100，Freeze_Train = True，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（冻结）
+    #           Init_Epoch = 0，UnFreeze_Epoch = 100，Freeze_Train = False，optimizer_type = 'adam'，Init_lr = 1e-3，weight_decay = 0。（不冻结）
+    #       SGD：
+    #           Init_Epoch = 0，Freeze_Epoch = 50，UnFreeze_Epoch = 300，Freeze_Train = True，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（冻结）
+    #           Init_Epoch = 0，UnFreeze_Epoch = 300，Freeze_Train = False，optimizer_type = 'sgd'，Init_lr = 1e-2，weight_decay = 5e-4。（不冻结）
+    #       其中：UnFreeze_Epoch可以在100-300之间调整。
+    #   （二）从0开始训练：
+    #       Init_Epoch = 0，UnFreeze_Epoch >= 300，Unfreeze_batch_size >= 16，Freeze_Train = False（不冻结训练）
+    #       其中：UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd'，Init_lr = 1e-2，mosaic = True。
+    #   （三）batch_size的设置：
+    #       在显卡能够接受的范围内，以大为好。显存不足与数据集大小无关，提示显存不足（OOM或者CUDA out of memory）请调小batch_size。
+    #       受到BatchNorm层影响，batch_size最小为2，不能为1。
+    #       正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大，因为关系到学习率的自动调整。
+    # ----------------------------------------------------------------------------------------------------------------------------#
+    # ------------------------------------------------------------------#
+    #   冻结阶段训练参数
+    #   此时模型的主干被冻结了，特征提取网络不发生改变
+    #   占用的显存较小，仅对网络进行微调
+    #   Init_Epoch          模型当前开始的训练世代，其值可以大于Freeze_Epoch，如设置：
+    #                       Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100
+    #                       会跳过冻结阶段，直接从60代开始，并调整对应的学习率。
+    #                       （断点续练时使用）
+    #   Freeze_Epoch        模型冻结训练的Freeze_Epoch
+    #                       (当Freeze_Train=False时失效)
+    #   Freeze_batch_size   模型冻结训练的batch_size
+    #                       (当Freeze_Train=False时失效)
+    # ------------------------------------------------------------------#
+    Init_Epoch = 0
+    Freeze_Epoch = 100
+    Freeze_batch_size = 4
+    # ------------------------------------------------------------------#
+    #   解冻阶段训练参数
+    #   此时模型的主干不被冻结了，特征提取网络会发生改变
+    #   占用的显存较大，网络所有的参数都会发生改变
+    #   UnFreeze_Epoch          模型总共训练的epoch
+    #                           SGD需要更长的时间收敛，因此设置较大的UnFreeze_Epoch
+    #                           Adam可以使用相对较小的UnFreeze_Epoch
+    #   Unfreeze_batch_size     模型在解冻后的batch_size
+    # ------------------------------------------------------------------#
+    UnFreeze_Epoch = 100
+    Unfreeze_batch_size = 4
+    # ------------------------------------------------------------------#
+    #   Freeze_Train    是否进行冻结训练
+    #                   默认先冻结主干训练后解冻训练。
+    # ------------------------------------------------------------------#
+    Freeze_Train = False
+
+    # ------------------------------------------------------------------#
+    #   其它训练参数：学习率、优化器、学习率下降有关
+    # ------------------------------------------------------------------#
+    # ------------------------------------------------------------------#
+    #   Init_lr         模型的最大学习率
+    #   Min_lr          模型的最小学习率，默认为最大学习率的0.01
+    # ------------------------------------------------------------------#
+    Init_lr = 1e-3
+    Min_lr = Init_lr * 0.01
+    # ------------------------------------------------------------------#
+    #   optimizer_type  使用到的优化器种类，可选的有adam、sgd
+    #                   当使用Adam优化器时建议设置  Init_lr=1e-3
+    #                   当使用SGD优化器时建议设置   Init_lr=1e-2
+    #   momentum        优化器内部使用到的momentum参数
+    #   weight_decay    权值衰减，可防止过拟合
+    #                   adam会导致weight_decay错误，使用adam时建议设置为0。
+    # ------------------------------------------------------------------#
+    optimizer_type = "adam"
+    momentum = 0.937
+    weight_decay = 1e-4
+    # ------------------------------------------------------------------#
+    #   lr_decay_type   使用到的学习率下降方式，可选的有step、cos
+    # ------------------------------------------------------------------#
+    lr_decay_type = "cos"
+    # ------------------------------------------------------------------#
+    #   save_period     多少个epoch保存一次权值
+    # ------------------------------------------------------------------#
+    save_period = 10
+    # ------------------------------------------------------------------#
+    #   save_dir        权值与日志文件保存的文件夹
+    # ------------------------------------------------------------------#
+    save_dir = f'logs/TDCNet_epoch_{UnFreeze_Epoch}_batch_{Unfreeze_batch_size}_optim_{optimizer_type}_lr_{Init_lr}_T_{num_frame}'
+    # ------------------------------------------------------------------#
+    #   eval_flag       是否在训练时进行评估，评估对象为验证集
+    #                   安装pycocotools库后，评估体验更佳。
+    #   eval_period     代表多少个epoch评估一次，不建议频繁的评估
+    #                   评估需要消耗较多的时间，频繁评估会导致训练非常慢
+    #   此处获得的mAP会与get_map.py获得的会有所不同，原因有二：
+    #   （一）此处获得的mAP为验证集的mAP。
+    #   （二）此处设置评估参数较为保守，目的是加快评估速度。
+    # ------------------------------------------------------------------#
+    eval_flag = True
+    eval_period = 200
+    # ------------------------------------------------------------------#
+    #   num_workers     用于设置是否使用多线程读取数据
+    #                   开启后会加快数据读取速度，但是会占用更多内存
+    #                   内存较小的电脑可以设置为2或者0  
+    # ------------------------------------------------------------------#
+    num_workers = 8
+
+    # ----------------------------------------------------#
+    #   获得图片路径和标签
+    # ----------------------------------------------------#
+
+    DATA_PATH = "/Dataset/IRSTD-UAV/"
+    train_annotation_path = "/Dataset/IRSTD-UAV/train.txt"
+    val_annotation_path = "/Dataset/IRSTD-UAV/val.txt"
+
+    # ------------------------------------------------------#
+    #   设置用到的显卡
+    # ------------------------------------------------------#
+    ngpus_per_node = torch.cuda.device_count()
+    if distributed:
+        dist.init_process_group(backend="nccl")
+        local_rank = int(os.environ["LOCAL_RANK"])
+        rank = int(os.environ["RANK"])
+        device = torch.device("cuda", local_rank)
+        if local_rank == 0:
+            print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
+            print("Gpu Device Count : ", ngpus_per_node)
+    else:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        local_rank = 0
+        rank = 0
+
+    # ------------------------------------------------------#
+    #   设置随机种子
+    # ------------------------------------------------------#
+    seed = 42
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+    class_names, num_classes = get_classes(classes_path)
+
+    model = TDCNetwork(num_classes=1, num_frame=num_frame)
+
+    weights_init(model)
+    if model_path != '':
+        if local_rank == 0:
+            print('Load weights {}.'.format(model_path))
+
+        # ------------------------------------------------------#
+        #   根据预训练权重的Key和模型的Key进行加载
+        # ------------------------------------------------------#
+        model_dict = model.state_dict()
+        # pdb.set_trace()
+        pretrained_dict = torch.load(model_path, map_location=device)
+        load_key, no_load_key, temp_dict = [], [], {}
+        for k, v in pretrained_dict.items():
+            if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
+                temp_dict[k] = v
+                load_key.append(k)
+            else:
+                no_load_key.append(k)
+        model_dict.update(temp_dict)
+        model.load_state_dict(model_dict)
+        # ------------------------------------------------------#
+        #   显示没有匹配上的Key
+        # ------------------------------------------------------#
+        if local_rank == 0:
+            print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key))
+            print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key))
+            print("\n\033[1;33;44m温馨提示，head部分没有载入是正常现象，Backbone部分没有载入是错误的。\033[0m")
+
+    # ----------------------#
+    #   获得损失函数
+    # ----------------------#
+
+    yolo_loss = YOLOLoss(num_classes, fp16, strides=[8])
+    # ----------------------#
+    #   记录Loss
+    # ----------------------#
+    if local_rank == 0:
+        time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S')
+        log_dir = os.path.join(save_dir, "loss_" + str(time_str))
+
+        loss_history = LossHistory(log_dir, model, input_shape=input_shape)
+
+        # pdb.set_trace()
+    else:
+        loss_history = None
+
+    # ------------------------------------------------------------------#
+    #   torch 1.2不支持amp，建议使用torch 1.7.1及以上正确使用fp16
+    #   因此torch1.2这里显示"could not be resolve"
+    # ------------------------------------------------------------------#
+    if fp16:
+        from torch.cuda.amp import GradScaler as GradScaler
+
+        scaler = GradScaler()
+    else:
+        scaler = None
+
+    model_train = model.train()
+    # ----------------------------#
+    #   多卡同步Bn
+    # ----------------------------#
+    if sync_bn and ngpus_per_node > 1 and distributed:
+        model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train)
+    elif sync_bn:
+        print("Sync_bn is not support in one gpu or not distributed.")
+
+    if Cuda:
+        if distributed:
+            # ----------------------------#
+            #   多卡平行运行
+            # ----------------------------#
+            model_train = model_train.cuda(local_rank)
+            model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True)
+        else:
+            model_train = torch.nn.DataParallel(model)
+            cudnn.benchmark = True
+            model_train = model_train.cuda()
+
+    # ----------------------------#
+    #   权值平滑
+    # ----------------------------#
+    # pdb.set_trace()
+    ema = ModelEMA(model_train)
+
+    # ---------------------------#
+    #   读取数据集对应的txt
+    # ---------------------------#
+    with open(train_annotation_path, encoding='utf-8') as f:
+        train_lines = f.readlines()
+    with open(val_annotation_path, encoding='utf-8') as f:
+        val_lines = f.readlines()
+    num_train = len(train_lines)
+    num_val = len(val_lines)
+
+    if local_rank == 0:
+        show_config(
+            classes_path=classes_path, model_path=model_path, input_shape=input_shape, \
+            Init_Epoch=Init_Epoch, Freeze_Epoch=Freeze_Epoch, UnFreeze_Epoch=UnFreeze_Epoch, Freeze_batch_size=Freeze_batch_size, Unfreeze_batch_size=Unfreeze_batch_size, Freeze_Train=Freeze_Train, \
+            Init_lr=Init_lr, Min_lr=Min_lr, optimizer_type=optimizer_type, momentum=momentum, lr_decay_type=lr_decay_type, \
+            save_period=save_period, save_dir=log_dir, num_workers=num_workers, num_train=num_train, num_val=num_val
+        )
+        # ---------------------------------------------------------#
+        #   总训练世代指的是遍历全部数据的总次数
+        #   总训练步长指的是梯度下降的总次数 
+        #   每个训练世代包含若干训练步长，每个训练步长进行一次梯度下降。
+        #   此处仅建议最低训练世代，上不封顶，计算时只考虑了解冻部分
+        # ----------------------------------------------------------#
+        wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
+        total_step = num_train // Unfreeze_batch_size * UnFreeze_Epoch
+        if total_step <= wanted_step:
+            if num_train // Unfreeze_batch_size == 0:
+                raise ValueError('数据集过小，无法进行训练，请扩充数据集。')
+            wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
+            print("\n\033[1;33;44m[Warning] 使用%s优化器时，建议将训练总步长设置到%d以上。\033[0m" % (optimizer_type, wanted_step))
+            print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%d，Unfreeze_batch_size为%d，共训练%d个Epoch，计算出总训练步长为%d。\033[0m" % (num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step))
+            print("\033[1;33;44m[Warning] 由于总训练步长为%d，小于建议总步长%d，建议设置总世代为%d。\033[0m" % (total_step, wanted_step, wanted_epoch))
+
+    # ------------------------------------------------------#
+    #   主干特征提取网络特征通用，冻结训练可以加快训练速度
+    #   也可以在训练初期防止权值被破坏。
+    #   Init_Epoch为起始世代
+    #   Freeze_Epoch为冻结训练的世代
+    #   UnFreeze_Epoch总训练世代
+    #   提示OOM或者显存不足请调小Batch_size
+    # ------------------------------------------------------#
+    if True:
+        UnFreeze_flag = False
+        # ------------------------------------#
+        #   冻结一定部分训练
+        # ------------------------------------#
+        if Freeze_Train:
+            for param in model.backbone.parameters():
+                param.requires_grad = False
+            for param in model.backbone_3d.parameters():
+                param.requires_grad = False
+
+        # -------------------------------------------------------------------#
+        #   如果不冻结训练的话，直接设置batch_size为Unfreeze_batch_size
+        # -------------------------------------------------------------------#
+        batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size
+
+        # -------------------------------------------------------------------#
+        #   判断当前batch_size，自适应调整学习率
+        # -------------------------------------------------------------------#
+        nbs = 64
+        lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2
+        lr_limit_min = 1e-5 if optimizer_type == 'adam' else 5e-4
+        Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
+        Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
+
+        # ---------------------------------------#
+        #   根据optimizer_type选择优化器
+        # ---------------------------------------#
+        pg0, pg1, pg2 = [], [], []
+        for k, v in model.named_modules():
+            if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                pg2.append(v.bias)
+            if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                pg0.append(v.weight)
+            elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                pg1.append(v.weight)
+        optimizer = {
+            'adam': optim.Adam(pg0, Init_lr_fit, betas=(momentum, 0.999)),
+            'sgd': optim.SGD(pg0, Init_lr_fit, momentum=momentum, nesterov=True)
+        }[optimizer_type]
+        optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay})
+        optimizer.add_param_group({"params": pg2})
+
+        # ---------------------------------------#
+        #   获得学习率下降的公式
+        # ---------------------------------------#
+        lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
+
+        # ---------------------------------------#
+        #   判断每一个世代的长度
+        # ---------------------------------------#
+        epoch_step = num_train // batch_size
+        epoch_step_val = num_val // batch_size
+
+        if epoch_step == 0 or epoch_step_val == 0:
+            raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
+
+        if ema:
+            ema.updates = epoch_step * Init_Epoch
+
+        train_dataset = seqDataset(train_annotation_path, input_shape[0], num_frame, 'train')  # 5
+        val_dataset = seqDataset(val_annotation_path, input_shape[0], num_frame, 'val')  # 5
+
+        if distributed:
+            train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True, )
+            val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, )
+            batch_size = batch_size // ngpus_per_node
+            shuffle = False
+        else:
+            train_sampler = None
+            val_sampler = None
+            shuffle = True
+
+        gen = DataLoader(train_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
+                         drop_last=True, collate_fn=dataset_collate, sampler=train_sampler)
+        gen_val = DataLoader(val_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
+                             drop_last=True, collate_fn=dataset_collate, sampler=val_sampler)
+
+        # ----------------------#
+        #   记录eval的map曲线
+        # ----------------------#
+        if local_rank == 0:
+            eval_callback = EvalCallback(model, input_shape, class_names, num_classes, val_lines, log_dir, Cuda, \
+                                         eval_flag=eval_flag, period=eval_period)
+        else:
+            eval_callback = None
+
+        # ---------------------------------------#
+        #   开始模型训练
+        # ---------------------------------------#
+        for epoch in range(Init_Epoch, UnFreeze_Epoch):
+            # ---------------------------------------#
+            #   如果模型有冻结学习部分
+            #   则解冻，并设置参数
+            # ---------------------------------------#
+            if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
+                batch_size = Unfreeze_batch_size
+
+                # -------------------------------------------------------------------#
+                #   判断当前batch_size，自适应调整学习率
+                # -------------------------------------------------------------------#
+                nbs = 64
+                lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2
+                lr_limit_min = 1e-5 if optimizer_type == 'adam' else 5e-4
+                Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
+                Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
+                # ---------------------------------------#
+                #   获得学习率下降的公式
+                # ---------------------------------------#
+                lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
+
+                for param in model.backbone.parameters():
+                    param.requires_grad = True
+                for param in model.backbone_3d.parameters():
+                    param.requires_grad = True
+
+                epoch_step = num_train // batch_size
+                epoch_step_val = num_val // batch_size
+
+                if epoch_step == 0 or epoch_step_val == 0:
+                    raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
+
+                if distributed:
+                    batch_size = batch_size // ngpus_per_node
+
+                if ema:
+                    ema.updates = epoch_step * epoch
+
+                gen = DataLoader(train_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
+                                 drop_last=True, collate_fn=dataset_collate, sampler=train_sampler)
+                gen_val = DataLoader(val_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
+                                     drop_last=True, collate_fn=dataset_collate, sampler=val_sampler)
+
+                UnFreeze_flag = True
+
+            gen.dataset.epoch_now = epoch
+            gen_val.dataset.epoch_now = epoch
+
+            if distributed:
+                train_sampler.set_epoch(epoch)
+
+            set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
+            # pdb.set_trace()
+
+            fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, log_dir, local_rank)
+
+            if distributed:
+                dist.barrier()
+
+        if local_rank == 0:
+            loss_history.writer.close()