From 532f83c39445bb0221b040441158d94fe19c35eb Mon Sep 17 00:00:00 2001 From: PinkPanther-ny <alvincny529@gmail.com> Date: Mon, 21 Mar 2022 17:01:46 +0800 Subject: [PATCH] Refactor framework structure --- main.py | 104 ++++++++++++++++++++++++-------------------------------- 1 file changed, 44 insertions(+), 60 deletions(-) diff --git a/main.py b/main.py index 04d421b..865d604 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ from src.models import ini_model from src.preprocess import Preprocessor from src.settings import configs -from src.utils import Timer, find_best3, eval_total +from src.utils import Timer, find_best_n_model, eval_total, remove_bad_models # OMP_NUM_THREADS=2 python -m torch.distributed.run --nproc_per_node 4 90plus.py @@ -19,13 +19,10 @@ from torch.nn.parallel import DistributedDataParallel as DDP def train(): + # DDP backend initialization if configs.DDP_ON: - # DDP backend initialization - configs.LOCAL_RANK = int(os.environ["LOCAL_RANK"]) - torch.cuda.set_device(configs.LOCAL_RANK) + torch.cuda.set_device(configs._LOCAL_RANK) dist.init_process_group(backend='nccl') - else: - configs.LOCAL_RANK = 0 model = ini_model() trainloader, testloader = Preprocessor().get_loader() @@ -33,89 +30,76 @@ def train(): # Start timer from here timer = Timer() timer.timeit() - if configs.LOAD_MODEL and configs.LOCAL_RANK == 0: - print(f"\nVerifying loaded model ({configs.MODEL_NAME})'s accuracy as its name suggested...") - eval_total(model, testloader, timer) - if configs.LOCAL_RANK == 0: + if configs._LOCAL_RANK == 0: + if configs._LOAD_SUCCESS: + print(f"\nVerifying loaded model ({configs.MODEL_NAME})'s accuracy as its name suggested...") + eval_total(model, testloader, timer) print(f"Start training! Total {configs.TOTAL_EPOCHS} epochs.\n") - return # Define loss function and optimizer for the following training process criterion = nn.CrossEntropyLoss() - opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE) - opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90) - opts = [opt2, opt1] - opt_use_adam = configs.OPT_USE_ADAM + optimizer = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.9, nesterov=True, weight_decay=0.0001) + # optimizer = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE) - # Mixed precision for speed up + # Mixed precision for massive speed up # https://zhuanlan.zhihu.com/p/165152789 - scalar = torch.cuda.amp.GradScaler() + scalar = None + if configs.MIX_PRECISION: + scalar = torch.cuda.amp.GradScaler() # ========================== Train ============================= for epoch in range(configs.TOTAL_EPOCHS): - if epoch%configs.LEARNING_RATE_UPDATE_EPOCH == configs.LEARNING_RATE_UPDATE_EPOCH - 1: - configs.LEARNING_RATE *= configs.LEARNING_RATE_UPDATE_RATE - if configs.LEARNING_RATE <= configs.LEARNING_RATE_END: - configs.LEARNING_RATE = configs.LEARNING_RATE_END - print(f"Learning rate updated to {configs.LEARNING_RATE}\n") - opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE) - opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90) - # To avoid duplicated data sent to multi-gpu - trainloader.sampler.set_epoch(epoch) + # Just for removing bad models + remove_bad_models() - # Just for removing worst models - if epoch % configs.EPOCH_TO_LOAD_BEST == 0: - remove_bad_models() - - # By my stategy, chose optimizer dynamically - optimizer = opts[int(opt_use_adam)] + if configs.DDP_ON: + # To avoid duplicated data sent to multi-gpu + trainloader.sampler.set_epoch(epoch) # Counter for printing information during training count_log = 0 if configs.N_LOGS_PER_EPOCH == 0 else int(len(trainloader) / configs.N_LOGS_PER_EPOCH) - running_loss = 0.0 for i, data in enumerate(trainloader, 0): inputs, labels = data # zero the parameter gradients optimizer.zero_grad() - - # Speed up with half precision - with torch.cuda.amp.autocast(): - # forward + backward + optimize - outputs = model(inputs.to(device)) - loss = criterion(outputs, labels.to(device)) - - # Scale the gradient - scalar.scale(loss).backward() - scalar.step(optimizer) - scalar.update() - # print statistics - running_loss += loss.item() * inputs.shape[0] - - if count_log != 0 and local_rank == 0 and i % count_log == count_log - 1: - print(f'[{epoch + 1}(Epochs), {i + 1:5d}(batches)] loss: {running_loss / count_log:.3f}') - running_loss = 0.0 - - # Switch to another optimizer after some epochs - if configs.ADAM_SGD_SWITCH: - if epoch % configs.EPOCHS_PER_SWITCH == configs.EPOCHS_PER_SWITCH - 1: - opt_use_adam = not opt_use_adam - print(f"Epoch {epoch + 1}: Opt switched to {'Adam' if opt_use_adam else 'SGD'}") + # Speed up with half precision + if configs.MIX_PRECISION: + with torch.cuda.amp.autocast(): + outputs = model(inputs.to(configs._DEVICE)) + loss = criterion(outputs, labels.to(configs._DEVICE)) + + # Scale the gradient + scalar.scale(loss).backward() + scalar.step(optimizer) + scalar.update() + else: + outputs = model(inputs.to(configs._DEVICE)) + loss = criterion(outputs, labels.to(configs._DEVICE)) + loss.backward() + optimizer.step() + + if count_log != 0 and configs._LOCAL_RANK == 0 and i % count_log == count_log - 1: + print(f'[{epoch + 1}(Epochs), {i + 1:5d}(batches)]') - # Evaluate model on main GPU after some epochs - if local_rank == 0 and epoch % configs.EPOCHS_PER_EVAL == configs.EPOCHS_PER_EVAL - 1: - eval_total(model, testloader, timer, device, epoch) + # Evaluate model on main GPU after EPOCHS_PER_EVAL epochs + if configs._LOCAL_RANK == 0: + # Time current epoch training duration + t = timer.timeit() + print(f"Epoch delta time: {t[0]}, Already: {t[1]}\n") + if epoch % configs.EPOCHS_PER_EVAL == configs.EPOCHS_PER_EVAL - 1: + eval_total(model, testloader, timer, epoch) print(f'Training Finished! ({str(datetime.timedelta(seconds=int(timer.timeit())))})') if __name__ == '__main__': try: - # gc.collect() + gc.collect() torch.cuda.empty_cache() configs.reset_working_dir(__file__) train() -- GitLab