Skip to content
Snippets Groups Projects
Commit 532f83c3 authored by PinkPanther-ny's avatar PinkPanther-ny
Browse files

Refactor framework structure

parent 8c03e305
Branches main
No related tags found
No related merge requests found
from src.models import ini_model
from src.preprocess import Preprocessor
from src.settings import configs
from src.utils import Timer, find_best3, eval_total
from src.utils import Timer, find_best_n_model, eval_total, remove_bad_models
# OMP_NUM_THREADS=2 python -m torch.distributed.run --nproc_per_node 4 90plus.py
......@@ -19,13 +19,10 @@ from torch.nn.parallel import DistributedDataParallel as DDP
def train():
if configs.DDP_ON:
# DDP backend initialization
configs.LOCAL_RANK = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(configs.LOCAL_RANK)
if configs.DDP_ON:
torch.cuda.set_device(configs._LOCAL_RANK)
dist.init_process_group(backend='nccl')
else:
configs.LOCAL_RANK = 0
model = ini_model()
trainloader, testloader = Preprocessor().get_loader()
......@@ -33,49 +30,37 @@ def train():
# Start timer from here
timer = Timer()
timer.timeit()
if configs.LOAD_MODEL and configs.LOCAL_RANK == 0:
if configs._LOCAL_RANK == 0:
if configs._LOAD_SUCCESS:
print(f"\nVerifying loaded model ({configs.MODEL_NAME})'s accuracy as its name suggested...")
eval_total(model, testloader, timer)
if configs.LOCAL_RANK == 0:
print(f"Start training! Total {configs.TOTAL_EPOCHS} epochs.\n")
return
# Define loss function and optimizer for the following training process
criterion = nn.CrossEntropyLoss()
opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE)
opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90)
opts = [opt2, opt1]
opt_use_adam = configs.OPT_USE_ADAM
optimizer = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.9, nesterov=True, weight_decay=0.0001)
# optimizer = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE)
# Mixed precision for speed up
# Mixed precision for massive speed up
# https://zhuanlan.zhihu.com/p/165152789
scalar = None
if configs.MIX_PRECISION:
scalar = torch.cuda.amp.GradScaler()
# ========================== Train =============================
for epoch in range(configs.TOTAL_EPOCHS):
if epoch%configs.LEARNING_RATE_UPDATE_EPOCH == configs.LEARNING_RATE_UPDATE_EPOCH - 1:
configs.LEARNING_RATE *= configs.LEARNING_RATE_UPDATE_RATE
if configs.LEARNING_RATE <= configs.LEARNING_RATE_END:
configs.LEARNING_RATE = configs.LEARNING_RATE_END
print(f"Learning rate updated to {configs.LEARNING_RATE}\n")
opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE)
opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90)
# To avoid duplicated data sent to multi-gpu
trainloader.sampler.set_epoch(epoch)
# Just for removing worst models
if epoch % configs.EPOCH_TO_LOAD_BEST == 0:
# Just for removing bad models
remove_bad_models()
# By my stategy, chose optimizer dynamically
optimizer = opts[int(opt_use_adam)]
if configs.DDP_ON:
# To avoid duplicated data sent to multi-gpu
trainloader.sampler.set_epoch(epoch)
# Counter for printing information during training
count_log = 0 if configs.N_LOGS_PER_EPOCH == 0 else int(len(trainloader) / configs.N_LOGS_PER_EPOCH)
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
inputs, labels = data
......@@ -83,39 +68,38 @@ def train():
optimizer.zero_grad()
# Speed up with half precision
if configs.MIX_PRECISION:
with torch.cuda.amp.autocast():
# forward + backward + optimize
outputs = model(inputs.to(device))
loss = criterion(outputs, labels.to(device))
outputs = model(inputs.to(configs._DEVICE))
loss = criterion(outputs, labels.to(configs._DEVICE))
# Scale the gradient
scalar.scale(loss).backward()
scalar.step(optimizer)
scalar.update()
# print statistics
running_loss += loss.item() * inputs.shape[0]
if count_log != 0 and local_rank == 0 and i % count_log == count_log - 1:
print(f'[{epoch + 1}(Epochs), {i + 1:5d}(batches)] loss: {running_loss / count_log:.3f}')
running_loss = 0.0
# Switch to another optimizer after some epochs
if configs.ADAM_SGD_SWITCH:
if epoch % configs.EPOCHS_PER_SWITCH == configs.EPOCHS_PER_SWITCH - 1:
opt_use_adam = not opt_use_adam
print(f"Epoch {epoch + 1}: Opt switched to {'Adam' if opt_use_adam else 'SGD'}")
# Evaluate model on main GPU after some epochs
if local_rank == 0 and epoch % configs.EPOCHS_PER_EVAL == configs.EPOCHS_PER_EVAL - 1:
eval_total(model, testloader, timer, device, epoch)
else:
outputs = model(inputs.to(configs._DEVICE))
loss = criterion(outputs, labels.to(configs._DEVICE))
loss.backward()
optimizer.step()
if count_log != 0 and configs._LOCAL_RANK == 0 and i % count_log == count_log - 1:
print(f'[{epoch + 1}(Epochs), {i + 1:5d}(batches)]')
# Evaluate model on main GPU after EPOCHS_PER_EVAL epochs
if configs._LOCAL_RANK == 0:
# Time current epoch training duration
t = timer.timeit()
print(f"Epoch delta time: {t[0]}, Already: {t[1]}\n")
if epoch % configs.EPOCHS_PER_EVAL == configs.EPOCHS_PER_EVAL - 1:
eval_total(model, testloader, timer, epoch)
print(f'Training Finished! ({str(datetime.timedelta(seconds=int(timer.timeit())))})')
if __name__ == '__main__':
try:
# gc.collect()
gc.collect()
torch.cuda.empty_cache()
configs.reset_working_dir(__file__)
train()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment