Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
B
bmlf
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
NUOYANC
bmlf
Commits
532f83c3
Commit
532f83c3
authored
3 years ago
by
PinkPanther-ny
Browse files
Options
Downloads
Patches
Plain Diff
Refactor framework structure
parent
8c03e305
Branches
main
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
main.py
+44
-60
44 additions, 60 deletions
main.py
with
44 additions
and
60 deletions
main.py
+
44
−
60
View file @
532f83c3
from
src.models
import
ini_model
from
src.preprocess
import
Preprocessor
from
src.settings
import
configs
from
src.utils
import
Timer
,
find_best
3
,
eval_total
from
src.utils
import
Timer
,
find_best
_n_model
,
eval_total
,
remove_bad_models
# OMP_NUM_THREADS=2 python -m torch.distributed.run --nproc_per_node 4 90plus.py
...
...
@@ -19,13 +19,10 @@ from torch.nn.parallel import DistributedDataParallel as DDP
def
train
():
if
configs
.
DDP_ON
:
# DDP backend initialization
configs
.
LOCAL_RANK
=
int
(
os
.
environ
[
"
LOCAL_RANK
"
])
torch
.
cuda
.
set_device
(
configs
.
LOCAL_RANK
)
if
configs
.
DDP_ON
:
torch
.
cuda
.
set_device
(
configs
.
_
LOCAL_RANK
)
dist
.
init_process_group
(
backend
=
'
nccl
'
)
else
:
configs
.
LOCAL_RANK
=
0
model
=
ini_model
()
trainloader
,
testloader
=
Preprocessor
().
get_loader
()
...
...
@@ -33,49 +30,37 @@ def train():
# Start timer from here
timer
=
Timer
()
timer
.
timeit
()
if
configs
.
LOAD_MODEL
and
configs
.
LOCAL_RANK
==
0
:
if
configs
.
_LOCAL_RANK
==
0
:
if
configs
.
_LOAD_SUCCESS
:
print
(
f
"
\n
Verifying loaded model (
{
configs
.
MODEL_NAME
}
)
'
s accuracy as its name suggested...
"
)
eval_total
(
model
,
testloader
,
timer
)
if
configs
.
LOCAL_RANK
==
0
:
print
(
f
"
Start training! Total
{
configs
.
TOTAL_EPOCHS
}
epochs.
\n
"
)
return
# Define loss function and optimizer for the following training process
criterion
=
nn
.
CrossEntropyLoss
()
opt1
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
configs
.
LEARNING_RATE
)
opt2
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
configs
.
LEARNING_RATE
,
momentum
=
0.90
)
opts
=
[
opt2
,
opt1
]
opt_use_adam
=
configs
.
OPT_USE_ADAM
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
configs
.
LEARNING_RATE
,
momentum
=
0.9
,
nesterov
=
True
,
weight_decay
=
0.0001
)
# optimizer = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE)
# Mixed precision for speed up
# Mixed precision for
massive
speed up
# https://zhuanlan.zhihu.com/p/165152789
scalar
=
None
if
configs
.
MIX_PRECISION
:
scalar
=
torch
.
cuda
.
amp
.
GradScaler
()
# ========================== Train =============================
for
epoch
in
range
(
configs
.
TOTAL_EPOCHS
):
if
epoch
%
configs
.
LEARNING_RATE_UPDATE_EPOCH
==
configs
.
LEARNING_RATE_UPDATE_EPOCH
-
1
:
configs
.
LEARNING_RATE
*=
configs
.
LEARNING_RATE_UPDATE_RATE
if
configs
.
LEARNING_RATE
<=
configs
.
LEARNING_RATE_END
:
configs
.
LEARNING_RATE
=
configs
.
LEARNING_RATE_END
print
(
f
"
Learning rate updated to
{
configs
.
LEARNING_RATE
}
\n
"
)
opt1
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
configs
.
LEARNING_RATE
)
opt2
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
configs
.
LEARNING_RATE
,
momentum
=
0.90
)
# To avoid duplicated data sent to multi-gpu
trainloader
.
sampler
.
set_epoch
(
epoch
)
# Just for removing worst models
if
epoch
%
configs
.
EPOCH_TO_LOAD_BEST
==
0
:
# Just for removing bad models
remove_bad_models
()
# By my stategy, chose optimizer dynamically
optimizer
=
opts
[
int
(
opt_use_adam
)]
if
configs
.
DDP_ON
:
# To avoid duplicated data sent to multi-gpu
trainloader
.
sampler
.
set_epoch
(
epoch
)
# Counter for printing information during training
count_log
=
0
if
configs
.
N_LOGS_PER_EPOCH
==
0
else
int
(
len
(
trainloader
)
/
configs
.
N_LOGS_PER_EPOCH
)
running_loss
=
0.0
for
i
,
data
in
enumerate
(
trainloader
,
0
):
inputs
,
labels
=
data
...
...
@@ -83,39 +68,38 @@ def train():
optimizer
.
zero_grad
()
# Speed up with half precision
if
configs
.
MIX_PRECISION
:
with
torch
.
cuda
.
amp
.
autocast
():
# forward + backward + optimize
outputs
=
model
(
inputs
.
to
(
device
))
loss
=
criterion
(
outputs
,
labels
.
to
(
device
))
outputs
=
model
(
inputs
.
to
(
configs
.
_DEVICE
))
loss
=
criterion
(
outputs
,
labels
.
to
(
configs
.
_DEVICE
))
# Scale the gradient
scalar
.
scale
(
loss
).
backward
()
scalar
.
step
(
optimizer
)
scalar
.
update
()
# print statistics
running_loss
+=
loss
.
item
()
*
inputs
.
shape
[
0
]
if
count_log
!=
0
and
local_rank
==
0
and
i
%
count_log
==
count_log
-
1
:
print
(
f
'
[
{
epoch
+
1
}
(Epochs),
{
i
+
1
:
5
d
}
(batches)] loss:
{
running_loss
/
count_log
:
.
3
f
}
'
)
running_loss
=
0.0
# Switch to another optimizer after some epochs
if
configs
.
ADAM_SGD_SWITCH
:
if
epoch
%
configs
.
EPOCHS_PER_SWITCH
==
configs
.
EPOCHS_PER_SWITCH
-
1
:
opt_use_adam
=
not
opt_use_adam
print
(
f
"
Epoch
{
epoch
+
1
}
: Opt switched to
{
'
Adam
'
if
opt_use_adam
else
'
SGD
'
}
"
)
# Evaluate model on main GPU after some epochs
if
local_rank
==
0
and
epoch
%
configs
.
EPOCHS_PER_EVAL
==
configs
.
EPOCHS_PER_EVAL
-
1
:
eval_total
(
model
,
testloader
,
timer
,
device
,
epoch
)
else
:
outputs
=
model
(
inputs
.
to
(
configs
.
_DEVICE
))
loss
=
criterion
(
outputs
,
labels
.
to
(
configs
.
_DEVICE
))
loss
.
backward
()
optimizer
.
step
()
if
count_log
!=
0
and
configs
.
_LOCAL_RANK
==
0
and
i
%
count_log
==
count_log
-
1
:
print
(
f
'
[
{
epoch
+
1
}
(Epochs),
{
i
+
1
:
5
d
}
(batches)]
'
)
# Evaluate model on main GPU after EPOCHS_PER_EVAL epochs
if
configs
.
_LOCAL_RANK
==
0
:
# Time current epoch training duration
t
=
timer
.
timeit
()
print
(
f
"
Epoch delta time:
{
t
[
0
]
}
, Already:
{
t
[
1
]
}
\n
"
)
if
epoch
%
configs
.
EPOCHS_PER_EVAL
==
configs
.
EPOCHS_PER_EVAL
-
1
:
eval_total
(
model
,
testloader
,
timer
,
epoch
)
print
(
f
'
Training Finished! (
{
str
(
datetime
.
timedelta
(
seconds
=
int
(
timer
.
timeit
())))
}
)
'
)
if
__name__
==
'
__main__
'
:
try
:
#
gc.collect()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
configs
.
reset_working_dir
(
__file__
)
train
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment