From 09e8aa1ce56151b0cb864f73f1425260e089864a Mon Sep 17 00:00:00 2001 From: nuoyanc <alvincny529@gmail.com> Date: Fri, 18 Mar 2022 19:38:16 +0800 Subject: [PATCH] Half way there, and reload gitignore. --- .gitignore | 249 +++++++++++++++++++ .idea/.gitignore | 8 + .idea/bmlf.iml | 21 ++ .idea/inspectionProfiles/Project_Default.xml | 12 + .idea/misc.xml | 24 ++ .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .vscode/settings.json | 5 + main.py | 124 +++++++++ src/models/__init__.py | 1 + src/models/aresnet.py | 103 ++++++++ src/models/load_model.py | 41 +++ src/preprocess/__init__.py | 1 + src/preprocess/preprocess.py | 83 +++++++ src/settings/__init__.py | 3 + src/settings/configs.py | 83 +++++++ src/settings/model_initialize.py | 3 + src/train/train.py | 0 src/utils/__init__.py | 1 + src/utils/utils.py | 86 +++++++ 20 files changed, 862 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/bmlf.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .vscode/settings.json create mode 100644 main.py create mode 100644 src/models/__init__.py create mode 100644 src/models/aresnet.py create mode 100644 src/models/load_model.py create mode 100644 src/preprocess/__init__.py create mode 100644 src/preprocess/preprocess.py create mode 100644 src/settings/__init__.py create mode 100644 src/settings/configs.py create mode 100644 src/settings/model_initialize.py create mode 100644 src/train/train.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b39824d --- /dev/null +++ b/.gitignore @@ -0,0 +1,249 @@ +data/ + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + + + +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/bmlf.iml b/.idea/bmlf.iml new file mode 100644 index 0000000..cf6c9c8 --- /dev/null +++ b/.idea/bmlf.iml @@ -0,0 +1,21 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="JAVA_MODULE" version="4"> + <component name="FacetManager"> + <facet type="Python" name="Python"> + <configuration sdkName="" /> + </facet> + </component> + <component name="NewModuleRootManager" inherit-compiler-output="true"> + <exclude-output /> + <content url="file://$MODULE_DIR$" /> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> + <component name="PyNamespacePackagesService"> + <option name="namespacePackageFolders"> + <list> + <option value="$MODULE_DIR$/src" /> + </list> + </option> + </component> +</module> \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..75a9f69 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,12 @@ +<component name="InspectionProjectProfileManager"> + <profile version="1.0"> + <option name="myName" value="Project Default" /> + <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true"> + <option name="ignoredErrors"> + <list> + <option value="E722" /> + </list> + </option> + </inspection_tool> + </profile> +</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..3dbf0c3 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectRootManager" version="2" languageLevel="JDK_17" project-jdk-name="Python 3.9 (base)" project-jdk-type="Python SDK"> + <output url="file://$PROJECT_DIR$/out" /> + </component> + <component name="UnattendedHostPersistenceState"> + <option name="openedFilesInfos"> + <list> + <OpenedFileInfo> + <option name="caretOffset" value="2427" /> + <option name="fileUrl" value="file://$PROJECT_DIR$/src/preprocess/preprocess.py" /> + </OpenedFileInfo> + <OpenedFileInfo> + <option name="caretOffset" value="5280" /> + <option name="fileUrl" value="file://$PROJECT_DIR$/main.py" /> + </OpenedFileInfo> + <OpenedFileInfo> + <option name="caretOffset" value="2434" /> + <option name="fileUrl" value="file://$PROJECT_DIR$/src/settings/configs.py" /> + </OpenedFileInfo> + </list> + </option> + </component> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..27e9261 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/bmlf.iml" filepath="$PROJECT_DIR$/.idea/bmlf.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..af7446d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.analysis.extraPaths": [ + "./src" + ] +} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..04d421b --- /dev/null +++ b/main.py @@ -0,0 +1,124 @@ +from src.models import ini_model +from src.preprocess import Preprocessor +from src.settings import configs +from src.utils import Timer, find_best3, eval_total + +# OMP_NUM_THREADS=2 python -m torch.distributed.run --nproc_per_node 4 90plus.py + +import datetime +import os +import torch +import torch.nn as nn +import torch.optim as optim +import gc +from subprocess import call + +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + + + +def train(): + if configs.DDP_ON: + # DDP backend initialization + configs.LOCAL_RANK = int(os.environ["LOCAL_RANK"]) + torch.cuda.set_device(configs.LOCAL_RANK) + dist.init_process_group(backend='nccl') + else: + configs.LOCAL_RANK = 0 + + model = ini_model() + trainloader, testloader = Preprocessor().get_loader() + + # Start timer from here + timer = Timer() + timer.timeit() + if configs.LOAD_MODEL and configs.LOCAL_RANK == 0: + print(f"\nVerifying loaded model ({configs.MODEL_NAME})'s accuracy as its name suggested...") + eval_total(model, testloader, timer) + + if configs.LOCAL_RANK == 0: + print(f"Start training! Total {configs.TOTAL_EPOCHS} epochs.\n") + + return + # Define loss function and optimizer for the following training process + criterion = nn.CrossEntropyLoss() + opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE) + opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90) + opts = [opt2, opt1] + opt_use_adam = configs.OPT_USE_ADAM + + # Mixed precision for speed up + # https://zhuanlan.zhihu.com/p/165152789 + scalar = torch.cuda.amp.GradScaler() + + # ========================== Train ============================= + for epoch in range(configs.TOTAL_EPOCHS): + if epoch%configs.LEARNING_RATE_UPDATE_EPOCH == configs.LEARNING_RATE_UPDATE_EPOCH - 1: + configs.LEARNING_RATE *= configs.LEARNING_RATE_UPDATE_RATE + if configs.LEARNING_RATE <= configs.LEARNING_RATE_END: + configs.LEARNING_RATE = configs.LEARNING_RATE_END + print(f"Learning rate updated to {configs.LEARNING_RATE}\n") + opt1 = optim.Adam(model.parameters(), lr=configs.LEARNING_RATE) + opt2 = optim.SGD(model.parameters(), lr=configs.LEARNING_RATE, momentum=0.90) + + # To avoid duplicated data sent to multi-gpu + trainloader.sampler.set_epoch(epoch) + + # Just for removing worst models + if epoch % configs.EPOCH_TO_LOAD_BEST == 0: + remove_bad_models() + + # By my stategy, chose optimizer dynamically + optimizer = opts[int(opt_use_adam)] + + # Counter for printing information during training + count_log = 0 if configs.N_LOGS_PER_EPOCH == 0 else int(len(trainloader) / configs.N_LOGS_PER_EPOCH) + + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + inputs, labels = data + + # zero the parameter gradients + optimizer.zero_grad() + + # Speed up with half precision + with torch.cuda.amp.autocast(): + # forward + backward + optimize + outputs = model(inputs.to(device)) + loss = criterion(outputs, labels.to(device)) + + # Scale the gradient + scalar.scale(loss).backward() + scalar.step(optimizer) + scalar.update() + + # print statistics + running_loss += loss.item() * inputs.shape[0] + + if count_log != 0 and local_rank == 0 and i % count_log == count_log - 1: + print(f'[{epoch + 1}(Epochs), {i + 1:5d}(batches)] loss: {running_loss / count_log:.3f}') + running_loss = 0.0 + + # Switch to another optimizer after some epochs + if configs.ADAM_SGD_SWITCH: + if epoch % configs.EPOCHS_PER_SWITCH == configs.EPOCHS_PER_SWITCH - 1: + opt_use_adam = not opt_use_adam + print(f"Epoch {epoch + 1}: Opt switched to {'Adam' if opt_use_adam else 'SGD'}") + + # Evaluate model on main GPU after some epochs + if local_rank == 0 and epoch % configs.EPOCHS_PER_EVAL == configs.EPOCHS_PER_EVAL - 1: + eval_total(model, testloader, timer, device, epoch) + + print(f'Training Finished! ({str(datetime.timedelta(seconds=int(timer.timeit())))})') + + +if __name__ == '__main__': + try: + # gc.collect() + torch.cuda.empty_cache() + configs.reset_working_dir(__file__) + train() + except KeyboardInterrupt: + print("Exit!") + diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..1127652 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1 @@ +from .load_model import ini_model \ No newline at end of file diff --git a/src/models/aresnet.py b/src/models/aresnet.py new file mode 100644 index 0000000..b60aa36 --- /dev/null +++ b/src/models/aresnet.py @@ -0,0 +1,103 @@ +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torchvision.models as models + +class Net(nn.Module): + def __init__(self): + super().__init__() + + def make_sequence(in_channels, increase_channels=False): + + middle_channels = in_channels*2 if increase_channels else in_channels + first_stride = 2 if increase_channels else 1 + return nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=middle_channels, dilation=1, kernel_size=3, padding=1, stride=first_stride), + nn.BatchNorm2d(num_features=middle_channels, eps=0.000001, momentum=0.9), + nn.ReLU(), + nn.Conv2d(in_channels=middle_channels, out_channels=middle_channels, dilation=1, kernel_size=3, padding=1, stride=1), + nn.BatchNorm2d(num_features=middle_channels, eps=0.000001, momentum=0.9), + ) + + def make_sequence_left(in_channels): + + return nn.Sequential( + nn.Conv2d(in_channels=in_channels, out_channels=in_channels*2, dilation=1, kernel_size=1, padding=0, stride=2), + nn.BatchNorm2d(num_features=in_channels*2, eps=0.000001, momentum=0.9), + ) + + self.activation_func = F.leaky_relu + + self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1, stride=1) + self.bn1 = nn.BatchNorm2d(num_features=64, eps=0.000001, momentum=0.9) + # self.pool = nn.MaxPool2d(kernel_size=3, padding=1, stride=2) + + self.seq1_r = make_sequence(in_channels=64) + + self.seq2_l = make_sequence_left(64) + self.seq2_r = make_sequence(in_channels=64, increase_channels=True) + + self.seq3_r = make_sequence(in_channels=128) + + self.seq4_l = make_sequence_left(128) + self.seq4_r = make_sequence(in_channels=128, increase_channels=True) + + self.seq5_r = make_sequence(in_channels=256) + + self.seq6_l = make_sequence_left(256) + self.seq6_r = make_sequence(in_channels=256, increase_channels=True) + + self.seq7_r = make_sequence(in_channels=512) + + self.seq_end = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + nn.Flatten(), + nn.Linear(512, 10) + ) + + # self.resnet18 = models.resnet50(num_classes=10) + # self.resnet18._modules['conv1'] = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1, stride=1) + # self.resnet18._modules['maxpool'] = nn.MaxPool2d(kernel_size=1) + + def forward(self, x): + # return self.resnet18(x) + + # Before the first block + x = self.activation_func(self.bn1(self.conv1(x))) + # x = self.pool(x) + + # block 1_1 + x = x + self.seq1_r(x) + x = self.activation_func(x) + # block 1_2 + x = x + self.seq1_r(x) + x = self.activation_func(x) + + # block 2_1 + x = self.seq2_l(x) + self.seq2_r(x) + x = self.activation_func(x) + + # block 3_1 + x = x + self.seq3_r(x) + x = self.activation_func(x) + + # block 4 + x = self.seq4_l(x) + self.seq4_r(x) + x = self.activation_func(x) + + # block 5 + x = x + self.seq5_r(x) + x = self.activation_func(x) + + # block 6 + x = self.seq6_l(x) + self.seq6_r(x) + x = self.activation_func(x) + + # block 7 + x = x + self.seq7_r(x) + x = self.activation_func(x) + + # end + x = self.seq_end(x) + + return x \ No newline at end of file diff --git a/src/models/load_model.py b/src/models/load_model.py new file mode 100644 index 0000000..22b15d0 --- /dev/null +++ b/src/models/load_model.py @@ -0,0 +1,41 @@ +# OMP_NUM_THREADS=2 python -m torch.distributed.run --nproc_per_node 4 90plus.py + +import datetime +import os +import torch +import torch.nn as nn +import torch.optim as optim +import gc +from subprocess import call + +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from ..settings import configs, model +from ..utils import Timer, find_best3, eval_total + +def ini_model(): + local_rank = configs.LOCAL_RANK + global model + # Load model to gpu + device = torch.device("cuda", local_rank) + configs.DEVICE = device + # Check if load specific model or load best model in model folder + if configs.LOAD_MODEL: + if configs.LOAD_BEST: + configs.MODEL_NAME = find_best3(local_rank) + try: + print(configs.MODEL_DIR + configs.MODEL_NAME) + model.load_state_dict(torch.load(configs.MODEL_DIR + configs.MODEL_NAME)) + + except FileNotFoundError or IsADirectoryError: + print(f"{configs.MODEL_NAME} Model not found!") + + # Move loaded model with parameters to gpus + # Then warp with DDP, reducer will be constructed too. + model.to(device) + if configs.DDP_ON: + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + + + return model + \ No newline at end of file diff --git a/src/preprocess/__init__.py b/src/preprocess/__init__.py new file mode 100644 index 0000000..46fe515 --- /dev/null +++ b/src/preprocess/__init__.py @@ -0,0 +1 @@ +from .preprocess import Preprocessor \ No newline at end of file diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py new file mode 100644 index 0000000..3a1d29a --- /dev/null +++ b/src/preprocess/preprocess.py @@ -0,0 +1,83 @@ +import random +from typing import Tuple + +import torchvision.transforms as transforms +from matplotlib import pyplot as plt +import numpy as np +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler +from torchvision.datasets import CIFAR10 +from ..settings.configs import configs +import math + + +class Preprocessor: + + # Official data augmentation for CIFAR10 dataset + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4, padding_mode="constant"), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ]) + + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ]) + + def __init__(self, + trans_train=transform_train, + trans_test=transform_test) -> None: + self.trans_train = trans_train + self.trans_test = trans_test + self.loader = None + + def get_loader(self)->Tuple[DataLoader, DataLoader]: + + if self.loader is not None: + return self.loader + + data_dir = configs.DATA_DIR + batch_size = configs.BATCH_SIZE + n_workers = configs.NUM_WORKERS + + train_set = CIFAR10(root=data_dir, train=True, + download=True, transform=self.transform_train) + test_set = CIFAR10(root=data_dir, train=False, + download=True, transform=self.transform_test) + if configs.DDP_ON: + train_sampler = DistributedSampler(train_set) + train_loader = DataLoader(train_set, batch_size=batch_size, + sampler=train_sampler) + else: + train_loader = DataLoader(train_set, batch_size=batch_size, + shuffle=True, num_workers=n_workers) + + # Test with whole test set, no need for distributed sampler + test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=n_workers) + + # Return two iterables which contain data in blocks, block size equals to batch size + return train_loader, test_loader + + def visualize_data(self, n=9, train=True, rand=True)->None: + loader = self.get_loader()[int(not train)] + wid = int(math.floor(math.sqrt(n))) + if wid * wid < n: + wid += 1 + fig = plt.figure(figsize=(2 * wid, 2 * wid)) + print(wid) + + for i in range(n): + if rand: + index = random.randint(0, len(loader.dataset) - 1) + else: + index = i + + # Add subplot to corresponding position + fig.add_subplot(wid, wid, i + 1) + plt.imshow((np.transpose(loader.dataset[index][0].numpy(), (1, 2, 0)))) + plt.axis('off') + plt.title(configs.CLASSES[loader.dataset[index][1]]) + + fig.show() diff --git a/src/settings/__init__.py b/src/settings/__init__.py new file mode 100644 index 0000000..b8ccbba --- /dev/null +++ b/src/settings/__init__.py @@ -0,0 +1,3 @@ +from .configs import configs +from .model_initialize import model + diff --git a/src/settings/configs.py b/src/settings/configs.py new file mode 100644 index 0000000..048d36c --- /dev/null +++ b/src/settings/configs.py @@ -0,0 +1,83 @@ +import os +import json + +os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, [0, 1, 2, 3, 4, 5, 6, 7])) + + +class Config: + def __init__(self, *dict_config) -> None: + # ============================================== + # GLOBAL SETTINGS + self.DDP_ON: bool = True + + self.BATCH_SIZE: int = 512 + self.LEARNING_RATE: float = 1e-3 + self.LEARNING_RATE_UPDATE_EPOCH: int = 30 + self.LEARNING_RATE_UPDATE_RATE: float = 0.12 + self.LEARNING_RATE_END: float = 1e-5 + self.TOTAL_EPOCHS: int = 5000 + + self.OPT_USE_ADAM: bool = True + + self.LOAD_MODEL: bool = True + self.MODEL_NAME: str = "10X92.pth" + self.LOAD_BEST: bool = False + self.EPOCH_TO_LOAD_BEST: int = 15 + + self.MODEL_SAVE_THRESHOLD: float = 0 + + self.NUM_WORKERS: int = 4 + self.N_LOGS_PER_EPOCH: int = 0 + + # ============================================== + # SPECIAL SETTINGS + self.EPOCHS_PER_EVAL: int = 2 + + self.ADAM_SGD_SWITCH: bool = True + self.EPOCHS_PER_SWITCH: int = 30 + + # ============================================== + # NOT SUPPOSED TO BE CHANGED OFTEN + + self.WORKING_DIR: str = os.path.dirname(os.path.realpath(__file__)) + self.MODEL_DIR: str = self.WORKING_DIR + "/models_v100/" + self.DATA_DIR: str = self.WORKING_DIR + '/data/' + self.CLASSES: tuple = ('plane', 'car', 'bird', 'cat', 'deer', + 'dog', 'frog', 'horse', 'ship', 'truck') + + self.DEVICE = None + self.LOCAL_RANK = None + + if len(dict_config) != 0: + d = eval(dict_config[0]) + for k in dict(d): + setattr(self, k, d[k]) + + def reset_working_dir(self, main_dir): + self.WORKING_DIR: str = os.path.dirname(os.path.realpath(main_dir)) + self.MODEL_DIR: str = self.WORKING_DIR + "/models_v100/" + self.DATA_DIR: str = self.WORKING_DIR + '/data/' + + if not os.path.exists(self.MODEL_DIR): + os.makedirs(self.MODEL_DIR) + + def save(self, fn='/config.json'): + with open(self.WORKING_DIR + fn, 'w') as fp: + json.dump(str(self.__dict__), fp, indent=4) + + def load(self, fn='/config.json'): + try: + + with open(self.WORKING_DIR + fn, 'r') as fp: + dict_config = json.load(fp) + d = eval(dict_config) + for k in dict(d): + setattr(self, k, d[k]) + print("Config file loaded successfully!") + except: + print("Config file does not exits, use default value instead!") + + +configs = Config() +# configs.load() +# configs.save() diff --git a/src/settings/model_initialize.py b/src/settings/model_initialize.py new file mode 100644 index 0000000..2a8faed --- /dev/null +++ b/src/settings/model_initialize.py @@ -0,0 +1,3 @@ +from ..models import aresnet + +model = aresnet.Net() \ No newline at end of file diff --git a/src/train/train.py b/src/train/train.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..90f60fd --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..90d314b --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,86 @@ +import time +import torch +from ..settings import configs +from random import randrange +import os +from os import walk + +class Timer: + def __init__(self): + self.ini = time.time() + self.last = 0 + self.curr = 0 + + def timeit(self)->float: + if self.last == 0 and self.curr == 0: + self.last = time.time() + self.curr = time.time() + return 0, 0 + else: + self.last = self.curr + self.curr = time.time() + return time.strftime("%H:%M:%S",time.gmtime(round(self.curr - self.last, 2))), time.strftime("%H:%M:%S",time.gmtime(round(self.curr - self.ini, 2))) + + +def eval_total(model, testloader, timer, epoch=-1): + # Only neccessary to evaluate model on one gpu + if configs.LOCAL_RANK != 0: + return + device = configs.DEVICE + model.eval() + correct = 0 + total = 0 + # since we're not training, we don't need to calculate the + # gradients for our outputs + with torch.no_grad(): + for data in testloader: + images, labels = data + # calculate outputs by running images through the network + outputs = model(images.to(device)) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.cpu().data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + save_model = 100 * correct / total >= configs.MODEL_SAVE_THRESHOLD + print(f"{'''''' if epoch==-1 else '''Epoch ''' + str(epoch) + ''': '''}Accuracy of the network on the {total} test images: {100 * correct / float(total)} % ({'saved' if save_model else 'discarded'})") + t = timer.timeit() + print(f"Delta time: {t[0]}, Already: {t[1]}\n") + model.train() + if save_model: + if configs.DDP_ON: + torch.save(model.module.state_dict(), configs.MODEL_DIR + f"{100 * correct / total}".replace('.', '_') + '.pth') + else: + torch.save(model.state_dict(), configs.MODEL_DIR + f"{100 * correct / total}".replace('.', '_') + '.pth') + + +def find_best3(local_rank, rand=False): + files = next(walk(configs.MODEL_DIR), (None, None, []))[2] + if len(files) == 0: + return '' + acc = sorted([float(i.split('.')[0].replace('_', '.')) for i in files], reverse=True) + best_acc = acc[:3] + + for i in acc[3:]: + try: + os.remove(configs.MODEL_DIR + "/" + str(i).replace('.', '_') + ".pth") + except: + continue + + + model_name = str(best_acc[randrange(3) if (rand and len(acc[:3]) == 3) else 0]).replace('.', '_') + ".pth" + if local_rank == 0: + print(f"Loading one of top 3 best model: {model_name}\n") + return "/" + model_name + + +def remove_bad_models(): + files = next(walk(configs.MODEL_DIR), (None, None, []))[2] + if len(files) == 0: + return + acc = sorted([float(i.split('.')[0].replace('_', '.')) for i in files], reverse=True) + for i in acc[3:]: + try: + os.remove(configs.MODEL_DIR + "/" + str(i).replace('.', '_') + ".pth") + except: + continue -- GitLab