From 485e6ca93b5db960c9c9ee0fe1465eacc29786ea Mon Sep 17 00:00:00 2001
From: Byeongman Lee <bmlee@nota.ai>
Date: Thu, 27 Jun 2024 10:47:35 +0900
Subject: [PATCH] #255 Update v0.2.2 version of netspresso_trainer (#259)

---
 netspresso/trainer/augmentations/__init__.py  |   18 +-
 .../trainer/augmentations/augmentation.py     |  138 ++-
 netspresso/trainer/models/__init__.py         |   74 +-
 netspresso/trainer/models/base.py             |   35 +
 netspresso/trainer/models/efficientformer.py  |  125 ++
 netspresso/trainer/models/mixnet.py           |  475 ++++++++
 netspresso/trainer/models/mobilenetv3.py      |  218 ++++
 netspresso/trainer/models/mobilevit.py        |   89 ++
 netspresso/trainer/models/model.py            | 1029 -----------------
 netspresso/trainer/models/pidnet.py           |   30 +
 netspresso/trainer/models/resnet.py           |  201 ++++
 netspresso/trainer/models/rtmpose.py          |   33 +
 netspresso/trainer/models/segformer.py        |   73 ++
 netspresso/trainer/models/vit.py              |   47 +
 netspresso/trainer/models/yolox.py            |  188 +++
 netspresso/trainer/trainer.py                 |   56 +-
 netspresso/trainer/training/environment.py    |    1 +
 netspresso/trainer/training/logging.py        |    8 +-
 netspresso/trainer/training/training.py       |    8 +-
 requirements.txt                              |    2 +-
 20 files changed, 1722 insertions(+), 1126 deletions(-)
 create mode 100644 netspresso/trainer/models/base.py
 create mode 100644 netspresso/trainer/models/efficientformer.py
 create mode 100644 netspresso/trainer/models/mixnet.py
 create mode 100644 netspresso/trainer/models/mobilenetv3.py
 create mode 100644 netspresso/trainer/models/mobilevit.py
 delete mode 100644 netspresso/trainer/models/model.py
 create mode 100644 netspresso/trainer/models/pidnet.py
 create mode 100644 netspresso/trainer/models/resnet.py
 create mode 100644 netspresso/trainer/models/rtmpose.py
 create mode 100644 netspresso/trainer/models/segformer.py
 create mode 100644 netspresso/trainer/models/vit.py
 create mode 100644 netspresso/trainer/models/yolox.py

diff --git a/netspresso/trainer/augmentations/__init__.py b/netspresso/trainer/augmentations/__init__.py
index 9fa351cb..9b235b04 100644
--- a/netspresso/trainer/augmentations/__init__.py
+++ b/netspresso/trainer/augmentations/__init__.py
@@ -1,19 +1,24 @@
 from netspresso.trainer.augmentations.augmentation import (
     AugmentationConfig,
+    CenterCrop,
     ClassificationAugmentationConfig,
     ColorJitter,
     DetectionAugmentationConfig,
-    Inference,
+    HSVJitter,
+    Mixing,
+    MosaicDetection,
     Pad,
+    PoseTopDownAffine,
     RandomCrop,
     RandomCutmix,
+    RandomErasing,
     RandomHorizontalFlip,
     RandomMixup,
+    RandomResize,
     RandomResizedCrop,
     RandomVerticalFlip,
     Resize,
     SegmentationAugmentationConfig,
-    Train,
     Transform,
     TrivialAugmentWide,
 )
@@ -26,6 +31,13 @@
 
 
 __all__ = [
+    "CenterCrop",
+    "HSVJitter",
+    "Mixing",
+    "MosaicDetection",
+    "PoseTopDownAffine",
+    "RandomErasing",
+    "RandomResize",
     "ColorJitter",
     "Pad",
     "RandomCrop",
@@ -36,8 +48,6 @@
     "TrivialAugmentWide",
     "RandomMixup",
     "RandomCutmix",
-    "Inference",
-    "Train",
     "Transform",
     "AugmentationConfig",
     "AUGMENTATION_CONFIG_TYPE",
diff --git a/netspresso/trainer/augmentations/augmentation.py b/netspresso/trainer/augmentations/augmentation.py
index 85a1ae21..06775fad 100644
--- a/netspresso/trainer/augmentations/augmentation.py
+++ b/netspresso/trainer/augmentations/augmentation.py
@@ -13,21 +13,16 @@ class Transform:
 
 
 @dataclass
-class Train:
-    transforms: Optional[List] = None
-    mix_transforms: Optional[List] = None
-
-
-@dataclass
-class Inference:
-    transforms: Optional[List] = None
+class AugmentationConfig:
+    img_size: int = DEFAULT_IMG_SIZE
+    train: Optional[List] = None
+    inference: Optional[List] = None
 
 
 @dataclass
-class AugmentationConfig:
-    img_size: int = DEFAULT_IMG_SIZE
-    train: Train = field(default_factory=lambda: Train())
-    inference: Inference = field(default_factory=lambda: Inference())
+class CenterCrop(Transform):
+    name: str = 'centercrop'
+    size: int = DEFAULT_IMG_SIZE
 
 
 @dataclass
@@ -40,6 +35,38 @@ class ColorJitter(Transform):
     p: Optional[float] = 0.5
 
 
+@dataclass
+class HSVJitter(Transform):
+    name: str = "hsvjitter"
+    h_mag: int = 5
+    s_mag: int = 30
+    v_mag: int = 30
+
+
+@dataclass
+class Mixing(Transform):
+    name: str = "mixing"
+    mixup: Optional[List[float]] = field(default=None)
+    cutmix: Optional[List[float]] = field(default=None)
+    inplace: bool = False
+
+
+@dataclass
+class MosaicDetection(Transform):
+    name: str = "mosaicdetection"
+    size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE])
+    mosaic_prob: float = 1.0
+    affine_scale: List = field(default_factory=lambda: [0.5, 1.5])
+    degrees: float = 10.0
+    translate: float = 0.1
+    shear: float = 2.0
+    enable_mixup: bool = True
+    mixup_prob: float = 1.0
+    mixup_scale: List = field(default_factory=lambda: [0.5, 1.5])
+    fill: int = 114
+    mosaic_off_epoch: int = 10
+
+
 @dataclass
 class Pad(Transform):
     name: str = 'pad'
@@ -48,6 +75,18 @@ class Pad(Transform):
     padding_mode: str = 'constant'
 
 
+@dataclass
+class PoseTopDownAffine(Transform):
+    name: str = "posetopdownaffine"
+    scale: List = field(default_factory=lambda: [0.75, 1.25])
+    scale_prob: float = 1.
+    translate: float = 0.1
+    translate_prob: float = 1.
+    rotation: int = 60
+    rotation_prob: float = 1.
+    size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE])
+
+
 @dataclass
 class RandomCrop(Transform):
     name: str = 'randomcrop'
@@ -55,12 +94,13 @@ class RandomCrop(Transform):
 
 
 @dataclass
-class RandomResizedCrop(Transform):
-    name: str = 'randomresizedcrop'
-    size: int = DEFAULT_IMG_SIZE
-    scale: List = field(default_factory=lambda: [0.08, 1.0])
-    ratio: List = field(default_factory=lambda: [0.75, 1.33])
-    interpolation: Optional[str] = 'bilinear'
+class RandomErasing(Transform):
+    name: str = "randomerasing"
+    p: float = 0.5
+    scale: List = field(default_factory=lambda: [0.02, 0.33])
+    scale: List = field(default_factory=lambda: [0.3, 3.3])
+    value: Optional[int] = 0
+    inplace: bool = False
 
 
 @dataclass
@@ -69,6 +109,24 @@ class RandomHorizontalFlip(Transform):
     p: float = 0.5
 
 
+@dataclass
+class RandomResize(Transform):
+    name: str = "randomresize"
+    base_size: List = field(default_factory=lambda: [256, 256])
+    stride: int = 32
+    random_range: int = 4
+    interpolation: str = "bilinear"
+
+
+@dataclass
+class RandomResizedCrop(Transform):
+    name: str = 'randomresizedcrop'
+    size: int = DEFAULT_IMG_SIZE
+    scale: List = field(default_factory=lambda: [0.08, 1.0])
+    ratio: List = field(default_factory=lambda: [0.75, 1.33])
+    interpolation: Optional[str] = 'bilinear'
+
+
 @dataclass
 class RandomVerticalFlip(Transform):
     name: str = 'randomverticalflip'
@@ -81,6 +139,7 @@ class Resize(Transform):
     size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE])
     interpolation: Optional[str] = 'bilinear'
     max_size: Optional[int] =  None
+    resize_criteria: Optional[int] =  None
 
 
 class TrivialAugmentWide(Transform):
@@ -109,34 +168,35 @@ class RandomCutmix(Transform):
 @dataclass
 class ClassificationAugmentationConfig(AugmentationConfig):
     img_size: int = 256
-    train: Train = field(default_factory=lambda: Train(
-        transforms=[RandomResizedCrop(size=256), RandomHorizontalFlip()],
-        mix_transforms=[RandomCutmix()]
-    ))
-    inference: Inference = field(default_factory=lambda: Inference(
-        transforms=[Resize(size=[256, 256])]
-    ))
+    train: Optional[List] = field(default_factory=lambda: [
+        RandomResizedCrop(size=256),
+        RandomHorizontalFlip(),
+        Mixing(mixup=[0.25, 1.0])
+    ])
+    inference: Optional[List] = field(default_factory=lambda: [
+        Resize(size=[256, 256])
+    ])
 
 
 @dataclass
 class SegmentationAugmentationConfig(AugmentationConfig):
     img_size: int = 512
-    train: Train = field(default_factory=lambda: Train(
-        transforms=[RandomResizedCrop(size=512), RandomHorizontalFlip(), ColorJitter()],
-        mix_transforms=None
-    ))
-    inference: Inference = field(default_factory=lambda: Inference(
-        transforms=[Resize(size=[512, 512])]
-    ))
+    train: Optional[List] = field(default_factory=lambda: [
+        RandomResizedCrop(size=512),
+        RandomHorizontalFlip(),
+        ColorJitter()
+    ])
+    inference: Optional[List] = field(default_factory=lambda: [
+        Resize(size=[512, 512])
+    ])
 
 
 @dataclass
 class DetectionAugmentationConfig(AugmentationConfig):
     img_size: int = 512
-    train: Train = field(default_factory=lambda: Train(
-        transforms=[Resize(size=[512, 512])],
-        mix_transforms=None
-    ))
-    inference: Inference = field(default_factory=lambda: Inference(
-        transforms=[Resize(size=[512, 512])],
-    ))
+    train: Optional[List] = field(default_factory=lambda: [
+        Resize(size=[512, 512])
+    ])
+    inference: Optional[List] = field(default_factory=lambda: [
+        Resize(size=[512, 512])
+    ])
diff --git a/netspresso/trainer/models/__init__.py b/netspresso/trainer/models/__init__.py
index 581fc87d..78c6e932 100644
--- a/netspresso/trainer/models/__init__.py
+++ b/netspresso/trainer/models/__init__.py
@@ -1,37 +1,54 @@
-from netspresso.trainer.models.model import (
-    CheckpointConfig,
+from netspresso.trainer.models.base import CheckpointConfig, ModelConfig
+from netspresso.trainer.models.efficientformer import (
     ClassificationEfficientFormerModelConfig,
+    DetectionEfficientFormerModelConfig,
+    SegmentationEfficientFormerModelConfig,
+)
+from netspresso.trainer.models.mixnet import (
     ClassificationMixNetLargeModelConfig,
     ClassificationMixNetMediumModelConfig,
     ClassificationMixNetSmallModelConfig,
-    ClassificationMobileNetV3ModelConfig,
-    ClassificationMobileViTModelConfig,
-    ClassificationResNetModelConfig,
-    ClassificationViTModelConfig,
-    DetectionEfficientFormerModelConfig,
     DetectionMixNetLargeModelConfig,
     DetectionMixNetMediumModelConfig,
     DetectionMixNetSmallModelConfig,
-    DetectionMobileNetV3ModelConfig,
-    DetectionResNetModelConfig,
-    DetectionYoloXModelConfig,
-    ModelConfig,
-    PIDNetModelConfig,
-    SegmentationEfficientFormerModelConfig,
     SegmentationMixNetLargeModelConfig,
     SegmentationMixNetMediumModelConfig,
     SegmentationMixNetSmallModelConfig,
-    SegmentationMobileNetV3ModelConfig,
-    SegmentationResNetModelConfig,
-    SegmentationSegFormerModelConfig,
+)
+from netspresso.trainer.models.mobilenetv3 import (
+    ClassificationMobileNetV3LargeModelConfig,
+    ClassificationMobileNetV3SmallModelConfig,
+    DetectionMobileNetV3SmallModelConfig,
+    SegmentationMobileNetV3SmallModelConfig,
+)
+from netspresso.trainer.models.mobilevit import ClassificationMobileViTModelConfig
+from netspresso.trainer.models.pidnet import PIDNetModelConfig
+from netspresso.trainer.models.resnet import (
+    ClassificationResNet18ModelConfig,
+    ClassificationResNet34ModelConfig,
+    ClassificationResNet50ModelConfig,
+    DetectionResNet50ModelConfig,
+    SegmentationResNet50ModelConfig,
+)
+from netspresso.trainer.models.rtmpose import PoseEstimationMobileNetV3SmallModelConfig
+from netspresso.trainer.models.segformer import SegmentationSegFormerB0ModelConfig
+from netspresso.trainer.models.vit import ClassificationViTTinyModelConfig
+from netspresso.trainer.models.yolox import (
+    DetectionYoloXLModelConfig,
+    DetectionYoloXMModelConfig,
+    DetectionYoloXSModelConfig,
+    DetectionYoloXXModelConfig,
 )
 
 CLASSIFICATION_MODELS = {
     "EfficientFormer": ClassificationEfficientFormerModelConfig,
-    "MobileNetV3": ClassificationMobileNetV3ModelConfig,
+    "MobileNetV3_Small": ClassificationMobileNetV3SmallModelConfig,
+    "MobileNetV3_Large": ClassificationMobileNetV3LargeModelConfig,
     "MobileViT": ClassificationMobileViTModelConfig,
-    "ResNet": ClassificationResNetModelConfig,
-    "ViT": ClassificationViTModelConfig,
+    "ResNet18": ClassificationResNet18ModelConfig,
+    "ResNet34": ClassificationResNet34ModelConfig,
+    "ResNet50": ClassificationResNet50ModelConfig,
+    "ViT_Tiny": ClassificationViTTinyModelConfig,
     "MixNetS": ClassificationMixNetSmallModelConfig,
     "MixNetM": ClassificationMixNetMediumModelConfig,
     "MixNetL": ClassificationMixNetLargeModelConfig,
@@ -39,9 +56,12 @@
 
 DETECTION_MODELS = {
     "EfficientFormer": DetectionEfficientFormerModelConfig,
-    "YOLOX-S": DetectionYoloXModelConfig,
-    "ResNet": DetectionResNetModelConfig,
-    "MobileNetV3": DetectionMobileNetV3ModelConfig,
+    "MobileNetV3_Small": DetectionMobileNetV3SmallModelConfig,
+    "YOLOX-S": DetectionYoloXSModelConfig,
+    "YOLOX-M": DetectionYoloXMModelConfig,
+    "YOLOX-L": DetectionYoloXLModelConfig,
+    "YOLOX-X": DetectionYoloXXModelConfig,
+    "ResNet50": DetectionResNet50ModelConfig,
     "MixNetL": DetectionMixNetLargeModelConfig,
     "MixNetM": DetectionMixNetMediumModelConfig,
     "MixNetS": DetectionMixNetSmallModelConfig,
@@ -49,15 +69,19 @@
 
 SEGMENTATION_MODELS = {
     "EfficientFormer": SegmentationEfficientFormerModelConfig,
-    "MobileNetV3": SegmentationMobileNetV3ModelConfig,
-    "ResNet": SegmentationResNetModelConfig,
-    "SegFormer": SegmentationSegFormerModelConfig,
+    "MobileNetV3_Small": SegmentationMobileNetV3SmallModelConfig,
+    "ResNet50": SegmentationResNet50ModelConfig,
+    "SegFormer-B0": SegmentationSegFormerB0ModelConfig,
     "MixNetS": SegmentationMixNetSmallModelConfig,
     "MixNetM": SegmentationMixNetMediumModelConfig,
     "MixNetL": SegmentationMixNetLargeModelConfig,
     "PIDNet": PIDNetModelConfig,
 }
 
+POSEESTIMATION_MODELS = {
+    "MobileNetV3_Small": PoseEstimationMobileNetV3SmallModelConfig,
+}
+
 
 __all__ = [
     "CLASSIFICATION_MODELS",
diff --git a/netspresso/trainer/models/base.py b/netspresso/trainer/models/base.py
new file mode 100644
index 00000000..ae4242d4
--- /dev/null
+++ b/netspresso/trainer/models/base.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from omegaconf import MISSING
+
+
+@dataclass
+class ArchitectureConfig:
+    full: Optional[Dict[str, Any]] = None
+    backbone: Optional[Dict[str, Any]] = None
+    neck: Optional[Dict[str, Any]] = None
+    head: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        assert bool(self.full) != bool(self.backbone), "Only one of full or backbone should be given."
+
+
+@dataclass
+class CheckpointConfig:
+    use_pretrained: bool = True
+    load_head: bool = False
+    path: Optional[Union[Path, str]] = None
+    fx_model_path: Optional[Union[Path, str]] = None
+    optimizer_path: Optional[Union[Path, str]] = None
+
+
+@dataclass
+class ModelConfig:
+    task: str = MISSING
+    name: str = MISSING
+    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig())
+    freeze_backbone: bool = False
+    architecture: ArchitectureConfig = field(default_factory=lambda: ArchitectureConfig())
+    losses: Optional[List[Dict[str, Any]]] = None
diff --git a/netspresso/trainer/models/efficientformer.py b/netspresso/trainer/models/efficientformer.py
new file mode 100644
index 00000000..2ddd780c
--- /dev/null
+++ b/netspresso/trainer/models/efficientformer.py
@@ -0,0 +1,125 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class EfficientFormerArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "efficientformer",
+            "params": {
+                "num_attention_heads": 8,
+                "attention_channels": 256,
+                "attention_dropout_prob": 0.0,
+                "attention_value_expansion_ratio": 4,
+                "ffn_intermediate_ratio": 4,
+                "ffn_dropout_prob": 0.0,
+                "ffn_act_type": "gelu",
+                "vit_num": 1,
+            },
+            "stage_params": [
+                {"num_blocks": 3, "channels": 48},
+                {"num_blocks": 2, "channels": 96},
+                {"num_blocks": 6, "channels": 224},
+                {"num_blocks": 4, "channels": 448},
+            ],
+        }
+    )
+
+
+@dataclass
+class ClassificationEfficientFormerModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "efficientformer_l1"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: EfficientFormerArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationEfficientFormerModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "efficientformer_l1"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: EfficientFormerArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionEfficientFormerModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "efficientformer_l1"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: EfficientFormerArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
diff --git a/netspresso/trainer/models/mixnet.py b/netspresso/trainer/models/mixnet.py
new file mode 100644
index 00000000..6f74d4a7
--- /dev/null
+++ b/netspresso/trainer/models/mixnet.py
@@ -0,0 +1,475 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class MixNetSmallArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mixnet",
+            "params": {
+                "stem_channels": 16,
+                "wid_mul": 1.0,
+                "dep_mul": 1.0,
+                "dropout_rate": 0.0,
+            },
+            "stage_params": [
+                {
+                    "expansion_ratio": [1, 6, 3],
+                    "out_channels": [16, 24, 24],
+                    "num_blocks": [1, 1, 1],
+                    "kernel_sizes": [[3], [3], [3]],
+                    "num_exp_groups": [1, 2, 2],
+                    "num_poi_groups": [1, 2, 2],
+                    "stride": [1, 2, 1],
+                    "act_type": ["relu", "relu", "relu"],
+                    "se_reduction_ratio": [None, None, None],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [40, 40],
+                    "num_blocks": [1, 3],
+                    "kernel_sizes": [[3, 5, 7], [3, 5]],
+                    "num_exp_groups": [1, 2],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6, 6, 3],
+                    "out_channels": [80, 80, 120, 120],
+                    "num_blocks": [1, 2, 1, 2],
+                    "kernel_sizes": [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 1, 2, 2],
+                    "num_poi_groups": [2, 2, 2, 2],
+                    "stride": [2, 1, 1, 1],
+                    "act_type": ["swish", "swish", "swish", "swish"],
+                    "se_reduction_ratio": [4, 4, 2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [200, 200],
+                    "num_blocks": [1, 2],
+                    "kernel_sizes": [[3, 5, 7, 9, 11], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 1],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class MixNetMediumArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mixnet",
+            "params": {
+                "stem_channels": 24,
+                "wid_mul": 1.0,
+                "dep_mul": 1.0,
+                "dropout_rate": 0.0,
+            },
+            "stage_params": [
+                {
+                    "expansion_ratio": [1, 6, 3],
+                    "out_channels": [24, 32, 32],
+                    "num_blocks": [1, 1, 1],
+                    "kernel_sizes": [[3], [3, 5, 7], [3]],
+                    "num_exp_groups": [1, 2, 2],
+                    "num_poi_groups": [1, 2, 2],
+                    "stride": [1, 2, 1],
+                    "act_type": ["relu", "relu", "relu"],
+                    "se_reduction_ratio": [None, None, None],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [40, 40],
+                    "num_blocks": [1, 3],
+                    "kernel_sizes": [[3, 5, 7, 9], [3, 5]],
+                    "num_exp_groups": [1, 2],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6, 6, 3],
+                    "out_channels": [80, 80, 120, 120],
+                    "num_blocks": [1, 3, 1, 3],
+                    "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 2, 1, 2],
+                    "num_poi_groups": [1, 2, 1, 2],
+                    "stride": [2, 1, 1, 1],
+                    "act_type": ["swish", "swish", "swish", "swish"],
+                    "se_reduction_ratio": [4, 4, 2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [200, 200],
+                    "num_blocks": [1, 3],
+                    "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 1],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class MixNetLargeArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mixnet",
+            "params": {
+                "stem_channels": 24,
+                "wid_mul": 1.3,
+                "dep_mul": 1.0,
+                "dropout_rate": 0.0,
+            },
+            "stage_params": [
+                {
+                    "expansion_ratio": [1, 6, 3],
+                    "out_channels": [24, 32, 32],
+                    "num_blocks": [1, 1, 1],
+                    "kernel_sizes": [[3], [3, 5, 7], [3]],
+                    "num_exp_groups": [1, 2, 2],
+                    "num_poi_groups": [1, 2, 2],
+                    "stride": [1, 2, 1],
+                    "act_type": ["relu", "relu", "relu"],
+                    "se_reduction_ratio": [None, None, None],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [40, 40],
+                    "num_blocks": [1, 3],
+                    "kernel_sizes": [[3, 5, 7, 9], [3, 5]],
+                    "num_exp_groups": [1, 2],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6, 6, 3],
+                    "out_channels": [80, 80, 120, 120],
+                    "num_blocks": [1, 3, 1, 3],
+                    "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 2, 1, 2],
+                    "num_poi_groups": [1, 2, 1, 2],
+                    "stride": [2, 1, 1, 1],
+                    "act_type": ["swish", "swish", "swish", "swish"],
+                    "se_reduction_ratio": [4, 4, 2, 2],
+                },
+                {
+                    "expansion_ratio": [6, 6],
+                    "out_channels": [200, 200],
+                    "num_blocks": [1, 3],
+                    "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]],
+                    "num_exp_groups": [1, 1],
+                    "num_poi_groups": [1, 2],
+                    "stride": [2, 1],
+                    "act_type": ["swish", "swish"],
+                    "se_reduction_ratio": [2, 2],
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class ClassificationMixNetSmallModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mixnet_s"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetSmallArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationMixNetSmallModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "mixnet_s"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetSmallArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionMixNetSmallModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "mixnet_s"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetSmallArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
+
+
+@dataclass
+class ClassificationMixNetMediumModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mixnet_m"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetMediumArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationMixNetMediumModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "mixnet_m"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetMediumArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionMixNetMediumModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "mixnet_m"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetMediumArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
+
+
+@dataclass
+class ClassificationMixNetLargeModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mixnet_l"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetLargeArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationMixNetLargeModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "mixnet_l"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetLargeArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionMixNetLargeModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "mixnet_l"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MixNetLargeArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
diff --git a/netspresso/trainer/models/mobilenetv3.py b/netspresso/trainer/models/mobilenetv3.py
new file mode 100644
index 00000000..814b5588
--- /dev/null
+++ b/netspresso/trainer/models/mobilenetv3.py
@@ -0,0 +1,218 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class MobileNetV3SmallArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mobilenetv3",
+            "params": None,
+            "stage_params": [
+                {
+                    "in_channels": [16],
+                    "kernel_sizes": [3],
+                    "expanded_channels": [16],
+                    "out_channels": [16],
+                    "use_se": [True],
+                    "act_type": ["relu"],
+                    "stride": [2],
+                },
+                {
+                    "in_channels": [16, 24],
+                    "kernel_sizes": [3, 3],
+                    "expanded_channels": [72, 88],
+                    "out_channels": [24, 24],
+                    "use_se": [False, False],
+                    "act_type": ["relu", "relu"],
+                    "stride": [2, 1],
+                },
+                {
+                    "in_channels": [24, 40, 40, 40, 48],
+                    "kernel_sizes": [5, 5, 5, 5, 5],
+                    "expanded_channels": [96, 240, 240, 120, 144],
+                    "out_channels": [40, 40, 40, 48, 48],
+                    "use_se": [True, True, True, True, True],
+                    "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"],
+                    "stride": [2, 1, 1, 1, 1],
+                },
+                {
+                    "in_channels": [48, 96, 96],
+                    "kernel_sizes": [5, 5, 5],
+                    "expanded_channels": [288, 576, 576],
+                    "out_channels": [96, 96, 96],
+                    "use_se": [True, True, True],
+                    "act_type": ["hard_swish", "hard_swish", "hard_swish"],
+                    "stride": [2, 1, 1],
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class MobileNetV3LargeArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mobilenetv3",
+            "params": None,
+            "stage_params": [
+                {
+                    "in_channels": [16, 16, 24],
+                    "kernel_sizes": [3, 3, 3],
+                    "expanded_channels": [16, 64, 72],
+                    "out_channels": [16, 24, 24],
+                    "use_se": [False, False, False],
+                    "act_type": ["relu", "relu", "relu"],
+                    "stride": [1, 2, 1],
+                },
+                {
+                    "in_channels": [24, 40, 40],
+                    "kernel_sizes": [5, 5, 5],
+                    "expanded_channels": [72, 120, 120],
+                    "out_channels": [40, 40, 40],
+                    "use_se": [True, True, True],
+                    "act_type": ["relu", "relu", "relu"],
+                    "stride": [2, 1, 1],
+                },
+                {
+                    "in_channels": [40, 80, 80, 80, 80, 112],
+                    "kernel_sizes": [3, 3, 3, 3, 3, 3],
+                    "expanded_channels": [240, 200, 184, 184, 480, 672],
+                    "out_channels": [80, 80, 80, 80, 112, 112],
+                    "use_se": [False, False, False, False, True, True],
+                    "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"],
+                    "stride": [2, 1, 1, 1, 1, 1],
+                },
+                {
+                    "in_channels": [112, 160, 160],
+                    "kernel_sizes": [5, 5, 5],
+                    "expanded_channels": [672, 960, 960],
+                    "out_channels": [160, 160, 160],
+                    "use_se": [True, True, True],
+                    "act_type": ["hard_swish", "hard_swish", "hard_swish"],
+                    "stride": [2, 1, 1],
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class ClassificationMobileNetV3LargeModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mobilenet_v3_large"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileNetV3LargeArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 2,
+                    "intermediate_channels": 1200,
+                    "act_type": "hard_swish",
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class ClassificationMobileNetV3SmallModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mobilenet_v3_small"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileNetV3SmallArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationMobileNetV3SmallModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "mobilenet_v3_small"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileNetV3SmallArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionMobileNetV3SmallModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "mobilenet_v3_small"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileNetV3SmallArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
diff --git a/netspresso/trainer/models/mobilevit.py b/netspresso/trainer/models/mobilevit.py
new file mode 100644
index 00000000..03361477
--- /dev/null
+++ b/netspresso/trainer/models/mobilevit.py
@@ -0,0 +1,89 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class MobileViTArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mobilevit",
+            "params": {
+                "patch_size": 2,
+                "num_attention_heads": 4,
+                "attention_dropout_prob": 0.1,
+                "ffn_dropout_prob": 0.0,
+                "output_expansion_ratio": 4,
+                "use_fusion_layer": True,
+            },
+            "stage_params": [
+                {
+                    "out_channels": 32,
+                    "block_type": "mv2",
+                    "num_blocks": 1,
+                    "stride": 1,
+                    "ir_expansion_ratio": 4,  # [mv2_exp_mult] * 4
+                },
+                {
+                    "block_type": "mv2",
+                    "out_channels": 64,
+                    "num_blocks": 3,
+                    "stride": 2,
+                    "ir_expansion_ratio": 4,  # [mv2_exp_mult] * 4
+                },
+                {
+                    "block_type": "mobilevit",
+                    "out_channels": 96,
+                    "num_blocks": 2,
+                    "stride": 2,
+                    "hidden_size": 144,
+                    "intermediate_size": 288,
+                    "dilate": False,
+                    "ir_expansion_ratio": 4,  # [mv2_exp_mult] * 4
+                },
+                {
+                    "block_type": "mobilevit",
+                    "out_channels": 128,
+                    "num_blocks": 4,
+                    "stride": 2,
+                    "hidden_size": 192,
+                    "intermediate_size": 384,
+                    "dilate": False,
+                    "ir_expansion_ratio": 4,  # [mv2_exp_mult] * 4
+                },
+                {
+                    "block_type": "mobilevit",
+                    "out_channels": 160,
+                    "num_blocks": 3,
+                    "stride": 2,
+                    "hidden_size": 240,
+                    "intermediate_size": 480,
+                    "dilate": False,
+                    "ir_expansion_ratio": 4,  # [mv2_exp_mult] * 4
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class ClassificationMobileViTModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "mobilevit_s"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileViTArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
diff --git a/netspresso/trainer/models/model.py b/netspresso/trainer/models/model.py
deleted file mode 100644
index f30d7982..00000000
--- a/netspresso/trainer/models/model.py
+++ /dev/null
@@ -1,1029 +0,0 @@
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
-
-from omegaconf import MISSING, MissingMandatoryValue
-
-__all__ = [
-    "ModelConfig",
-    "ClassificationEfficientFormerModelConfig",
-    "SegmentationEfficientFormerModelConfig",
-    "DetectionEfficientFormerModelConfig",
-    "ClassificationMobileNetV3ModelConfig",
-    "SegmentationMobileNetV3ModelConfig",
-    "DetectionMobileNetV3ModelConfig",
-    "ClassificationMobileViTModelConfig",
-    "PIDNetModelConfig",
-    "ClassificationResNetModelConfig",
-    "SegmentationResNetModelConfig",
-    "DetectionResNetModelConfig",
-    "SegmentationSegFormerModelConfig",
-    "ClassificationViTModelConfig",
-    "DetectionYoloXModelConfig",
-    "ClassificationMixNetSmallModelConfig",
-    "ClassificationMixNetMediumModelConfig",
-    "ClassificationMixNetLargeModelConfig",
-    "SegmentationMixNetSmallModelConfig",
-    "SegmentationMixNetMediumModelConfig",
-    "SegmentationMixNetLargeModelConfig",
-    "DetectionMixNetSmallModelConfig",
-    "DetectionMixNetMediumModelConfig",
-    "DetectionMixNetLargeModelConfig",
-]
-
-
-@dataclass
-class ArchitectureConfig:
-    full: Optional[Dict[str, Any]] = None
-    backbone: Optional[Dict[str, Any]] = None
-    neck: Optional[Dict[str, Any]] = None
-    head: Optional[Dict[str, Any]] = None
-
-    def __post_init__(self):
-        assert bool(self.full) != bool(self.backbone), "Only one of full or backbone should be given."
-
-@dataclass
-class CheckpointConfig:
-    use_pretrained: bool = True
-    load_head: bool = False
-    path: Optional[Union[Path, str]] = None
-    fx_model_path: Optional[Union[Path, str]] = None
-    optimizer_path: Optional[Union[Path, str]] = None
-
-@dataclass
-class ModelConfig:
-    task: str = MISSING
-    name: str = MISSING
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig())
-    load_checkpoint_head: bool = False
-    fx_model_checkpoint: Optional[Union[Path, str]] = None
-    resume_optimizer_checkpoint: Optional[Union[Path, str]] = None
-    freeze_backbone: bool = False
-    architecture: ArchitectureConfig = field(default_factory=lambda: ArchitectureConfig())
-    losses: Optional[List[Dict[str, Any]]] = None
-
-
-@dataclass
-class EfficientFormerArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "efficientformer",
-        "params": {
-            "num_attention_heads": 8,
-            "attention_channels": 256,
-            "attention_dropout_prob": 0.,
-            "attention_value_expansion_ratio": 4,
-            "ffn_intermediate_ratio": 4,
-            "ffn_dropout_prob": 0.,
-            "ffn_act_type": 'gelu',
-            "vit_num": 1,
-        },
-        "stage_params": [
-            {"num_blocks": 3, "channels": 48},
-            {"num_blocks": 2, "channels": 96},
-            {"num_blocks": 6, "channels": 224},
-            {"num_blocks": 4, "channels": 448},
-        ],
-    })
-
-
-@dataclass
-class MobileNetV3ArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mobilenetv3",
-        "params": None,
-        "stage_params": [
-            {
-                "in_channels": [16],
-                "kernel_sizes": [3],
-                "expanded_channels": [16],
-                "out_channels": [16],
-                "use_se": [True],
-                "act_type": ["relu"],
-                "stride": [2],
-            },
-            {
-                "in_channels": [16, 24],
-                "kernel_sizes": [3, 3],
-                "expanded_channels": [72, 88],
-                "out_channels": [24, 24],
-                "use_se": [False, False],
-                "act_type": ["relu", "relu"],
-                "stride": [2, 1],
-            },
-            {
-                "in_channels": [24, 40, 40, 40, 48],
-                "kernel_sizes": [5, 5, 5, 5, 5],
-                "expanded_channels": [96, 240, 240, 120, 144],
-                "out_channels": [40, 40, 40, 48, 48],
-                "use_se": [True, True, True, True, True],
-                "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"],
-                "stride": [2, 1, 1, 1, 1],
-            },
-            {
-                "in_channels": [48, 96, 96],
-                "kernel_sizes": [5, 5, 5],
-                "expanded_channels": [288, 576, 576],
-                "out_channels": [96, 96, 96],
-                "use_se": [True, True, True],
-                "act_type": ["hard_swish", "hard_swish", "hard_swish"],
-                "stride": [2, 1, 1],
-            },
-        ],
-    })
-
-
-@dataclass
-class MobileViTArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mobilevit",
-        "params": {
-            "patch_size": 2,
-            "num_attention_heads": 4,
-            "attention_dropout_prob": 0.1,
-            "ffn_dropout_prob": 0.0,
-            "output_expansion_ratio": 4,
-            "use_fusion_layer": True,
-        },
-        "stage_params": [
-            {
-                "out_channels": 32,
-                "block_type": "mv2",
-                "num_blocks": 1,
-                "stride": 1,
-                "ir_expansion_ratio": 4,
-            },
-            {
-                "out_channels": 64,
-                "block_type": "mv2",
-                "num_blocks": 3,
-                "stride": 2,
-                "ir_expansion_ratio": 4,
-            },
-            {
-                "out_channels": 96,
-                "block_type": "mobilevit",
-                "num_blocks": 2,
-                "stride": 2,
-                "hidden_size": 144,
-                "intermediate_size": 288,
-                "dilate": False,
-                "ir_expansion_ratio": 4,
-            },
-            {
-                "out_channels": 128,
-                "block_type": "mobilevit",
-                "num_blocks": 4,
-                "stride": 2,
-                "hidden_size": 192,
-                "intermediate_size": 384,
-                "dilate": False,
-                "ir_expansion_ratio": 4,
-            },
-            {
-                "out_channels": 160,
-                "block_type": "mobilevit",
-                "num_blocks": 3,
-                "stride": 2,
-                "hidden_size": 240,
-                "intermediate_size": 480,
-                "dilate": False,
-                "ir_expansion_ratio": 4,
-            },
-        ]
-    })
-
-
-@dataclass
-class PIDNetArchitectureConfig(ArchitectureConfig):
-    full: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "pidnet",
-        "m": 2,
-        "n": 3,
-        "channels": 32,
-        "ppm_channels": 96,
-        "head_channels": 128,
-    })
-
-
-@dataclass
-class ResNetArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "resnet",
-        "params": {
-            "block_type": "bottleneck",
-            "norm_type": "batch_norm",
-        },
-        "stage_params": [
-            {"channels": 64, "num_blocks": 3},
-            {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False},
-            {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False},
-            {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False},
-        ],
-    })
-
-
-@dataclass
-class SegFormerArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mixtransformer",
-        "params": {
-            "ffn_intermediate_expansion_ratio": 4,
-            "ffn_act_type": "gelu",
-            "ffn_dropout_prob": 0.0,
-            "attention_dropout_prob": 0.0,
-        },
-        "stage_params": [
-            {
-                "num_blocks": 2,
-                "sequence_reduction_ratio": 8,
-                "attention_chananels": 32,
-                "embedding_patch_sizes": 7,
-                "embedding_strides": 4,
-                "num_attention_heads": 1,
-            },
-            {
-                "num_blocks": 2,
-                "sequence_reduction_ratio": 4,
-                "attention_chananels": 64,
-                "embedding_patch_sizes": 3,
-                "num_attention_heads": 2,
-            },
-            {
-                "num_blocks": 2,
-                "sequence_reduction_ratio": 2,
-                "attention_chananels": 160,
-                "embedding_patch_sizes": 3,
-                "embedding_strides": 2,
-                "num_attention_heads": 5,
-            },
-            {
-                "num_blocks": 2,
-                "sequence_reduction_ratio": 1,
-                "attention_chananels": 256,
-                "embedding_patch_sizes": 3,
-                "embedding_strides": 2,
-                "num_attention_heads": 8,
-            },
-        ],
-    })
-
-
-@dataclass
-class ViTArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "vit",
-        "params": {
-            "patch_size": 16,
-            "attention_channels": 192,
-            "num_blocks": 12,
-            "num_attention_heads": 3,
-            "attention_dropout_prob": 0.0,
-            "ffn_intermediate_channels": 768,
-            "ffn_dropout_prob": 0.1,
-            "use_cls_token": True,
-            "vocab_size": 1000,
-        },
-        "stage_params": None,
-    })
-
-
-@dataclass
-class MixNetSmallArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mixnet",
-        "params": {
-            "stem_channels": 16,
-            "wid_mul": 1.0,
-            "dep_mul": 1.0,
-            "dropout_rate": 0.,
-        },
-        "stage_params":  [
-            {
-                "expansion_ratio": [1, 6, 3],
-                "out_channels": [16, 24, 24],
-                "num_blocks": [1, 1, 1],
-                "kernel_sizes": [[3], [3], [3]],
-                "num_exp_groups": [1, 2, 2],
-                "num_poi_groups": [1, 2, 2],
-                "stride": [1, 2, 1],
-                "act_type": ["relu", "relu", "relu"],
-                "se_reduction_ratio": [None, None, None],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [40, 40],
-                "num_blocks": [1, 3],
-                "kernel_sizes": [[3, 5, 7], [3, 5]],
-                "num_exp_groups": [1, 2],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6, 6, 3],
-                "out_channels": [80, 80, 120, 120],
-                "num_blocks": [1, 2, 1, 2],
-                "kernel_sizes": [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 1, 2, 2],
-                "num_poi_groups": [2, 2, 2, 2],
-                "stride": [2, 1, 1, 1],
-                "act_type": ["swish", "swish", "swish", "swish"],
-                "se_reduction_ratio": [4, 4, 2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [200, 200],
-                "num_blocks": [1, 2],
-                "kernel_sizes": [[3, 5, 7, 9, 11], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 1],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-        ],
-    })
-
-
-@dataclass
-class MixNetMediumArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mixnet",
-        "params": {
-            "stem_channels": 24,
-            "wid_mul": 1.0,
-            "dep_mul": 1.0,
-            "dropout_rate": 0.,
-        },
-        "stage_params":  [
-            {
-                "expansion_ratio": [1, 6, 3],
-                "out_channels": [24, 32, 32],
-                "num_blocks": [1, 1, 1],
-                "kernel_sizes": [[3], [3, 5, 7], [3]],
-                "num_exp_groups": [1, 2, 2],
-                "num_poi_groups": [1, 2, 2],
-                "stride": [1, 2, 1],
-                "act_type": ["relu", "relu", "relu"],
-                "se_reduction_ratio": [None, None, None],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [40, 40],
-                "num_blocks": [1, 3],
-                "kernel_sizes": [[3, 5, 7, 9], [3, 5]],
-                "num_exp_groups": [1, 2],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6, 6, 3],
-                "out_channels": [80, 80, 120, 120],
-                "num_blocks": [1, 3, 1, 3],
-                "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 2, 1, 2],
-                "num_poi_groups": [1, 2, 1, 2],
-                "stride": [2, 1, 1, 1],
-                "act_type": ["swish", "swish", "swish", "swish"],
-                "se_reduction_ratio": [4, 4, 2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [200, 200],
-                "num_blocks": [1, 3],
-                "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 1],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-        ],
-    })
-
-
-@dataclass
-class MixNetLargeArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "mixnet",
-        "params": {
-            "stem_channels": 24,
-            "wid_mul": 1.3,
-            "dep_mul": 1.0,
-            "dropout_rate": 0.,
-        },
-        "stage_params":  [
-            {
-                "expansion_ratio": [1, 6, 3],
-                "out_channels": [24, 32, 32],
-                "num_blocks": [1, 1, 1],
-                "kernel_sizes": [[3], [3, 5, 7], [3]],
-                "num_exp_groups": [1, 2, 2],
-                "num_poi_groups": [1, 2, 2],
-                "stride": [1, 2, 1],
-                "act_type": ["relu", "relu", "relu"],
-                "se_reduction_ratio": [None, None, None],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [40, 40],
-                "num_blocks": [1, 3],
-                "kernel_sizes": [[3, 5, 7, 9], [3, 5]],
-                "num_exp_groups": [1, 2],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6, 6, 3],
-                "out_channels": [80, 80, 120, 120],
-                "num_blocks": [1, 3, 1, 3],
-                "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 2, 1, 2],
-                "num_poi_groups": [1, 2, 1, 2],
-                "stride": [2, 1, 1, 1],
-                "act_type": ["swish", "swish", "swish", "swish"],
-                "se_reduction_ratio": [4, 4, 2, 2],
-            },
-            {
-                "expansion_ratio": [6, 6],
-                "out_channels": [200, 200],
-                "num_blocks": [1, 3],
-                "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]],
-                "num_exp_groups": [1, 1],
-                "num_poi_groups": [1, 2],
-                "stride": [2, 1],
-                "act_type": ["swish", "swish"],
-                "se_reduction_ratio": [2, 2],
-            },
-        ],
-    })
-
-
-@dataclass
-class CSPDarkNetSmallArchitectureConfig(ArchitectureConfig):
-    backbone: Dict[str, Any] = field(default_factory=lambda: {
-        "name": "cspdarknet",
-        "params": {
-            "dep_mul": 0.33,
-            "wid_mul": 0.5,
-            "act_type": "silu",
-        },
-        "stage_params": None,
-    })
-
-
-@dataclass
-class ClassificationEfficientFormerModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "efficientformer_l1"
-    architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationEfficientFormerModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "efficientformer_l1"
-    architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionEfficientFormerModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "efficientformer_l1"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
-
-
-@dataclass
-class ClassificationMobileNetV3ModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "mobilenet_v3_small"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationMobileNetV3ModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "mobilenet_v3_small"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionMobileNetV3ModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "mobilenet_v3_small"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
-
-
-@dataclass
-class ClassificationMobileViTModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "mobilevit_s"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MobileViTArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class PIDNetModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "pidnet_s"
-    architecture: ArchitectureConfig = field(default_factory=lambda: PIDNetArchitectureConfig())
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "pidnet_loss", "ignore_index": 255, "weight": None},
-    ])
-
-
-@dataclass
-class ClassificationResNetModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "resnet50"
-    architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationResNetModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "resnet50"
-    architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionResNetModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "resnet50"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
-
-
-@dataclass
-class SegmentationSegFormerModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "segformer"
-    architecture: ArchitectureConfig = field(default_factory=lambda: SegFormerArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class ClassificationViTModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "vit_tiny"
-    architecture: ArchitectureConfig = field(default_factory=lambda: ViTArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionYoloXModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "yolox_s"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: CSPDarkNetSmallArchitectureConfig(
-        neck={
-            "name": "yolopafpn",
-            "params": {
-                "dep_mul": 0.33,
-                "act_type": "silu",
-            },
-        },
-        head={
-            "name": "anchor_free_decoupled_head",
-            "params": {
-                "act_type": "silu",
-                # postprocessor - decode
-                "score_thresh": 0.7,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "yolox_loss", "weight": None}
-    ])
-
-
-@dataclass
-class ClassificationMixNetSmallModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "mixnet_s"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationMixNetSmallModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "mixnet_s"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionMixNetSmallModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "mixnet_s"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
-
-
-@dataclass
-class ClassificationMixNetMediumModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "mixnet_m"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationMixNetMediumModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "mixnet_m"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionMixNetMediumModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "mixnet_m"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
-
-
-@dataclass
-class ClassificationMixNetLargeModelConfig(ModelConfig):
-    task: str = "classification"
-    name: str = "mixnet_l"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig(
-        head={
-            "name": "fc",
-            "params": {
-                "intermediate_channels": 1024,
-                "num_layers": 1,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}
-    ])
-
-
-@dataclass
-class SegmentationMixNetLargeModelConfig(ModelConfig):
-    task: str = "segmentation"
-    name: str = "mixnet_l"
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig(
-        head={
-            "name": "all_mlp_decoder",
-            "params": {
-                "intermediate_channels": 256,
-                "classifier_dropout_prob": 0.,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "cross_entropy", "ignore_index": 255, "weight": None}
-    ])
-
-
-@dataclass
-class DetectionMixNetLargeModelConfig(ModelConfig):
-    task: str = "detection"
-    name: str = "mixnet_l"
-    checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig(
-        load_head=True
-    ))
-    architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig(
-        neck={
-            "name": "fpn",
-            "params": {
-                "num_outs": 4,
-                "start_level": 0,
-                "end_level": -1,
-                "add_extra_convs": False,
-                "relu_before_extra_convs": False,
-            },
-        },
-        head={
-            "name": "anchor_decoupled_head",
-            "params": {
-                # Anchor parameters
-                "anchor_sizes": [[32,], [64,], [128,], [256,]],
-                "aspect_ratios": [0.5, 1.0, 2.0],
-                "num_layers": 1,
-                "norm_type": "batch_norm",
-                # postprocessor - decode
-                "topk_candidates": 1000,
-                "score_thresh": 0.05,
-                # postprocessor - nms
-                "nms_thresh": 0.45,
-                "class_agnostic": False,
-            }
-        }
-    ))
-    losses: List[Dict[str, Any]] = field(default_factory=lambda: [
-        {"criterion": "retinanet_loss", "weight": None},
-    ])
diff --git a/netspresso/trainer/models/pidnet.py b/netspresso/trainer/models/pidnet.py
new file mode 100644
index 00000000..534b5445
--- /dev/null
+++ b/netspresso/trainer/models/pidnet.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig
+
+
+@dataclass
+class PIDNetArchitectureConfig(ArchitectureConfig):
+    full: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "pidnet",
+            "m": 2,
+            "n": 3,
+            "channels": 32,
+            "ppm_channels": 96,
+            "head_channels": 128,
+        }
+    )
+
+
+@dataclass
+class PIDNetModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "pidnet_s"
+    architecture: ArchitectureConfig = field(default_factory=lambda: PIDNetArchitectureConfig())
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "pidnet_loss", "ignore_index": 255, "weight": None},
+        ]
+    )
diff --git a/netspresso/trainer/models/resnet.py b/netspresso/trainer/models/resnet.py
new file mode 100644
index 00000000..f9d62db3
--- /dev/null
+++ b/netspresso/trainer/models/resnet.py
@@ -0,0 +1,201 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class ResNet18ArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "resnet",
+            "params": {
+                "block_type": "basicblock",
+                "norm_type": "batch_norm",
+            },
+            "stage_params": [
+                {"channels": 64, "num_blocks": 2},
+                {"channels": 128, "num_blocks": 2, "replace_stride_with_dilation": False},
+                {"channels": 256, "num_blocks": 2, "replace_stride_with_dilation": False},
+                {"channels": 512, "num_blocks": 2, "replace_stride_with_dilation": False},
+            ],
+        }
+    )
+
+
+@dataclass
+class ResNet34ArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "resnet",
+            "params": {
+                "block_type": "basicblock",
+                "norm_type": "batch_norm",
+            },
+            "stage_params": [
+                {"channels": 64, "num_blocks": 3},
+                {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False},
+                {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False},
+                {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False},
+            ],
+        }
+    )
+
+
+@dataclass
+class ResNet50ArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "resnet",
+            "params": {
+                "block_type": "bottleneck",
+                "norm_type": "batch_norm",
+            },
+            "stage_params": [
+                {"channels": 64, "num_blocks": 3},
+                {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False},
+                {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False},
+                {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False},
+            ],
+        }
+    )
+
+
+@dataclass
+class ClassificationResNet18ModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "resnet18"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ResNet18ArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class ClassificationResNet34ModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "resnet34"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ResNet34ArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class ClassificationResNet50ModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "resnet50"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ResNet50ArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
+
+
+@dataclass
+class SegmentationResNet50ModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "resnet50"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ResNet50ArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
+
+
+@dataclass
+class DetectionResNet50ModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "resnet50"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ResNet50ArchitectureConfig(
+            neck={
+                "name": "fpn",
+                "params": {
+                    "num_outs": 4,
+                    "start_level": 0,
+                    "end_level": -1,
+                    "add_extra_convs": False,
+                    "relu_before_extra_convs": False,
+                },
+            },
+            head={
+                "name": "anchor_decoupled_head",
+                "params": {
+                    # Anchor parameters
+                    "anchor_sizes": [
+                        [
+                            32,
+                        ],
+                        [
+                            64,
+                        ],
+                        [
+                            128,
+                        ],
+                        [
+                            256,
+                        ],
+                    ],
+                    "aspect_ratios": [0.5, 1.0, 2.0],
+                    "num_layers": 1,
+                    "norm_type": "batch_norm",
+                    # postprocessor - decode
+                    "topk_candidates": 1000,
+                    "score_thresh": 0.05,
+                    # postprocessor - nms
+                    "nms_thresh": 0.45,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [
+            {"criterion": "retinanet_loss", "weight": None},
+        ]
+    )
diff --git a/netspresso/trainer/models/rtmpose.py b/netspresso/trainer/models/rtmpose.py
new file mode 100644
index 00000000..8f650b5a
--- /dev/null
+++ b/netspresso/trainer/models/rtmpose.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig
+from netspresso.trainer.models.mobilenetv3 import MobileNetV3SmallArchitectureConfig
+
+
+@dataclass
+class PoseEstimationMobileNetV3SmallModelConfig(ModelConfig):
+    task: str = "pose_estimation"
+    name: str = "mobilenet_v3_small"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: MobileNetV3SmallArchitectureConfig(
+            head={
+                "name": "rtmcc",
+                "params": {
+                    "conv_kernel": 7,
+                    "attention_channels": 256,
+                    "attention_act_type": "silu",
+                    "attention_pos_enc": False,
+                    "s": 128,
+                    "expansion_factor": 2,
+                    "dropout_rate": 0.0,
+                    "drop_path": 0.0,
+                    "use_rel_bias": False,
+                    "simcc_split_ratio": 2.0,
+                    "target_size": [256, 256],
+                    "backbone_stride": 32,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(default_factory=lambda: [{"criterion": "rtmcc_loss", "weight": None}])
diff --git a/netspresso/trainer/models/segformer.py b/netspresso/trainer/models/segformer.py
new file mode 100644
index 00000000..57235a07
--- /dev/null
+++ b/netspresso/trainer/models/segformer.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig
+
+
+@dataclass
+class SegFormerB0ArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "mixtransformer",
+            "params": {
+                "ffn_intermediate_expansion_ratio": 4,
+                "ffn_act_type": "gelu",
+                "ffn_dropout_prob": 0.0,
+                "attention_dropout_prob": 0.0,
+            },
+            "stage_params": [
+                {
+                    "num_blocks": 2,
+                    "sequence_reduction_ratio": 8,
+                    "attention_chananels": 32,
+                    "embedding_patch_sizes": 7,
+                    "embedding_strides": 4,
+                    "num_attention_heads": 1,
+                },
+                {
+                    "num_blocks": 2,
+                    "sequence_reduction_ratio": 4,
+                    "attention_chananels": 64,
+                    "embedding_patch_sizes": 3,
+                    "num_attention_heads": 2,
+                },
+                {
+                    "num_blocks": 2,
+                    "sequence_reduction_ratio": 2,
+                    "attention_chananels": 160,
+                    "embedding_patch_sizes": 3,
+                    "embedding_strides": 2,
+                    "num_attention_heads": 5,
+                },
+                {
+                    "num_blocks": 2,
+                    "sequence_reduction_ratio": 1,
+                    "attention_chananels": 256,
+                    "embedding_patch_sizes": 3,
+                    "embedding_strides": 2,
+                    "num_attention_heads": 8,
+                },
+            ],
+        }
+    )
+
+
+@dataclass
+class SegmentationSegFormerB0ModelConfig(ModelConfig):
+    task: str = "segmentation"
+    name: str = "segformer_b0"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: SegFormerB0ArchitectureConfig(
+            head={
+                "name": "all_mlp_decoder",
+                "params": {
+                    "intermediate_channels": 256,
+                    "classifier_dropout_prob": 0.0,
+                    "resize_output": [512, 512],
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}]
+    )
diff --git a/netspresso/trainer/models/vit.py b/netspresso/trainer/models/vit.py
new file mode 100644
index 00000000..1f605d49
--- /dev/null
+++ b/netspresso/trainer/models/vit.py
@@ -0,0 +1,47 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig
+
+
+@dataclass
+class ViTTinyArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "vit",
+            "params": {
+                "patch_size": 16,
+                "attention_channels": 192,
+                "num_blocks": 12,
+                "num_attention_heads": 3,
+                "attention_dropout_prob": 0.0,
+                "ffn_intermediate_channels": 768,
+                "ffn_dropout_prob": 0.1,
+                "use_cls_token": True,
+                "vocab_size": 1000,
+            },
+            "stage_params": None,
+        }
+    )
+
+
+@dataclass
+class ClassificationViTTinyModelConfig(ModelConfig):
+    task: str = "classification"
+    name: str = "vit_tiny"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: ViTTinyArchitectureConfig(
+            head={
+                "name": "fc",
+                "params": {
+                    "num_layers": 1,
+                    "intermediate_channels": None,
+                    "act_type": None,
+                    "dropout_prob": 0.0,
+                },
+            }
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}]
+    )
diff --git a/netspresso/trainer/models/yolox.py b/netspresso/trainer/models/yolox.py
new file mode 100644
index 00000000..6b3f1a31
--- /dev/null
+++ b/netspresso/trainer/models/yolox.py
@@ -0,0 +1,188 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig
+
+
+@dataclass
+class CSPDarkNetXArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "cspdarknet",
+            "params": {
+                "dep_mul": 1.33,
+                "wid_mul": 1.25,
+                "act_type": "silu",
+            },
+            "stage_params": None,
+        }
+    )
+
+
+@dataclass
+class CSPDarkNetLArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "cspdarknet",
+            "params": {
+                "dep_mul": 1.0,
+                "wid_mul": 1.0,
+                "act_type": "silu",
+            },
+            "stage_params": None,
+        }
+    )
+
+
+@dataclass
+class CSPDarkNetMArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "cspdarknet",
+            "params": {
+                "dep_mul": 0.67,
+                "wid_mul": 0.75,
+                "act_type": "silu",
+            },
+            "stage_params": None,
+        }
+    )
+
+
+@dataclass
+class CSPDarkNetSArchitectureConfig(ArchitectureConfig):
+    backbone: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "name": "cspdarknet",
+            "params": {
+                "dep_mul": 0.33,
+                "wid_mul": 0.5,
+                "act_type": "silu",
+            },
+            "stage_params": None,
+        }
+    )
+
+
+@dataclass
+class DetectionYoloXXModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "yolox_x"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: CSPDarkNetXArchitectureConfig(
+            neck={
+                "name": "yolopafpn",
+                "params": {
+                    "dep_mul": 1.33,
+                    "act_type": "silu",
+                },
+            },
+            head={
+                "name": "anchor_free_decoupled_head",
+                "params": {
+                    "act_type": "silu",
+                    # postprocessor - decode
+                    "score_thresh": 0.01,
+                    # postprocessor - nms
+                    "nms_thresh": 0.65,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}]
+    )
+
+
+@dataclass
+class DetectionYoloXLModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "yolox_l"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: CSPDarkNetLArchitectureConfig(
+            neck={
+                "name": "yolopafpn",
+                "params": {
+                    "dep_mul": 1.0,
+                    "act_type": "silu",
+                },
+            },
+            head={
+                "name": "anchor_free_decoupled_head",
+                "params": {
+                    "act_type": "silu",
+                    # postprocessor - decode
+                    "score_thresh": 0.01,
+                    # postprocessor - nms
+                    "nms_thresh": 0.65,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}]
+    )
+
+
+@dataclass
+class DetectionYoloXMModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "yolox_m"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: CSPDarkNetMArchitectureConfig(
+            neck={
+                "name": "yolopafpn",
+                "params": {
+                    "dep_mul": 0.67,
+                    "act_type": "silu",
+                },
+            },
+            head={
+                "name": "anchor_free_decoupled_head",
+                "params": {
+                    "act_type": "silu",
+                    # postprocessor - decode
+                    "score_thresh": 0.01,
+                    # postprocessor - nms
+                    "nms_thresh": 0.65,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}]
+    )
+
+
+@dataclass
+class DetectionYoloXSModelConfig(ModelConfig):
+    task: str = "detection"
+    name: str = "yolox_s"
+    architecture: ArchitectureConfig = field(
+        default_factory=lambda: CSPDarkNetSArchitectureConfig(
+            neck={
+                "name": "yolopafpn",
+                "params": {
+                    "dep_mul": 0.33,
+                    "act_type": "silu",
+                },
+            },
+            head={
+                "name": "anchor_free_decoupled_head",
+                "params": {
+                    "act_type": "silu",
+                    # postprocessor - decode
+                    "score_thresh": 0.01,
+                    # postprocessor - nms
+                    "nms_thresh": 0.65,
+                    "class_agnostic": False,
+                },
+            },
+        )
+    )
+    losses: List[Dict[str, Any]] = field(
+        default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}]
+    )
diff --git a/netspresso/trainer/trainer.py b/netspresso/trainer/trainer.py
index 8fb70dd8..26db399b 100644
--- a/netspresso/trainer/trainer.py
+++ b/netspresso/trainer/trainer.py
@@ -5,7 +5,7 @@
 from omegaconf import OmegaConf
 
 from netspresso.enums import Status, Task, TaskType
-from netspresso.trainer.augmentations import AUGMENTATION_CONFIG_TYPE, AugmentationConfig, Inference, Train, Transform
+from netspresso.trainer.augmentations import AUGMENTATION_CONFIG_TYPE, AugmentationConfig, Transform
 from netspresso.trainer.data import DATA_CONFIG_TYPE, ImageLabelPathConfig, PathConfig
 from netspresso.trainer.models import (
     CLASSIFICATION_MODELS,
@@ -131,9 +131,9 @@ def set_dataset_config(
         root_path: str,
         train_image: str = "images/train",
         train_label: str = "labels/train",
-        valid_image: str = "images/val",
-        valid_label: str = "labels/val",
-        id_mapping: Optional[Union[List[str], Dict[str, str]]] = None,
+        valid_image: str = "images/valid",
+        valid_label: str = "labels/valid",
+        id_mapping: Optional[Union[List[str], Dict[str, str], str]] = None,
     ):
         """Set the dataset configuration for the Trainer.
 
@@ -158,6 +158,33 @@ def set_dataset_config(
         }
         self.data = DATA_CONFIG_TYPE[self.task](**common_config)
 
+    def check_paths_exist(self, base_path):
+        paths = [
+            "images/train",
+            "labels/train",
+            "images/valid",
+            "labels/valid",
+            "id_mapping.json",
+        ]
+
+        for relative_path in paths:
+            path = Path(base_path) / relative_path
+            if not path.exists():
+                if path.suffix:
+                    raise FileNotFoundError(f"The required file '{relative_path}' does not exist. Please check and make sure it is in the correct location.")
+                else:
+                    raise FileNotFoundError(f"The required directory '{relative_path}' does not exist. Please check and make sure it is in the correct location.")
+
+    def set_dataset(self, dataset_root_path: str):
+        dataset_name = Path(dataset_root_path).name
+
+        self.check_paths_exist(dataset_root_path)
+        self.set_dataset_config(
+            name=dataset_name,
+            root_path=dataset_root_path,
+            id_mapping="id_mapping.json",
+        )
+
     def set_model_config(
         self,
         model_name: str,
@@ -235,33 +262,26 @@ def set_training_config(
 
         self.training = ScheduleConfig(
             epochs=epochs,
-            batch_size=batch_size,
             optimizer=optimizer.asdict(),
             scheduler=scheduler.asdict(),
         )
+        self.environment.batch_size = batch_size
 
     def set_augmentation_config(
         self,
         train_transforms: Optional[List] = None,
-        train_mix_transforms: Optional[List] = None,
         inference_transforms: Optional[List] = None,
     ):
         """Set the augmentation configuration for training.
 
         Args:
             train_transforms (List, optional): List of transforms for training. Defaults to None.
-            train_mix_transforms (List, optional): List of mix transforms for training. Defaults to None.
             inference_transforms (List, optional): List of transforms for inference. Defaults to None.
         """
 
         self.augmentation = AugmentationConfig(
-            train=Train(
-                transforms=train_transforms,
-                mix_transforms=train_mix_transforms,
-            ),
-            inference=Inference(
-                transforms=inference_transforms,
-            ),
+            train=train_transforms,
+            inference=inference_transforms,
         )
 
     def set_logging_config(
@@ -345,9 +365,8 @@ def _apply_img_size(self):
         """
 
         self.augmentation.img_size = self.img_size
-        self.augmentation.train.transforms = self._change_transforms(self.augmentation.train.transforms)
-        self.augmentation.train.mix_transforms = self._change_transforms(self.augmentation.train.mix_transforms)
-        self.augmentation.inference.transforms = self._change_transforms(self.augmentation.inference.transforms)
+        self.augmentation.train = self._change_transforms(self.augmentation.train)
+        self.augmentation.inference = self._change_transforms(self.augmentation.inference)
 
     def train(self, gpus: str, project_name: str) -> Dict:
         """Train the model with the specified configuration.
@@ -369,6 +388,7 @@ def train(self, gpus: str, project_name: str) -> Dict:
         destination_folder = FileHandler.create_unique_folder(folder_path=destination_folder)
         metadata = MetadataHandler.init_metadata(folder_path=destination_folder, task_type=TaskType.TRAIN)
         self.logging.project_id = Path(destination_folder).name
+        self.environment.gpus = gpus
 
         configs = TrainerConfigs(
             self.data,
@@ -414,7 +434,7 @@ def train(self, gpus: str, project_name: str) -> Dict:
             dataset=self.data.name,
             input_shapes=[InputShape(batch=1, channel=3, dimension=[self.img_size, self.img_size])],
         )
-        metadata.update_training_info(epoch=self.training.epochs, batch_size=self.training.batch_size)
+        metadata.update_training_info(epoch=self.training.epochs, batch_size=self.environment.batch_size)
         metadata.update_training_result(training_summary=training_summary)
         metadata.update_logging_dir(logging_dir=destination_folder.as_posix())
         metadata.update_hparams(hparams=hparams_path.as_posix())
diff --git a/netspresso/trainer/training/environment.py b/netspresso/trainer/training/environment.py
index 04ec85be..71600639 100644
--- a/netspresso/trainer/training/environment.py
+++ b/netspresso/trainer/training/environment.py
@@ -6,3 +6,4 @@ class EnvironmentConfig:
     seed: int = 1
     num_workers: int = 4
     gpus: str = "0"
+    batch_size: int = 8
diff --git a/netspresso/trainer/training/logging.py b/netspresso/trainer/training/logging.py
index ef7643d1..b6446c81 100644
--- a/netspresso/trainer/training/logging.py
+++ b/netspresso/trainer/training/logging.py
@@ -1,8 +1,6 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Optional, Union
-
-from omegaconf import MISSING, MissingMandatoryValue
+from typing import List, Optional, Union
 
 
 @dataclass
@@ -10,10 +8,10 @@ class LoggingConfig:
     project_id: Optional[str] = None
     output_dir: Union[Path, str] = "./outputs"
     tensorboard: bool = True
-    csv: bool = False
     image: bool = True
     stdout: bool = True
     save_optimizer_state: bool = True
+    onnx_input_size: List = field(default_factory=lambda: [512, 512])
     validation_epoch: int = 10
     save_checkpoint_epoch: Optional[int] = None
 
diff --git a/netspresso/trainer/training/training.py b/netspresso/trainer/training/training.py
index 9f2e8411..667ca1e8 100644
--- a/netspresso/trainer/training/training.py
+++ b/netspresso/trainer/training/training.py
@@ -1,13 +1,11 @@
 from dataclasses import dataclass, field
-from typing import Dict
-
-from omegaconf import MISSING, MissingMandatoryValue
+from typing import Dict, Optional
 
 
 @dataclass
 class ScheduleConfig:
     epochs: int = 3
-    batch_size: int = 8
+    ema: Optional[Dict] = field(default=None)
     optimizer: Dict = field(default_factory=lambda: {
         "name": "adamw",
         "lr": 6e-5,
@@ -24,7 +22,7 @@ class ScheduleConfig:
 
 @dataclass
 class ClassificationScheduleConfig(ScheduleConfig):
-    batch_size: int = 32
+    pass
 
 
 @dataclass
diff --git a/requirements.txt b/requirements.txt
index 14e415e4..fc755032 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,6 @@ requests>=2.30.0
 email-validator==2.0.0
 pytz>=2023.3
 typing_extensions==4.5.0
-netspresso_trainer==0.1.2
+netspresso_trainer==0.2.2
 PyGithub>=2.1.1
 matplotlib>=3.7.4