From 485e6ca93b5db960c9c9ee0fe1465eacc29786ea Mon Sep 17 00:00:00 2001 From: Byeongman Lee Date: Thu, 27 Jun 2024 10:47:35 +0900 Subject: [PATCH] #255 Update v0.2.2 version of netspresso_trainer (#259) --- netspresso/trainer/augmentations/__init__.py | 18 +- .../trainer/augmentations/augmentation.py | 138 ++- netspresso/trainer/models/__init__.py | 74 +- netspresso/trainer/models/base.py | 35 + netspresso/trainer/models/efficientformer.py | 125 ++ netspresso/trainer/models/mixnet.py | 475 ++++++++ netspresso/trainer/models/mobilenetv3.py | 218 ++++ netspresso/trainer/models/mobilevit.py | 89 ++ netspresso/trainer/models/model.py | 1029 ----------------- netspresso/trainer/models/pidnet.py | 30 + netspresso/trainer/models/resnet.py | 201 ++++ netspresso/trainer/models/rtmpose.py | 33 + netspresso/trainer/models/segformer.py | 73 ++ netspresso/trainer/models/vit.py | 47 + netspresso/trainer/models/yolox.py | 188 +++ netspresso/trainer/trainer.py | 56 +- netspresso/trainer/training/environment.py | 1 + netspresso/trainer/training/logging.py | 8 +- netspresso/trainer/training/training.py | 8 +- requirements.txt | 2 +- 20 files changed, 1722 insertions(+), 1126 deletions(-) create mode 100644 netspresso/trainer/models/base.py create mode 100644 netspresso/trainer/models/efficientformer.py create mode 100644 netspresso/trainer/models/mixnet.py create mode 100644 netspresso/trainer/models/mobilenetv3.py create mode 100644 netspresso/trainer/models/mobilevit.py delete mode 100644 netspresso/trainer/models/model.py create mode 100644 netspresso/trainer/models/pidnet.py create mode 100644 netspresso/trainer/models/resnet.py create mode 100644 netspresso/trainer/models/rtmpose.py create mode 100644 netspresso/trainer/models/segformer.py create mode 100644 netspresso/trainer/models/vit.py create mode 100644 netspresso/trainer/models/yolox.py diff --git a/netspresso/trainer/augmentations/__init__.py b/netspresso/trainer/augmentations/__init__.py index 9fa351cb..9b235b04 100644 --- a/netspresso/trainer/augmentations/__init__.py +++ b/netspresso/trainer/augmentations/__init__.py @@ -1,19 +1,24 @@ from netspresso.trainer.augmentations.augmentation import ( AugmentationConfig, + CenterCrop, ClassificationAugmentationConfig, ColorJitter, DetectionAugmentationConfig, - Inference, + HSVJitter, + Mixing, + MosaicDetection, Pad, + PoseTopDownAffine, RandomCrop, RandomCutmix, + RandomErasing, RandomHorizontalFlip, RandomMixup, + RandomResize, RandomResizedCrop, RandomVerticalFlip, Resize, SegmentationAugmentationConfig, - Train, Transform, TrivialAugmentWide, ) @@ -26,6 +31,13 @@ __all__ = [ + "CenterCrop", + "HSVJitter", + "Mixing", + "MosaicDetection", + "PoseTopDownAffine", + "RandomErasing", + "RandomResize", "ColorJitter", "Pad", "RandomCrop", @@ -36,8 +48,6 @@ "TrivialAugmentWide", "RandomMixup", "RandomCutmix", - "Inference", - "Train", "Transform", "AugmentationConfig", "AUGMENTATION_CONFIG_TYPE", diff --git a/netspresso/trainer/augmentations/augmentation.py b/netspresso/trainer/augmentations/augmentation.py index 85a1ae21..06775fad 100644 --- a/netspresso/trainer/augmentations/augmentation.py +++ b/netspresso/trainer/augmentations/augmentation.py @@ -13,21 +13,16 @@ class Transform: @dataclass -class Train: - transforms: Optional[List] = None - mix_transforms: Optional[List] = None - - -@dataclass -class Inference: - transforms: Optional[List] = None +class AugmentationConfig: + img_size: int = DEFAULT_IMG_SIZE + train: Optional[List] = None + inference: Optional[List] = None @dataclass -class AugmentationConfig: - img_size: int = DEFAULT_IMG_SIZE - train: Train = field(default_factory=lambda: Train()) - inference: Inference = field(default_factory=lambda: Inference()) +class CenterCrop(Transform): + name: str = 'centercrop' + size: int = DEFAULT_IMG_SIZE @dataclass @@ -40,6 +35,38 @@ class ColorJitter(Transform): p: Optional[float] = 0.5 +@dataclass +class HSVJitter(Transform): + name: str = "hsvjitter" + h_mag: int = 5 + s_mag: int = 30 + v_mag: int = 30 + + +@dataclass +class Mixing(Transform): + name: str = "mixing" + mixup: Optional[List[float]] = field(default=None) + cutmix: Optional[List[float]] = field(default=None) + inplace: bool = False + + +@dataclass +class MosaicDetection(Transform): + name: str = "mosaicdetection" + size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE]) + mosaic_prob: float = 1.0 + affine_scale: List = field(default_factory=lambda: [0.5, 1.5]) + degrees: float = 10.0 + translate: float = 0.1 + shear: float = 2.0 + enable_mixup: bool = True + mixup_prob: float = 1.0 + mixup_scale: List = field(default_factory=lambda: [0.5, 1.5]) + fill: int = 114 + mosaic_off_epoch: int = 10 + + @dataclass class Pad(Transform): name: str = 'pad' @@ -48,6 +75,18 @@ class Pad(Transform): padding_mode: str = 'constant' +@dataclass +class PoseTopDownAffine(Transform): + name: str = "posetopdownaffine" + scale: List = field(default_factory=lambda: [0.75, 1.25]) + scale_prob: float = 1. + translate: float = 0.1 + translate_prob: float = 1. + rotation: int = 60 + rotation_prob: float = 1. + size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE]) + + @dataclass class RandomCrop(Transform): name: str = 'randomcrop' @@ -55,12 +94,13 @@ class RandomCrop(Transform): @dataclass -class RandomResizedCrop(Transform): - name: str = 'randomresizedcrop' - size: int = DEFAULT_IMG_SIZE - scale: List = field(default_factory=lambda: [0.08, 1.0]) - ratio: List = field(default_factory=lambda: [0.75, 1.33]) - interpolation: Optional[str] = 'bilinear' +class RandomErasing(Transform): + name: str = "randomerasing" + p: float = 0.5 + scale: List = field(default_factory=lambda: [0.02, 0.33]) + scale: List = field(default_factory=lambda: [0.3, 3.3]) + value: Optional[int] = 0 + inplace: bool = False @dataclass @@ -69,6 +109,24 @@ class RandomHorizontalFlip(Transform): p: float = 0.5 +@dataclass +class RandomResize(Transform): + name: str = "randomresize" + base_size: List = field(default_factory=lambda: [256, 256]) + stride: int = 32 + random_range: int = 4 + interpolation: str = "bilinear" + + +@dataclass +class RandomResizedCrop(Transform): + name: str = 'randomresizedcrop' + size: int = DEFAULT_IMG_SIZE + scale: List = field(default_factory=lambda: [0.08, 1.0]) + ratio: List = field(default_factory=lambda: [0.75, 1.33]) + interpolation: Optional[str] = 'bilinear' + + @dataclass class RandomVerticalFlip(Transform): name: str = 'randomverticalflip' @@ -81,6 +139,7 @@ class Resize(Transform): size: List = field(default_factory=lambda: [DEFAULT_IMG_SIZE, DEFAULT_IMG_SIZE]) interpolation: Optional[str] = 'bilinear' max_size: Optional[int] = None + resize_criteria: Optional[int] = None class TrivialAugmentWide(Transform): @@ -109,34 +168,35 @@ class RandomCutmix(Transform): @dataclass class ClassificationAugmentationConfig(AugmentationConfig): img_size: int = 256 - train: Train = field(default_factory=lambda: Train( - transforms=[RandomResizedCrop(size=256), RandomHorizontalFlip()], - mix_transforms=[RandomCutmix()] - )) - inference: Inference = field(default_factory=lambda: Inference( - transforms=[Resize(size=[256, 256])] - )) + train: Optional[List] = field(default_factory=lambda: [ + RandomResizedCrop(size=256), + RandomHorizontalFlip(), + Mixing(mixup=[0.25, 1.0]) + ]) + inference: Optional[List] = field(default_factory=lambda: [ + Resize(size=[256, 256]) + ]) @dataclass class SegmentationAugmentationConfig(AugmentationConfig): img_size: int = 512 - train: Train = field(default_factory=lambda: Train( - transforms=[RandomResizedCrop(size=512), RandomHorizontalFlip(), ColorJitter()], - mix_transforms=None - )) - inference: Inference = field(default_factory=lambda: Inference( - transforms=[Resize(size=[512, 512])] - )) + train: Optional[List] = field(default_factory=lambda: [ + RandomResizedCrop(size=512), + RandomHorizontalFlip(), + ColorJitter() + ]) + inference: Optional[List] = field(default_factory=lambda: [ + Resize(size=[512, 512]) + ]) @dataclass class DetectionAugmentationConfig(AugmentationConfig): img_size: int = 512 - train: Train = field(default_factory=lambda: Train( - transforms=[Resize(size=[512, 512])], - mix_transforms=None - )) - inference: Inference = field(default_factory=lambda: Inference( - transforms=[Resize(size=[512, 512])], - )) + train: Optional[List] = field(default_factory=lambda: [ + Resize(size=[512, 512]) + ]) + inference: Optional[List] = field(default_factory=lambda: [ + Resize(size=[512, 512]) + ]) diff --git a/netspresso/trainer/models/__init__.py b/netspresso/trainer/models/__init__.py index 581fc87d..78c6e932 100644 --- a/netspresso/trainer/models/__init__.py +++ b/netspresso/trainer/models/__init__.py @@ -1,37 +1,54 @@ -from netspresso.trainer.models.model import ( - CheckpointConfig, +from netspresso.trainer.models.base import CheckpointConfig, ModelConfig +from netspresso.trainer.models.efficientformer import ( ClassificationEfficientFormerModelConfig, + DetectionEfficientFormerModelConfig, + SegmentationEfficientFormerModelConfig, +) +from netspresso.trainer.models.mixnet import ( ClassificationMixNetLargeModelConfig, ClassificationMixNetMediumModelConfig, ClassificationMixNetSmallModelConfig, - ClassificationMobileNetV3ModelConfig, - ClassificationMobileViTModelConfig, - ClassificationResNetModelConfig, - ClassificationViTModelConfig, - DetectionEfficientFormerModelConfig, DetectionMixNetLargeModelConfig, DetectionMixNetMediumModelConfig, DetectionMixNetSmallModelConfig, - DetectionMobileNetV3ModelConfig, - DetectionResNetModelConfig, - DetectionYoloXModelConfig, - ModelConfig, - PIDNetModelConfig, - SegmentationEfficientFormerModelConfig, SegmentationMixNetLargeModelConfig, SegmentationMixNetMediumModelConfig, SegmentationMixNetSmallModelConfig, - SegmentationMobileNetV3ModelConfig, - SegmentationResNetModelConfig, - SegmentationSegFormerModelConfig, +) +from netspresso.trainer.models.mobilenetv3 import ( + ClassificationMobileNetV3LargeModelConfig, + ClassificationMobileNetV3SmallModelConfig, + DetectionMobileNetV3SmallModelConfig, + SegmentationMobileNetV3SmallModelConfig, +) +from netspresso.trainer.models.mobilevit import ClassificationMobileViTModelConfig +from netspresso.trainer.models.pidnet import PIDNetModelConfig +from netspresso.trainer.models.resnet import ( + ClassificationResNet18ModelConfig, + ClassificationResNet34ModelConfig, + ClassificationResNet50ModelConfig, + DetectionResNet50ModelConfig, + SegmentationResNet50ModelConfig, +) +from netspresso.trainer.models.rtmpose import PoseEstimationMobileNetV3SmallModelConfig +from netspresso.trainer.models.segformer import SegmentationSegFormerB0ModelConfig +from netspresso.trainer.models.vit import ClassificationViTTinyModelConfig +from netspresso.trainer.models.yolox import ( + DetectionYoloXLModelConfig, + DetectionYoloXMModelConfig, + DetectionYoloXSModelConfig, + DetectionYoloXXModelConfig, ) CLASSIFICATION_MODELS = { "EfficientFormer": ClassificationEfficientFormerModelConfig, - "MobileNetV3": ClassificationMobileNetV3ModelConfig, + "MobileNetV3_Small": ClassificationMobileNetV3SmallModelConfig, + "MobileNetV3_Large": ClassificationMobileNetV3LargeModelConfig, "MobileViT": ClassificationMobileViTModelConfig, - "ResNet": ClassificationResNetModelConfig, - "ViT": ClassificationViTModelConfig, + "ResNet18": ClassificationResNet18ModelConfig, + "ResNet34": ClassificationResNet34ModelConfig, + "ResNet50": ClassificationResNet50ModelConfig, + "ViT_Tiny": ClassificationViTTinyModelConfig, "MixNetS": ClassificationMixNetSmallModelConfig, "MixNetM": ClassificationMixNetMediumModelConfig, "MixNetL": ClassificationMixNetLargeModelConfig, @@ -39,9 +56,12 @@ DETECTION_MODELS = { "EfficientFormer": DetectionEfficientFormerModelConfig, - "YOLOX-S": DetectionYoloXModelConfig, - "ResNet": DetectionResNetModelConfig, - "MobileNetV3": DetectionMobileNetV3ModelConfig, + "MobileNetV3_Small": DetectionMobileNetV3SmallModelConfig, + "YOLOX-S": DetectionYoloXSModelConfig, + "YOLOX-M": DetectionYoloXMModelConfig, + "YOLOX-L": DetectionYoloXLModelConfig, + "YOLOX-X": DetectionYoloXXModelConfig, + "ResNet50": DetectionResNet50ModelConfig, "MixNetL": DetectionMixNetLargeModelConfig, "MixNetM": DetectionMixNetMediumModelConfig, "MixNetS": DetectionMixNetSmallModelConfig, @@ -49,15 +69,19 @@ SEGMENTATION_MODELS = { "EfficientFormer": SegmentationEfficientFormerModelConfig, - "MobileNetV3": SegmentationMobileNetV3ModelConfig, - "ResNet": SegmentationResNetModelConfig, - "SegFormer": SegmentationSegFormerModelConfig, + "MobileNetV3_Small": SegmentationMobileNetV3SmallModelConfig, + "ResNet50": SegmentationResNet50ModelConfig, + "SegFormer-B0": SegmentationSegFormerB0ModelConfig, "MixNetS": SegmentationMixNetSmallModelConfig, "MixNetM": SegmentationMixNetMediumModelConfig, "MixNetL": SegmentationMixNetLargeModelConfig, "PIDNet": PIDNetModelConfig, } +POSEESTIMATION_MODELS = { + "MobileNetV3_Small": PoseEstimationMobileNetV3SmallModelConfig, +} + __all__ = [ "CLASSIFICATION_MODELS", diff --git a/netspresso/trainer/models/base.py b/netspresso/trainer/models/base.py new file mode 100644 index 00000000..ae4242d4 --- /dev/null +++ b/netspresso/trainer/models/base.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from omegaconf import MISSING + + +@dataclass +class ArchitectureConfig: + full: Optional[Dict[str, Any]] = None + backbone: Optional[Dict[str, Any]] = None + neck: Optional[Dict[str, Any]] = None + head: Optional[Dict[str, Any]] = None + + def __post_init__(self): + assert bool(self.full) != bool(self.backbone), "Only one of full or backbone should be given." + + +@dataclass +class CheckpointConfig: + use_pretrained: bool = True + load_head: bool = False + path: Optional[Union[Path, str]] = None + fx_model_path: Optional[Union[Path, str]] = None + optimizer_path: Optional[Union[Path, str]] = None + + +@dataclass +class ModelConfig: + task: str = MISSING + name: str = MISSING + checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig()) + freeze_backbone: bool = False + architecture: ArchitectureConfig = field(default_factory=lambda: ArchitectureConfig()) + losses: Optional[List[Dict[str, Any]]] = None diff --git a/netspresso/trainer/models/efficientformer.py b/netspresso/trainer/models/efficientformer.py new file mode 100644 index 00000000..2ddd780c --- /dev/null +++ b/netspresso/trainer/models/efficientformer.py @@ -0,0 +1,125 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class EfficientFormerArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "efficientformer", + "params": { + "num_attention_heads": 8, + "attention_channels": 256, + "attention_dropout_prob": 0.0, + "attention_value_expansion_ratio": 4, + "ffn_intermediate_ratio": 4, + "ffn_dropout_prob": 0.0, + "ffn_act_type": "gelu", + "vit_num": 1, + }, + "stage_params": [ + {"num_blocks": 3, "channels": 48}, + {"num_blocks": 2, "channels": 96}, + {"num_blocks": 6, "channels": 224}, + {"num_blocks": 4, "channels": 448}, + ], + } + ) + + +@dataclass +class ClassificationEfficientFormerModelConfig(ModelConfig): + task: str = "classification" + name: str = "efficientformer_l1" + architecture: ArchitectureConfig = field( + default_factory=lambda: EfficientFormerArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationEfficientFormerModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "efficientformer_l1" + architecture: ArchitectureConfig = field( + default_factory=lambda: EfficientFormerArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionEfficientFormerModelConfig(ModelConfig): + task: str = "detection" + name: str = "efficientformer_l1" + architecture: ArchitectureConfig = field( + default_factory=lambda: EfficientFormerArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) diff --git a/netspresso/trainer/models/mixnet.py b/netspresso/trainer/models/mixnet.py new file mode 100644 index 00000000..6f74d4a7 --- /dev/null +++ b/netspresso/trainer/models/mixnet.py @@ -0,0 +1,475 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class MixNetSmallArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_channels": 16, + "wid_mul": 1.0, + "dep_mul": 1.0, + "dropout_rate": 0.0, + }, + "stage_params": [ + { + "expansion_ratio": [1, 6, 3], + "out_channels": [16, 24, 24], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3], [3]], + "num_exp_groups": [1, 2, 2], + "num_poi_groups": [1, 2, 2], + "stride": [1, 2, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5]], + "num_exp_groups": [1, 2], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expansion_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 2, 1, 2], + "kernel_sizes": [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]], + "num_exp_groups": [1, 1, 2, 2], + "num_poi_groups": [2, 2, 2, 2], + "stride": [2, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 2], + "kernel_sizes": [[3, 5, 7, 9, 11], [3, 5, 7, 9]], + "num_exp_groups": [1, 1], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + } + ) + + +@dataclass +class MixNetMediumArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_channels": 24, + "wid_mul": 1.0, + "dep_mul": 1.0, + "dropout_rate": 0.0, + }, + "stage_params": [ + { + "expansion_ratio": [1, 6, 3], + "out_channels": [24, 32, 32], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3, 5, 7], [3]], + "num_exp_groups": [1, 2, 2], + "num_poi_groups": [1, 2, 2], + "stride": [1, 2, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5]], + "num_exp_groups": [1, 2], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expansion_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 3, 1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], + "num_exp_groups": [1, 2, 1, 2], + "num_poi_groups": [1, 2, 1, 2], + "stride": [2, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], + "num_exp_groups": [1, 1], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + } + ) + + +@dataclass +class MixNetLargeArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mixnet", + "params": { + "stem_channels": 24, + "wid_mul": 1.3, + "dep_mul": 1.0, + "dropout_rate": 0.0, + }, + "stage_params": [ + { + "expansion_ratio": [1, 6, 3], + "out_channels": [24, 32, 32], + "num_blocks": [1, 1, 1], + "kernel_sizes": [[3], [3, 5, 7], [3]], + "num_exp_groups": [1, 2, 2], + "num_poi_groups": [1, 2, 2], + "stride": [1, 2, 1], + "act_type": ["relu", "relu", "relu"], + "se_reduction_ratio": [None, None, None], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [40, 40], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5]], + "num_exp_groups": [1, 2], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + { + "expansion_ratio": [6, 6, 6, 3], + "out_channels": [80, 80, 120, 120], + "num_blocks": [1, 3, 1, 3], + "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], + "num_exp_groups": [1, 2, 1, 2], + "num_poi_groups": [1, 2, 1, 2], + "stride": [2, 1, 1, 1], + "act_type": ["swish", "swish", "swish", "swish"], + "se_reduction_ratio": [4, 4, 2, 2], + }, + { + "expansion_ratio": [6, 6], + "out_channels": [200, 200], + "num_blocks": [1, 3], + "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], + "num_exp_groups": [1, 1], + "num_poi_groups": [1, 2], + "stride": [2, 1], + "act_type": ["swish", "swish"], + "se_reduction_ratio": [2, 2], + }, + ], + } + ) + + +@dataclass +class ClassificationMixNetSmallModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_s" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetSmallArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationMixNetSmallModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_s" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetSmallArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionMixNetSmallModelConfig(ModelConfig): + task: str = "detection" + name: str = "mixnet_s" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetSmallArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) + + +@dataclass +class ClassificationMixNetMediumModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_m" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetMediumArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationMixNetMediumModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_m" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetMediumArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionMixNetMediumModelConfig(ModelConfig): + task: str = "detection" + name: str = "mixnet_m" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetMediumArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) + + +@dataclass +class ClassificationMixNetLargeModelConfig(ModelConfig): + task: str = "classification" + name: str = "mixnet_l" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetLargeArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationMixNetLargeModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mixnet_l" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetLargeArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionMixNetLargeModelConfig(ModelConfig): + task: str = "detection" + name: str = "mixnet_l" + architecture: ArchitectureConfig = field( + default_factory=lambda: MixNetLargeArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) diff --git a/netspresso/trainer/models/mobilenetv3.py b/netspresso/trainer/models/mobilenetv3.py new file mode 100644 index 00000000..814b5588 --- /dev/null +++ b/netspresso/trainer/models/mobilenetv3.py @@ -0,0 +1,218 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class MobileNetV3SmallArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mobilenetv3", + "params": None, + "stage_params": [ + { + "in_channels": [16], + "kernel_sizes": [3], + "expanded_channels": [16], + "out_channels": [16], + "use_se": [True], + "act_type": ["relu"], + "stride": [2], + }, + { + "in_channels": [16, 24], + "kernel_sizes": [3, 3], + "expanded_channels": [72, 88], + "out_channels": [24, 24], + "use_se": [False, False], + "act_type": ["relu", "relu"], + "stride": [2, 1], + }, + { + "in_channels": [24, 40, 40, 40, 48], + "kernel_sizes": [5, 5, 5, 5, 5], + "expanded_channels": [96, 240, 240, 120, 144], + "out_channels": [40, 40, 40, 48, 48], + "use_se": [True, True, True, True, True], + "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1, 1, 1], + }, + { + "in_channels": [48, 96, 96], + "kernel_sizes": [5, 5, 5], + "expanded_channels": [288, 576, 576], + "out_channels": [96, 96, 96], + "use_se": [True, True, True], + "act_type": ["hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1], + }, + ], + } + ) + + +@dataclass +class MobileNetV3LargeArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mobilenetv3", + "params": None, + "stage_params": [ + { + "in_channels": [16, 16, 24], + "kernel_sizes": [3, 3, 3], + "expanded_channels": [16, 64, 72], + "out_channels": [16, 24, 24], + "use_se": [False, False, False], + "act_type": ["relu", "relu", "relu"], + "stride": [1, 2, 1], + }, + { + "in_channels": [24, 40, 40], + "kernel_sizes": [5, 5, 5], + "expanded_channels": [72, 120, 120], + "out_channels": [40, 40, 40], + "use_se": [True, True, True], + "act_type": ["relu", "relu", "relu"], + "stride": [2, 1, 1], + }, + { + "in_channels": [40, 80, 80, 80, 80, 112], + "kernel_sizes": [3, 3, 3, 3, 3, 3], + "expanded_channels": [240, 200, 184, 184, 480, 672], + "out_channels": [80, 80, 80, 80, 112, 112], + "use_se": [False, False, False, False, True, True], + "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1, 1, 1, 1], + }, + { + "in_channels": [112, 160, 160], + "kernel_sizes": [5, 5, 5], + "expanded_channels": [672, 960, 960], + "out_channels": [160, 160, 160], + "use_se": [True, True, True], + "act_type": ["hard_swish", "hard_swish", "hard_swish"], + "stride": [2, 1, 1], + }, + ], + } + ) + + +@dataclass +class ClassificationMobileNetV3LargeModelConfig(ModelConfig): + task: str = "classification" + name: str = "mobilenet_v3_large" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileNetV3LargeArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 2, + "intermediate_channels": 1200, + "act_type": "hard_swish", + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class ClassificationMobileNetV3SmallModelConfig(ModelConfig): + task: str = "classification" + name: str = "mobilenet_v3_small" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileNetV3SmallArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationMobileNetV3SmallModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "mobilenet_v3_small" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileNetV3SmallArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionMobileNetV3SmallModelConfig(ModelConfig): + task: str = "detection" + name: str = "mobilenet_v3_small" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileNetV3SmallArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) diff --git a/netspresso/trainer/models/mobilevit.py b/netspresso/trainer/models/mobilevit.py new file mode 100644 index 00000000..03361477 --- /dev/null +++ b/netspresso/trainer/models/mobilevit.py @@ -0,0 +1,89 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class MobileViTArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mobilevit", + "params": { + "patch_size": 2, + "num_attention_heads": 4, + "attention_dropout_prob": 0.1, + "ffn_dropout_prob": 0.0, + "output_expansion_ratio": 4, + "use_fusion_layer": True, + }, + "stage_params": [ + { + "out_channels": 32, + "block_type": "mv2", + "num_blocks": 1, + "stride": 1, + "ir_expansion_ratio": 4, # [mv2_exp_mult] * 4 + }, + { + "block_type": "mv2", + "out_channels": 64, + "num_blocks": 3, + "stride": 2, + "ir_expansion_ratio": 4, # [mv2_exp_mult] * 4 + }, + { + "block_type": "mobilevit", + "out_channels": 96, + "num_blocks": 2, + "stride": 2, + "hidden_size": 144, + "intermediate_size": 288, + "dilate": False, + "ir_expansion_ratio": 4, # [mv2_exp_mult] * 4 + }, + { + "block_type": "mobilevit", + "out_channels": 128, + "num_blocks": 4, + "stride": 2, + "hidden_size": 192, + "intermediate_size": 384, + "dilate": False, + "ir_expansion_ratio": 4, # [mv2_exp_mult] * 4 + }, + { + "block_type": "mobilevit", + "out_channels": 160, + "num_blocks": 3, + "stride": 2, + "hidden_size": 240, + "intermediate_size": 480, + "dilate": False, + "ir_expansion_ratio": 4, # [mv2_exp_mult] * 4 + }, + ], + } + ) + + +@dataclass +class ClassificationMobileViTModelConfig(ModelConfig): + task: str = "classification" + name: str = "mobilevit_s" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileViTArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) diff --git a/netspresso/trainer/models/model.py b/netspresso/trainer/models/model.py deleted file mode 100644 index f30d7982..00000000 --- a/netspresso/trainer/models/model.py +++ /dev/null @@ -1,1029 +0,0 @@ -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -from omegaconf import MISSING, MissingMandatoryValue - -__all__ = [ - "ModelConfig", - "ClassificationEfficientFormerModelConfig", - "SegmentationEfficientFormerModelConfig", - "DetectionEfficientFormerModelConfig", - "ClassificationMobileNetV3ModelConfig", - "SegmentationMobileNetV3ModelConfig", - "DetectionMobileNetV3ModelConfig", - "ClassificationMobileViTModelConfig", - "PIDNetModelConfig", - "ClassificationResNetModelConfig", - "SegmentationResNetModelConfig", - "DetectionResNetModelConfig", - "SegmentationSegFormerModelConfig", - "ClassificationViTModelConfig", - "DetectionYoloXModelConfig", - "ClassificationMixNetSmallModelConfig", - "ClassificationMixNetMediumModelConfig", - "ClassificationMixNetLargeModelConfig", - "SegmentationMixNetSmallModelConfig", - "SegmentationMixNetMediumModelConfig", - "SegmentationMixNetLargeModelConfig", - "DetectionMixNetSmallModelConfig", - "DetectionMixNetMediumModelConfig", - "DetectionMixNetLargeModelConfig", -] - - -@dataclass -class ArchitectureConfig: - full: Optional[Dict[str, Any]] = None - backbone: Optional[Dict[str, Any]] = None - neck: Optional[Dict[str, Any]] = None - head: Optional[Dict[str, Any]] = None - - def __post_init__(self): - assert bool(self.full) != bool(self.backbone), "Only one of full or backbone should be given." - -@dataclass -class CheckpointConfig: - use_pretrained: bool = True - load_head: bool = False - path: Optional[Union[Path, str]] = None - fx_model_path: Optional[Union[Path, str]] = None - optimizer_path: Optional[Union[Path, str]] = None - -@dataclass -class ModelConfig: - task: str = MISSING - name: str = MISSING - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig()) - load_checkpoint_head: bool = False - fx_model_checkpoint: Optional[Union[Path, str]] = None - resume_optimizer_checkpoint: Optional[Union[Path, str]] = None - freeze_backbone: bool = False - architecture: ArchitectureConfig = field(default_factory=lambda: ArchitectureConfig()) - losses: Optional[List[Dict[str, Any]]] = None - - -@dataclass -class EfficientFormerArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "efficientformer", - "params": { - "num_attention_heads": 8, - "attention_channels": 256, - "attention_dropout_prob": 0., - "attention_value_expansion_ratio": 4, - "ffn_intermediate_ratio": 4, - "ffn_dropout_prob": 0., - "ffn_act_type": 'gelu', - "vit_num": 1, - }, - "stage_params": [ - {"num_blocks": 3, "channels": 48}, - {"num_blocks": 2, "channels": 96}, - {"num_blocks": 6, "channels": 224}, - {"num_blocks": 4, "channels": 448}, - ], - }) - - -@dataclass -class MobileNetV3ArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mobilenetv3", - "params": None, - "stage_params": [ - { - "in_channels": [16], - "kernel_sizes": [3], - "expanded_channels": [16], - "out_channels": [16], - "use_se": [True], - "act_type": ["relu"], - "stride": [2], - }, - { - "in_channels": [16, 24], - "kernel_sizes": [3, 3], - "expanded_channels": [72, 88], - "out_channels": [24, 24], - "use_se": [False, False], - "act_type": ["relu", "relu"], - "stride": [2, 1], - }, - { - "in_channels": [24, 40, 40, 40, 48], - "kernel_sizes": [5, 5, 5, 5, 5], - "expanded_channels": [96, 240, 240, 120, 144], - "out_channels": [40, 40, 40, 48, 48], - "use_se": [True, True, True, True, True], - "act_type": ["hard_swish", "hard_swish", "hard_swish", "hard_swish", "hard_swish"], - "stride": [2, 1, 1, 1, 1], - }, - { - "in_channels": [48, 96, 96], - "kernel_sizes": [5, 5, 5], - "expanded_channels": [288, 576, 576], - "out_channels": [96, 96, 96], - "use_se": [True, True, True], - "act_type": ["hard_swish", "hard_swish", "hard_swish"], - "stride": [2, 1, 1], - }, - ], - }) - - -@dataclass -class MobileViTArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mobilevit", - "params": { - "patch_size": 2, - "num_attention_heads": 4, - "attention_dropout_prob": 0.1, - "ffn_dropout_prob": 0.0, - "output_expansion_ratio": 4, - "use_fusion_layer": True, - }, - "stage_params": [ - { - "out_channels": 32, - "block_type": "mv2", - "num_blocks": 1, - "stride": 1, - "ir_expansion_ratio": 4, - }, - { - "out_channels": 64, - "block_type": "mv2", - "num_blocks": 3, - "stride": 2, - "ir_expansion_ratio": 4, - }, - { - "out_channels": 96, - "block_type": "mobilevit", - "num_blocks": 2, - "stride": 2, - "hidden_size": 144, - "intermediate_size": 288, - "dilate": False, - "ir_expansion_ratio": 4, - }, - { - "out_channels": 128, - "block_type": "mobilevit", - "num_blocks": 4, - "stride": 2, - "hidden_size": 192, - "intermediate_size": 384, - "dilate": False, - "ir_expansion_ratio": 4, - }, - { - "out_channels": 160, - "block_type": "mobilevit", - "num_blocks": 3, - "stride": 2, - "hidden_size": 240, - "intermediate_size": 480, - "dilate": False, - "ir_expansion_ratio": 4, - }, - ] - }) - - -@dataclass -class PIDNetArchitectureConfig(ArchitectureConfig): - full: Dict[str, Any] = field(default_factory=lambda: { - "name": "pidnet", - "m": 2, - "n": 3, - "channels": 32, - "ppm_channels": 96, - "head_channels": 128, - }) - - -@dataclass -class ResNetArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "resnet", - "params": { - "block_type": "bottleneck", - "norm_type": "batch_norm", - }, - "stage_params": [ - {"channels": 64, "num_blocks": 3}, - {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False}, - {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False}, - {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False}, - ], - }) - - -@dataclass -class SegFormerArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mixtransformer", - "params": { - "ffn_intermediate_expansion_ratio": 4, - "ffn_act_type": "gelu", - "ffn_dropout_prob": 0.0, - "attention_dropout_prob": 0.0, - }, - "stage_params": [ - { - "num_blocks": 2, - "sequence_reduction_ratio": 8, - "attention_chananels": 32, - "embedding_patch_sizes": 7, - "embedding_strides": 4, - "num_attention_heads": 1, - }, - { - "num_blocks": 2, - "sequence_reduction_ratio": 4, - "attention_chananels": 64, - "embedding_patch_sizes": 3, - "num_attention_heads": 2, - }, - { - "num_blocks": 2, - "sequence_reduction_ratio": 2, - "attention_chananels": 160, - "embedding_patch_sizes": 3, - "embedding_strides": 2, - "num_attention_heads": 5, - }, - { - "num_blocks": 2, - "sequence_reduction_ratio": 1, - "attention_chananels": 256, - "embedding_patch_sizes": 3, - "embedding_strides": 2, - "num_attention_heads": 8, - }, - ], - }) - - -@dataclass -class ViTArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "vit", - "params": { - "patch_size": 16, - "attention_channels": 192, - "num_blocks": 12, - "num_attention_heads": 3, - "attention_dropout_prob": 0.0, - "ffn_intermediate_channels": 768, - "ffn_dropout_prob": 0.1, - "use_cls_token": True, - "vocab_size": 1000, - }, - "stage_params": None, - }) - - -@dataclass -class MixNetSmallArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mixnet", - "params": { - "stem_channels": 16, - "wid_mul": 1.0, - "dep_mul": 1.0, - "dropout_rate": 0., - }, - "stage_params": [ - { - "expansion_ratio": [1, 6, 3], - "out_channels": [16, 24, 24], - "num_blocks": [1, 1, 1], - "kernel_sizes": [[3], [3], [3]], - "num_exp_groups": [1, 2, 2], - "num_poi_groups": [1, 2, 2], - "stride": [1, 2, 1], - "act_type": ["relu", "relu", "relu"], - "se_reduction_ratio": [None, None, None], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [40, 40], - "num_blocks": [1, 3], - "kernel_sizes": [[3, 5, 7], [3, 5]], - "num_exp_groups": [1, 2], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - { - "expansion_ratio": [6, 6, 6, 3], - "out_channels": [80, 80, 120, 120], - "num_blocks": [1, 2, 1, 2], - "kernel_sizes": [[3, 5, 7], [3, 5], [3, 5, 7], [3, 5, 7, 9]], - "num_exp_groups": [1, 1, 2, 2], - "num_poi_groups": [2, 2, 2, 2], - "stride": [2, 1, 1, 1], - "act_type": ["swish", "swish", "swish", "swish"], - "se_reduction_ratio": [4, 4, 2, 2], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [200, 200], - "num_blocks": [1, 2], - "kernel_sizes": [[3, 5, 7, 9, 11], [3, 5, 7, 9]], - "num_exp_groups": [1, 1], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - ], - }) - - -@dataclass -class MixNetMediumArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mixnet", - "params": { - "stem_channels": 24, - "wid_mul": 1.0, - "dep_mul": 1.0, - "dropout_rate": 0., - }, - "stage_params": [ - { - "expansion_ratio": [1, 6, 3], - "out_channels": [24, 32, 32], - "num_blocks": [1, 1, 1], - "kernel_sizes": [[3], [3, 5, 7], [3]], - "num_exp_groups": [1, 2, 2], - "num_poi_groups": [1, 2, 2], - "stride": [1, 2, 1], - "act_type": ["relu", "relu", "relu"], - "se_reduction_ratio": [None, None, None], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [40, 40], - "num_blocks": [1, 3], - "kernel_sizes": [[3, 5, 7, 9], [3, 5]], - "num_exp_groups": [1, 2], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - { - "expansion_ratio": [6, 6, 6, 3], - "out_channels": [80, 80, 120, 120], - "num_blocks": [1, 3, 1, 3], - "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], - "num_exp_groups": [1, 2, 1, 2], - "num_poi_groups": [1, 2, 1, 2], - "stride": [2, 1, 1, 1], - "act_type": ["swish", "swish", "swish", "swish"], - "se_reduction_ratio": [4, 4, 2, 2], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [200, 200], - "num_blocks": [1, 3], - "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], - "num_exp_groups": [1, 1], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - ], - }) - - -@dataclass -class MixNetLargeArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "mixnet", - "params": { - "stem_channels": 24, - "wid_mul": 1.3, - "dep_mul": 1.0, - "dropout_rate": 0., - }, - "stage_params": [ - { - "expansion_ratio": [1, 6, 3], - "out_channels": [24, 32, 32], - "num_blocks": [1, 1, 1], - "kernel_sizes": [[3], [3, 5, 7], [3]], - "num_exp_groups": [1, 2, 2], - "num_poi_groups": [1, 2, 2], - "stride": [1, 2, 1], - "act_type": ["relu", "relu", "relu"], - "se_reduction_ratio": [None, None, None], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [40, 40], - "num_blocks": [1, 3], - "kernel_sizes": [[3, 5, 7, 9], [3, 5]], - "num_exp_groups": [1, 2], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - { - "expansion_ratio": [6, 6, 6, 3], - "out_channels": [80, 80, 120, 120], - "num_blocks": [1, 3, 1, 3], - "kernel_sizes": [[3, 5, 7], [3, 5, 7, 9], [3], [3, 5, 7, 9]], - "num_exp_groups": [1, 2, 1, 2], - "num_poi_groups": [1, 2, 1, 2], - "stride": [2, 1, 1, 1], - "act_type": ["swish", "swish", "swish", "swish"], - "se_reduction_ratio": [4, 4, 2, 2], - }, - { - "expansion_ratio": [6, 6], - "out_channels": [200, 200], - "num_blocks": [1, 3], - "kernel_sizes": [[3, 5, 7, 9], [3, 5, 7, 9]], - "num_exp_groups": [1, 1], - "num_poi_groups": [1, 2], - "stride": [2, 1], - "act_type": ["swish", "swish"], - "se_reduction_ratio": [2, 2], - }, - ], - }) - - -@dataclass -class CSPDarkNetSmallArchitectureConfig(ArchitectureConfig): - backbone: Dict[str, Any] = field(default_factory=lambda: { - "name": "cspdarknet", - "params": { - "dep_mul": 0.33, - "wid_mul": 0.5, - "act_type": "silu", - }, - "stage_params": None, - }) - - -@dataclass -class ClassificationEfficientFormerModelConfig(ModelConfig): - task: str = "classification" - name: str = "efficientformer_l1" - architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationEfficientFormerModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "efficientformer_l1" - architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionEfficientFormerModelConfig(ModelConfig): - task: str = "detection" - name: str = "efficientformer_l1" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: EfficientFormerArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) - - -@dataclass -class ClassificationMobileNetV3ModelConfig(ModelConfig): - task: str = "classification" - name: str = "mobilenet_v3_small" - architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationMobileNetV3ModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "mobilenet_v3_small" - architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionMobileNetV3ModelConfig(ModelConfig): - task: str = "detection" - name: str = "mobilenet_v3_small" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: MobileNetV3ArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) - - -@dataclass -class ClassificationMobileViTModelConfig(ModelConfig): - task: str = "classification" - name: str = "mobilevit_s" - architecture: ArchitectureConfig = field(default_factory=lambda: MobileViTArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class PIDNetModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "pidnet_s" - architecture: ArchitectureConfig = field(default_factory=lambda: PIDNetArchitectureConfig()) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "pidnet_loss", "ignore_index": 255, "weight": None}, - ]) - - -@dataclass -class ClassificationResNetModelConfig(ModelConfig): - task: str = "classification" - name: str = "resnet50" - architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationResNetModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "resnet50" - architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionResNetModelConfig(ModelConfig): - task: str = "detection" - name: str = "resnet50" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: ResNetArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) - - -@dataclass -class SegmentationSegFormerModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "segformer" - architecture: ArchitectureConfig = field(default_factory=lambda: SegFormerArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class ClassificationViTModelConfig(ModelConfig): - task: str = "classification" - name: str = "vit_tiny" - architecture: ArchitectureConfig = field(default_factory=lambda: ViTArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class DetectionYoloXModelConfig(ModelConfig): - task: str = "detection" - name: str = "yolox_s" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: CSPDarkNetSmallArchitectureConfig( - neck={ - "name": "yolopafpn", - "params": { - "dep_mul": 0.33, - "act_type": "silu", - }, - }, - head={ - "name": "anchor_free_decoupled_head", - "params": { - "act_type": "silu", - # postprocessor - decode - "score_thresh": 0.7, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "yolox_loss", "weight": None} - ]) - - -@dataclass -class ClassificationMixNetSmallModelConfig(ModelConfig): - task: str = "classification" - name: str = "mixnet_s" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationMixNetSmallModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "mixnet_s" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionMixNetSmallModelConfig(ModelConfig): - task: str = "detection" - name: str = "mixnet_s" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetSmallArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) - - -@dataclass -class ClassificationMixNetMediumModelConfig(ModelConfig): - task: str = "classification" - name: str = "mixnet_m" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationMixNetMediumModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "mixnet_m" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionMixNetMediumModelConfig(ModelConfig): - task: str = "detection" - name: str = "mixnet_m" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetMediumArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) - - -@dataclass -class ClassificationMixNetLargeModelConfig(ModelConfig): - task: str = "classification" - name: str = "mixnet_l" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig( - head={ - "name": "fc", - "params": { - "intermediate_channels": 1024, - "num_layers": 1, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None} - ]) - - -@dataclass -class SegmentationMixNetLargeModelConfig(ModelConfig): - task: str = "segmentation" - name: str = "mixnet_l" - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig( - head={ - "name": "all_mlp_decoder", - "params": { - "intermediate_channels": 256, - "classifier_dropout_prob": 0., - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "cross_entropy", "ignore_index": 255, "weight": None} - ]) - - -@dataclass -class DetectionMixNetLargeModelConfig(ModelConfig): - task: str = "detection" - name: str = "mixnet_l" - checkpoint: CheckpointConfig = field(default_factory=lambda: CheckpointConfig( - load_head=True - )) - architecture: ArchitectureConfig = field(default_factory=lambda: MixNetLargeArchitectureConfig( - neck={ - "name": "fpn", - "params": { - "num_outs": 4, - "start_level": 0, - "end_level": -1, - "add_extra_convs": False, - "relu_before_extra_convs": False, - }, - }, - head={ - "name": "anchor_decoupled_head", - "params": { - # Anchor parameters - "anchor_sizes": [[32,], [64,], [128,], [256,]], - "aspect_ratios": [0.5, 1.0, 2.0], - "num_layers": 1, - "norm_type": "batch_norm", - # postprocessor - decode - "topk_candidates": 1000, - "score_thresh": 0.05, - # postprocessor - nms - "nms_thresh": 0.45, - "class_agnostic": False, - } - } - )) - losses: List[Dict[str, Any]] = field(default_factory=lambda: [ - {"criterion": "retinanet_loss", "weight": None}, - ]) diff --git a/netspresso/trainer/models/pidnet.py b/netspresso/trainer/models/pidnet.py new file mode 100644 index 00000000..534b5445 --- /dev/null +++ b/netspresso/trainer/models/pidnet.py @@ -0,0 +1,30 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig + + +@dataclass +class PIDNetArchitectureConfig(ArchitectureConfig): + full: Dict[str, Any] = field( + default_factory=lambda: { + "name": "pidnet", + "m": 2, + "n": 3, + "channels": 32, + "ppm_channels": 96, + "head_channels": 128, + } + ) + + +@dataclass +class PIDNetModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "pidnet_s" + architecture: ArchitectureConfig = field(default_factory=lambda: PIDNetArchitectureConfig()) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "pidnet_loss", "ignore_index": 255, "weight": None}, + ] + ) diff --git a/netspresso/trainer/models/resnet.py b/netspresso/trainer/models/resnet.py new file mode 100644 index 00000000..f9d62db3 --- /dev/null +++ b/netspresso/trainer/models/resnet.py @@ -0,0 +1,201 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class ResNet18ArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "resnet", + "params": { + "block_type": "basicblock", + "norm_type": "batch_norm", + }, + "stage_params": [ + {"channels": 64, "num_blocks": 2}, + {"channels": 128, "num_blocks": 2, "replace_stride_with_dilation": False}, + {"channels": 256, "num_blocks": 2, "replace_stride_with_dilation": False}, + {"channels": 512, "num_blocks": 2, "replace_stride_with_dilation": False}, + ], + } + ) + + +@dataclass +class ResNet34ArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "resnet", + "params": { + "block_type": "basicblock", + "norm_type": "batch_norm", + }, + "stage_params": [ + {"channels": 64, "num_blocks": 3}, + {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False}, + {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False}, + {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False}, + ], + } + ) + + +@dataclass +class ResNet50ArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "resnet", + "params": { + "block_type": "bottleneck", + "norm_type": "batch_norm", + }, + "stage_params": [ + {"channels": 64, "num_blocks": 3}, + {"channels": 128, "num_blocks": 4, "replace_stride_with_dilation": False}, + {"channels": 256, "num_blocks": 6, "replace_stride_with_dilation": False}, + {"channels": 512, "num_blocks": 3, "replace_stride_with_dilation": False}, + ], + } + ) + + +@dataclass +class ClassificationResNet18ModelConfig(ModelConfig): + task: str = "classification" + name: str = "resnet18" + architecture: ArchitectureConfig = field( + default_factory=lambda: ResNet18ArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class ClassificationResNet34ModelConfig(ModelConfig): + task: str = "classification" + name: str = "resnet34" + architecture: ArchitectureConfig = field( + default_factory=lambda: ResNet34ArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class ClassificationResNet50ModelConfig(ModelConfig): + task: str = "classification" + name: str = "resnet50" + architecture: ArchitectureConfig = field( + default_factory=lambda: ResNet50ArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) + + +@dataclass +class SegmentationResNet50ModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "resnet50" + architecture: ArchitectureConfig = field( + default_factory=lambda: ResNet50ArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) + + +@dataclass +class DetectionResNet50ModelConfig(ModelConfig): + task: str = "detection" + name: str = "resnet50" + architecture: ArchitectureConfig = field( + default_factory=lambda: ResNet50ArchitectureConfig( + neck={ + "name": "fpn", + "params": { + "num_outs": 4, + "start_level": 0, + "end_level": -1, + "add_extra_convs": False, + "relu_before_extra_convs": False, + }, + }, + head={ + "name": "anchor_decoupled_head", + "params": { + # Anchor parameters + "anchor_sizes": [ + [ + 32, + ], + [ + 64, + ], + [ + 128, + ], + [ + 256, + ], + ], + "aspect_ratios": [0.5, 1.0, 2.0], + "num_layers": 1, + "norm_type": "batch_norm", + # postprocessor - decode + "topk_candidates": 1000, + "score_thresh": 0.05, + # postprocessor - nms + "nms_thresh": 0.45, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [ + {"criterion": "retinanet_loss", "weight": None}, + ] + ) diff --git a/netspresso/trainer/models/rtmpose.py b/netspresso/trainer/models/rtmpose.py new file mode 100644 index 00000000..8f650b5a --- /dev/null +++ b/netspresso/trainer/models/rtmpose.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig +from netspresso.trainer.models.mobilenetv3 import MobileNetV3SmallArchitectureConfig + + +@dataclass +class PoseEstimationMobileNetV3SmallModelConfig(ModelConfig): + task: str = "pose_estimation" + name: str = "mobilenet_v3_small" + architecture: ArchitectureConfig = field( + default_factory=lambda: MobileNetV3SmallArchitectureConfig( + head={ + "name": "rtmcc", + "params": { + "conv_kernel": 7, + "attention_channels": 256, + "attention_act_type": "silu", + "attention_pos_enc": False, + "s": 128, + "expansion_factor": 2, + "dropout_rate": 0.0, + "drop_path": 0.0, + "use_rel_bias": False, + "simcc_split_ratio": 2.0, + "target_size": [256, 256], + "backbone_stride": 32, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field(default_factory=lambda: [{"criterion": "rtmcc_loss", "weight": None}]) diff --git a/netspresso/trainer/models/segformer.py b/netspresso/trainer/models/segformer.py new file mode 100644 index 00000000..57235a07 --- /dev/null +++ b/netspresso/trainer/models/segformer.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig + + +@dataclass +class SegFormerB0ArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "mixtransformer", + "params": { + "ffn_intermediate_expansion_ratio": 4, + "ffn_act_type": "gelu", + "ffn_dropout_prob": 0.0, + "attention_dropout_prob": 0.0, + }, + "stage_params": [ + { + "num_blocks": 2, + "sequence_reduction_ratio": 8, + "attention_chananels": 32, + "embedding_patch_sizes": 7, + "embedding_strides": 4, + "num_attention_heads": 1, + }, + { + "num_blocks": 2, + "sequence_reduction_ratio": 4, + "attention_chananels": 64, + "embedding_patch_sizes": 3, + "num_attention_heads": 2, + }, + { + "num_blocks": 2, + "sequence_reduction_ratio": 2, + "attention_chananels": 160, + "embedding_patch_sizes": 3, + "embedding_strides": 2, + "num_attention_heads": 5, + }, + { + "num_blocks": 2, + "sequence_reduction_ratio": 1, + "attention_chananels": 256, + "embedding_patch_sizes": 3, + "embedding_strides": 2, + "num_attention_heads": 8, + }, + ], + } + ) + + +@dataclass +class SegmentationSegFormerB0ModelConfig(ModelConfig): + task: str = "segmentation" + name: str = "segformer_b0" + architecture: ArchitectureConfig = field( + default_factory=lambda: SegFormerB0ArchitectureConfig( + head={ + "name": "all_mlp_decoder", + "params": { + "intermediate_channels": 256, + "classifier_dropout_prob": 0.0, + "resize_output": [512, 512], + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "seg_cross_entropy", "ignore_index": 255, "weight": None}] + ) diff --git a/netspresso/trainer/models/vit.py b/netspresso/trainer/models/vit.py new file mode 100644 index 00000000..1f605d49 --- /dev/null +++ b/netspresso/trainer/models/vit.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, ModelConfig + + +@dataclass +class ViTTinyArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "vit", + "params": { + "patch_size": 16, + "attention_channels": 192, + "num_blocks": 12, + "num_attention_heads": 3, + "attention_dropout_prob": 0.0, + "ffn_intermediate_channels": 768, + "ffn_dropout_prob": 0.1, + "use_cls_token": True, + "vocab_size": 1000, + }, + "stage_params": None, + } + ) + + +@dataclass +class ClassificationViTTinyModelConfig(ModelConfig): + task: str = "classification" + name: str = "vit_tiny" + architecture: ArchitectureConfig = field( + default_factory=lambda: ViTTinyArchitectureConfig( + head={ + "name": "fc", + "params": { + "num_layers": 1, + "intermediate_channels": None, + "act_type": None, + "dropout_prob": 0.0, + }, + } + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "cross_entropy", "label_smoothing": 0.1, "weight": None}] + ) diff --git a/netspresso/trainer/models/yolox.py b/netspresso/trainer/models/yolox.py new file mode 100644 index 00000000..6b3f1a31 --- /dev/null +++ b/netspresso/trainer/models/yolox.py @@ -0,0 +1,188 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List + +from netspresso.trainer.models.base import ArchitectureConfig, CheckpointConfig, ModelConfig + + +@dataclass +class CSPDarkNetXArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "cspdarknet", + "params": { + "dep_mul": 1.33, + "wid_mul": 1.25, + "act_type": "silu", + }, + "stage_params": None, + } + ) + + +@dataclass +class CSPDarkNetLArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "cspdarknet", + "params": { + "dep_mul": 1.0, + "wid_mul": 1.0, + "act_type": "silu", + }, + "stage_params": None, + } + ) + + +@dataclass +class CSPDarkNetMArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "cspdarknet", + "params": { + "dep_mul": 0.67, + "wid_mul": 0.75, + "act_type": "silu", + }, + "stage_params": None, + } + ) + + +@dataclass +class CSPDarkNetSArchitectureConfig(ArchitectureConfig): + backbone: Dict[str, Any] = field( + default_factory=lambda: { + "name": "cspdarknet", + "params": { + "dep_mul": 0.33, + "wid_mul": 0.5, + "act_type": "silu", + }, + "stage_params": None, + } + ) + + +@dataclass +class DetectionYoloXXModelConfig(ModelConfig): + task: str = "detection" + name: str = "yolox_x" + architecture: ArchitectureConfig = field( + default_factory=lambda: CSPDarkNetXArchitectureConfig( + neck={ + "name": "yolopafpn", + "params": { + "dep_mul": 1.33, + "act_type": "silu", + }, + }, + head={ + "name": "anchor_free_decoupled_head", + "params": { + "act_type": "silu", + # postprocessor - decode + "score_thresh": 0.01, + # postprocessor - nms + "nms_thresh": 0.65, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}] + ) + + +@dataclass +class DetectionYoloXLModelConfig(ModelConfig): + task: str = "detection" + name: str = "yolox_l" + architecture: ArchitectureConfig = field( + default_factory=lambda: CSPDarkNetLArchitectureConfig( + neck={ + "name": "yolopafpn", + "params": { + "dep_mul": 1.0, + "act_type": "silu", + }, + }, + head={ + "name": "anchor_free_decoupled_head", + "params": { + "act_type": "silu", + # postprocessor - decode + "score_thresh": 0.01, + # postprocessor - nms + "nms_thresh": 0.65, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}] + ) + + +@dataclass +class DetectionYoloXMModelConfig(ModelConfig): + task: str = "detection" + name: str = "yolox_m" + architecture: ArchitectureConfig = field( + default_factory=lambda: CSPDarkNetMArchitectureConfig( + neck={ + "name": "yolopafpn", + "params": { + "dep_mul": 0.67, + "act_type": "silu", + }, + }, + head={ + "name": "anchor_free_decoupled_head", + "params": { + "act_type": "silu", + # postprocessor - decode + "score_thresh": 0.01, + # postprocessor - nms + "nms_thresh": 0.65, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}] + ) + + +@dataclass +class DetectionYoloXSModelConfig(ModelConfig): + task: str = "detection" + name: str = "yolox_s" + architecture: ArchitectureConfig = field( + default_factory=lambda: CSPDarkNetSArchitectureConfig( + neck={ + "name": "yolopafpn", + "params": { + "dep_mul": 0.33, + "act_type": "silu", + }, + }, + head={ + "name": "anchor_free_decoupled_head", + "params": { + "act_type": "silu", + # postprocessor - decode + "score_thresh": 0.01, + # postprocessor - nms + "nms_thresh": 0.65, + "class_agnostic": False, + }, + }, + ) + ) + losses: List[Dict[str, Any]] = field( + default_factory=lambda: [{"criterion": "yolox_loss", "weight": None, "l1_activate_epoch": 1}] + ) diff --git a/netspresso/trainer/trainer.py b/netspresso/trainer/trainer.py index 8fb70dd8..26db399b 100644 --- a/netspresso/trainer/trainer.py +++ b/netspresso/trainer/trainer.py @@ -5,7 +5,7 @@ from omegaconf import OmegaConf from netspresso.enums import Status, Task, TaskType -from netspresso.trainer.augmentations import AUGMENTATION_CONFIG_TYPE, AugmentationConfig, Inference, Train, Transform +from netspresso.trainer.augmentations import AUGMENTATION_CONFIG_TYPE, AugmentationConfig, Transform from netspresso.trainer.data import DATA_CONFIG_TYPE, ImageLabelPathConfig, PathConfig from netspresso.trainer.models import ( CLASSIFICATION_MODELS, @@ -131,9 +131,9 @@ def set_dataset_config( root_path: str, train_image: str = "images/train", train_label: str = "labels/train", - valid_image: str = "images/val", - valid_label: str = "labels/val", - id_mapping: Optional[Union[List[str], Dict[str, str]]] = None, + valid_image: str = "images/valid", + valid_label: str = "labels/valid", + id_mapping: Optional[Union[List[str], Dict[str, str], str]] = None, ): """Set the dataset configuration for the Trainer. @@ -158,6 +158,33 @@ def set_dataset_config( } self.data = DATA_CONFIG_TYPE[self.task](**common_config) + def check_paths_exist(self, base_path): + paths = [ + "images/train", + "labels/train", + "images/valid", + "labels/valid", + "id_mapping.json", + ] + + for relative_path in paths: + path = Path(base_path) / relative_path + if not path.exists(): + if path.suffix: + raise FileNotFoundError(f"The required file '{relative_path}' does not exist. Please check and make sure it is in the correct location.") + else: + raise FileNotFoundError(f"The required directory '{relative_path}' does not exist. Please check and make sure it is in the correct location.") + + def set_dataset(self, dataset_root_path: str): + dataset_name = Path(dataset_root_path).name + + self.check_paths_exist(dataset_root_path) + self.set_dataset_config( + name=dataset_name, + root_path=dataset_root_path, + id_mapping="id_mapping.json", + ) + def set_model_config( self, model_name: str, @@ -235,33 +262,26 @@ def set_training_config( self.training = ScheduleConfig( epochs=epochs, - batch_size=batch_size, optimizer=optimizer.asdict(), scheduler=scheduler.asdict(), ) + self.environment.batch_size = batch_size def set_augmentation_config( self, train_transforms: Optional[List] = None, - train_mix_transforms: Optional[List] = None, inference_transforms: Optional[List] = None, ): """Set the augmentation configuration for training. Args: train_transforms (List, optional): List of transforms for training. Defaults to None. - train_mix_transforms (List, optional): List of mix transforms for training. Defaults to None. inference_transforms (List, optional): List of transforms for inference. Defaults to None. """ self.augmentation = AugmentationConfig( - train=Train( - transforms=train_transforms, - mix_transforms=train_mix_transforms, - ), - inference=Inference( - transforms=inference_transforms, - ), + train=train_transforms, + inference=inference_transforms, ) def set_logging_config( @@ -345,9 +365,8 @@ def _apply_img_size(self): """ self.augmentation.img_size = self.img_size - self.augmentation.train.transforms = self._change_transforms(self.augmentation.train.transforms) - self.augmentation.train.mix_transforms = self._change_transforms(self.augmentation.train.mix_transforms) - self.augmentation.inference.transforms = self._change_transforms(self.augmentation.inference.transforms) + self.augmentation.train = self._change_transforms(self.augmentation.train) + self.augmentation.inference = self._change_transforms(self.augmentation.inference) def train(self, gpus: str, project_name: str) -> Dict: """Train the model with the specified configuration. @@ -369,6 +388,7 @@ def train(self, gpus: str, project_name: str) -> Dict: destination_folder = FileHandler.create_unique_folder(folder_path=destination_folder) metadata = MetadataHandler.init_metadata(folder_path=destination_folder, task_type=TaskType.TRAIN) self.logging.project_id = Path(destination_folder).name + self.environment.gpus = gpus configs = TrainerConfigs( self.data, @@ -414,7 +434,7 @@ def train(self, gpus: str, project_name: str) -> Dict: dataset=self.data.name, input_shapes=[InputShape(batch=1, channel=3, dimension=[self.img_size, self.img_size])], ) - metadata.update_training_info(epoch=self.training.epochs, batch_size=self.training.batch_size) + metadata.update_training_info(epoch=self.training.epochs, batch_size=self.environment.batch_size) metadata.update_training_result(training_summary=training_summary) metadata.update_logging_dir(logging_dir=destination_folder.as_posix()) metadata.update_hparams(hparams=hparams_path.as_posix()) diff --git a/netspresso/trainer/training/environment.py b/netspresso/trainer/training/environment.py index 04ec85be..71600639 100644 --- a/netspresso/trainer/training/environment.py +++ b/netspresso/trainer/training/environment.py @@ -6,3 +6,4 @@ class EnvironmentConfig: seed: int = 1 num_workers: int = 4 gpus: str = "0" + batch_size: int = 8 diff --git a/netspresso/trainer/training/logging.py b/netspresso/trainer/training/logging.py index ef7643d1..b6446c81 100644 --- a/netspresso/trainer/training/logging.py +++ b/netspresso/trainer/training/logging.py @@ -1,8 +1,6 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path -from typing import Optional, Union - -from omegaconf import MISSING, MissingMandatoryValue +from typing import List, Optional, Union @dataclass @@ -10,10 +8,10 @@ class LoggingConfig: project_id: Optional[str] = None output_dir: Union[Path, str] = "./outputs" tensorboard: bool = True - csv: bool = False image: bool = True stdout: bool = True save_optimizer_state: bool = True + onnx_input_size: List = field(default_factory=lambda: [512, 512]) validation_epoch: int = 10 save_checkpoint_epoch: Optional[int] = None diff --git a/netspresso/trainer/training/training.py b/netspresso/trainer/training/training.py index 9f2e8411..667ca1e8 100644 --- a/netspresso/trainer/training/training.py +++ b/netspresso/trainer/training/training.py @@ -1,13 +1,11 @@ from dataclasses import dataclass, field -from typing import Dict - -from omegaconf import MISSING, MissingMandatoryValue +from typing import Dict, Optional @dataclass class ScheduleConfig: epochs: int = 3 - batch_size: int = 8 + ema: Optional[Dict] = field(default=None) optimizer: Dict = field(default_factory=lambda: { "name": "adamw", "lr": 6e-5, @@ -24,7 +22,7 @@ class ScheduleConfig: @dataclass class ClassificationScheduleConfig(ScheduleConfig): - batch_size: int = 32 + pass @dataclass diff --git a/requirements.txt b/requirements.txt index 14e415e4..fc755032 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ requests>=2.30.0 email-validator==2.0.0 pytz>=2023.3 typing_extensions==4.5.0 -netspresso_trainer==0.1.2 +netspresso_trainer==0.2.2 PyGithub>=2.1.1 matplotlib>=3.7.4