diff --git a/docs/conf.py b/docs/conf.py
index 82583c6b..4cc70a6b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -100,7 +100,6 @@ def get_version():
     "timm",
     "cv2",
     "PIL",
-    "pretrainedmodels",
     "torchvision",
     "segmentation_models_pytorch.encoders",
     "segmentation_models_pytorch.utils",
diff --git a/licenses/LICENSES.md b/licenses/LICENSES.md
index 06f36241..e51ad8d0 100644
--- a/licenses/LICENSES.md
+++ b/licenses/LICENSES.md
@@ -13,14 +13,18 @@ The majority of the code is licensed under the [MIT License](LICENSE). However,
   * [segmentation_models_pytorch/encoders/mix_transformer.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/mix_transformer.py)
   * [LICENSE_nvidia](LICENSE_nvidia.md)
 
-
 - Apple License
   * Applies to the MobileOne encoder
   * [segmentation_models_pytorch/encoders/mobileone.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/mobileone.py)
   * [LICENSE_apple](LICENSE_apple.md)
 
 - BSD 3-Clause License
-  * Applies to the DeepLabV3 decoder
+  * Applies to several encoders and the DeepLabV3 decoder
+  * [segmentation_models_pytorch/encoders/_dpn.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_dpn.py)
+  * [segmentation_models_pytorch/encoders/_inceptionresnetv2.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_inceptionresnetv2.py)
+  * [segmentation_models_pytorch/encoders/_inceptionv4.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_inceptionv4.py)
+  * [segmentation_models_pytorch/encoders/_senet.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_senet.py)
+  * [segmentation_models_pytorch/encoders/_xception.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_xception.py)
   * [segmentation_models_pytorch/decoders/deeplabv3/decoder.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/decoders/deeplabv3/decoder.py)
 
 - Apache-2.0 License
diff --git a/pyproject.toml b/pyproject.toml
index 492f7ef9..f3e55a96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,9 +20,7 @@ dependencies = [
     'huggingface-hub>=0.24',
     'numpy>=1.19.3',
     'pillow>=8',
-    'pretrainedmodels>=0.7.1',
     'safetensors>=0.3.1',
-    'six>=1.5',
     'timm>=0.9',
     'torch>=1.8',
     'torchvision>=0.9',
diff --git a/requirements/minimum.old b/requirements/minimum.old
index 1adb97f5..678f83f4 100644
--- a/requirements/minimum.old
+++ b/requirements/minimum.old
@@ -1,9 +1,7 @@
 huggingface-hub==0.24.0
 numpy==1.19.3
 pillow==8.0.0
-pretrainedmodels==0.7.1
 safetensors==0.3.1
-six==1.5.0
 timm==0.9.0
 torch==1.9.0
 torchvision==0.10.0
diff --git a/requirements/required.txt b/requirements/required.txt
index 6864e1f6..cf3db498 100644
--- a/requirements/required.txt
+++ b/requirements/required.txt
@@ -1,9 +1,7 @@
 huggingface_hub==0.27.1
 numpy==2.2.1
 pillow==11.1.0
-pretrainedmodels==0.7.4
 safetensors==0.5.2
-six==1.17.0
 timm==1.0.13
 torch==2.5.1
 torchvision==0.20.1
diff --git a/segmentation_models_pytorch/__init__.py b/segmentation_models_pytorch/__init__.py
index f1807836..8a1e17fe 100644
--- a/segmentation_models_pytorch/__init__.py
+++ b/segmentation_models_pytorch/__init__.py
@@ -1,5 +1,3 @@
-import warnings
-
 from . import datasets
 from . import encoders
 from . import decoders
@@ -24,12 +22,6 @@
 from typing import Optional as _Optional
 import torch as _torch
 
-# Suppress the specific SyntaxWarning for `pretrainedmodels`
-warnings.filterwarnings("ignore", message="is with a literal", category=SyntaxWarning)
-warnings.filterwarnings(
-    "ignore", message=r'"is" with \'str\' literal.*', category=SyntaxWarning
-)  # for python >= 3.12
-
 _MODEL_ARCHITECTURES = [
     Unet,
     UnetPlusPlus,
diff --git a/segmentation_models_pytorch/encoders/_dpn.py b/segmentation_models_pytorch/encoders/_dpn.py
new file mode 100644
index 00000000..db3cb29a
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_dpn.py
@@ -0,0 +1,610 @@
+"""PyTorch implementation of DualPathNetworks
+Ported to PyTorch by [Ross Wightman](https://github.com/rwightman/pytorch-dpn-pretrained)
+
+Based on original MXNet implementation https://github.com/cypw/DPNs with
+many ideas from another PyTorch implementation https://github.com/oyam/pytorch-DPNs.
+
+This implementation is compatible with the pretrained weights
+from cypw's MXNet implementation.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+
+__all__ = ["DPN", "dpn68", "dpn68b", "dpn92", "dpn98", "dpn131", "dpn107"]
+
+pretrained_settings = {
+    "dpn68": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn68-4af7d88d2.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+    "dpn68b": {
+        "imagenet+5k": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn68b_extra-363ab9c19.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+    "dpn92": {
+        # 'imagenet': {
+        #     'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn68-66bebafa7.pth',
+        #     'input_space': 'RGB',
+        #     'input_size': [3, 224, 224],
+        #     'input_range': [0, 1],
+        #     'mean': [124 / 255, 117 / 255, 104 / 255],
+        #     'std': [1 / (.0167 * 255)] * 3,
+        #     'num_classes': 1000
+        # },
+        "imagenet+5k": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn92_extra-fda993c95.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+    "dpn98": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn98-722954780.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+    "dpn131": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn131-7af84be88.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+    "dpn107": {
+        "imagenet+5k": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/dpn107_extra-b7f9f4cc9.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [124 / 255, 117 / 255, 104 / 255],
+            "std": [1 / (0.0167 * 255)] * 3,
+            "num_classes": 1000,
+        }
+    },
+}
+
+
+def dpn68(num_classes=1000, pretrained="imagenet"):
+    model = DPN(
+        small=True,
+        num_init_features=10,
+        k_r=128,
+        groups=32,
+        k_sec=(3, 4, 12, 3),
+        inc_sec=(16, 32, 32, 64),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn68"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+def dpn68b(num_classes=1000, pretrained="imagenet+5k"):
+    model = DPN(
+        small=True,
+        num_init_features=10,
+        k_r=128,
+        groups=32,
+        b=True,
+        k_sec=(3, 4, 12, 3),
+        inc_sec=(16, 32, 32, 64),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn68b"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+def dpn92(num_classes=1000, pretrained="imagenet+5k"):
+    model = DPN(
+        num_init_features=64,
+        k_r=96,
+        groups=32,
+        k_sec=(3, 4, 20, 3),
+        inc_sec=(16, 32, 24, 128),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn92"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+def dpn98(num_classes=1000, pretrained="imagenet"):
+    model = DPN(
+        num_init_features=96,
+        k_r=160,
+        groups=40,
+        k_sec=(3, 6, 20, 3),
+        inc_sec=(16, 32, 32, 128),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn98"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+def dpn131(num_classes=1000, pretrained="imagenet"):
+    model = DPN(
+        num_init_features=128,
+        k_r=160,
+        groups=40,
+        k_sec=(4, 8, 28, 3),
+        inc_sec=(16, 32, 32, 128),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn131"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+def dpn107(num_classes=1000, pretrained="imagenet+5k"):
+    model = DPN(
+        num_init_features=128,
+        k_r=200,
+        groups=50,
+        k_sec=(4, 8, 20, 3),
+        inc_sec=(20, 64, 64, 128),
+        num_classes=num_classes,
+        test_time_pool=True,
+    )
+    if pretrained:
+        settings = pretrained_settings["dpn107"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    return model
+
+
+class CatBnAct(nn.Module):
+    def __init__(self, in_chs, activation_fn=nn.ReLU(inplace=True)):
+        super(CatBnAct, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+
+    def forward(self, x):
+        x = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        return self.act(self.bn(x))
+
+
+class BnActConv2d(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        kernel_size,
+        stride,
+        padding=0,
+        groups=1,
+        activation_fn=nn.ReLU(inplace=True),
+    ):
+        super(BnActConv2d, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+        self.conv = nn.Conv2d(
+            in_chs, out_chs, kernel_size, stride, padding, groups=groups, bias=False
+        )
+
+    def forward(self, x):
+        return self.conv(self.act(self.bn(x)))
+
+
+class InputBlock(nn.Module):
+    def __init__(
+        self,
+        num_init_features,
+        kernel_size=7,
+        padding=3,
+        activation_fn=nn.ReLU(inplace=True),
+    ):
+        super(InputBlock, self).__init__()
+        self.conv = nn.Conv2d(
+            3,
+            num_init_features,
+            kernel_size=kernel_size,
+            stride=2,
+            padding=padding,
+            bias=False,
+        )
+        self.bn = nn.BatchNorm2d(num_init_features, eps=0.001)
+        self.act = activation_fn
+        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.pool(x)
+        return x
+
+
+class DualPathBlock(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        num_1x1_a,
+        num_3x3_b,
+        num_1x1_c,
+        inc,
+        groups,
+        block_type="normal",
+        b=False,
+    ):
+        super(DualPathBlock, self).__init__()
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.b = b
+        if block_type == "proj":
+            self.key_stride = 1
+            self.has_proj = True
+        elif block_type == "down":
+            self.key_stride = 2
+            self.has_proj = True
+        else:
+            assert block_type == "normal"
+            self.key_stride = 1
+            self.has_proj = False
+
+        if self.has_proj:
+            # Using different member names here to allow easier parameter key matching for conversion
+            if self.key_stride == 2:
+                self.c1x1_w_s2 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=2
+                )
+            else:
+                self.c1x1_w_s1 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1
+                )
+        self.c1x1_a = BnActConv2d(
+            in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1
+        )
+        self.c3x3_b = BnActConv2d(
+            in_chs=num_1x1_a,
+            out_chs=num_3x3_b,
+            kernel_size=3,
+            stride=self.key_stride,
+            padding=1,
+            groups=groups,
+        )
+        if b:
+            self.c1x1_c = CatBnAct(in_chs=num_3x3_b)
+            self.c1x1_c1 = nn.Conv2d(num_3x3_b, num_1x1_c, kernel_size=1, bias=False)
+            self.c1x1_c2 = nn.Conv2d(num_3x3_b, inc, kernel_size=1, bias=False)
+        else:
+            self.c1x1_c = BnActConv2d(
+                in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1
+            )
+
+    def forward(self, x):
+        x_in = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        if self.has_proj:
+            if self.key_stride == 2:
+                x_s = self.c1x1_w_s2(x_in)
+            else:
+                x_s = self.c1x1_w_s1(x_in)
+            x_s1 = x_s[:, : self.num_1x1_c, :, :]
+            x_s2 = x_s[:, self.num_1x1_c :, :, :]
+        else:
+            x_s1 = x[0]
+            x_s2 = x[1]
+        x_in = self.c1x1_a(x_in)
+        x_in = self.c3x3_b(x_in)
+        if self.b:
+            x_in = self.c1x1_c(x_in)
+            out1 = self.c1x1_c1(x_in)
+            out2 = self.c1x1_c2(x_in)
+        else:
+            x_in = self.c1x1_c(x_in)
+            out1 = x_in[:, : self.num_1x1_c, :, :]
+            out2 = x_in[:, self.num_1x1_c :, :, :]
+        resid = x_s1 + out1
+        dense = torch.cat([x_s2, out2], dim=1)
+        return resid, dense
+
+
+class DPN(nn.Module):
+    def __init__(
+        self,
+        small=False,
+        num_init_features=64,
+        k_r=96,
+        groups=32,
+        b=False,
+        k_sec=(3, 4, 20, 3),
+        inc_sec=(16, 32, 24, 128),
+        num_classes=1000,
+        test_time_pool=False,
+    ):
+        super(DPN, self).__init__()
+        self.test_time_pool = test_time_pool
+        self.b = b
+        bw_factor = 1 if small else 4
+
+        blocks = OrderedDict()
+
+        # conv1
+        if small:
+            blocks["conv1_1"] = InputBlock(num_init_features, kernel_size=3, padding=1)
+        else:
+            blocks["conv1_1"] = InputBlock(num_init_features, kernel_size=7, padding=3)
+
+        # conv2
+        bw = 64 * bw_factor
+        inc = inc_sec[0]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks["conv2_1"] = DualPathBlock(
+            num_init_features, r, r, bw, inc, groups, "proj", b
+        )
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[0] + 1):
+            blocks["conv2_" + str(i)] = DualPathBlock(
+                in_chs, r, r, bw, inc, groups, "normal", b
+            )
+            in_chs += inc
+
+        # conv3
+        bw = 128 * bw_factor
+        inc = inc_sec[1]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks["conv3_1"] = DualPathBlock(in_chs, r, r, bw, inc, groups, "down", b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[1] + 1):
+            blocks["conv3_" + str(i)] = DualPathBlock(
+                in_chs, r, r, bw, inc, groups, "normal", b
+            )
+            in_chs += inc
+
+        # conv4
+        bw = 256 * bw_factor
+        inc = inc_sec[2]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks["conv4_1"] = DualPathBlock(in_chs, r, r, bw, inc, groups, "down", b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[2] + 1):
+            blocks["conv4_" + str(i)] = DualPathBlock(
+                in_chs, r, r, bw, inc, groups, "normal", b
+            )
+            in_chs += inc
+
+        # conv5
+        bw = 512 * bw_factor
+        inc = inc_sec[3]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks["conv5_1"] = DualPathBlock(in_chs, r, r, bw, inc, groups, "down", b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[3] + 1):
+            blocks["conv5_" + str(i)] = DualPathBlock(
+                in_chs, r, r, bw, inc, groups, "normal", b
+            )
+            in_chs += inc
+        blocks["conv5_bn_ac"] = CatBnAct(in_chs)
+
+        self.features = nn.Sequential(blocks)
+
+        # Using 1x1 conv for the FC layer to allow the extra pooling scheme
+        self.last_linear = nn.Conv2d(in_chs, num_classes, kernel_size=1, bias=True)
+
+    def logits(self, features):
+        if not self.training and self.test_time_pool:
+            x = F.avg_pool2d(features, kernel_size=7, stride=1)
+            out = self.last_linear(x)
+            # The extra test time pool should be pooling an img_size//32 - 6 size patch
+            out = adaptive_avgmax_pool2d(out, pool_type="avgmax")
+        else:
+            x = adaptive_avgmax_pool2d(features, pool_type="avg")
+            out = self.last_linear(x)
+        return out.view(out.size(0), -1)
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+""" PyTorch selectable adaptive pooling
+Adaptive pooling with the ability to select the type of pooling from:
+    * 'avg' - Average pooling
+    * 'max' - Max pooling
+    * 'avgmax' - Sum of average and max pooling re-scaled by 0.5
+    * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim
+
+Both a functional and a nn.Module version of the pooling is provided.
+
+Author: Ross Wightman (rwightman)
+"""
+
+
+def pooling_factor(pool_type="avg"):
+    return 2 if pool_type == "avgmaxc" else 1
+
+
+def adaptive_avgmax_pool2d(x, pool_type="avg", padding=0, count_include_pad=False):
+    """Selectable global pooling function with dynamic input kernel size"""
+    if pool_type == "avgmaxc":
+        x = torch.cat(
+            [
+                F.avg_pool2d(
+                    x,
+                    kernel_size=(x.size(2), x.size(3)),
+                    padding=padding,
+                    count_include_pad=count_include_pad,
+                ),
+                F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding),
+            ],
+            dim=1,
+        )
+    elif pool_type == "avgmax":
+        x_avg = F.avg_pool2d(
+            x,
+            kernel_size=(x.size(2), x.size(3)),
+            padding=padding,
+            count_include_pad=count_include_pad,
+        )
+        x_max = F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+        x = 0.5 * (x_avg + x_max)
+    elif pool_type == "max":
+        x = F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+    else:
+        if pool_type != "avg":
+            print(
+                "Invalid pool type %s specified. Defaulting to average pooling."
+                % pool_type
+            )
+        x = F.avg_pool2d(
+            x,
+            kernel_size=(x.size(2), x.size(3)),
+            padding=padding,
+            count_include_pad=count_include_pad,
+        )
+    return x
+
+
+class AdaptiveAvgMaxPool2d(torch.nn.Module):
+    """Selectable global pooling layer with dynamic input kernel size"""
+
+    def __init__(self, output_size=1, pool_type="avg"):
+        super(AdaptiveAvgMaxPool2d, self).__init__()
+        self.output_size = output_size
+        self.pool_type = pool_type
+        if pool_type == "avgmaxc" or pool_type == "avgmax":
+            self.pool = nn.ModuleList(
+                [nn.AdaptiveAvgPool2d(output_size), nn.AdaptiveMaxPool2d(output_size)]
+            )
+        elif pool_type == "max":
+            self.pool = nn.AdaptiveMaxPool2d(output_size)
+        else:
+            if pool_type != "avg":
+                print(
+                    "Invalid pool type %s specified. Defaulting to average pooling."
+                    % pool_type
+                )
+            self.pool = nn.AdaptiveAvgPool2d(output_size)
+
+    def forward(self, x):
+        if self.pool_type == "avgmaxc":
+            x = torch.cat([p(x) for p in self.pool], dim=1)
+        elif self.pool_type == "avgmax":
+            x = 0.5 * torch.sum(torch.stack([p(x) for p in self.pool]), 0).squeeze(
+                dim=0
+            )
+        else:
+            x = self.pool(x)
+        return x
+
+    def factor(self):
+        return pooling_factor(self.pool_type)
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + " ("
+            + "output_size="
+            + str(self.output_size)
+            + ", pool_type="
+            + self.pool_type
+            + ")"
+        )
diff --git a/segmentation_models_pytorch/encoders/_inceptionresnetv2.py b/segmentation_models_pytorch/encoders/_inceptionresnetv2.py
new file mode 100644
index 00000000..425d4261
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_inceptionresnetv2.py
@@ -0,0 +1,381 @@
+from __future__ import print_function, division, absolute_import
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+__all__ = ["InceptionResNetV2", "inceptionresnetv2"]
+
+pretrained_settings = {
+    "inceptionresnetv2": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth",
+            "input_space": "RGB",
+            "input_size": [3, 299, 299],
+            "input_range": [0, 1],
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5],
+            "num_classes": 1000,
+        },
+        "imagenet+background": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth",
+            "input_space": "RGB",
+            "input_size": [3, 299, 299],
+            "input_range": [0, 1],
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5],
+            "num_classes": 1001,
+        },
+    }
+}
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )  # verify bias false
+        self.bn = nn.BatchNorm2d(
+            out_planes,
+            eps=0.001,  # value found in tensorflow
+            momentum=0.1,  # default pytorch value
+            affine=True,
+        )
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Mixed_5b(nn.Module):
+    def __init__(self):
+        super(Mixed_5b, self).__init__()
+
+        self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(192, 48, kernel_size=1, stride=1),
+            BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2),
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(192, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1),
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(192, 64, kernel_size=1, stride=1),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Block35(nn.Module):
+    def __init__(self, scale=1.0):
+        super(Block35, self).__init__()
+
+        self.scale = scale
+
+        self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(320, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1),
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(320, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1),
+        )
+
+        self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+
+
+class Mixed_6a(nn.Module):
+    def __init__(self):
+        super(Mixed_6a, self).__init__()
+
+        self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(320, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(256, 384, kernel_size=3, stride=2),
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Block17(nn.Module):
+    def __init__(self, scale=1.0):
+        super(Block17, self).__init__()
+
+        self.scale = scale
+
+        self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1088, 128, kernel_size=1, stride=1),
+            BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+        )
+
+        self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+
+
+class Mixed_7a(nn.Module):
+    def __init__(self):
+        super(Mixed_7a, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 384, kernel_size=3, stride=2),
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 288, kernel_size=3, stride=2),
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(288, 320, kernel_size=3, stride=2),
+        )
+
+        self.branch3 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Block8(nn.Module):
+    def __init__(self, scale=1.0, noReLU=False):
+        super(Block8, self).__init__()
+
+        self.scale = scale
+        self.noReLU = noReLU
+
+        self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(2080, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)),
+            BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)),
+        )
+
+        self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)
+        if not self.noReLU:
+            self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        if not self.noReLU:
+            out = self.relu(out)
+        return out
+
+
+class InceptionResNetV2(nn.Module):
+    def __init__(self, num_classes=1001):
+        super(InceptionResNetV2, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2)
+        self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)
+        self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)
+        self.maxpool_3a = nn.MaxPool2d(3, stride=2)
+        self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
+        self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
+        self.maxpool_5a = nn.MaxPool2d(3, stride=2)
+        self.mixed_5b = Mixed_5b()
+        self.repeat = nn.Sequential(
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+        )
+        self.mixed_6a = Mixed_6a()
+        self.repeat_1 = nn.Sequential(
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+        )
+        self.mixed_7a = Mixed_7a()
+        self.repeat_2 = nn.Sequential(
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+        )
+        self.block8 = Block8(noReLU=True)
+        self.conv2d_7b = BasicConv2d(2080, 1536, kernel_size=1, stride=1)
+        self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
+        self.last_linear = nn.Linear(1536, num_classes)
+
+    def features(self, input):
+        x = self.conv2d_1a(input)
+        x = self.conv2d_2a(x)
+        x = self.conv2d_2b(x)
+        x = self.maxpool_3a(x)
+        x = self.conv2d_3b(x)
+        x = self.conv2d_4a(x)
+        x = self.maxpool_5a(x)
+        x = self.mixed_5b(x)
+        x = self.repeat(x)
+        x = self.mixed_6a(x)
+        x = self.repeat_1(x)
+        x = self.mixed_7a(x)
+        x = self.repeat_2(x)
+        x = self.block8(x)
+        x = self.conv2d_7b(x)
+        return x
+
+    def logits(self, features):
+        x = self.avgpool_1a(features)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def inceptionresnetv2(num_classes=1000, pretrained="imagenet"):
+    r"""InceptionResNetV2 model architecture from the
+    `"InceptionV4, Inception-ResNet..." <https://arxiv.org/abs/1602.07261>`_ paper.
+    """
+    if pretrained:
+        settings = pretrained_settings["inceptionresnetv2"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        # both 'imagenet'&'imagenet+background' are loaded from same parameters
+        model = InceptionResNetV2(num_classes=1001)
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+
+        if pretrained == "imagenet":
+            new_last_linear = nn.Linear(1536, 1000)
+            new_last_linear.weight.data = model.last_linear.weight.data[1:]
+            new_last_linear.bias.data = model.last_linear.bias.data[1:]
+            model.last_linear = new_last_linear
+
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    else:
+        model = InceptionResNetV2(num_classes=num_classes)
+    return model
+
+
+"""
+TEST
+Run this code with:
+```
+cd $HOME/pretrained-models.pytorch
+python -m pretrainedmodels.inceptionresnetv2
+```
+"""
+if __name__ == "__main__":
+    assert inceptionresnetv2(num_classes=10, pretrained=None)
+    print("success")
+    assert inceptionresnetv2(num_classes=1000, pretrained="imagenet")
+    print("success")
+    assert inceptionresnetv2(num_classes=1001, pretrained="imagenet+background")
+    print("success")
+
+    # fail
+    assert inceptionresnetv2(num_classes=1001, pretrained="imagenet")
diff --git a/segmentation_models_pytorch/encoders/_inceptionv4.py b/segmentation_models_pytorch/encoders/_inceptionv4.py
new file mode 100644
index 00000000..14b8eadb
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_inceptionv4.py
@@ -0,0 +1,367 @@
+from __future__ import print_function, division, absolute_import
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+__all__ = ["InceptionV4", "inceptionv4"]
+
+pretrained_settings = {
+    "inceptionv4": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionv4-8e4777a0.pth",
+            "input_space": "RGB",
+            "input_size": [3, 299, 299],
+            "input_range": [0, 1],
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5],
+            "num_classes": 1000,
+        },
+        "imagenet+background": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionv4-8e4777a0.pth",
+            "input_space": "RGB",
+            "input_size": [3, 299, 299],
+            "input_range": [0, 1],
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5],
+            "num_classes": 1001,
+        },
+    }
+}
+
+
+class BasicConv2d(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )  # verify bias false
+        self.bn = nn.BatchNorm2d(
+            out_planes,
+            eps=0.001,  # value found in tensorflow
+            momentum=0.1,  # default pytorch value
+            affine=True,
+        )
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Mixed_3a(nn.Module):
+    def __init__(self):
+        super(Mixed_3a, self).__init__()
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+        self.conv = BasicConv2d(64, 96, kernel_size=3, stride=2)
+
+    def forward(self, x):
+        x0 = self.maxpool(x)
+        x1 = self.conv(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Mixed_4a(nn.Module):
+    def __init__(self):
+        super(Mixed_4a, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1),
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 64, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(64, 64, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(64, 96, kernel_size=(3, 3), stride=1),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Mixed_5a(nn.Module):
+    def __init__(self):
+        super(Mixed_5a, self).__init__()
+        self.conv = BasicConv2d(192, 192, kernel_size=3, stride=2)
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.conv(x)
+        x1 = self.maxpool(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Inception_A(nn.Module):
+    def __init__(self):
+        super(Inception_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 96, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1),
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(384, 96, kernel_size=1, stride=1),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Reduction_A(nn.Module):
+    def __init__(self):
+        super(Reduction_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 384, kernel_size=3, stride=2)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(224, 256, kernel_size=3, stride=2),
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Inception_B(nn.Module):
+    def __init__(self):
+        super(Inception_B, self).__init__()
+        self.branch0 = BasicConv2d(1024, 384, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(224, 256, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(224, 224, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(224, 256, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(1024, 128, kernel_size=1, stride=1),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Reduction_B(nn.Module):
+    def __init__(self):
+        super(Reduction_B, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=3, stride=2),
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 256, kernel_size=(1, 7), stride=1, padding=(0, 3)),
+            BasicConv2d(256, 320, kernel_size=(7, 1), stride=1, padding=(3, 0)),
+            BasicConv2d(320, 320, kernel_size=3, stride=2),
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Inception_C(nn.Module):
+    def __init__(self):
+        super(Inception_C, self).__init__()
+
+        self.branch0 = BasicConv2d(1536, 256, kernel_size=1, stride=1)
+
+        self.branch1_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch1_1a = BasicConv2d(
+            384, 256, kernel_size=(1, 3), stride=1, padding=(0, 1)
+        )
+        self.branch1_1b = BasicConv2d(
+            384, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)
+        )
+
+        self.branch2_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch2_1 = BasicConv2d(
+            384, 448, kernel_size=(3, 1), stride=1, padding=(1, 0)
+        )
+        self.branch2_2 = BasicConv2d(
+            448, 512, kernel_size=(1, 3), stride=1, padding=(0, 1)
+        )
+        self.branch2_3a = BasicConv2d(
+            512, 256, kernel_size=(1, 3), stride=1, padding=(0, 1)
+        )
+        self.branch2_3b = BasicConv2d(
+            512, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(1536, 256, kernel_size=1, stride=1),
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+
+        x1_0 = self.branch1_0(x)
+        x1_1a = self.branch1_1a(x1_0)
+        x1_1b = self.branch1_1b(x1_0)
+        x1 = torch.cat((x1_1a, x1_1b), 1)
+
+        x2_0 = self.branch2_0(x)
+        x2_1 = self.branch2_1(x2_0)
+        x2_2 = self.branch2_2(x2_1)
+        x2_3a = self.branch2_3a(x2_2)
+        x2_3b = self.branch2_3b(x2_2)
+        x2 = torch.cat((x2_3a, x2_3b), 1)
+
+        x3 = self.branch3(x)
+
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class InceptionV4(nn.Module):
+    def __init__(self, num_classes=1001):
+        super(InceptionV4, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.features = nn.Sequential(
+            BasicConv2d(3, 32, kernel_size=3, stride=2),
+            BasicConv2d(32, 32, kernel_size=3, stride=1),
+            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            Mixed_3a(),
+            Mixed_4a(),
+            Mixed_5a(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Reduction_A(),  # Mixed_6a
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Reduction_B(),  # Mixed_7a
+            Inception_C(),
+            Inception_C(),
+            Inception_C(),
+        )
+        self.last_linear = nn.Linear(1536, num_classes)
+
+    def logits(self, features):
+        # Allows image of any size to be processed
+        adaptiveAvgPoolWidth = features.shape[2]
+        x = F.avg_pool2d(features, kernel_size=adaptiveAvgPoolWidth)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def inceptionv4(num_classes=1000, pretrained="imagenet"):
+    if pretrained:
+        settings = pretrained_settings["inceptionv4"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        # both 'imagenet'&'imagenet+background' are loaded from same parameters
+        model = InceptionV4(num_classes=1001)
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+
+        if pretrained == "imagenet":
+            new_last_linear = nn.Linear(1536, 1000)
+            new_last_linear.weight.data = model.last_linear.weight.data[1:]
+            new_last_linear.bias.data = model.last_linear.bias.data[1:]
+            model.last_linear = new_last_linear
+
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+    else:
+        model = InceptionV4(num_classes=num_classes)
+    return model
+
+
+"""
+TEST
+Run this code with:
+```
+cd $HOME/pretrained-models.pytorch
+python -m pretrainedmodels.inceptionv4
+```
+"""
+if __name__ == "__main__":
+    assert inceptionv4(num_classes=10, pretrained=None)
+    print("success")
+    assert inceptionv4(num_classes=1000, pretrained="imagenet")
+    print("success")
+    assert inceptionv4(num_classes=1001, pretrained="imagenet+background")
+    print("success")
+
+    # fail
+    assert inceptionv4(num_classes=1001, pretrained="imagenet")
diff --git a/segmentation_models_pytorch/encoders/_senet.py b/segmentation_models_pytorch/encoders/_senet.py
new file mode 100644
index 00000000..1e555ca1
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_senet.py
@@ -0,0 +1,542 @@
+"""
+ResNet code gently borrowed from
+https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+"""
+
+from __future__ import print_function, division, absolute_import
+from collections import OrderedDict
+import math
+
+import torch.nn as nn
+from torch.utils import model_zoo
+
+__all__ = [
+    "SENet",
+    "senet154",
+    "se_resnet50",
+    "se_resnet101",
+    "se_resnet152",
+    "se_resnext50_32x4d",
+    "se_resnext101_32x4d",
+]
+
+pretrained_settings = {
+    "senet154": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+    "se_resnet50": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet50-ce0d4300.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+    "se_resnet101": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet101-7e38fcc6.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+    "se_resnet152": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet152-d17c99b7.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+    "se_resnext50_32x4d": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+    "se_resnext101_32x4d": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext101_32x4d-3b2fe3d8.pth",
+            "input_space": "RGB",
+            "input_size": [3, 224, 224],
+            "input_range": [0, 1],
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225],
+            "num_classes": 1000,
+        }
+    },
+}
+
+
+class SEModule(nn.Module):
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1, padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class Bottleneck(nn.Module):
+    """
+    Base class for bottlenecks that implements `forward()` method.
+    """
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = self.se_module(out) + residual
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(Bottleneck):
+    """
+    Bottleneck for SENet154.
+    """
+
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes * 2)
+        self.conv2 = nn.Conv2d(
+            planes * 2,
+            planes * 4,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=groups,
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm2d(planes * 4)
+        self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNetBottleneck(Bottleneck):
+    """
+    ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
+    implementation and uses `stride=stride` in `conv1` and not in `conv2`
+    (the latter is used in the torchvision implementation of ResNet).
+    """
+
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
+        super(SEResNetBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, bias=False, stride=stride
+        )
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, groups=groups, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNeXtBottleneck(Bottleneck):
+    """
+    ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
+    """
+
+    expansion = 4
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        groups,
+        reduction,
+        stride=1,
+        downsample=None,
+        base_width=4,
+    ):
+        super(SEResNeXtBottleneck, self).__init__()
+        width = math.floor(planes * (base_width / 64)) * groups
+        self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False, stride=1)
+        self.bn1 = nn.BatchNorm2d(width)
+        self.conv2 = nn.Conv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=groups,
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm2d(width)
+        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SENet(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        groups,
+        reduction,
+        dropout_p=0.2,
+        inplanes=128,
+        input_3x3=True,
+        downsample_kernel_size=3,
+        downsample_padding=1,
+        num_classes=1000,
+    ):
+        """
+        Parameters
+        ----------
+        block (nn.Module): Bottleneck class.
+            - For SENet154: SEBottleneck
+            - For SE-ResNet models: SEResNetBottleneck
+            - For SE-ResNeXt models:  SEResNeXtBottleneck
+        layers (list of ints): Number of residual blocks for 4 layers of the
+            network (layer1...layer4).
+        groups (int): Number of groups for the 3x3 convolution in each
+            bottleneck block.
+            - For SENet154: 64
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models:  32
+        reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
+            - For all models: 16
+        dropout_p (float or None): Drop probability for the Dropout layer.
+            If `None` the Dropout layer is not used.
+            - For SENet154: 0.2
+            - For SE-ResNet models: None
+            - For SE-ResNeXt models: None
+        inplanes (int):  Number of input channels for layer1.
+            - For SENet154: 128
+            - For SE-ResNet models: 64
+            - For SE-ResNeXt models: 64
+        input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
+            a single 7x7 convolution in layer0.
+            - For SENet154: True
+            - For SE-ResNet models: False
+            - For SE-ResNeXt models: False
+        downsample_kernel_size (int): Kernel size for downsampling convolutions
+            in layer2, layer3 and layer4.
+            - For SENet154: 3
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models: 1
+        downsample_padding (int): Padding for downsampling convolutions in
+            layer2, layer3 and layer4.
+            - For SENet154: 1
+            - For SE-ResNet models: 0
+            - For SE-ResNeXt models: 0
+        num_classes (int): Number of outputs in `last_linear` layer.
+            - For all models: 1000
+        """
+        super(SENet, self).__init__()
+        self.inplanes = inplanes
+        if input_3x3:
+            layer0_modules = [
+                ("conv1", nn.Conv2d(3, 64, 3, stride=2, padding=1, bias=False)),
+                ("bn1", nn.BatchNorm2d(64)),
+                ("relu1", nn.ReLU(inplace=True)),
+                ("conv2", nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False)),
+                ("bn2", nn.BatchNorm2d(64)),
+                ("relu2", nn.ReLU(inplace=True)),
+                ("conv3", nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
+                ("bn3", nn.BatchNorm2d(inplanes)),
+                ("relu3", nn.ReLU(inplace=True)),
+            ]
+        else:
+            layer0_modules = [
+                (
+                    "conv1",
+                    nn.Conv2d(
+                        3, inplanes, kernel_size=7, stride=2, padding=3, bias=False
+                    ),
+                ),
+                ("bn1", nn.BatchNorm2d(inplanes)),
+                ("relu1", nn.ReLU(inplace=True)),
+            ]
+        # To preserve compatibility with Caffe weights `ceil_mode=True`
+        # is used instead of `padding=1`.
+        layer0_modules.append(("pool", nn.MaxPool2d(3, stride=2, ceil_mode=True)))
+        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
+        self.layer1 = self._make_layer(
+            block,
+            planes=64,
+            blocks=layers[0],
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=1,
+            downsample_padding=0,
+        )
+        self.layer2 = self._make_layer(
+            block,
+            planes=128,
+            blocks=layers[1],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding,
+        )
+        self.layer3 = self._make_layer(
+            block,
+            planes=256,
+            blocks=layers[2],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding,
+        )
+        self.layer4 = self._make_layer(
+            block,
+            planes=512,
+            blocks=layers[3],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding,
+        )
+        self.avg_pool = nn.AvgPool2d(7, stride=1)
+        self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
+        self.last_linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(
+        self,
+        block,
+        planes,
+        blocks,
+        groups,
+        reduction,
+        stride=1,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+    ):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=downsample_kernel_size,
+                    stride=stride,
+                    padding=downsample_padding,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, groups, reduction, stride, downsample)
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups, reduction))
+
+        return nn.Sequential(*layers)
+
+    def features(self, x):
+        x = self.layer0(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+    def logits(self, x):
+        x = self.avg_pool(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.logits(x)
+        return x
+
+
+def initialize_pretrained_model(model, num_classes, settings):
+    assert num_classes == settings["num_classes"], (
+        "num_classes should be {}, but is {}".format(
+            settings["num_classes"], num_classes
+        )
+    )
+    model.load_state_dict(model_zoo.load_url(settings["url"]))
+    model.input_space = settings["input_space"]
+    model.input_size = settings["input_size"]
+    model.input_range = settings["input_range"]
+    model.mean = settings["mean"]
+    model.std = settings["std"]
+
+
+def senet154(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEBottleneck,
+        [3, 8, 36, 3],
+        groups=64,
+        reduction=16,
+        dropout_p=0.2,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["senet154"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet50(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEResNetBottleneck,
+        [3, 4, 6, 3],
+        groups=1,
+        reduction=16,
+        dropout_p=None,
+        inplanes=64,
+        input_3x3=False,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["se_resnet50"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet101(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEResNetBottleneck,
+        [3, 4, 23, 3],
+        groups=1,
+        reduction=16,
+        dropout_p=None,
+        inplanes=64,
+        input_3x3=False,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["se_resnet101"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet152(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEResNetBottleneck,
+        [3, 8, 36, 3],
+        groups=1,
+        reduction=16,
+        dropout_p=None,
+        inplanes=64,
+        input_3x3=False,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["se_resnet152"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnext50_32x4d(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEResNeXtBottleneck,
+        [3, 4, 6, 3],
+        groups=32,
+        reduction=16,
+        dropout_p=None,
+        inplanes=64,
+        input_3x3=False,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["se_resnext50_32x4d"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnext101_32x4d(num_classes=1000, pretrained="imagenet"):
+    model = SENet(
+        SEResNeXtBottleneck,
+        [3, 4, 23, 3],
+        groups=32,
+        reduction=16,
+        dropout_p=None,
+        inplanes=64,
+        input_3x3=False,
+        downsample_kernel_size=1,
+        downsample_padding=0,
+        num_classes=num_classes,
+    )
+    if pretrained is not None:
+        settings = pretrained_settings["se_resnext101_32x4d"][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
diff --git a/segmentation_models_pytorch/encoders/_xception.py b/segmentation_models_pytorch/encoders/_xception.py
new file mode 100644
index 00000000..5c4420f6
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_xception.py
@@ -0,0 +1,275 @@
+"""
+Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
+
+@author: tstandley
+Adapted by cadene
+
+Creates an Xception Model as defined in:
+
+Francois Chollet
+Xception: Deep Learning with Depthwise Separable Convolutions
+https://arxiv.org/pdf/1610.02357.pdf
+
+This weights ported from the Keras implementation. Achieves the following performance on the validation set:
+
+Loss:0.9173 Prec@1:78.892 Prec@5:94.292
+
+REMEMBER to set your image size to 3x299x299 for both test and validation
+
+normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+
+The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+"""
+
+from __future__ import print_function, division, absolute_import
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+__all__ = ["xception"]
+
+pretrained_settings = {
+    "xception": {
+        "imagenet": {
+            "url": "http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth",
+            "input_space": "RGB",
+            "input_size": [3, 299, 299],
+            "input_range": [0, 1],
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5],
+            "num_classes": 1000,
+            "scale": 0.8975,  # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+        }
+    }
+}
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        in_filters,
+        out_filters,
+        reps,
+        strides=1,
+        start_with_relu=True,
+        grow_first=True,
+    ):
+        super(Block, self).__init__()
+
+        if out_filters != in_filters or strides != 1:
+            self.skip = nn.Conv2d(
+                in_filters, out_filters, 1, stride=strides, bias=False
+            )
+            self.skipbn = nn.BatchNorm2d(out_filters)
+        else:
+            self.skip = None
+
+        rep = []
+
+        filters = in_filters
+        if grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(
+                SeparableConv2d(
+                    in_filters, out_filters, 3, stride=1, padding=1, bias=False
+                )
+            )
+            rep.append(nn.BatchNorm2d(out_filters))
+            filters = out_filters
+
+        for i in range(reps - 1):
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(
+                SeparableConv2d(filters, filters, 3, stride=1, padding=1, bias=False)
+            )
+            rep.append(nn.BatchNorm2d(filters))
+
+        if not grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(
+                SeparableConv2d(
+                    in_filters, out_filters, 3, stride=1, padding=1, bias=False
+                )
+            )
+            rep.append(nn.BatchNorm2d(out_filters))
+
+        if not start_with_relu:
+            rep = rep[1:]
+        else:
+            rep[0] = nn.ReLU(inplace=False)
+
+        if strides != 1:
+            rep.append(nn.MaxPool2d(3, strides, 1))
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = self.rep(inp)
+
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+
+        x += skip
+        return x
+
+
+class Xception(nn.Module):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+
+    def __init__(self, num_classes=1000):
+        """Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        # do relu here
+
+        self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
+        self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
+        self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)
+
+        self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+
+        self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+
+        self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)
+
+        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
+        self.bn3 = nn.BatchNorm2d(1536)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        # do relu here
+        self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
+        self.bn4 = nn.BatchNorm2d(2048)
+
+        self.fc = nn.Linear(2048, num_classes)
+
+        # #------- init weights --------
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+        # #-----------------------------
+
+    def features(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu3(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        return x
+
+    def logits(self, features):
+        x = nn.ReLU(inplace=True)(features)
+
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def xception(num_classes=1000, pretrained="imagenet"):
+    model = Xception(num_classes=num_classes)
+    if pretrained:
+        settings = pretrained_settings["xception"][pretrained]
+        assert num_classes == settings["num_classes"], (
+            "num_classes should be {}, but is {}".format(
+                settings["num_classes"], num_classes
+            )
+        )
+
+        model = Xception(num_classes=num_classes)
+        model.load_state_dict(model_zoo.load_url(settings["url"]))
+
+        model.input_space = settings["input_space"]
+        model.input_size = settings["input_size"]
+        model.input_range = settings["input_range"]
+        model.mean = settings["mean"]
+        model.std = settings["std"]
+
+    # TODO: ugly
+    model.last_linear = model.fc
+    del model.fc
+    return model
diff --git a/segmentation_models_pytorch/encoders/dpn.py b/segmentation_models_pytorch/encoders/dpn.py
index 4fe84328..b5226d4d 100644
--- a/segmentation_models_pytorch/encoders/dpn.py
+++ b/segmentation_models_pytorch/encoders/dpn.py
@@ -27,9 +27,8 @@
 import torch.nn.functional as F
 from typing import List, Dict, Sequence
 
-from pretrainedmodels.models.dpn import DPN
-
 from ._base import EncoderMixin
+from ._dpn import DPN
 
 
 class DPNEncoder(DPN, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/inceptionresnetv2.py b/segmentation_models_pytorch/encoders/inceptionresnetv2.py
index 15bf6502..d7f83f9d 100644
--- a/segmentation_models_pytorch/encoders/inceptionresnetv2.py
+++ b/segmentation_models_pytorch/encoders/inceptionresnetv2.py
@@ -26,9 +26,9 @@
 import torch
 import torch.nn as nn
 from typing import List
-from pretrainedmodels.models.inceptionresnetv2 import InceptionResNetV2
 
 from ._base import EncoderMixin
+from ._inceptionresnetv2 import InceptionResNetV2
 
 
 class InceptionResNetV2Encoder(InceptionResNetV2, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/inceptionv4.py b/segmentation_models_pytorch/encoders/inceptionv4.py
index 12a7cc1b..3c335042 100644
--- a/segmentation_models_pytorch/encoders/inceptionv4.py
+++ b/segmentation_models_pytorch/encoders/inceptionv4.py
@@ -27,9 +27,9 @@
 import torch.nn as nn
 
 from typing import List
-from pretrainedmodels.models.inceptionv4 import InceptionV4
 
 from ._base import EncoderMixin
+from ._inceptionv4 import InceptionV4
 
 
 class InceptionV4Encoder(InceptionV4, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/senet.py b/segmentation_models_pytorch/encoders/senet.py
index 18dbfd91..da509f5a 100644
--- a/segmentation_models_pytorch/encoders/senet.py
+++ b/segmentation_models_pytorch/encoders/senet.py
@@ -26,13 +26,13 @@
 import torch
 from typing import List, Dict, Sequence
 
-from pretrainedmodels.models.senet import (
+from ._base import EncoderMixin
+from ._senet import (
     SENet,
     SEBottleneck,
     SEResNetBottleneck,
     SEResNeXtBottleneck,
 )
-from ._base import EncoderMixin
 
 
 class SENetEncoder(SENet, EncoderMixin):
@@ -99,76 +99,6 @@ def load_state_dict(self, state_dict, **kwargs):
         super().load_state_dict(state_dict, **kwargs)
 
 
-pretrained_settings = {
-    "senet154": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet50": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet50-ce0d4300.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet101": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet101-7e38fcc6.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet152": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet152-d17c99b7.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnext50_32x4d": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnext101_32x4d": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext101_32x4d-3b2fe3d8.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-}
-
-
 senet_encoders = {
     "senet154": {
         "encoder": SENetEncoder,
diff --git a/segmentation_models_pytorch/encoders/xception.py b/segmentation_models_pytorch/encoders/xception.py
index 594636a4..af3a26d4 100644
--- a/segmentation_models_pytorch/encoders/xception.py
+++ b/segmentation_models_pytorch/encoders/xception.py
@@ -1,7 +1,7 @@
 from typing import List
-from pretrainedmodels.models.xception import Xception
 
 from ._base import EncoderMixin
+from ._xception import Xception
 
 
 class XceptionEncoder(Xception, EncoderMixin):
@@ -42,10 +42,10 @@ def forward(self, x):
         if self._depth >= 1:
             x = self.conv1(x)
             x = self.bn1(x)
-            x = self.relu(x)
+            x = self.relu1(x)
             x = self.conv2(x)
             x = self.bn2(x)
-            x = self.relu(x)
+            x = self.relu2(x)
             features.append(x)
 
         if self._depth >= 2:
@@ -72,7 +72,7 @@ def forward(self, x):
             x = self.block12(x)
             x = self.conv3(x)
             x = self.bn3(x)
-            x = self.relu(x)
+            x = self.relu3(x)
             x = self.conv4(x)
             x = self.bn4(x)
             features.append(x)