diff --git a/docs/conf.py b/docs/conf.py
index 82583c6b..4cc70a6b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -100,7 +100,6 @@ def get_version():
     "timm",
     "cv2",
     "PIL",
-    "pretrainedmodels",
     "torchvision",
     "segmentation_models_pytorch.encoders",
     "segmentation_models_pytorch.utils",
diff --git a/licenses/LICENSES.md b/licenses/LICENSES.md
index 06f36241..e51ad8d0 100644
--- a/licenses/LICENSES.md
+++ b/licenses/LICENSES.md
@@ -13,14 +13,18 @@ The majority of the code is licensed under the [MIT License](LICENSE). However,
   * [segmentation_models_pytorch/encoders/mix_transformer.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/mix_transformer.py)
   * [LICENSE_nvidia](LICENSE_nvidia.md)
 
-
 - Apple License
   * Applies to the MobileOne encoder
   * [segmentation_models_pytorch/encoders/mobileone.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/mobileone.py)
   * [LICENSE_apple](LICENSE_apple.md)
 
 - BSD 3-Clause License
-  * Applies to the DeepLabV3 decoder
+  * Applies to several encoders and the DeepLabV3 decoder
+  * [segmentation_models_pytorch/encoders/_dpn.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_dpn.py)
+  * [segmentation_models_pytorch/encoders/_inceptionresnetv2.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_inceptionresnetv2.py)
+  * [segmentation_models_pytorch/encoders/_inceptionv4.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_inceptionv4.py)
+  * [segmentation_models_pytorch/encoders/_senet.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_senet.py)
+  * [segmentation_models_pytorch/encoders/_xception.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/encoders/_xception.py)
   * [segmentation_models_pytorch/decoders/deeplabv3/decoder.py](https://github.com/qubvel/segmentation_models.pytorch/blob/main/segmentation_models_pytorch/decoders/deeplabv3/decoder.py)
 
 - Apache-2.0 License
diff --git a/pyproject.toml b/pyproject.toml
index 492f7ef9..f3e55a96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,9 +20,7 @@ dependencies = [
     'huggingface-hub>=0.24',
     'numpy>=1.19.3',
     'pillow>=8',
-    'pretrainedmodels>=0.7.1',
     'safetensors>=0.3.1',
-    'six>=1.5',
     'timm>=0.9',
     'torch>=1.8',
     'torchvision>=0.9',
diff --git a/requirements/minimum.old b/requirements/minimum.old
index 1adb97f5..678f83f4 100644
--- a/requirements/minimum.old
+++ b/requirements/minimum.old
@@ -1,9 +1,7 @@
 huggingface-hub==0.24.0
 numpy==1.19.3
 pillow==8.0.0
-pretrainedmodels==0.7.1
 safetensors==0.3.1
-six==1.5.0
 timm==0.9.0
 torch==1.9.0
 torchvision==0.10.0
diff --git a/requirements/required.txt b/requirements/required.txt
index 6864e1f6..cf3db498 100644
--- a/requirements/required.txt
+++ b/requirements/required.txt
@@ -1,9 +1,7 @@
 huggingface_hub==0.27.1
 numpy==2.2.1
 pillow==11.1.0
-pretrainedmodels==0.7.4
 safetensors==0.5.2
-six==1.17.0
 timm==1.0.13
 torch==2.5.1
 torchvision==0.20.1
diff --git a/segmentation_models_pytorch/__init__.py b/segmentation_models_pytorch/__init__.py
index f1807836..8a1e17fe 100644
--- a/segmentation_models_pytorch/__init__.py
+++ b/segmentation_models_pytorch/__init__.py
@@ -1,5 +1,3 @@
-import warnings
-
 from . import datasets
 from . import encoders
 from . import decoders
@@ -24,12 +22,6 @@
 from typing import Optional as _Optional
 import torch as _torch
 
-# Suppress the specific SyntaxWarning for `pretrainedmodels`
-warnings.filterwarnings("ignore", message="is with a literal", category=SyntaxWarning)
-warnings.filterwarnings(
-    "ignore", message=r'"is" with \'str\' literal.*', category=SyntaxWarning
-)  # for python >= 3.12
-
 _MODEL_ARCHITECTURES = [
     Unet,
     UnetPlusPlus,
diff --git a/segmentation_models_pytorch/encoders/_dpn.py b/segmentation_models_pytorch/encoders/_dpn.py
new file mode 100644
index 00000000..5ab965ca
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_dpn.py
@@ -0,0 +1,461 @@
+""" PyTorch implementation of DualPathNetworks
+Ported to PyTorch by [Ross Wightman](https://github.com/rwightman/pytorch-dpn-pretrained)
+
+Based on original MXNet implementation https://github.com/cypw/DPNs with
+many ideas from another PyTorch implementation https://github.com/oyam/pytorch-DPNs.
+
+This implementation is compatible with the pretrained weights
+from cypw's MXNet implementation.
+"""
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+
+__all__ = ['DPN', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107']
+
+pretrained_settings = {
+    'dpn68': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn68-4af7d88d2.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    },
+    'dpn68b': {
+        'imagenet+5k': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn68b_extra-363ab9c19.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    },
+    'dpn92': {
+        # 'imagenet': {
+        #     'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn68-66bebafa7.pth',
+        #     'input_space': 'RGB',
+        #     'input_size': [3, 224, 224],
+        #     'input_range': [0, 1],
+        #     'mean': [124 / 255, 117 / 255, 104 / 255],
+        #     'std': [1 / (.0167 * 255)] * 3,
+        #     'num_classes': 1000
+        # },
+        'imagenet+5k': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn92_extra-fda993c95.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    },
+    'dpn98': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn98-722954780.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    },
+    'dpn131': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn131-7af84be88.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    },
+    'dpn107': {
+        'imagenet+5k': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/dpn107_extra-b7f9f4cc9.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [124 / 255, 117 / 255, 104 / 255],
+            'std': [1 / (.0167 * 255)] * 3,
+            'num_classes': 1000
+        }
+    }
+}
+
+def dpn68(num_classes=1000, pretrained='imagenet'):
+    model = DPN(
+        small=True, num_init_features=10, k_r=128, groups=32,
+        k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn68'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+def dpn68b(num_classes=1000, pretrained='imagenet+5k'):
+    model = DPN(
+        small=True, num_init_features=10, k_r=128, groups=32,
+        b=True, k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn68b'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+def dpn92(num_classes=1000, pretrained='imagenet+5k'):
+    model = DPN(
+        num_init_features=64, k_r=96, groups=32,
+        k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn92'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+def dpn98(num_classes=1000, pretrained='imagenet'):
+    model = DPN(
+        num_init_features=96, k_r=160, groups=40,
+        k_sec=(3, 6, 20, 3), inc_sec=(16, 32, 32, 128),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn98'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+def dpn131(num_classes=1000, pretrained='imagenet'):
+    model = DPN(
+        num_init_features=128, k_r=160, groups=40,
+        k_sec=(4, 8, 28, 3), inc_sec=(16, 32, 32, 128),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn131'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+def dpn107(num_classes=1000, pretrained='imagenet+5k'):
+    model = DPN(
+        num_init_features=128, k_r=200, groups=50,
+        k_sec=(4, 8, 20, 3), inc_sec=(20, 64, 64, 128),
+        num_classes=num_classes, test_time_pool=True)
+    if pretrained:
+        settings = pretrained_settings['dpn107'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    return model
+
+
+class CatBnAct(nn.Module):
+    def __init__(self, in_chs, activation_fn=nn.ReLU(inplace=True)):
+        super(CatBnAct, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+
+    def forward(self, x):
+        x = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        return self.act(self.bn(x))
+
+
+class BnActConv2d(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size, stride,
+                 padding=0, groups=1, activation_fn=nn.ReLU(inplace=True)):
+        super(BnActConv2d, self).__init__()
+        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
+        self.act = activation_fn
+        self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, padding, groups=groups, bias=False)
+
+    def forward(self, x):
+        return self.conv(self.act(self.bn(x)))
+
+
+class InputBlock(nn.Module):
+    def __init__(self, num_init_features, kernel_size=7,
+                 padding=3, activation_fn=nn.ReLU(inplace=True)):
+        super(InputBlock, self).__init__()
+        self.conv = nn.Conv2d(
+            3, num_init_features, kernel_size=kernel_size, stride=2, padding=padding, bias=False)
+        self.bn = nn.BatchNorm2d(num_init_features, eps=0.001)
+        self.act = activation_fn
+        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.pool(x)
+        return x
+
+
+class DualPathBlock(nn.Module):
+    def __init__(
+            self, in_chs, num_1x1_a, num_3x3_b, num_1x1_c, inc, groups, block_type='normal', b=False):
+        super(DualPathBlock, self).__init__()
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.b = b
+        if block_type is 'proj':
+            self.key_stride = 1
+            self.has_proj = True
+        elif block_type is 'down':
+            self.key_stride = 2
+            self.has_proj = True
+        else:
+            assert block_type is 'normal'
+            self.key_stride = 1
+            self.has_proj = False
+
+        if self.has_proj:
+            # Using different member names here to allow easier parameter key matching for conversion
+            if self.key_stride == 2:
+                self.c1x1_w_s2 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=2)
+            else:
+                self.c1x1_w_s1 = BnActConv2d(
+                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1)
+        self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1)
+        self.c3x3_b = BnActConv2d(
+            in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3,
+            stride=self.key_stride, padding=1, groups=groups)
+        if b:
+            self.c1x1_c = CatBnAct(in_chs=num_3x3_b)
+            self.c1x1_c1 = nn.Conv2d(num_3x3_b, num_1x1_c, kernel_size=1, bias=False)
+            self.c1x1_c2 = nn.Conv2d(num_3x3_b, inc, kernel_size=1, bias=False)
+        else:
+            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1)
+
+    def forward(self, x):
+        x_in = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        if self.has_proj:
+            if self.key_stride == 2:
+                x_s = self.c1x1_w_s2(x_in)
+            else:
+                x_s = self.c1x1_w_s1(x_in)
+            x_s1 = x_s[:, :self.num_1x1_c, :, :]
+            x_s2 = x_s[:, self.num_1x1_c:, :, :]
+        else:
+            x_s1 = x[0]
+            x_s2 = x[1]
+        x_in = self.c1x1_a(x_in)
+        x_in = self.c3x3_b(x_in)
+        if self.b:
+            x_in = self.c1x1_c(x_in)
+            out1 = self.c1x1_c1(x_in)
+            out2 = self.c1x1_c2(x_in)
+        else:
+            x_in = self.c1x1_c(x_in)
+            out1 = x_in[:, :self.num_1x1_c, :, :]
+            out2 = x_in[:, self.num_1x1_c:, :, :]
+        resid = x_s1 + out1
+        dense = torch.cat([x_s2, out2], dim=1)
+        return resid, dense
+
+
+class DPN(nn.Module):
+    def __init__(self, small=False, num_init_features=64, k_r=96, groups=32,
+                 b=False, k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128),
+                 num_classes=1000, test_time_pool=False):
+        super(DPN, self).__init__()
+        self.test_time_pool = test_time_pool
+        self.b = b
+        bw_factor = 1 if small else 4
+
+        blocks = OrderedDict()
+
+        # conv1
+        if small:
+            blocks['conv1_1'] = InputBlock(num_init_features, kernel_size=3, padding=1)
+        else:
+            blocks['conv1_1'] = InputBlock(num_init_features, kernel_size=7, padding=3)
+
+        # conv2
+        bw = 64 * bw_factor
+        inc = inc_sec[0]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv2_1'] = DualPathBlock(num_init_features, r, r, bw, inc, groups, 'proj', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[0] + 1):
+            blocks['conv2_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+
+        # conv3
+        bw = 128 * bw_factor
+        inc = inc_sec[1]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv3_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[1] + 1):
+            blocks['conv3_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+
+        # conv4
+        bw = 256 * bw_factor
+        inc = inc_sec[2]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv4_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[2] + 1):
+            blocks['conv4_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+
+        # conv5
+        bw = 512 * bw_factor
+        inc = inc_sec[3]
+        r = (k_r * bw) // (64 * bw_factor)
+        blocks['conv5_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
+        in_chs = bw + 3 * inc
+        for i in range(2, k_sec[3] + 1):
+            blocks['conv5_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
+            in_chs += inc
+        blocks['conv5_bn_ac'] = CatBnAct(in_chs)
+
+        self.features = nn.Sequential(blocks)
+
+        # Using 1x1 conv for the FC layer to allow the extra pooling scheme
+        self.last_linear = nn.Conv2d(in_chs, num_classes, kernel_size=1, bias=True)
+
+    def logits(self, features):
+        if not self.training and self.test_time_pool:
+            x = F.avg_pool2d(features, kernel_size=7, stride=1)
+            out = self.last_linear(x)
+            # The extra test time pool should be pooling an img_size//32 - 6 size patch
+            out = adaptive_avgmax_pool2d(out, pool_type='avgmax')
+        else:
+            x = adaptive_avgmax_pool2d(features, pool_type='avg')
+            out = self.last_linear(x)
+        return out.view(out.size(0), -1)
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+""" PyTorch selectable adaptive pooling
+Adaptive pooling with the ability to select the type of pooling from:
+    * 'avg' - Average pooling
+    * 'max' - Max pooling
+    * 'avgmax' - Sum of average and max pooling re-scaled by 0.5
+    * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim
+
+Both a functional and a nn.Module version of the pooling is provided.
+
+Author: Ross Wightman (rwightman)
+"""
+
+def pooling_factor(pool_type='avg'):
+    return 2 if pool_type == 'avgmaxc' else 1
+
+
+def adaptive_avgmax_pool2d(x, pool_type='avg', padding=0, count_include_pad=False):
+    """Selectable global pooling function with dynamic input kernel size
+    """
+    if pool_type == 'avgmaxc':
+        x = torch.cat([
+            F.avg_pool2d(
+                x, kernel_size=(x.size(2), x.size(3)), padding=padding, count_include_pad=count_include_pad),
+            F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+        ], dim=1)
+    elif pool_type == 'avgmax':
+        x_avg = F.avg_pool2d(
+                x, kernel_size=(x.size(2), x.size(3)), padding=padding, count_include_pad=count_include_pad)
+        x_max = F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+        x = 0.5 * (x_avg + x_max)
+    elif pool_type == 'max':
+        x = F.max_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=padding)
+    else:
+        if pool_type != 'avg':
+            print('Invalid pool type %s specified. Defaulting to average pooling.' % pool_type)
+        x = F.avg_pool2d(
+            x, kernel_size=(x.size(2), x.size(3)), padding=padding, count_include_pad=count_include_pad)
+    return x
+
+
+class AdaptiveAvgMaxPool2d(torch.nn.Module):
+    """Selectable global pooling layer with dynamic input kernel size
+    """
+    def __init__(self, output_size=1, pool_type='avg'):
+        super(AdaptiveAvgMaxPool2d, self).__init__()
+        self.output_size = output_size
+        self.pool_type = pool_type
+        if pool_type == 'avgmaxc' or pool_type == 'avgmax':
+            self.pool = nn.ModuleList([nn.AdaptiveAvgPool2d(output_size), nn.AdaptiveMaxPool2d(output_size)])
+        elif pool_type == 'max':
+            self.pool = nn.AdaptiveMaxPool2d(output_size)
+        else:
+            if pool_type != 'avg':
+                print('Invalid pool type %s specified. Defaulting to average pooling.' % pool_type)
+            self.pool = nn.AdaptiveAvgPool2d(output_size)
+
+    def forward(self, x):
+        if self.pool_type == 'avgmaxc':
+            x = torch.cat([p(x) for p in self.pool], dim=1)
+        elif self.pool_type == 'avgmax':
+            x = 0.5 * torch.sum(torch.stack([p(x) for p in self.pool]), 0).squeeze(dim=0)
+        else:
+            x = self.pool(x)
+        return x
+
+    def factor(self):
+        return pooling_factor(self.pool_type)
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+               + 'output_size=' + str(self.output_size) \
+               + ', pool_type=' + self.pool_type + ')'
diff --git a/segmentation_models_pytorch/encoders/_inceptionresnetv2.py b/segmentation_models_pytorch/encoders/_inceptionresnetv2.py
new file mode 100644
index 00000000..8f55bb0b
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_inceptionresnetv2.py
@@ -0,0 +1,380 @@
+from __future__ import print_function, division, absolute_import
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import os
+import sys
+
+__all__ = ['InceptionResNetV2', 'inceptionresnetv2']
+
+pretrained_settings = {
+    'inceptionresnetv2': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1000
+        },
+        'imagenet+background': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1001
+        }
+    }
+}
+
+
+class BasicConv2d(nn.Module):
+
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_planes, out_planes,
+                              kernel_size=kernel_size, stride=stride,
+                              padding=padding, bias=False) # verify bias false
+        self.bn = nn.BatchNorm2d(out_planes,
+                                 eps=0.001, # value found in tensorflow
+                                 momentum=0.1, # default pytorch value
+                                 affine=True)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Mixed_5b(nn.Module):
+
+    def __init__(self):
+        super(Mixed_5b, self).__init__()
+
+        self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(192, 48, kernel_size=1, stride=1),
+            BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2)
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(192, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(192, 64, kernel_size=1, stride=1)
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Block35(nn.Module):
+
+    def __init__(self, scale=1.0):
+        super(Block35, self).__init__()
+
+        self.scale = scale
+
+        self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(320, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(320, 32, kernel_size=1, stride=1),
+            BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+
+
+class Mixed_6a(nn.Module):
+
+    def __init__(self):
+        super(Mixed_6a, self).__init__()
+
+        self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(320, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(256, 384, kernel_size=3, stride=2)
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Block17(nn.Module):
+
+    def __init__(self, scale=1.0):
+        super(Block17, self).__init__()
+
+        self.scale = scale
+
+        self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1088, 128, kernel_size=1, stride=1),
+            BasicConv2d(128, 160, kernel_size=(1,7), stride=1, padding=(0,3)),
+            BasicConv2d(160, 192, kernel_size=(7,1), stride=1, padding=(3,0))
+        )
+
+        self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        out = self.relu(out)
+        return out
+
+
+class Mixed_7a(nn.Module):
+
+    def __init__(self):
+        super(Mixed_7a, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 384, kernel_size=3, stride=2)
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 288, kernel_size=3, stride=2)
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(1088, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(288, 320, kernel_size=3, stride=2)
+        )
+
+        self.branch3 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Block8(nn.Module):
+
+    def __init__(self, scale=1.0, noReLU=False):
+        super(Block8, self).__init__()
+
+        self.scale = scale
+        self.noReLU = noReLU
+
+        self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(2080, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=(1,3), stride=1, padding=(0,1)),
+            BasicConv2d(224, 256, kernel_size=(3,1), stride=1, padding=(1,0))
+        )
+
+        self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)
+        if not self.noReLU:
+            self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        out = self.conv2d(out)
+        out = out * self.scale + x
+        if not self.noReLU:
+            out = self.relu(out)
+        return out
+
+
+class InceptionResNetV2(nn.Module):
+
+    def __init__(self, num_classes=1001):
+        super(InceptionResNetV2, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2)
+        self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)
+        self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)
+        self.maxpool_3a = nn.MaxPool2d(3, stride=2)
+        self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
+        self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
+        self.maxpool_5a = nn.MaxPool2d(3, stride=2)
+        self.mixed_5b = Mixed_5b()
+        self.repeat = nn.Sequential(
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17),
+            Block35(scale=0.17)
+        )
+        self.mixed_6a = Mixed_6a()
+        self.repeat_1 = nn.Sequential(
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10),
+            Block17(scale=0.10)
+        )
+        self.mixed_7a = Mixed_7a()
+        self.repeat_2 = nn.Sequential(
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20),
+            Block8(scale=0.20)
+        )
+        self.block8 = Block8(noReLU=True)
+        self.conv2d_7b = BasicConv2d(2080, 1536, kernel_size=1, stride=1)
+        self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
+        self.last_linear = nn.Linear(1536, num_classes)
+
+    def features(self, input):
+        x = self.conv2d_1a(input)
+        x = self.conv2d_2a(x)
+        x = self.conv2d_2b(x)
+        x = self.maxpool_3a(x)
+        x = self.conv2d_3b(x)
+        x = self.conv2d_4a(x)
+        x = self.maxpool_5a(x)
+        x = self.mixed_5b(x)
+        x = self.repeat(x)
+        x = self.mixed_6a(x)
+        x = self.repeat_1(x)
+        x = self.mixed_7a(x)
+        x = self.repeat_2(x)
+        x = self.block8(x)
+        x = self.conv2d_7b(x)
+        return x
+
+    def logits(self, features):
+        x = self.avgpool_1a(features)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+def inceptionresnetv2(num_classes=1000, pretrained='imagenet'):
+    r"""InceptionResNetV2 model architecture from the
+    `"InceptionV4, Inception-ResNet..." <https://arxiv.org/abs/1602.07261>`_ paper.
+    """
+    if pretrained:
+        settings = pretrained_settings['inceptionresnetv2'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        # both 'imagenet'&'imagenet+background' are loaded from same parameters
+        model = InceptionResNetV2(num_classes=1001)
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+
+        if pretrained == 'imagenet':
+            new_last_linear = nn.Linear(1536, 1000)
+            new_last_linear.weight.data = model.last_linear.weight.data[1:]
+            new_last_linear.bias.data = model.last_linear.bias.data[1:]
+            model.last_linear = new_last_linear
+
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+
+        model.mean = settings['mean']
+        model.std = settings['std']
+    else:
+        model = InceptionResNetV2(num_classes=num_classes)
+    return model
+
+'''
+TEST
+Run this code with:
+```
+cd $HOME/pretrained-models.pytorch
+python -m pretrainedmodels.inceptionresnetv2
+```
+'''
+if __name__ == '__main__':
+
+    assert inceptionresnetv2(num_classes=10, pretrained=None)
+    print('success')
+    assert inceptionresnetv2(num_classes=1000, pretrained='imagenet')
+    print('success')
+    assert inceptionresnetv2(num_classes=1001, pretrained='imagenet+background')
+    print('success')
+
+    # fail
+    assert inceptionresnetv2(num_classes=1001, pretrained='imagenet')
\ No newline at end of file
diff --git a/segmentation_models_pytorch/encoders/_inceptionv4.py b/segmentation_models_pytorch/encoders/_inceptionv4.py
new file mode 100644
index 00000000..d48f7b77
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_inceptionv4.py
@@ -0,0 +1,358 @@
+from __future__ import print_function, division, absolute_import
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+import os
+import sys
+
+__all__ = ['InceptionV4', 'inceptionv4']
+
+pretrained_settings = {
+    'inceptionv4': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionv4-8e4777a0.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1000
+        },
+        'imagenet+background': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/inceptionv4-8e4777a0.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1001
+        }
+    }
+}
+
+
+class BasicConv2d(nn.Module):
+
+    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv2d(in_planes, out_planes,
+                              kernel_size=kernel_size, stride=stride,
+                              padding=padding, bias=False) # verify bias false
+        self.bn = nn.BatchNorm2d(out_planes,
+                                 eps=0.001, # value found in tensorflow
+                                 momentum=0.1, # default pytorch value
+                                 affine=True)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Mixed_3a(nn.Module):
+
+    def __init__(self):
+        super(Mixed_3a, self).__init__()
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+        self.conv = BasicConv2d(64, 96, kernel_size=3, stride=2)
+
+    def forward(self, x):
+        x0 = self.maxpool(x)
+        x1 = self.conv(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Mixed_4a(nn.Module):
+
+    def __init__(self):
+        super(Mixed_4a, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1)
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(160, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 64, kernel_size=(1,7), stride=1, padding=(0,3)),
+            BasicConv2d(64, 64, kernel_size=(7,1), stride=1, padding=(3,0)),
+            BasicConv2d(64, 96, kernel_size=(3,3), stride=1)
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Mixed_5a(nn.Module):
+
+    def __init__(self):
+        super(Mixed_5a, self).__init__()
+        self.conv = BasicConv2d(192, 192, kernel_size=3, stride=2)
+        self.maxpool = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.conv(x)
+        x1 = self.maxpool(x)
+        out = torch.cat((x0, x1), 1)
+        return out
+
+
+class Inception_A(nn.Module):
+
+    def __init__(self):
+        super(Inception_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 96, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(384, 64, kernel_size=1, stride=1),
+            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(384, 96, kernel_size=1, stride=1)
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Reduction_A(nn.Module):
+
+    def __init__(self):
+        super(Reduction_A, self).__init__()
+        self.branch0 = BasicConv2d(384, 384, kernel_size=3, stride=2)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(384, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=3, stride=1, padding=1),
+            BasicConv2d(224, 256, kernel_size=3, stride=2)
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Inception_B(nn.Module):
+
+    def __init__(self):
+        super(Inception_B, self).__init__()
+        self.branch0 = BasicConv2d(1024, 384, kernel_size=1, stride=1)
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 224, kernel_size=(1,7), stride=1, padding=(0,3)),
+            BasicConv2d(224, 256, kernel_size=(7,1), stride=1, padding=(3,0))
+        )
+
+        self.branch2 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=(7,1), stride=1, padding=(3,0)),
+            BasicConv2d(192, 224, kernel_size=(1,7), stride=1, padding=(0,3)),
+            BasicConv2d(224, 224, kernel_size=(7,1), stride=1, padding=(3,0)),
+            BasicConv2d(224, 256, kernel_size=(1,7), stride=1, padding=(0,3))
+        )
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(1024, 128, kernel_size=1, stride=1)
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class Reduction_B(nn.Module):
+
+    def __init__(self):
+        super(Reduction_B, self).__init__()
+
+        self.branch0 = nn.Sequential(
+            BasicConv2d(1024, 192, kernel_size=1, stride=1),
+            BasicConv2d(192, 192, kernel_size=3, stride=2)
+        )
+
+        self.branch1 = nn.Sequential(
+            BasicConv2d(1024, 256, kernel_size=1, stride=1),
+            BasicConv2d(256, 256, kernel_size=(1,7), stride=1, padding=(0,3)),
+            BasicConv2d(256, 320, kernel_size=(7,1), stride=1, padding=(3,0)),
+            BasicConv2d(320, 320, kernel_size=3, stride=2)
+        )
+
+        self.branch2 = nn.MaxPool2d(3, stride=2)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        out = torch.cat((x0, x1, x2), 1)
+        return out
+
+
+class Inception_C(nn.Module):
+
+    def __init__(self):
+        super(Inception_C, self).__init__()
+
+        self.branch0 = BasicConv2d(1536, 256, kernel_size=1, stride=1)
+
+        self.branch1_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch1_1a = BasicConv2d(384, 256, kernel_size=(1,3), stride=1, padding=(0,1))
+        self.branch1_1b = BasicConv2d(384, 256, kernel_size=(3,1), stride=1, padding=(1,0))
+
+        self.branch2_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
+        self.branch2_1 = BasicConv2d(384, 448, kernel_size=(3,1), stride=1, padding=(1,0))
+        self.branch2_2 = BasicConv2d(448, 512, kernel_size=(1,3), stride=1, padding=(0,1))
+        self.branch2_3a = BasicConv2d(512, 256, kernel_size=(1,3), stride=1, padding=(0,1))
+        self.branch2_3b = BasicConv2d(512, 256, kernel_size=(3,1), stride=1, padding=(1,0))
+
+        self.branch3 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
+            BasicConv2d(1536, 256, kernel_size=1, stride=1)
+        )
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+
+        x1_0 = self.branch1_0(x)
+        x1_1a = self.branch1_1a(x1_0)
+        x1_1b = self.branch1_1b(x1_0)
+        x1 = torch.cat((x1_1a, x1_1b), 1)
+
+        x2_0 = self.branch2_0(x)
+        x2_1 = self.branch2_1(x2_0)
+        x2_2 = self.branch2_2(x2_1)
+        x2_3a = self.branch2_3a(x2_2)
+        x2_3b = self.branch2_3b(x2_2)
+        x2 = torch.cat((x2_3a, x2_3b), 1)
+
+        x3 = self.branch3(x)
+
+        out = torch.cat((x0, x1, x2, x3), 1)
+        return out
+
+
+class InceptionV4(nn.Module):
+
+    def __init__(self, num_classes=1001):
+        super(InceptionV4, self).__init__()
+        # Special attributs
+        self.input_space = None
+        self.input_size = (299, 299, 3)
+        self.mean = None
+        self.std = None
+        # Modules
+        self.features = nn.Sequential(
+            BasicConv2d(3, 32, kernel_size=3, stride=2),
+            BasicConv2d(32, 32, kernel_size=3, stride=1),
+            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            Mixed_3a(),
+            Mixed_4a(),
+            Mixed_5a(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Inception_A(),
+            Reduction_A(), # Mixed_6a
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Inception_B(),
+            Reduction_B(), # Mixed_7a
+            Inception_C(),
+            Inception_C(),
+            Inception_C()
+        )
+        self.last_linear = nn.Linear(1536, num_classes)
+
+    def logits(self, features):
+        #Allows image of any size to be processed
+        adaptiveAvgPoolWidth = features.shape[2]
+        x = F.avg_pool2d(features, kernel_size=adaptiveAvgPoolWidth)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def inceptionv4(num_classes=1000, pretrained='imagenet'):
+    if pretrained:
+        settings = pretrained_settings['inceptionv4'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        # both 'imagenet'&'imagenet+background' are loaded from same parameters
+        model = InceptionV4(num_classes=1001)
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+
+        if pretrained == 'imagenet':
+            new_last_linear = nn.Linear(1536, 1000)
+            new_last_linear.weight.data = model.last_linear.weight.data[1:]
+            new_last_linear.bias.data = model.last_linear.bias.data[1:]
+            model.last_linear = new_last_linear
+
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    else:
+        model = InceptionV4(num_classes=num_classes)
+    return model
+
+
+'''
+TEST
+Run this code with:
+```
+cd $HOME/pretrained-models.pytorch
+python -m pretrainedmodels.inceptionv4
+```
+'''
+if __name__ == '__main__':
+
+    assert inceptionv4(num_classes=10, pretrained=None)
+    print('success')
+    assert inceptionv4(num_classes=1000, pretrained='imagenet')
+    print('success')
+    assert inceptionv4(num_classes=1001, pretrained='imagenet+background')
+    print('success')
+
+    # fail
+    assert inceptionv4(num_classes=1001, pretrained='imagenet')
diff --git a/segmentation_models_pytorch/encoders/_senet.py b/segmentation_models_pytorch/encoders/_senet.py
new file mode 100644
index 00000000..20bd122f
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_senet.py
@@ -0,0 +1,442 @@
+"""
+ResNet code gently borrowed from
+https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+"""
+from __future__ import print_function, division, absolute_import
+from collections import OrderedDict
+import math
+
+import torch.nn as nn
+from torch.utils import model_zoo
+
+__all__ = ['SENet', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152',
+           'se_resnext50_32x4d', 'se_resnext101_32x4d']
+
+pretrained_settings = {
+    'senet154': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+    'se_resnet50': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/se_resnet50-ce0d4300.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+    'se_resnet101': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/se_resnet101-7e38fcc6.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+    'se_resnet152': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/se_resnet152-d17c99b7.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+    'se_resnext50_32x4d': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+    'se_resnext101_32x4d': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/se_resnext101_32x4d-3b2fe3d8.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+            'num_classes': 1000
+        }
+    },
+}
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1,
+                             padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1,
+                             padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class Bottleneck(nn.Module):
+    """
+    Base class for bottlenecks that implements `forward()` method.
+    """
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = self.se_module(out) + residual
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(Bottleneck):
+    """
+    Bottleneck for SENet154.
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1,
+                 downsample=None):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes * 2)
+        self.conv2 = nn.Conv2d(planes * 2, planes * 4, kernel_size=3,
+                               stride=stride, padding=1, groups=groups,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes * 4)
+        self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNetBottleneck(Bottleneck):
+    """
+    ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
+    implementation and uses `stride=stride` in `conv1` and not in `conv2`
+    (the latter is used in the torchvision implementation of ResNet).
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1,
+                 downsample=None):
+        super(SEResNetBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False,
+                               stride=stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1,
+                               groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SEResNeXtBottleneck(Bottleneck):
+    """
+    ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
+    """
+    expansion = 4
+
+    def __init__(self, inplanes, planes, groups, reduction, stride=1,
+                 downsample=None, base_width=4):
+        super(SEResNeXtBottleneck, self).__init__()
+        width = math.floor(planes * (base_width / 64)) * groups
+        self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False,
+                               stride=1)
+        self.bn1 = nn.BatchNorm2d(width)
+        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride,
+                               padding=1, groups=groups, bias=False)
+        self.bn2 = nn.BatchNorm2d(width)
+        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se_module = SEModule(planes * 4, reduction=reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+
+class SENet(nn.Module):
+
+    def __init__(self, block, layers, groups, reduction, dropout_p=0.2,
+                 inplanes=128, input_3x3=True, downsample_kernel_size=3,
+                 downsample_padding=1, num_classes=1000):
+        """
+        Parameters
+        ----------
+        block (nn.Module): Bottleneck class.
+            - For SENet154: SEBottleneck
+            - For SE-ResNet models: SEResNetBottleneck
+            - For SE-ResNeXt models:  SEResNeXtBottleneck
+        layers (list of ints): Number of residual blocks for 4 layers of the
+            network (layer1...layer4).
+        groups (int): Number of groups for the 3x3 convolution in each
+            bottleneck block.
+            - For SENet154: 64
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models:  32
+        reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
+            - For all models: 16
+        dropout_p (float or None): Drop probability for the Dropout layer.
+            If `None` the Dropout layer is not used.
+            - For SENet154: 0.2
+            - For SE-ResNet models: None
+            - For SE-ResNeXt models: None
+        inplanes (int):  Number of input channels for layer1.
+            - For SENet154: 128
+            - For SE-ResNet models: 64
+            - For SE-ResNeXt models: 64
+        input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
+            a single 7x7 convolution in layer0.
+            - For SENet154: True
+            - For SE-ResNet models: False
+            - For SE-ResNeXt models: False
+        downsample_kernel_size (int): Kernel size for downsampling convolutions
+            in layer2, layer3 and layer4.
+            - For SENet154: 3
+            - For SE-ResNet models: 1
+            - For SE-ResNeXt models: 1
+        downsample_padding (int): Padding for downsampling convolutions in
+            layer2, layer3 and layer4.
+            - For SENet154: 1
+            - For SE-ResNet models: 0
+            - For SE-ResNeXt models: 0
+        num_classes (int): Number of outputs in `last_linear` layer.
+            - For all models: 1000
+        """
+        super(SENet, self).__init__()
+        self.inplanes = inplanes
+        if input_3x3:
+            layer0_modules = [
+                ('conv1', nn.Conv2d(3, 64, 3, stride=2, padding=1,
+                                    bias=False)),
+                ('bn1', nn.BatchNorm2d(64)),
+                ('relu1', nn.ReLU(inplace=True)),
+                ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1,
+                                    bias=False)),
+                ('bn2', nn.BatchNorm2d(64)),
+                ('relu2', nn.ReLU(inplace=True)),
+                ('conv3', nn.Conv2d(64, inplanes, 3, stride=1, padding=1,
+                                    bias=False)),
+                ('bn3', nn.BatchNorm2d(inplanes)),
+                ('relu3', nn.ReLU(inplace=True)),
+            ]
+        else:
+            layer0_modules = [
+                ('conv1', nn.Conv2d(3, inplanes, kernel_size=7, stride=2,
+                                    padding=3, bias=False)),
+                ('bn1', nn.BatchNorm2d(inplanes)),
+                ('relu1', nn.ReLU(inplace=True)),
+            ]
+        # To preserve compatibility with Caffe weights `ceil_mode=True`
+        # is used instead of `padding=1`.
+        layer0_modules.append(('pool', nn.MaxPool2d(3, stride=2,
+                                                    ceil_mode=True)))
+        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
+        self.layer1 = self._make_layer(
+            block,
+            planes=64,
+            blocks=layers[0],
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=1,
+            downsample_padding=0
+        )
+        self.layer2 = self._make_layer(
+            block,
+            planes=128,
+            blocks=layers[1],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.layer3 = self._make_layer(
+            block,
+            planes=256,
+            blocks=layers[2],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.layer4 = self._make_layer(
+            block,
+            planes=512,
+            blocks=layers[3],
+            stride=2,
+            groups=groups,
+            reduction=reduction,
+            downsample_kernel_size=downsample_kernel_size,
+            downsample_padding=downsample_padding
+        )
+        self.avg_pool = nn.AvgPool2d(7, stride=1)
+        self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
+        self.last_linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, groups, reduction, stride=1,
+                    downsample_kernel_size=1, downsample_padding=0):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=downsample_kernel_size, stride=stride,
+                          padding=downsample_padding, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, groups, reduction, stride,
+                            downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups, reduction))
+
+        return nn.Sequential(*layers)
+
+    def features(self, x):
+        x = self.layer0(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+    def logits(self, x):
+        x = self.avg_pool(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.logits(x)
+        return x
+
+
+def initialize_pretrained_model(model, num_classes, settings):
+    assert num_classes == settings['num_classes'], \
+        'num_classes should be {}, but is {}'.format(
+            settings['num_classes'], num_classes)
+    model.load_state_dict(model_zoo.load_url(settings['url']))
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+
+
+def senet154(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEBottleneck, [3, 8, 36, 3], groups=64, reduction=16,
+                  dropout_p=0.2, num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['senet154'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet50(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEResNetBottleneck, [3, 4, 6, 3], groups=1, reduction=16,
+                  dropout_p=None, inplanes=64, input_3x3=False,
+                  downsample_kernel_size=1, downsample_padding=0,
+                  num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['se_resnet50'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet101(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEResNetBottleneck, [3, 4, 23, 3], groups=1, reduction=16,
+                  dropout_p=None, inplanes=64, input_3x3=False,
+                  downsample_kernel_size=1, downsample_padding=0,
+                  num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['se_resnet101'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnet152(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEResNetBottleneck, [3, 8, 36, 3], groups=1, reduction=16,
+                  dropout_p=None, inplanes=64, input_3x3=False,
+                  downsample_kernel_size=1, downsample_padding=0,
+                  num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['se_resnet152'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnext50_32x4d(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEResNeXtBottleneck, [3, 4, 6, 3], groups=32, reduction=16,
+                  dropout_p=None, inplanes=64, input_3x3=False,
+                  downsample_kernel_size=1, downsample_padding=0,
+                  num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['se_resnext50_32x4d'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
+
+
+def se_resnext101_32x4d(num_classes=1000, pretrained='imagenet'):
+    model = SENet(SEResNeXtBottleneck, [3, 4, 23, 3], groups=32, reduction=16,
+                  dropout_p=None, inplanes=64, input_3x3=False,
+                  downsample_kernel_size=1, downsample_padding=0,
+                  num_classes=num_classes)
+    if pretrained is not None:
+        settings = pretrained_settings['se_resnext101_32x4d'][pretrained]
+        initialize_pretrained_model(model, num_classes, settings)
+    return model
diff --git a/segmentation_models_pytorch/encoders/_xception.py b/segmentation_models_pytorch/encoders/_xception.py
new file mode 100644
index 00000000..7783c477
--- /dev/null
+++ b/segmentation_models_pytorch/encoders/_xception.py
@@ -0,0 +1,235 @@
+"""
+Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
+
+@author: tstandley
+Adapted by cadene
+
+Creates an Xception Model as defined in:
+
+Francois Chollet
+Xception: Deep Learning with Depthwise Separable Convolutions
+https://arxiv.org/pdf/1610.02357.pdf
+
+This weights ported from the Keras implementation. Achieves the following performance on the validation set:
+
+Loss:0.9173 Prec@1:78.892 Prec@5:94.292
+
+REMEMBER to set your image size to 3x299x299 for both test and validation
+
+normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+
+The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+"""
+from __future__ import print_function, division, absolute_import
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from torch.nn import init
+
+__all__ = ['xception']
+
+pretrained_settings = {
+    'xception': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1000,
+            'scale': 0.8975 # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+        }
+    }
+}
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(self,in_channels,out_channels,kernel_size=1,stride=1,padding=0,dilation=1,bias=False):
+        super(SeparableConv2d,self).__init__()
+
+        self.conv1 = nn.Conv2d(in_channels,in_channels,kernel_size,stride,padding,dilation,groups=in_channels,bias=bias)
+        self.pointwise = nn.Conv2d(in_channels,out_channels,1,1,0,1,1,bias=bias)
+
+    def forward(self,x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(self,in_filters,out_filters,reps,strides=1,start_with_relu=True,grow_first=True):
+        super(Block, self).__init__()
+
+        if out_filters != in_filters or strides!=1:
+            self.skip = nn.Conv2d(in_filters,out_filters,1,stride=strides, bias=False)
+            self.skipbn = nn.BatchNorm2d(out_filters)
+        else:
+            self.skip=None
+
+        rep=[]
+
+        filters=in_filters
+        if grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+            filters = out_filters
+
+        for i in range(reps-1):
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(filters,filters,3,stride=1,padding=1,bias=False))
+            rep.append(nn.BatchNorm2d(filters))
+
+        if not grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+
+        if not start_with_relu:
+            rep = rep[1:]
+        else:
+            rep[0] = nn.ReLU(inplace=False)
+
+        if strides != 1:
+            rep.append(nn.MaxPool2d(3,strides,1))
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self,inp):
+        x = self.rep(inp)
+
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+
+        x+=skip
+        return x
+
+
+class Xception(nn.Module):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+    def __init__(self, num_classes=1000):
+        """ Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(3, 32, 3,2, 0, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32,64,3,bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        #do relu here
+
+        self.block1=Block(64,128,2,2,start_with_relu=False,grow_first=True)
+        self.block2=Block(128,256,2,2,start_with_relu=True,grow_first=True)
+        self.block3=Block(256,728,2,2,start_with_relu=True,grow_first=True)
+
+        self.block4=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block5=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block6=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block7=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+
+        self.block8=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block9=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block10=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+        self.block11=Block(728,728,3,1,start_with_relu=True,grow_first=True)
+
+        self.block12=Block(728,1024,2,2,start_with_relu=True,grow_first=False)
+
+        self.conv3 = SeparableConv2d(1024,1536,3,1,1)
+        self.bn3 = nn.BatchNorm2d(1536)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        #do relu here
+        self.conv4 = SeparableConv2d(1536,2048,3,1,1)
+        self.bn4 = nn.BatchNorm2d(2048)
+
+        self.fc = nn.Linear(2048, num_classes)
+
+        # #------- init weights --------
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+        # #-----------------------------
+
+    def features(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu3(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        return x
+
+    def logits(self, features):
+        x = nn.ReLU(inplace=True)(features)
+
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+def xception(num_classes=1000, pretrained='imagenet'):
+    model = Xception(num_classes=num_classes)
+    if pretrained:
+        settings = pretrained_settings['xception'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+
+        model = Xception(num_classes=num_classes)
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+
+    # TODO: ugly
+    model.last_linear = model.fc
+    del model.fc
+    return model
diff --git a/segmentation_models_pytorch/encoders/dpn.py b/segmentation_models_pytorch/encoders/dpn.py
index 4fe84328..b5226d4d 100644
--- a/segmentation_models_pytorch/encoders/dpn.py
+++ b/segmentation_models_pytorch/encoders/dpn.py
@@ -27,9 +27,8 @@
 import torch.nn.functional as F
 from typing import List, Dict, Sequence
 
-from pretrainedmodels.models.dpn import DPN
-
 from ._base import EncoderMixin
+from ._dpn import DPN
 
 
 class DPNEncoder(DPN, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/inceptionresnetv2.py b/segmentation_models_pytorch/encoders/inceptionresnetv2.py
index 15bf6502..d7f83f9d 100644
--- a/segmentation_models_pytorch/encoders/inceptionresnetv2.py
+++ b/segmentation_models_pytorch/encoders/inceptionresnetv2.py
@@ -26,9 +26,9 @@
 import torch
 import torch.nn as nn
 from typing import List
-from pretrainedmodels.models.inceptionresnetv2 import InceptionResNetV2
 
 from ._base import EncoderMixin
+from ._inceptionresnetv2 import InceptionResNetV2
 
 
 class InceptionResNetV2Encoder(InceptionResNetV2, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/inceptionv4.py b/segmentation_models_pytorch/encoders/inceptionv4.py
index 12a7cc1b..3c335042 100644
--- a/segmentation_models_pytorch/encoders/inceptionv4.py
+++ b/segmentation_models_pytorch/encoders/inceptionv4.py
@@ -27,9 +27,9 @@
 import torch.nn as nn
 
 from typing import List
-from pretrainedmodels.models.inceptionv4 import InceptionV4
 
 from ._base import EncoderMixin
+from ._inceptionv4 import InceptionV4
 
 
 class InceptionV4Encoder(InceptionV4, EncoderMixin):
diff --git a/segmentation_models_pytorch/encoders/senet.py b/segmentation_models_pytorch/encoders/senet.py
index 18dbfd91..03cf0820 100644
--- a/segmentation_models_pytorch/encoders/senet.py
+++ b/segmentation_models_pytorch/encoders/senet.py
@@ -26,13 +26,13 @@
 import torch
 from typing import List, Dict, Sequence
 
-from pretrainedmodels.models.senet import (
+from ._base import EncoderMixin
+from ._senet import (
     SENet,
     SEBottleneck,
     SEResNetBottleneck,
     SEResNeXtBottleneck,
 )
-from ._base import EncoderMixin
 
 
 class SENetEncoder(SENet, EncoderMixin):
@@ -97,205 +97,3 @@ def load_state_dict(self, state_dict, **kwargs):
         state_dict.pop("last_linear.bias", None)
         state_dict.pop("last_linear.weight", None)
         super().load_state_dict(state_dict, **kwargs)
-
-
-pretrained_settings = {
-    "senet154": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet50": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet50-ce0d4300.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet101": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet101-7e38fcc6.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnet152": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnet152-d17c99b7.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnext50_32x4d": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-    "se_resnext101_32x4d": {
-        "imagenet": {
-            "url": "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext101_32x4d-3b2fe3d8.pth",
-            "input_space": "RGB",
-            "input_size": [3, 224, 224],
-            "input_range": [0, 1],
-            "mean": [0.485, 0.456, 0.406],
-            "std": [0.229, 0.224, 0.225],
-            "num_classes": 1000,
-        }
-    },
-}
-
-
-senet_encoders = {
-    "senet154": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/senet154.imagenet",
-                "revision": "249f45efc9881ba560a0c480128edbc34ab87e40",
-            }
-        },
-        "params": {
-            "out_channels": [3, 128, 256, 512, 1024, 2048],
-            "block": SEBottleneck,
-            "dropout_p": 0.2,
-            "groups": 64,
-            "layers": [3, 8, 36, 3],
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-    "se_resnet50": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/se_resnet50.imagenet",
-                "revision": "e6b4bc2dc85226c3d3474544410724a485455459",
-            }
-        },
-        "params": {
-            "out_channels": [3, 64, 256, 512, 1024, 2048],
-            "block": SEResNetBottleneck,
-            "layers": [3, 4, 6, 3],
-            "downsample_kernel_size": 1,
-            "downsample_padding": 0,
-            "dropout_p": None,
-            "groups": 1,
-            "inplanes": 64,
-            "input_3x3": False,
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-    "se_resnet101": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/se_resnet101.imagenet",
-                "revision": "71fe95cc0a27f444cf83671f354de02dc741b18b",
-            }
-        },
-        "params": {
-            "out_channels": [3, 64, 256, 512, 1024, 2048],
-            "block": SEResNetBottleneck,
-            "layers": [3, 4, 23, 3],
-            "downsample_kernel_size": 1,
-            "downsample_padding": 0,
-            "dropout_p": None,
-            "groups": 1,
-            "inplanes": 64,
-            "input_3x3": False,
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-    "se_resnet152": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/se_resnet152.imagenet",
-                "revision": "e79fc3d9d76f197bd76a2593c2054edf1083fe32",
-            }
-        },
-        "params": {
-            "out_channels": [3, 64, 256, 512, 1024, 2048],
-            "block": SEResNetBottleneck,
-            "layers": [3, 8, 36, 3],
-            "downsample_kernel_size": 1,
-            "downsample_padding": 0,
-            "dropout_p": None,
-            "groups": 1,
-            "inplanes": 64,
-            "input_3x3": False,
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-    "se_resnext50_32x4d": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/se_resnext50_32x4d.imagenet",
-                "revision": "73246406d879a2b0e3fdfe6fddd56347d38f38ae",
-            }
-        },
-        "params": {
-            "out_channels": [3, 64, 256, 512, 1024, 2048],
-            "block": SEResNeXtBottleneck,
-            "layers": [3, 4, 6, 3],
-            "downsample_kernel_size": 1,
-            "downsample_padding": 0,
-            "dropout_p": None,
-            "groups": 32,
-            "inplanes": 64,
-            "input_3x3": False,
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-    "se_resnext101_32x4d": {
-        "encoder": SENetEncoder,
-        "pretrained_settings": {
-            "imagenet": {
-                "repo_id": "smp-hub/se_resnext101_32x4d.imagenet",
-                "revision": "18808a4276f46421d358a9de554e0b93c2795df4",
-            }
-        },
-        "params": {
-            "out_channels": [3, 64, 256, 512, 1024, 2048],
-            "block": SEResNeXtBottleneck,
-            "layers": [3, 4, 23, 3],
-            "downsample_kernel_size": 1,
-            "downsample_padding": 0,
-            "dropout_p": None,
-            "groups": 32,
-            "inplanes": 64,
-            "input_3x3": False,
-            "num_classes": 1000,
-            "reduction": 16,
-        },
-    },
-}
diff --git a/segmentation_models_pytorch/encoders/xception.py b/segmentation_models_pytorch/encoders/xception.py
index 594636a4..5ed70c70 100644
--- a/segmentation_models_pytorch/encoders/xception.py
+++ b/segmentation_models_pytorch/encoders/xception.py
@@ -1,7 +1,7 @@
 from typing import List
-from pretrainedmodels.models.xception import Xception
 
 from ._base import EncoderMixin
+from ._xception import Xception
 
 
 class XceptionEncoder(Xception, EncoderMixin):