add yolox tutorial for imx500

sony · Oct 6, 2024 · 91baa46 · 91baa46
1 parent 508e8fa
commit 91baa46
Show file tree

Hide file tree

Showing 7 changed files with 1,948 additions and 0 deletions.
diff --git a/tutorials/mct_model_garden/models_pytorch/yolox/LICENSE b/tutorials/mct_model_garden/models_pytorch/yolox/LICENSE
diff --git a/tutorials/mct_model_garden/models_pytorch/yolox/__init__.py b/tutorials/mct_model_garden/models_pytorch/yolox/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
diff --git a/tutorials/mct_model_garden/models_pytorch/yolox/darknet.py b/tutorials/mct_model_garden/models_pytorch/yolox/darknet.py
@@ -0,0 +1,374 @@
+# ===========================================================================================
+# The following code was adopted from https://github.com/Megvii-BaseDetection/YOLOX
+# ===========================================================================================
+
+import torch
+from torch import nn
+
+
+def get_activation(name="silu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+        )
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    "Residual layer with `in_channels` inputs."
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
+        )
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
+        )
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+            )
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+        self,
+        depth,
+        in_channels=3,
+        stem_out_channels=32,
+        out_features=("dark3", "dark4", "dark5"),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2)
+        )
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2)
+        )
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2)
+        )
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
+        )
+
+    def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
+        "starts with conv layer then has `num_blocks` `ResLayer`"
+        return [
+            BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                SPPBottleneck(
+                    in_channels=filters_list[1],
+                    out_channels=filters_list[0],
+                    activation="lrelu",
+                ),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
+            ]
+        )
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=("dark3", "dark4", "dark5"),
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}