diff --git a/src/yomitoku/models/layers/activate.py b/src/yomitoku/models/layers/activate.py index 12ea09b..66d9507 100644 --- a/src/yomitoku/models/layers/activate.py +++ b/src/yomitoku/models/layers/activate.py @@ -1,3 +1,16 @@ +# Copyright(c) 2023 lyuwenyu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import torch.nn as nn diff --git a/src/yomitoku/models/layers/rtdetr_backbone.py b/src/yomitoku/models/layers/rtdetr_backbone.py index 7bb8720..d7a60c3 100644 --- a/src/yomitoku/models/layers/rtdetr_backbone.py +++ b/src/yomitoku/models/layers/rtdetr_backbone.py @@ -1,4 +1,16 @@ -"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.""" +# Copyright 2023 lyuwenyu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from collections import OrderedDict @@ -47,7 +59,9 @@ def forward(self, x): class BasicBlock(nn.Module): expansion = 1 - def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"): + def __init__( + self, ch_in, ch_out, stride, shortcut, act="relu", variant="b" + ): super().__init__() self.shortcut = shortcut @@ -86,7 +100,9 @@ def forward(self, x): class BottleNeck(nn.Module): expansion = 4 - def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"): + def __init__( + self, ch_in, ch_out, stride, shortcut, act="relu", variant="b" + ): super().__init__() if variant == "a": @@ -109,13 +125,17 @@ def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"): ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)), ( "conv", - ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1), + ConvNormLayer( + ch_in, ch_out * self.expansion, 1, 1 + ), ), ] ) ) else: - self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride) + self.short = ConvNormLayer( + ch_in, ch_out * self.expansion, 1, stride + ) self.act = nn.Identity() if act is None else get_activation(act) @@ -136,7 +156,9 @@ def forward(self, x): class Blocks(nn.Module): - def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"): + def __init__( + self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b" + ): super().__init__() self.blocks = nn.ModuleList() diff --git a/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py b/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py index 1d6c4c5..addaef2 100644 --- a/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py +++ b/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py @@ -1,4 +1,16 @@ -"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.""" +# Copyright 2023 lyuwenyu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import copy from collections import OrderedDict @@ -240,7 +252,9 @@ def __init__( for in_channel in in_channels: if version == "v1": proj = nn.Sequential( - nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False), + nn.Conv2d( + in_channel, hidden_dim, kernel_size=1, bias=False + ), nn.BatchNorm2d(hidden_dim), ) elif version == "v2": @@ -276,7 +290,9 @@ def __init__( self.encoder = nn.ModuleList( [ - TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) + TransformerEncoder( + copy.deepcopy(encoder_layer), num_encoder_layers + ) for _ in range(len(use_encoder_idx)) ] ) @@ -331,7 +347,9 @@ def _reset_parameters(self): # self.register_buffer(f'pos_embed{idx}', pos_embed) @staticmethod - def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0): + def build_2d_sincos_position_embedding( + w, h, embed_dim=256, temperature=10000.0 + ): """ """ grid_w = torch.arange(int(w), dtype=torch.float32) grid_h = torch.arange(int(h), dtype=torch.float32) @@ -369,7 +387,9 @@ def forward(self, feats): src_flatten.device ) - memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed) + memory: torch.Tensor = self.encoder[i]( + src_flatten, pos_embed=pos_embed + ) proj_feats[enc_ind] = ( memory.permute(0, 2, 1) .reshape(-1, self.hidden_dim, h, w) @@ -381,9 +401,13 @@ def forward(self, feats): for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = proj_feats[idx - 1] - feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh) + feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( + feat_heigh + ) inner_outs[0] = feat_heigh - upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest") + upsample_feat = F.interpolate( + feat_heigh, scale_factor=2.0, mode="nearest" + ) inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( torch.concat([upsample_feat, feat_low], dim=1) ) diff --git a/src/yomitoku/models/layers/rtdetrv2_decoder.py b/src/yomitoku/models/layers/rtdetrv2_decoder.py index 2532218..f7db4d7 100644 --- a/src/yomitoku/models/layers/rtdetrv2_decoder.py +++ b/src/yomitoku/models/layers/rtdetrv2_decoder.py @@ -1,4 +1,17 @@ -"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.""" +# Scene Text Recognition Model Hub +# Copyright 2023 lyuwenyu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import copy import functools @@ -27,7 +40,9 @@ def inverse_sigmoid(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor: class MLP(nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"): + def __init__( + self, input_dim, hidden_dim, output_dim, num_layers, act="relu" + ): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) @@ -178,7 +193,9 @@ def forward( elif reference_points.shape[-1] == 4: # reference_points [8, 480, None, 1, 4] # sampling_offsets [8, 480, 8, 12, 2] - num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1) + num_points_scale = self.num_points_scale.to( + dtype=query.dtype + ).unsqueeze(-1) offset = ( sampling_offsets * num_points_scale @@ -313,7 +330,9 @@ def deformable_attention_core_func_v2( _, Len_q, _, _, _ = sampling_locations.shape split_shape = [h * w for h, w in value_spatial_shapes] - value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1) + value_list = ( + value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1) + ) # sampling_offsets [8, 480, 8, 12, 2] if method == "default": @@ -342,7 +361,8 @@ def deformable_attention_core_func_v2( elif method == "discrete": # n * m, seq, n, 2 sampling_coord = ( - sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5 + sampling_grid_l * torch.tensor([[w, h]], device=value.device) + + 0.5 ).to(torch.int64) # FIX ME? for rectangle input @@ -369,7 +389,9 @@ def deformable_attention_core_func_v2( attn_weights = attention_weights.permute(0, 2, 1, 3).reshape( bs * n_head, 1, Len_q, sum(num_points_list) ) - weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights + weighted_sample_locs = ( + torch.concat(sampling_value_list, dim=-1) * attn_weights + ) output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q) return output.permute(0, 2, 1) @@ -584,7 +606,9 @@ def _build_input_proj_layer(self, feat_channels): [ ( "conv", - nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False), + nn.Conv2d( + in_channels, self.hidden_dim, 1, bias=False + ), ), ( "norm", @@ -665,9 +689,13 @@ def _generate_anchors( torch.arange(h), torch.arange(w), indexing="ij" ) grid_xy = torch.stack([grid_x, grid_y], dim=-1) - grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor( + [w, h], dtype=dtype + ) wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl) - lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4) + lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape( + -1, h * w, 4 + ) anchors.append(lvl_anchors) anchors = torch.concat(anchors, dim=1).to(device) @@ -701,18 +729,22 @@ def _get_decoder_input( ) enc_topk_bboxes_list, enc_topk_logits_list = [], [] - enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = self._select_topk( - output_memory, - enc_outputs_logits, - enc_outputs_coord_unact, - self.num_queries, + enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = ( + self._select_topk( + output_memory, + enc_outputs_logits, + enc_outputs_coord_unact, + self.num_queries, + ) ) # if self.num_select_queries != self.num_queries: # raise NotImplementedError('') if self.learn_query_content: - content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1]) + content = self.tgt_embed.weight.unsqueeze(0).tile( + [memory.shape[0], 1, 1] + ) else: content = enc_topk_memory.detach() @@ -739,7 +771,9 @@ def _select_topk( topk: int, ): if self.query_select_method == "default": - _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1) + _, topk_ind = torch.topk( + outputs_logits.max(-1).values, topk, dim=-1 + ) elif self.query_select_method == "one2many": _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1) @@ -752,12 +786,16 @@ def _select_topk( topk_coords = outputs_coords_unact.gather( dim=1, - index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]), + index=topk_ind.unsqueeze(-1).repeat( + 1, 1, outputs_coords_unact.shape[-1] + ), ) topk_logits = outputs_logits.gather( dim=1, - index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]), + index=topk_ind.unsqueeze(-1).repeat( + 1, 1, outputs_logits.shape[-1] + ), ) topk_memory = memory.gather( diff --git a/src/yomitoku/postprocessor/rtdetr_postprocessor.py b/src/yomitoku/postprocessor/rtdetr_postprocessor.py index e6472e1..33894b7 100644 --- a/src/yomitoku/postprocessor/rtdetr_postprocessor.py +++ b/src/yomitoku/postprocessor/rtdetr_postprocessor.py @@ -1,4 +1,17 @@ -"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.""" +# Copyright 2023 lyuwenyu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.nn as nn @@ -41,12 +54,16 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold): logits, boxes = outputs["pred_logits"], outputs["pred_boxes"] # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) - bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy") + bbox_pred = torchvision.ops.box_convert( + boxes, in_fmt="cxcywh", out_fmt="xyxy" + ) bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1) if self.use_focal_loss: scores = F.sigmoid(logits) - scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1) + scores, index = torch.topk( + scores.flatten(1), self.num_top_queries, dim=-1 + ) # TODO for older tensorrt # labels = index % self.num_classes labels = mod(index, self.num_classes) @@ -60,7 +77,9 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold): scores = F.softmax(logits)[:, :, :-1] scores, labels = scores.max(dim=-1) if scores.shape[1] > self.num_top_queries: - scores, index = torch.topk(scores, self.num_top_queries, dim=-1) + scores, index = torch.topk( + scores, self.num_top_queries, dim=-1 + ) labels = torch.gather(labels, dim=1, index=index) boxes = torch.gather( boxes, @@ -78,7 +97,10 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold): labels = ( torch.tensor( - [mscoco_label2category[int(x.item())] for x in labels.flatten()] + [ + mscoco_label2category[int(x.item())] + for x in labels.flatten() + ] ) .to(boxes.device) .reshape(labels.shape)