diff --git a/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml new file mode 100644 index 0000000..d3571ab --- /dev/null +++ b/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml @@ -0,0 +1,67 @@ +evaluate: True + +pretrain: + path: model_zoo/AVA-Kinetics_SLOWFAST_R101_ACAR_HR2O_DEPTH1.pth.tar + +result_path: experiments/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1 +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast101 + learnable: True + kwargs: + alpha: 4 + beta: 0.125 + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 1 + mlp_1x1: True + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 1. + - type: Normalize + kwargs: + mean: [110.63666788, 103.16065604, 96.29023126] + std: [38.7568578, 37.88248729, 40.02898126] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml new file mode 100755 index 0000000..34d21fb --- /dev/null +++ b/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml @@ -0,0 +1,109 @@ +evaluate: False + +result_path: experiments/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1 +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast101 + learnable: True + pretrain: + path: pretrained/SLOWFAST_R101_K700.pth.tar + kwargs: + alpha: 4 + beta: 0.125 + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 1 + mlp_1x1: True + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +train: + root_path: data + annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: RandomHorizontalFlip + - type: ToTensor + kwargs: + norm_value: 1. + - type: Normalize + kwargs: + mean: [110.63666788, 103.16065604, 96.29023126] + std: [38.7568578, 37.88248729, 40.02898126] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + n_epochs: 5 + val_freq: 1 + save_freq: 1 + + optimizer: + type: SGD + kwargs: + momentum: 0.9 + weight_decay: 0.0000001 + nesterov: True + + scheduler: + type: step + milestone_epochs: [4.6, 4.8] + lr_mults: [0.1, 0.1] + base_lr: 0.008 + warmup_lr: 0.064 + warmup_epochs: 1 + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 1. + - type: Normalize + kwargs: + mean: [110.63666788, 103.16065604, 96.29023126] + std: [38.7568578, 37.88248729, 40.02898126] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml b/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml new file mode 100755 index 0000000..f08c26a --- /dev/null +++ b/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml @@ -0,0 +1,111 @@ +evaluate: False + +result_path: experiments/AVA/SLOWFAST_R50_ACAR_HR2O +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast50 + learnable: True + pretrain: + path: pretrained/SLOWFAST_R50_K400.pth.tar + kwargs: + alpha: 4 + beta: 0.125 + fuse_only_conv: False + fuse_kernel_size: 7 + slow_full_span: True + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 2 + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +train: + root_path: data + annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: RandomHorizontalFlip + - type: ToTensor + kwargs: + norm_value: 255. + - type: Normalize + kwargs: + mean: [0.450, 0.450, 0.450] + std: [0.225, 0.225, 0.225] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + n_epochs: 6 + val_freq: 1 + save_freq: 1 + + optimizer: + type: SGD + kwargs: + momentum: 0.9 + weight_decay: 0.0000001 + nesterov: True + + scheduler: + type: step + milestone_epochs: [5.6, 5.8] + lr_mults: [0.1, 0.1] + base_lr: 0.008 + warmup_lr: 0.064 + warmup_epochs: 1 + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 255. + - type: Normalize + kwargs: + mean: [0.450, 0.450, 0.450] + std: [0.225, 0.225, 0.225] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/AVA/SLOWFAST_R50_baseline.yaml b/configs/AVA/SLOWFAST_R50_baseline.yaml new file mode 100755 index 0000000..91ca7e6 --- /dev/null +++ b/configs/AVA/SLOWFAST_R50_baseline.yaml @@ -0,0 +1,110 @@ +evaluate: False + +result_path: experiments/AVA/SLOWFAST_R50_baseline +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast50 + learnable: True + pretrain: + path: pretrained/SLOWFAST_R50_K400.pth.tar + kwargs: + alpha: 4 + beta: 0.125 + fuse_only_conv: False + fuse_kernel_size: 7 + slow_full_span: True + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: linear + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +train: + root_path: data + annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: RandomHorizontalFlip + - type: ToTensor + kwargs: + norm_value: 255. + - type: Normalize + kwargs: + mean: [0.450, 0.450, 0.450] + std: [0.225, 0.225, 0.225] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + n_epochs: 6 + val_freq: 1 + save_freq: 1 + + optimizer: + type: SGD + kwargs: + momentum: 0.9 + weight_decay: 0.0000001 + nesterov: True + + scheduler: + type: step + milestone_epochs: [5.6, 5.8] + lr_mults: [0.1, 0.1] + base_lr: 0.008 + warmup_lr: 0.064 + warmup_epochs: 1 + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 255. + - type: Normalize + kwargs: + mean: [0.450, 0.450, 0.450] + std: [0.225, 0.225, 0.225] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml new file mode 100755 index 0000000..4c0f5b2 --- /dev/null +++ b/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml @@ -0,0 +1,67 @@ +evaluate: True + +pretrain: + path: model_zoo/AVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.pth.tar + +result_path: experiments/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1 +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast101 + learnable: True + kwargs: + alpha: 4 + beta: 0.125 + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 1 + mlp_1x1: True + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 1. + - type: Normalize + kwargs: + mean: [110.63666788, 103.16065604, 96.29023126] + std: [38.7568578, 37.88248729, 40.02898126] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml b/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml new file mode 100755 index 0000000..374a7cc --- /dev/null +++ b/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml @@ -0,0 +1,69 @@ +evaluate: True + +pretrain: + path: model_zoo/AVA_SLOWFAST_R50_ACAR_HR2O.pth.tar + +result_path: experiments/AVA/eval_SLOWFAST_R50_ACAR_HR2O +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast50 + learnable: True + kwargs: + alpha: 4 + beta: 0.125 + fuse_only_conv: False + fuse_kernel_size: 7 + slow_full_span: True + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 2 + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 255. + - type: Normalize + kwargs: + mean: [0.450, 0.450, 0.450] + std: [0.225, 0.225, 0.225] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/configs/ROAD.yml b/configs/ROAD.yml new file mode 100644 index 0000000..020c0eb --- /dev/null +++ b/configs/ROAD.yml @@ -0,0 +1,67 @@ +evaluate: True + +pretrain: + path: kinetics-pt/SLOWFAST_R101_K700.pth.tar + +result_path: experiments/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1 +manual_seed: 1 +print_freq: 20 + +model: + freeze_bn: True + backbone: + arch: slowfast101 + learnable: True + kwargs: + alpha: 2 + beta: 0.125 + neck: + type: basic + kwargs: + bbox_jitter: + num: 1 + scale: 0.075 + num_classes: 60 + multi_class: True + head: + type: acar + kwargs: + width: 2304 + roi_spatial: 7 + num_classes: 60 + depth: 1 + mlp_1x1: True + +loss: + type: ava_criterion + kwargs: + pose_softmax: True + +val: + root_path: data + annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl + batch_size: 1 + + augmentation: + spatial: + - type: Scale + kwargs: + resize: 256 + - type: ToTensor + kwargs: + norm_value: 1. + - type: Normalize + kwargs: + mean: [110.63666788, 103.16065604, 96.29023126] + std: [38.7568578, 37.88248729, 40.02898126] + temporal: + type: TemporalCenterCrop + kwargs: + size: 64 + step: 2 + + with_label: False + eval_mAP: + labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt + groundtruth: annotations/ava_val_v2.2.csv + exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv diff --git a/main.py b/main.py index 8f65e46..2753e3a 100644 --- a/main.py +++ b/main.py @@ -225,7 +225,7 @@ def main(): full_test = True #args.MODE != 'train' args.skip_beggning = 0 args.skip_ending = 0 - if args.MODEL_TYPE == 'I3D': + if args.MODEL_TYPE == 'I3D' or 'SlowFast': args.skip_beggning = 2 args.skip_ending = 2 elif args.MODEL_TYPE != 'C2D': diff --git a/models/__init__ .py b/models/__init__ .py new file mode 100755 index 0000000..3eedcb8 --- /dev/null +++ b/models/__init__ .py @@ -0,0 +1,106 @@ +import torch +import torch.nn as nn + +from .backbones import AVA_backbone +from .necks import AVA_neck +from .heads import AVA_head + + +class AVA_model(nn.Module): + def __init__(self, config): + super(AVA_model, self).__init__() + self.config = config + + self.backbone = AVA_backbone(config.backbone) + self.neck = AVA_neck(config.neck) + self.head = AVA_head(config.head) + + def forward(self, data, evaluate=False): + if not evaluate: # train mode + i_b = {'clips': data['clips']} + o_b = self.backbone(i_b) + + i_n = {'aug_info': data['aug_info'], 'labels': data['labels'], + 'filenames': data['filenames'], 'mid_times': data['mid_times']} + o_n = self.neck(i_n) + + if o_n['num_rois'] == 0: + return {'outputs': None, 'targets': o_n['targets'], + 'num_rois': 0, 'filenames': o_n['filenames'], + 'mid_times': o_n['mid_times'], 'bboxes': o_n['bboxes']} + + i_h = {'features': o_b['features'], 'rois': o_n['rois'], + 'num_rois': o_n['num_rois'], 'roi_ids': o_n['roi_ids'], + 'sizes_before_padding': o_n['sizes_before_padding']} + o_h = self.head(i_h) + + return {'outputs': o_h['outputs'], 'targets': o_n['targets'], + 'num_rois': o_n['num_rois'], 'filenames': o_n['filenames'], + 'mid_times': o_n['mid_times'], 'bboxes': o_n['bboxes']} + + # eval mode + assert not self.training + + noaug_info = [{'crop_box': [0., 0., 1., 1.], 'flip': False, 'pad_ratio': [1., 1.]}] * len(data['labels']) + i_n = {'aug_info': noaug_info, 'labels': data['labels'], + 'filenames': data['filenames'], 'mid_times': data['mid_times']} + o = self.neck(i_n) + + output_list = [None] * len(o['filenames']) + cnt_list = [0] * len(o['filenames']) + + for no in range(len(data['clips'])): + i_b = {'clips': data['clips'][no]} + o_b = self.backbone(i_b) + + i_n = {'aug_info': data['aug_info'][no], 'labels': data['labels'], + 'filenames': data['filenames'], 'mid_times': data['mid_times']} + o_n = self.neck(i_n) + + if o_n['num_rois'] == 0: + continue + ids = o_n['bbox_ids'] + + i_h = {'features': o_b['features'], 'rois': o_n['rois'], + 'num_rois': o_n['num_rois'], 'roi_ids': o_n['roi_ids'], + 'sizes_before_padding': o_n['sizes_before_padding']} + o_h = self.head(i_h) + + outputs = o_h['outputs'] + for idx in range(o_n['num_rois']): + if cnt_list[ids[idx]] == 0: + output_list[ids[idx]] = outputs[idx] + else: + output_list[ids[idx]] += outputs[idx] + cnt_list[ids[idx]] += 1 + + num_rois, filenames, mid_times, bboxes, targets, outputs = 0, [], [], [], [], [] + for idx in range(len(o['filenames'])): + if cnt_list[idx] == 0: + continue + num_rois += 1 + filenames.append(o['filenames'][idx]) + mid_times.append(o['mid_times'][idx]) + bboxes.append(o['bboxes'][idx]) + targets.append(o['targets'][idx]) + outputs.append(output_list[idx] / float(cnt_list[idx])) + + if num_rois == 0: + return {'outputs': None, 'targets': None, 'num_rois': 0, + 'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes} + + final_outputs = torch.stack(outputs, dim=0) + final_targets = torch.stack(targets, dim=0) + return {'outputs': final_outputs, 'targets': final_targets, 'num_rois': num_rois, + 'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes} + + def train(self, mode=True): + super(AVA_model, self).train(mode) + + if mode and self.config.get('freeze_bn', False): + def set_bn_eval(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') != -1: + m.eval() + + self.backbone.apply(set_bn_eval) diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/models/backbone_models.py b/models/backbone_models.py index 19faad2..c9826cd 100644 --- a/models/backbone_models.py +++ b/models/backbone_models.py @@ -30,9 +30,8 @@ def backbone_models(args): model.identity_state_dict() if MODEL_TYPE.startswith('RCGRU') or MODEL_TYPE.startswith('RCLSTM'): model.recurrent_conv_zero_state() - - load_dict = torch.load(args.MODEL_PATH) - - model.load_my_state_dict(load_dict) + if not MODEL_TYPE.startswith('SlowFast'): + load_dict = torch.load(args.MODEL_PATH) + model.load_my_state_dict(load_dict) return model diff --git a/models/backbones/__init__.py b/models/backbones/__init__.py new file mode 100755 index 0000000..1a8fa4e --- /dev/null +++ b/models/backbones/__init__.py @@ -0,0 +1,31 @@ +import torch.nn as nn + +from .slowfast import * +from utils import load_pretrain + + +def model_entry(config): + return globals()[config['arch']](**config['kwargs']) + + +class AVA_backbone(nn.Module): + def __init__(self, config): + super(AVA_backbone, self).__init__() + + self.config = config + self.module = model_entry(config.model.backbone) + print(config.get('pretrain', None)) + if config.get('pretrain', None) is not None: + load_pretrain(config.pretrain, self.module) + + if not config.get('learnable', True): + self.module.requires_grad_(False) + + # data: clips + # returns: features + def forward(self, data): + # inputs = data['clips'] + inputs = data + inputs = inputs.cuda() + features = self.module(inputs) + return features diff --git a/models/backbones/slowfast.py b/models/backbones/slowfast.py new file mode 100755 index 0000000..e6518ec --- /dev/null +++ b/models/backbones/slowfast.py @@ -0,0 +1,260 @@ +""" +References: +[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982), +[PySlowFast](https://github.com/facebookresearch/slowfast). +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +BN = nn.BatchNorm3d + +__all__ = ['slowfast50', 'slowfast101', 'slowfast152', 'slowfast200'] + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1, head_conv=1): + super(Bottleneck, self).__init__() + if head_conv == 1: + self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BN(planes) + elif head_conv == 3: + self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(3, 1, 1), bias=False, padding=(1, 0, 0)) + self.bn1 = BN(planes) + else: + raise ValueError("Unsupported head_conv!") + self.conv2 = nn.Conv3d( + planes, planes, kernel_size=(1, 3, 3), stride=(1, stride, stride), + padding=(0, dilation, dilation), dilation=(1, dilation, dilation), bias=False) + self.bn2 = BN(planes) + self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = BN(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + if downsample is not None: + self.downsample_bn = BN(planes * 4) + self.stride = stride + # self.alpha = 1 + + def forward(self, x): + res = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + res = self.downsample(x) + res = self.downsample_bn(res) + + out = out + res + out = self.relu(out) + + return out + + +class SlowFast(nn.Module): + def __init__(self, block, layers, alpha=8, beta=0.125, fuse_only_conv=True, fuse_kernel_size=5, slow_full_span=False): + super(SlowFast, self).__init__() + + self.alpha = alpha + self.beta = beta + self.slow_full_span = slow_full_span + + '''Fast Network''' + self.fast_inplanes = int(64 * beta) + self.fast_conv1 = nn.Conv3d(3, self.fast_inplanes, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False) + self.fast_bn1 = BN(self.fast_inplanes) + self.fast_relu = nn.ReLU(inplace=True) + self.fast_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) + + self.fast_res1 = self._make_layer_fast(block, int(64 * beta), layers[0], head_conv=3) + self.fast_res2 = self._make_layer_fast(block, int(128 * beta), layers[1], stride=2, head_conv=3) + self.fast_res3 = self._make_layer_fast(block, int(256 * beta), layers[2], stride=2, head_conv=3) + self.fast_res4 = self._make_layer_fast(block, int(512 * beta), layers[3], head_conv=3, dilation=2) + + '''Slow Network''' + self.slow_inplanes = 64 + self.slow_conv1 = nn.Conv3d(3, self.slow_inplanes, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False) + self.slow_bn1 = BN(self.slow_inplanes) + self.slow_relu = nn.ReLU(inplace=True) + self.slow_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) + + self.slow_res1 = self._make_layer_slow(block, 64, layers[0], head_conv=1) + self.slow_res2 = self._make_layer_slow(block, 128, layers[1], stride=2, head_conv=1) + self.slow_res3 = self._make_layer_slow(block, 256, layers[2], stride=2, head_conv=3) + self.slow_res4 = self._make_layer_slow(block, 512, layers[3], head_conv=3, dilation=2) + + + + '''Lateral Connections''' + fuse_padding = fuse_kernel_size // 2 + fuse_kwargs = {'kernel_size': (fuse_kernel_size, 1, 1), 'stride': (alpha, 1, 1), 'padding': (fuse_padding, 0, 0), 'bias': False} + if fuse_only_conv: + def fuse_func(in_channels, out_channels): + return nn.Conv3d(in_channels, out_channels, **fuse_kwargs) + else: + def fuse_func(in_channels, out_channels): + return nn.Sequential( + nn.Conv3d(in_channels, out_channels, **fuse_kwargs), + BN(out_channels), + nn.ReLU(inplace=True) + ) + self.Tconv1 = fuse_func(int(64 * beta), int(128 * beta)) + self.Tconv2 = fuse_func(int(256 * beta), int(512 * beta)) + self.Tconv3 = fuse_func(int(512 * beta), int(1024 * beta)) + self.Tconv4 = fuse_func(int(1024 * beta), int(2048 * beta)) + # for input in []: + # self.slow_conv1_1 = nn.Conv3d(3, self.fast_inplanes, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False) + self.pool2 = nn.MaxPool3d(kernel_size=( + 2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)) + def _upsample(self, x, y): + _, _, t, h, w = y.size() + # print('spatial', x.shape, y.shape) + x_upsampled = F.interpolate(x, [t, h, w], mode='nearest') + + return x_upsampled + + + def forward(self, input): + fast, Tc = self.FastPath(input) + # print('alpha',self.alpha) + + + if self.slow_full_span: + slow_input = torch.index_select( + input, + 2, + torch.linspace( + 0, + input.shape[2] - 1, + input.shape[2] // self.alpha, + ).long().cuda(), + ) + else: + slow_input = input[:, :, ::self.alpha, :, :] + slow = self.SlowPath(slow_input, Tc) + + + fast[0] = self.pool2(fast[0]) + fast[1] = self.pool2(fast[1]) + fast[2] = self.pool2(fast[2]) + + outFeat = [] + for sitem,fitem in zip(slow,fast): + outFeat.append(torch.cat((sitem,fitem),1)) + # print(outFeat[-1].shape) + return outFeat + + def SlowPath(self, input, Tc): + # print('slowinpdi',input.shape) + x = self.slow_conv1(input) + x = self.slow_bn1(x) + x = self.slow_relu(x) + x = self.slow_maxpool(x) + # print('x',x.shape) + x = torch.cat([x, Tc[0]], dim=1) + x = self.slow_res1(x) + x = torch.cat([x, Tc[1]], dim=1) + c3 = self.slow_res2(x) + x = torch.cat([c3, Tc[2]], dim=1) + c4 = self.slow_res3(x) + x = torch.cat([c4, Tc[3]], dim=1) + c5 = self.slow_res4(x) + + return [c3,c4,c5] + + def FastPath(self, input): + x = self.fast_conv1(input) + x = self.fast_bn1(x) + x = self.fast_relu(x) + x = self.fast_maxpool(x) + Tc1 = self.Tconv1(x) + x = self.fast_res1(x) + Tc2 = self.Tconv2(x) + c3 = self.fast_res2(x) + Tc3 = self.Tconv3(c3) + c4 = self.fast_res3(c3) + Tc4 = self.Tconv4(c4) + c5 = self.fast_res4(c4) + return [c3,c4,c5], [Tc1, Tc2, Tc3, Tc4] + + def _make_layer_fast(self, block, planes, blocks, stride=1, head_conv=1, dilation=1): + downsample = None + if stride != 1 or self.fast_inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv3d( + self.fast_inplanes, + planes * block.expansion, + kernel_size=1, + stride=(1, stride, stride), + bias=False + ) + ) + + layers = [] + layers.append(block(self.fast_inplanes, planes, stride, downsample, dilation=dilation, head_conv=head_conv)) + self.fast_inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.fast_inplanes, planes, dilation=dilation, head_conv=head_conv)) + + return nn.Sequential(*layers) + + def _make_layer_slow(self, block, planes, blocks, stride=1, head_conv=1, dilation=1): + downsample = None + fused_inplanes = self.slow_inplanes + int(self.slow_inplanes * self.beta) * 2 + if stride != 1 or fused_inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv3d( + fused_inplanes, + planes * block.expansion, + kernel_size=1, + stride=(1, stride, stride), + bias=False + ) + ) + + layers = [] + layers.append(block(fused_inplanes, planes, stride, downsample, dilation=dilation, head_conv=head_conv)) + self.slow_inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.slow_inplanes, planes, dilation=dilation, head_conv=head_conv)) + + return nn.Sequential(*layers) + + +def slowfast50(**kwargs): + """Constructs a SlowFast-50 model. + """ + model = SlowFast(Bottleneck, [3, 4, 6, 3], **kwargs) + return model + + +def slowfast101(**kwargs): + """Constructs a SlowFast-101 model. + """ + model = SlowFast(Bottleneck, [3, 4, 23, 3], **kwargs) + return model + + +def slowfast152(**kwargs): + """Constructs a SlowFast-152 model. + """ + model = SlowFast(Bottleneck, [3, 8, 36, 3], **kwargs) + return model + + +def slowfast200(**kwargs): + """Constructs a SlowFast-200 model. + """ + model = SlowFast(Bottleneck, [3, 24, 36, 3], **kwargs) + return model diff --git a/models/heads/__init__.py b/models/heads/__init__.py new file mode 100755 index 0000000..1009d70 --- /dev/null +++ b/models/heads/__init__.py @@ -0,0 +1,17 @@ +import torch.nn as nn + +from .linear import * +from .acar import * + + +def model_entry(config): + return globals()[config['type']](**config['kwargs']) + + +class AVA_head(nn.Module): + def __init__(self, config): + super(AVA_head, self).__init__() + self.module = model_entry(config) + + def forward(self, data): + return self.module(data) diff --git a/models/heads/acar.py b/models/heads/acar.py new file mode 100755 index 0000000..d35a423 --- /dev/null +++ b/models/heads/acar.py @@ -0,0 +1,151 @@ +import math + +import torch +import torch.nn as nn +import torchvision + +__all__ = ['acar'] + + +class HR2O_NL(nn.Module): + def __init__(self, hidden_dim=512, kernel_size=3, mlp_1x1=False): + super(HR2O_NL, self).__init__() + + self.hidden_dim = hidden_dim + + padding = kernel_size // 2 + self.conv_q = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False) + self.conv_k = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False) + self.conv_v = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False) + + self.conv = nn.Conv2d( + hidden_dim, hidden_dim, + 1 if mlp_1x1 else kernel_size, + padding=0 if mlp_1x1 else padding, + bias=False + ) + self.norm = nn.GroupNorm(1, hidden_dim, affine=True) + self.dp = nn.Dropout(0.2) + + def forward(self, x): + query = self.conv_q(x).unsqueeze(1) + key = self.conv_k(x).unsqueeze(0) + att = (query * key).sum(2) / (self.hidden_dim ** 0.5) + att = nn.Softmax(dim=1)(att) + value = self.conv_v(x) + virt_feats = (att.unsqueeze(2) * value).sum(1) + + virt_feats = self.norm(virt_feats) + virt_feats = nn.functional.relu(virt_feats) + virt_feats = self.conv(virt_feats) + virt_feats = self.dp(virt_feats) + + x = x + virt_feats + return x + + +class ACARHead(nn.Module): + def __init__(self, width, roi_spatial=7, num_classes=60, dropout=0., bias=False, + reduce_dim=1024, hidden_dim=512, downsample='max2x2', depth=2, + kernel_size=3, mlp_1x1=False): + super(ACARHead, self).__init__() + + self.roi_spatial = roi_spatial + self.roi_maxpool = nn.MaxPool2d(roi_spatial) + + # actor-context feature encoder + self.conv_reduce = nn.Conv2d(width, reduce_dim, 1, bias=False) + + self.conv1 = nn.Conv2d(reduce_dim * 2, hidden_dim, 1, bias=False) + self.conv2 = nn.Conv2d(hidden_dim, hidden_dim, 3, bias=False) + + # down-sampling before HR2O + assert downsample in ['none', 'max2x2'] + if downsample == 'none': + self.downsample = nn.Identity() + elif downsample == 'max2x2': + self.downsample = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + # high-order relation reasoning operator (HR2O_NL) + layers = [] + for _ in range(depth): + layers.append(HR2O_NL(hidden_dim, kernel_size, mlp_1x1)) + self.hr2o = nn.Sequential(*layers) + + # classification + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Linear(reduce_dim, hidden_dim, bias=False) + self.fc2 = nn.Linear(hidden_dim * 2, num_classes, bias=bias) + + if dropout > 0: + self.dp = nn.Dropout(dropout) + else: + self.dp = None + + # data: features, rois, num_rois, roi_ids, sizes_before_padding + # returns: outputs + def forward(self, data): + if not isinstance(data['features'], list): + feats = [data['features']] + else: + feats = data['features'] + + # temporal average pooling + h, w = feats[0].shape[3:] + # requires all features have the same spatial dimensions + feats = [nn.AdaptiveAvgPool3d((1, h, w))(f).view(-1, f.shape[1], h, w) for f in feats] + feats = torch.cat(feats, dim=1) + + feats = self.conv_reduce(feats) + + rois = data['rois'] + rois[:, 1] = rois[:, 1] * w + rois[:, 2] = rois[:, 2] * h + rois[:, 3] = rois[:, 3] * w + rois[:, 4] = rois[:, 4] * h + rois = rois.detach() + roi_feats = torchvision.ops.roi_align(feats, rois, (self.roi_spatial, self.roi_spatial)) + roi_feats = self.roi_maxpool(roi_feats).view(data['num_rois'], -1) + + roi_ids = data['roi_ids'] + sizes_before_padding = data['sizes_before_padding'] + high_order_feats = [] + for idx in range(feats.shape[0]): # iterate over mini-batch + n_rois = roi_ids[idx+1] - roi_ids[idx] + if n_rois == 0: + continue + + eff_h, eff_w = math.ceil(h * sizes_before_padding[idx][1]), math.ceil(w * sizes_before_padding[idx][0]) + bg_feats = feats[idx][:, :eff_h, :eff_w] + bg_feats = bg_feats.unsqueeze(0).repeat((n_rois, 1, 1, 1)) + actor_feats = roi_feats[roi_ids[idx]:roi_ids[idx+1]] + tiled_actor_feats = actor_feats.unsqueeze(2).unsqueeze(2).expand_as(bg_feats) + interact_feats = torch.cat([bg_feats, tiled_actor_feats], dim=1) + + interact_feats = self.conv1(interact_feats) + interact_feats = nn.functional.relu(interact_feats) + interact_feats = self.conv2(interact_feats) + interact_feats = nn.functional.relu(interact_feats) + + interact_feats = self.downsample(interact_feats) + + interact_feats = self.hr2o(interact_feats) + interact_feats = self.gap(interact_feats) + high_order_feats.append(interact_feats) + + high_order_feats = torch.cat(high_order_feats, dim=0).view(data['num_rois'], -1) + + outputs = self.fc1(roi_feats) + outputs = nn.functional.relu(outputs) + outputs = torch.cat([outputs, high_order_feats], dim=1) + + if self.dp is not None: + outputs = self.dp(outputs) + outputs = self.fc2(outputs) + + return {'outputs': outputs} + + +def acar(**kwargs): + model = ACARHead(**kwargs) + return model diff --git a/models/heads/linear.py b/models/heads/linear.py new file mode 100755 index 0000000..846cf8c --- /dev/null +++ b/models/heads/linear.py @@ -0,0 +1,57 @@ +import torch +import torch.nn as nn +import torchvision + +__all__ = ['linear'] + + +class LinearHead(nn.Module): + def __init__(self, width, roi_spatial=7, num_classes=60, dropout=0., bias=False): + super(LinearHead, self).__init__() + + self.roi_spatial = roi_spatial + self.roi_maxpool = nn.MaxPool2d(roi_spatial) + + self.fc = nn.Linear(width, num_classes, bias=bias) + + if dropout > 0: + self.dp = nn.Dropout(dropout) + else: + self.dp = None + + # data: features, rois + # returns: outputs + def forward(self, data): + if not isinstance(data['features'], list): + features = [data['features']] + else: + features = data['features'] + + roi_features = [] + for f in features: + sp = f.shape + h, w = sp[3:] + feats = nn.AdaptiveAvgPool3d((1, h, w))(f).view(-1, sp[1], h, w) + + rois = data['rois'].clone() + rois[:, 1] = rois[:, 1] * w + rois[:, 2] = rois[:, 2] * h + rois[:, 3] = rois[:, 3] * w + rois[:, 4] = rois[:, 4] * h + rois = rois.detach() + roi_feats = torchvision.ops.roi_align(feats, rois, (self.roi_spatial, self.roi_spatial)) + roi_feats = self.roi_maxpool(roi_feats).view(-1, sp[1]) + + roi_features.append(roi_feats) + + roi_features = torch.cat(roi_features, dim=1) + if self.dp is not None: + roi_features = self.dp(roi_features) + outputs = self.fc(roi_features) + + return {'outputs': outputs} + + +def linear(**kwargs): + model = LinearHead(**kwargs) + return model \ No newline at end of file diff --git a/models/necks/__init__.py b/models/necks/__init__.py new file mode 100755 index 0000000..b1ddc43 --- /dev/null +++ b/models/necks/__init__.py @@ -0,0 +1,16 @@ +import torch.nn as nn + +from .basic import * + + +def model_entry(config): + return globals()[config['type']](**config['kwargs']) + + +class AVA_neck(nn.Module): + def __init__(self, config): + super(AVA_neck, self).__init__() + self.module = model_entry(config) + + def forward(self, data): + return self.module(data) diff --git a/models/necks/basic.py b/models/necks/basic.py new file mode 100755 index 0000000..49a67c2 --- /dev/null +++ b/models/necks/basic.py @@ -0,0 +1,79 @@ +import torch +import torch.nn as nn + +from .utils import bbox_jitter, get_bbox_after_aug + +__all__ = ['basic'] + + +class BasicNeck(nn.Module): + def __init__(self, aug_threshold=0., bbox_jitter=None, num_classes=60, multi_class=True): + super(BasicNeck, self).__init__() + + # threshold on preserved ratio of bboxes after cropping augmentation + self.aug_threshold = aug_threshold + # config for bbox jittering + self.bbox_jitter = bbox_jitter + + self.num_classes = num_classes + self.multi_class = multi_class + + # data: aug_info, labels, filenames, mid_times + # returns: num_rois, rois, roi_ids, targets, sizes_before_padding, filenames, mid_times, bboxes, bbox_ids + def forward(self, data): + rois, roi_ids, targets, sizes_before_padding, filenames, mid_times = [], [0], [], [], [], [] + bboxes, bbox_ids = [], [] # used for multi-crop fusion + + cur_bbox_id = -1 # record current bbox no. + for idx in range(len(data['aug_info'])): + aug_info = data['aug_info'][idx] + pad_ratio = aug_info['pad_ratio'] + sizes_before_padding.append([1. / pad_ratio[0], 1. / pad_ratio[1]]) + + for label in data['labels'][idx]: + cur_bbox_id += 1 + if self.training and self.bbox_jitter is not None: + bbox_list = bbox_jitter(label['bounding_box'], + self.bbox_jitter.get('num', 1), + self.bbox_jitter.scale) + else: + # no bbox jittering during evaluation + bbox_list = [label['bounding_box']] + + for b in bbox_list: + bbox = get_bbox_after_aug(aug_info, b, self.aug_threshold) + if bbox is None: + continue + rois.append([idx] + bbox) + + filenames.append(data['filenames'][idx]) + mid_times.append(data['mid_times'][idx]) + bboxes.append(label['bounding_box']) + bbox_ids.append(cur_bbox_id) + + if self.multi_class: + ret = torch.zeros(self.num_classes) + ret.put_(torch.LongTensor(label['label']), + torch.ones(len(label['label']))) + else: + ret = torch.LongTensor(label['label']) + targets.append(ret) + + roi_ids.append(len(rois)) + + num_rois = len(rois) + if num_rois == 0: + return {'num_rois': 0, 'rois': None, 'roi_ids': roi_ids, 'targets': None, + 'sizes_before_padding': sizes_before_padding, + 'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes, 'bbox_ids': bbox_ids} + + rois = torch.FloatTensor(rois).cuda() + targets = torch.stack(targets, dim=0).cuda() + return {'num_rois': num_rois, 'rois': rois, 'roi_ids': roi_ids, 'targets': targets, + 'sizes_before_padding': sizes_before_padding, + 'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes, 'bbox_ids': bbox_ids} + + +def basic(**kwargs): + model = BasicNeck(**kwargs) + return model diff --git a/models/necks/utils.py b/models/necks/utils.py new file mode 100755 index 0000000..87ea964 --- /dev/null +++ b/models/necks/utils.py @@ -0,0 +1,45 @@ +import numpy as np + + +def bbox_jitter(bbox, num, delta): + w, h = bbox[2] - bbox[0], bbox[3] - bbox[1] + + if num == 1: + jitter = np.random.uniform(-delta, delta, 4) + bboxes = [[max(bbox[0] + jitter[0] * w, 0.), min(bbox[1] + jitter[1] * h, 1.), + max(bbox[2] + jitter[2] * w, 0.), min(bbox[3] + jitter[3] * h, 1.)]] + + return bboxes + + bboxes = [bbox] + jitter = np.random.uniform(-delta, delta, [num - 1, 4]) + for i in range(num - 1): + bboxes.append([max(bbox[0] + jitter[i][0] * w, 0.), min(bbox[1] + jitter[i][1] * h, 1.), + max(bbox[2] + jitter[i][2] * w, 0.), min(bbox[3] + jitter[i][3] * h, 1.)]) + return bboxes + + +def get_bbox_after_aug(aug_info, bbox, aug_threshold=0.3): + if aug_info is None: + return bbox + + cbox = aug_info['crop_box'] + w = cbox[2] - cbox[0] + h = cbox[3] - cbox[1] + + l = max(min(bbox[0], cbox[2]), cbox[0]) + r = max(min(bbox[2], cbox[2]), cbox[0]) + t = max(min(bbox[1], cbox[3]), cbox[1]) + b = max(min(bbox[3], cbox[3]), cbox[1]) + + if (b-t) * (r-l) <= (bbox[3]-bbox[1]) * (bbox[2]-bbox[0]) * aug_threshold: + return None + ret = [(l-cbox[0]) / w, (t-cbox[1]) / h, (r-cbox[0]) / w, (b-cbox[1]) / h] + + if aug_info['flip']: + ret = [1. - ret[2], ret[1], 1. - ret[0], ret[3]] + + pad_ratio = aug_info['pad_ratio'] + ret = [ret[0] / pad_ratio[0], ret[1] / pad_ratio[1], ret[2] / pad_ratio[0], ret[3] / pad_ratio[1]] + + return ret diff --git a/models/resnetFPN.py b/models/resnetFPN.py index 6b294ed..079f424 100644 --- a/models/resnetFPN.py +++ b/models/resnetFPN.py @@ -8,6 +8,12 @@ import modules.utils as lutils +import yaml +from easydict import EasyDict +from .backbones import AVA_backbone + + + logger = lutils.get_logger(__name__) ### Download weights from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py @@ -24,8 +30,21 @@ def conv1x1(in_channel, out_channel): class ResNetFPN(nn.Module): def __init__(self, block, args): + self.inplanes = 64 super(ResNetFPN, self).__init__() + + + if args.MODEL_TYPE.startswith('SlowFast'): + with open('configs/ROAD.yml') as f: + config = yaml.load(f, Loader=yaml.FullLoader) + opt = EasyDict(config) + print(opt) + self.inplanes = 64 + super(ResNetFPN, self).__init__() + self.backbone = AVA_backbone(opt) + + self.MODEL_TYPE = args.MODEL_TYPE num_blocks = args.model_perms non_local_inds = args.non_local_inds @@ -60,19 +79,34 @@ def __init__(self, block, args): #self.avgpool = nn.AvgPool2d(7, stride=1) #self.fc = nn.Linear(512 * block.expansion, num_classes) - self.conv6 = conv3x3(512 * block.expansion, 256, stride=2, padding=1) # P6 - self.conv7 = conv3x3(256, 256, stride=2, padding=1) # P7 + if self.MODEL_TYPE == 'SlowFast': + self.conv6 = conv3x3(2304, 256, stride=2, padding=1) # P6 + self.conv7 = conv3x3(256, 256, stride=2, padding=1) # P7 - self.ego_lateral = conv3x3(512 * block.expansion, 256, stride=2, padding=0) - self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) + self.ego_lateral = conv3x3(512 * block.expansion, 256, stride=2, padding=0) + self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) - self.lateral_layer1 = conv1x1(512 * block.expansion, 256) - self.lateral_layer2 = conv1x1(256 * block.expansion, 256) - self.lateral_layer3 = conv1x1(128 * block.expansion, 256) - - self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1) # P4 - self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1) # P4 - self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1) # P3 + self.lateral_layer1 = conv1x1(2304, 256) + self.lateral_layer2 = conv1x1(1152, 256) + self.lateral_layer3 = conv1x1(576, 256) + + self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1) # P4 + self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1) # P4 + self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1) # P3 + else: + self.conv6 = conv3x3(512 * block.expansion, 256, stride=2, padding=1) # P6 + self.conv7 = conv3x3(256, 256, stride=2, padding=1) # P7 + + self.ego_lateral = conv3x3(512 * block.expansion, 256, stride=2, padding=0) + self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) + + self.lateral_layer1 = conv1x1(512 * block.expansion, 256) + self.lateral_layer2 = conv1x1(256 * block.expansion, 256) + self.lateral_layer3 = conv1x1(128 * block.expansion, 256) + + self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1) # P4 + self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1) # P4 + self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1) # P3 for m in self.modules(): @@ -139,49 +173,59 @@ def _make_layer(self, block, planes, num_blocks, stride=1, temp_kernals=[], nl_i return nn.Sequential(*layers) def forward(self, x): - # pdb.set_trace() - # print('input shape', x.shape) - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - x = self.pool1(x) - # print('p1 ', x.shape) - x = self.layer1(x) - if self.pool2 is not None: - x = self.pool2(x) - # print('p2 shape ', x.shape) - c3 = self.layer2(x) - c4 = self.layer3(c3) - c5 = self.layer4(c4) - - # ego_feat = self.ego_lateral(c5) - # print(sources[-1].shape) - - - p5 = self.lateral_layer1(c5) - p5_upsampled = self._upsample(p5, c4) - p5 = self.corr_layer1(p5) - - p4 = self.lateral_layer2(c4) - p4 = p5_upsampled + p4 - p4_upsampled = self._upsample(p4, c3) - p4 = self.corr_layer2(p4) - - p3 = self.lateral_layer3(c3) - p3 = p4_upsampled + p3 - p3 = self.corr_layer3(p3) - - p6 = self.conv6(c5) - p7 = self.conv7(F.relu(p6)) - features = [p3, p4, p5, p6, p7] - - ego_feat = self.avg_pool(p7) - if self.pool2 is not None: - for i in range(len(features)): - features[i] = self._upsample_time(features[i]) - ego_feat = self._upsample_time(ego_feat) - + if self.MODEL_TYPE.startswith('SlowFast'): + ff = self.backbone(x) + c3 = ff[0] + c4 = ff[1] + c5 = ff[2] + p5 = self.lateral_layer1(c5) + p5_upsampled = self._upsample(p5, c4) + p5 = self.corr_layer1(p5) + p4 = self.lateral_layer2(c4) + p4 = p5_upsampled + p4 + p4_upsampled = self._upsample(p4, c3) + p4 = self.corr_layer2(p4) + p3 = self.lateral_layer3(c3) + p3 = p4_upsampled + p3 + p3 = self.corr_layer3(p3) + p6 = self.conv6(c5) + p7 = self.conv7(F.relu(p6)) + features = [p3, p4, p5, p6, p7] + ego_feat = self.avg_pool(p7) + if self.pool2 is not None: + for i in range(len(features)): + features[i] = self._upsample_time(features[i]) + ego_feat = self._upsample_time(ego_feat) + else: + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.pool1(x) + x = self.layer1(x) + if self.pool2 is not None: + x = self.pool2(x) + c3 = self.layer2(x) + c4 = self.layer3(c3) + c5 = self.layer4(c4) + p5 = self.lateral_layer1(c5) + p5_upsampled = self._upsample(p5, c4) + p5 = self.corr_layer1(p5) + p4 = self.lateral_layer2(c4) + p4 = p5_upsampled + p4 + p4_upsampled = self._upsample(p4, c3) + p4 = self.corr_layer2(p4) + p3 = self.lateral_layer3(c3) + p3 = p4_upsampled + p3 + p3 = self.corr_layer3(p3) + p6 = self.conv6(c5) + p7 = self.conv7(F.relu(p6)) + features = [p3, p4, p5, p6, p7] + ego_feat = self.avg_pool(p7) + if self.pool2 is not None: + for i in range(len(features)): + features[i] = self._upsample_time(features[i]) + ego_feat = self._upsample_time(ego_feat) return features, ego_feat @@ -290,5 +334,7 @@ def resnetfpn(args): return ResNetFPN(BottleneckRCLSTM, args) elif model_type.startswith('RCGRU'): return ResNetFPN(BottleneckRCGRU, args) + elif model_type.startswith('SlowFast'): + return ResNetFPN(BottleneckI3D, args) else: raise RuntimeError('Define the model type correctly:: ' + model_type) \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100755 index 0000000..4085bef --- /dev/null +++ b/utils.py @@ -0,0 +1,208 @@ +import os +import csv +import logging +import math +import random + +import numpy as np +import torch +import torch.distributed as dist +from torch.utils.data.sampler import Sampler + +_LOGGER = None + + +def get_rank(): + return dist.get_rank() + + +def get_world_size(): + return dist.get_world_size() + + +def mkdir(path): + os.makedirs(path, exist_ok=True) + + +def random_seed(seed_value): + np.random.seed(seed_value) + torch.manual_seed(seed_value) + random.seed(seed_value) + os.environ['PYTHONHASHSEED'] = str(seed_value) + torch.cuda.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) + + +def parameters_string(module): + lines = [ + "", + "List of model parameters:", + "=" * 105, + ] + + row_format = "{name:<60} {shape:>27} ={total_size:>15,d}" + params = list(module.named_parameters()) + for name, param in params: + lines.append(row_format.format( + name=name, + shape=" * ".join(str(p) for p in param.size()), + total_size=param.numel() + )) + lines.append("=" * 105) + lines.append(row_format.format( + name="all parameters", + shape="sum of above", + total_size=sum(int(param.numel()) for name, param in params) + )) + lines.append("") + return "\n".join(lines) + + +def create_logger(log_file, level=logging.INFO): + global _LOGGER + if _LOGGER is not None: + return _LOGGER + l = logging.getLogger('global') + formatter = logging.Formatter('[%(asctime)s][%(filename)15s][line:%(lineno)4d][%(levelname)8s] %(message)s') + fh = logging.FileHandler(log_file) + fh.setFormatter(formatter) + sh = logging.StreamHandler() + sh.setFormatter(formatter) + l.setLevel(level) + l.addHandler(fh) + l.addHandler(sh) + l.propagate = False + _LOGGER = l + return l + + +def get_logger(): + return _LOGGER + + +class Logger(object): + + def __init__(self, path, header): + self.log_file = open(path, 'w') + self.logger = csv.writer(self.log_file, delimiter='\t') + + self.logger.writerow(header) + self.header = header + + def __del(self): + self.log_file.close() + + def log(self, values): + write_values = [] + for col in self.header: + assert col in values + write_values.append(values[col]) + + self.logger.writerow(write_values) + self.log_file.flush() + + +class AverageMeter(object): + def __init__(self, length=0): + self.length = length + self.reset() + + def reset(self): + if self.length > 0: + self.history, self.history_num = [], [] + else: + self.count = 0 + self.sum = 0.0 + self.val = 0.0 + self.avg = 0.0 + + def update(self, val, num=1): + assert num > 0 + if self.length > 0: + self.history.append(val * num) + self.history_num.append(num) + if len(self.history) > self.length: + del self.history[0] + del self.history_num[0] + + self.val = val + self.avg = np.sum(self.history) / np.sum(self.history_num) + else: + self.val = val + self.sum += val * num + self.count += num + self.avg = self.sum / self.count + + +class DistributedSampler(Sampler): + def __init__(self, dataset, world_size=None, rank=None, round_down=False): + if world_size is None: + world_size = get_world_size() + if rank is None: + rank = get_rank() + self.dataset = dataset + self.world_size = world_size + self.rank = rank + self.round_down = round_down + self.epoch = 0 + + self.total_size = len(self.dataset) + if self.round_down: + self.num_samples = int(math.floor(len(self.dataset) / self.world_size)) + else: + self.num_samples = int(math.ceil(len(self.dataset) / self.world_size)) + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = list(torch.randperm(len(self.dataset), generator=g)) + + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset:offset + self.num_samples] + if self.round_down: + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + + +def load_pretrain(pretrain_opt, net): + checkpoint = torch.load(pretrain_opt.path, map_location=lambda storage, loc: storage.cuda()) + if pretrain_opt.get('state_dict_key', None) is not None: + checkpoint = checkpoint[pretrain_opt.state_dict_key] + + if pretrain_opt.get('delete_prefix', None): + keys = set(checkpoint.keys()) + for k in keys: + if k.startswith(pretrain_opt.delete_prefix): + checkpoint.pop(k) + if pretrain_opt.get('replace_prefix', None) is not None: + keys = set(checkpoint.keys()) + for k in keys: + if k.startswith(pretrain_opt.replace_prefix): + new_k = pretrain_opt.get('replace_to', '') + k[len(pretrain_opt.replace_prefix):] + checkpoint[new_k] = checkpoint.pop(k) + net.load_state_dict(checkpoint, strict=False) + + # if get_rank() == 0: + # ckpt_keys = set(checkpoint.keys()) + # own_keys = set(net.state_dict().keys()) + # missing_keys = own_keys - ckpt_keys + # ignore_keys = ckpt_keys - own_keys + # loaded_keys = own_keys - missing_keys + + # logger = get_logger() + # for k in missing_keys: + # logger.info('Caution: missing key {}'.format(k)) + # for k in ignore_keys: + # logger.info('Caution: redundant key {}'.format(k)) + # logger.info('Loaded {} key(s) from pre-trained model at {}'.format(len(loaded_keys), pretrain_opt.path))