diff --git a/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
new file mode 100644
index 0000000..d3571ab
--- /dev/null
+++ b/configs/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
@@ -0,0 +1,67 @@
+evaluate: True
+
+pretrain:
+    path: model_zoo/AVA-Kinetics_SLOWFAST_R101_ACAR_HR2O_DEPTH1.pth.tar
+
+result_path: experiments/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast101
+        learnable: True
+        kwargs:
+            alpha: 4
+            beta: 0.125
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 1
+            mlp_1x1: True
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 1.
+          - type: Normalize
+            kwargs:
+                mean: [110.63666788, 103.16065604, 96.29023126]
+                std: [38.7568578, 37.88248729, 40.02898126]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
new file mode 100755
index 0000000..34d21fb
--- /dev/null
+++ b/configs/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
@@ -0,0 +1,109 @@
+evaluate: False
+
+result_path: experiments/AVA/SLOWFAST_R101_ACAR_HR2O_DEPTH1
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast101
+        learnable: True
+        pretrain:
+            path: pretrained/SLOWFAST_R101_K700.pth.tar
+        kwargs:
+            alpha: 4
+            beta: 0.125
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 1
+            mlp_1x1: True
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+train:
+    root_path: data
+    annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: RandomHorizontalFlip
+          - type: ToTensor
+            kwargs:
+                norm_value: 1.
+          - type: Normalize
+            kwargs:
+                mean: [110.63666788, 103.16065604, 96.29023126]
+                std: [38.7568578, 37.88248729, 40.02898126]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    n_epochs: 5
+    val_freq: 1
+    save_freq: 1
+
+    optimizer:
+        type: SGD
+        kwargs:
+            momentum: 0.9
+            weight_decay: 0.0000001
+            nesterov: True
+
+    scheduler:
+        type: step
+        milestone_epochs: [4.6, 4.8]
+        lr_mults: [0.1, 0.1]
+        base_lr: 0.008
+        warmup_lr: 0.064
+        warmup_epochs: 1
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 1.
+          - type: Normalize
+            kwargs:
+                mean: [110.63666788, 103.16065604, 96.29023126]
+                std: [38.7568578, 37.88248729, 40.02898126]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml b/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml
new file mode 100755
index 0000000..f08c26a
--- /dev/null
+++ b/configs/AVA/SLOWFAST_R50_ACAR_HR2O.yaml
@@ -0,0 +1,111 @@
+evaluate: False
+
+result_path: experiments/AVA/SLOWFAST_R50_ACAR_HR2O
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast50
+        learnable: True
+        pretrain:
+            path: pretrained/SLOWFAST_R50_K400.pth.tar
+        kwargs:
+            alpha: 4
+            beta: 0.125
+            fuse_only_conv: False
+            fuse_kernel_size: 7
+            slow_full_span: True
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 2
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+train:
+    root_path: data
+    annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: RandomHorizontalFlip
+          - type: ToTensor
+            kwargs:
+                norm_value: 255.
+          - type: Normalize
+            kwargs:
+                mean: [0.450, 0.450, 0.450]
+                std: [0.225, 0.225, 0.225]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    n_epochs: 6
+    val_freq: 1
+    save_freq: 1
+
+    optimizer:
+        type: SGD
+        kwargs:
+            momentum: 0.9
+            weight_decay: 0.0000001
+            nesterov: True
+
+    scheduler:
+        type: step
+        milestone_epochs: [5.6, 5.8]
+        lr_mults: [0.1, 0.1]
+        base_lr: 0.008
+        warmup_lr: 0.064
+        warmup_epochs: 1
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 255.
+          - type: Normalize
+            kwargs:
+                mean: [0.450, 0.450, 0.450]
+                std: [0.225, 0.225, 0.225]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/AVA/SLOWFAST_R50_baseline.yaml b/configs/AVA/SLOWFAST_R50_baseline.yaml
new file mode 100755
index 0000000..91ca7e6
--- /dev/null
+++ b/configs/AVA/SLOWFAST_R50_baseline.yaml
@@ -0,0 +1,110 @@
+evaluate: False
+
+result_path: experiments/AVA/SLOWFAST_R50_baseline
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast50
+        learnable: True
+        pretrain:
+            path: pretrained/SLOWFAST_R50_K400.pth.tar
+        kwargs:
+            alpha: 4
+            beta: 0.125
+            fuse_only_conv: False
+            fuse_kernel_size: 7
+            slow_full_span: True
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: linear
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+train:
+    root_path: data
+    annotation_path: annotations/ava_train_v2.2_with_fair_0.9.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: RandomHorizontalFlip
+          - type: ToTensor
+            kwargs:
+                norm_value: 255.
+          - type: Normalize
+            kwargs:
+                mean: [0.450, 0.450, 0.450]
+                std: [0.225, 0.225, 0.225]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    n_epochs: 6
+    val_freq: 1
+    save_freq: 1
+
+    optimizer:
+        type: SGD
+        kwargs:
+            momentum: 0.9
+            weight_decay: 0.0000001
+            nesterov: True
+
+    scheduler:
+        type: step
+        milestone_epochs: [5.6, 5.8]
+        lr_mults: [0.1, 0.1]
+        base_lr: 0.008
+        warmup_lr: 0.064
+        warmup_epochs: 1
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 255.
+          - type: Normalize
+            kwargs:
+                mean: [0.450, 0.450, 0.450]
+                std: [0.225, 0.225, 0.225]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml b/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
new file mode 100755
index 0000000..4c0f5b2
--- /dev/null
+++ b/configs/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1.yaml
@@ -0,0 +1,67 @@
+evaluate: True
+
+pretrain:
+    path: model_zoo/AVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1.pth.tar
+
+result_path: experiments/AVA/eval_SLOWFAST_R101_ACAR_HR2O_DEPTH1
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast101
+        learnable: True
+        kwargs:
+            alpha: 4
+            beta: 0.125
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 1
+            mlp_1x1: True
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 1.
+          - type: Normalize
+            kwargs:
+                mean: [110.63666788, 103.16065604, 96.29023126]
+                std: [38.7568578, 37.88248729, 40.02898126]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml b/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml
new file mode 100755
index 0000000..374a7cc
--- /dev/null
+++ b/configs/AVA/eval_SLOWFAST_R50_ACAR_HR2O.yaml
@@ -0,0 +1,69 @@
+evaluate: True
+
+pretrain:
+    path: model_zoo/AVA_SLOWFAST_R50_ACAR_HR2O.pth.tar
+
+result_path: experiments/AVA/eval_SLOWFAST_R50_ACAR_HR2O
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast50
+        learnable: True
+        kwargs:
+            alpha: 4
+            beta: 0.125
+            fuse_only_conv: False
+            fuse_kernel_size: 7
+            slow_full_span: True
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 2
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 255.
+          - type: Normalize
+            kwargs:
+                mean: [0.450, 0.450, 0.450]
+                std: [0.225, 0.225, 0.225]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/configs/ROAD.yml b/configs/ROAD.yml
new file mode 100644
index 0000000..020c0eb
--- /dev/null
+++ b/configs/ROAD.yml
@@ -0,0 +1,67 @@
+evaluate: True
+
+pretrain:
+    path: kinetics-pt/SLOWFAST_R101_K700.pth.tar
+
+result_path: experiments/AVA-Kinetics/evalAVA_SLOWFAST_R101_ACAR_HR2O_DEPTH1
+manual_seed: 1
+print_freq: 20
+
+model:
+    freeze_bn: True
+    backbone:
+        arch: slowfast101
+        learnable: True
+        kwargs:
+            alpha: 2
+            beta: 0.125
+    neck:
+        type: basic
+        kwargs:
+            bbox_jitter:
+                num: 1
+                scale: 0.075
+            num_classes: 60
+            multi_class: True
+    head:
+        type: acar
+        kwargs:
+            width: 2304
+            roi_spatial: 7
+            num_classes: 60
+            depth: 1
+            mlp_1x1: True
+
+loss:
+    type: ava_criterion
+    kwargs:
+        pose_softmax: True
+
+val:
+    root_path: data
+    annotation_path: annotations/ava_val_v2.2_fair_0.85.pkl
+    batch_size: 1
+
+    augmentation:
+        spatial:
+          - type: Scale
+            kwargs:
+                resize: 256
+          - type: ToTensor
+            kwargs:
+                norm_value: 1.
+          - type: Normalize
+            kwargs:
+                mean: [110.63666788, 103.16065604, 96.29023126]
+                std: [38.7568578, 37.88248729, 40.02898126]
+        temporal:
+            type: TemporalCenterCrop
+            kwargs:
+                size: 64
+                step: 2
+
+    with_label: False
+    eval_mAP:
+        labelmap: annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt
+        groundtruth: annotations/ava_val_v2.2.csv
+        exclusions: annotations/ava_val_excluded_timestamps_v2.2.csv
diff --git a/main.py b/main.py
index 8f65e46..2753e3a 100644
--- a/main.py
+++ b/main.py
@@ -225,7 +225,7 @@ def main():
         full_test = True #args.MODE != 'train'
         args.skip_beggning = 0
         args.skip_ending = 0
-        if args.MODEL_TYPE == 'I3D':
+        if args.MODEL_TYPE == 'I3D' or 'SlowFast':
             args.skip_beggning = 2
             args.skip_ending = 2
         elif args.MODEL_TYPE != 'C2D':
diff --git a/models/__init__ .py b/models/__init__ .py
new file mode 100755
index 0000000..3eedcb8
--- /dev/null
+++ b/models/__init__ .py	
@@ -0,0 +1,106 @@
+import torch
+import torch.nn as nn
+
+from .backbones import AVA_backbone
+from .necks import AVA_neck
+from .heads import AVA_head
+
+
+class AVA_model(nn.Module):
+    def __init__(self, config):
+        super(AVA_model, self).__init__()
+        self.config = config
+        
+        self.backbone = AVA_backbone(config.backbone)
+        self.neck = AVA_neck(config.neck)
+        self.head = AVA_head(config.head)
+        
+    def forward(self, data, evaluate=False):
+        if not evaluate: # train mode
+            i_b = {'clips': data['clips']}
+            o_b = self.backbone(i_b)
+
+            i_n = {'aug_info': data['aug_info'], 'labels': data['labels'], 
+                   'filenames': data['filenames'], 'mid_times': data['mid_times']}
+            o_n = self.neck(i_n)
+
+            if o_n['num_rois'] == 0:
+                return {'outputs': None, 'targets': o_n['targets'], 
+                        'num_rois': 0, 'filenames': o_n['filenames'], 
+                        'mid_times': o_n['mid_times'], 'bboxes': o_n['bboxes']}
+                
+            i_h = {'features': o_b['features'], 'rois': o_n['rois'],
+                   'num_rois': o_n['num_rois'], 'roi_ids': o_n['roi_ids'],
+                   'sizes_before_padding': o_n['sizes_before_padding']}
+            o_h = self.head(i_h)
+
+            return {'outputs': o_h['outputs'], 'targets': o_n['targets'], 
+                    'num_rois': o_n['num_rois'], 'filenames': o_n['filenames'], 
+                    'mid_times': o_n['mid_times'], 'bboxes': o_n['bboxes']}
+        
+        # eval mode
+        assert not self.training
+        
+        noaug_info = [{'crop_box': [0., 0., 1., 1.], 'flip': False, 'pad_ratio': [1., 1.]}] * len(data['labels'])
+        i_n = {'aug_info': noaug_info, 'labels': data['labels'], 
+               'filenames': data['filenames'], 'mid_times': data['mid_times']}
+        o = self.neck(i_n)
+        
+        output_list = [None] * len(o['filenames'])
+        cnt_list = [0] * len(o['filenames'])
+        
+        for no in range(len(data['clips'])):
+            i_b = {'clips': data['clips'][no]}
+            o_b = self.backbone(i_b)
+            
+            i_n = {'aug_info': data['aug_info'][no], 'labels': data['labels'], 
+                   'filenames': data['filenames'], 'mid_times': data['mid_times']}
+            o_n = self.neck(i_n)
+            
+            if o_n['num_rois'] == 0:
+                continue
+            ids = o_n['bbox_ids']
+                
+            i_h = {'features': o_b['features'], 'rois': o_n['rois'], 
+                   'num_rois': o_n['num_rois'], 'roi_ids': o_n['roi_ids'],
+                   'sizes_before_padding': o_n['sizes_before_padding']}
+            o_h = self.head(i_h)
+            
+            outputs = o_h['outputs']
+            for idx in range(o_n['num_rois']):
+                if cnt_list[ids[idx]] == 0:
+                    output_list[ids[idx]] = outputs[idx]
+                else:
+                    output_list[ids[idx]] += outputs[idx]
+                cnt_list[ids[idx]] += 1
+            
+        num_rois, filenames, mid_times, bboxes, targets, outputs = 0, [], [], [], [], []
+        for idx in range(len(o['filenames'])):
+            if cnt_list[idx] == 0:
+                continue
+            num_rois += 1
+            filenames.append(o['filenames'][idx])
+            mid_times.append(o['mid_times'][idx])
+            bboxes.append(o['bboxes'][idx])
+            targets.append(o['targets'][idx])
+            outputs.append(output_list[idx] / float(cnt_list[idx]))
+
+        if num_rois == 0:
+            return {'outputs': None, 'targets': None, 'num_rois': 0, 
+                    'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes}
+        
+        final_outputs = torch.stack(outputs, dim=0)
+        final_targets = torch.stack(targets, dim=0)
+        return {'outputs': final_outputs, 'targets': final_targets, 'num_rois': num_rois, 
+                'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes}
+
+    def train(self, mode=True):
+        super(AVA_model, self).train(mode)
+
+        if mode and self.config.get('freeze_bn', False):
+            def set_bn_eval(m):
+                classname = m.__class__.__name__
+                if classname.find('BatchNorm') != -1:
+                    m.eval()
+
+            self.backbone.apply(set_bn_eval)
diff --git a/models/__init__.py b/models/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/models/backbone_models.py b/models/backbone_models.py
index 19faad2..c9826cd 100644
--- a/models/backbone_models.py
+++ b/models/backbone_models.py
@@ -30,9 +30,8 @@ def backbone_models(args):
             model.identity_state_dict()
         if MODEL_TYPE.startswith('RCGRU') or MODEL_TYPE.startswith('RCLSTM'):
             model.recurrent_conv_zero_state()
-
-        load_dict = torch.load(args.MODEL_PATH)
-        
-        model.load_my_state_dict(load_dict)
+        if not MODEL_TYPE.startswith('SlowFast'):
+            load_dict = torch.load(args.MODEL_PATH)
+            model.load_my_state_dict(load_dict)
 
     return model
diff --git a/models/backbones/__init__.py b/models/backbones/__init__.py
new file mode 100755
index 0000000..1a8fa4e
--- /dev/null
+++ b/models/backbones/__init__.py
@@ -0,0 +1,31 @@
+import torch.nn as nn
+
+from .slowfast import *
+from utils import load_pretrain
+
+
+def model_entry(config):
+    return globals()[config['arch']](**config['kwargs'])
+
+
+class AVA_backbone(nn.Module):
+    def __init__(self, config):
+        super(AVA_backbone, self).__init__()
+        
+        self.config = config
+        self.module = model_entry(config.model.backbone)
+        print(config.get('pretrain', None))
+        if config.get('pretrain', None) is not None:
+            load_pretrain(config.pretrain, self.module)
+                
+        if not config.get('learnable', True):
+            self.module.requires_grad_(False)
+
+    # data: clips
+    # returns: features
+    def forward(self, data):
+        # inputs = data['clips']
+        inputs = data
+        inputs = inputs.cuda()
+        features = self.module(inputs)
+        return features
diff --git a/models/backbones/slowfast.py b/models/backbones/slowfast.py
new file mode 100755
index 0000000..e6518ec
--- /dev/null
+++ b/models/backbones/slowfast.py
@@ -0,0 +1,260 @@
+"""
+References:
+[SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982),
+[PySlowFast](https://github.com/facebookresearch/slowfast).
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+BN = nn.BatchNorm3d
+
+__all__ = ['slowfast50', 'slowfast101', 'slowfast152', 'slowfast200']
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1, head_conv=1):
+        super(Bottleneck, self).__init__()
+        if head_conv == 1:
+            self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+            self.bn1 = BN(planes)
+        elif head_conv == 3:
+            self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(3, 1, 1), bias=False, padding=(1, 0, 0))
+            self.bn1 = BN(planes)
+        else:
+            raise ValueError("Unsupported head_conv!")
+        self.conv2 = nn.Conv3d(
+            planes, planes, kernel_size=(1, 3, 3), stride=(1, stride, stride), 
+            padding=(0, dilation, dilation), dilation=(1, dilation, dilation), bias=False)
+        self.bn2 = BN(planes)
+        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BN(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        if downsample is not None:
+            self.downsample_bn = BN(planes * 4)
+        self.stride = stride
+        # self.alpha = 1
+
+    def forward(self, x):
+        res = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            res = self.downsample(x)
+            res = self.downsample_bn(res)
+
+        out = out + res
+        out = self.relu(out)
+
+        return out
+
+
+class SlowFast(nn.Module):
+    def __init__(self, block, layers, alpha=8, beta=0.125, fuse_only_conv=True, fuse_kernel_size=5, slow_full_span=False):
+        super(SlowFast, self).__init__()
+        
+        self.alpha = alpha
+        self.beta = beta
+        self.slow_full_span = slow_full_span
+
+        '''Fast Network'''
+        self.fast_inplanes = int(64 * beta)
+        self.fast_conv1 = nn.Conv3d(3, self.fast_inplanes, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
+        self.fast_bn1 = BN(self.fast_inplanes)
+        self.fast_relu = nn.ReLU(inplace=True)
+        self.fast_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+
+        self.fast_res1 = self._make_layer_fast(block, int(64 * beta), layers[0], head_conv=3)
+        self.fast_res2 = self._make_layer_fast(block, int(128 * beta), layers[1], stride=2, head_conv=3)
+        self.fast_res3 = self._make_layer_fast(block, int(256 * beta), layers[2], stride=2, head_conv=3)
+        self.fast_res4 = self._make_layer_fast(block, int(512 * beta), layers[3], head_conv=3, dilation=2)
+
+        '''Slow Network'''
+        self.slow_inplanes = 64
+        self.slow_conv1 = nn.Conv3d(3, self.slow_inplanes, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
+        self.slow_bn1 = BN(self.slow_inplanes)
+        self.slow_relu = nn.ReLU(inplace=True)
+        self.slow_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+
+        self.slow_res1 = self._make_layer_slow(block, 64, layers[0], head_conv=1)
+        self.slow_res2 = self._make_layer_slow(block, 128, layers[1], stride=2, head_conv=1)
+        self.slow_res3 = self._make_layer_slow(block, 256, layers[2], stride=2, head_conv=3)
+        self.slow_res4 = self._make_layer_slow(block, 512, layers[3], head_conv=3, dilation=2)
+
+        
+
+        '''Lateral Connections'''
+        fuse_padding = fuse_kernel_size // 2
+        fuse_kwargs = {'kernel_size': (fuse_kernel_size, 1, 1), 'stride': (alpha, 1, 1), 'padding': (fuse_padding, 0, 0), 'bias': False}
+        if fuse_only_conv:
+            def fuse_func(in_channels, out_channels):
+                return nn.Conv3d(in_channels, out_channels, **fuse_kwargs)
+        else:
+            def fuse_func(in_channels, out_channels):
+                return nn.Sequential(
+                    nn.Conv3d(in_channels, out_channels, **fuse_kwargs),
+                    BN(out_channels),
+                    nn.ReLU(inplace=True)
+                )
+        self.Tconv1 = fuse_func(int(64 * beta), int(128 * beta))
+        self.Tconv2 = fuse_func(int(256 * beta), int(512 * beta))
+        self.Tconv3 = fuse_func(int(512 * beta), int(1024 * beta))
+        self.Tconv4 = fuse_func(int(1024 * beta), int(2048 * beta))
+        # for input in []:
+        # self.slow_conv1_1 =  nn.Conv3d(3, self.fast_inplanes, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
+        self.pool2 = nn.MaxPool3d(kernel_size=(
+                2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+    def _upsample(self, x, y):
+        _, _, t, h, w = y.size()
+        # print('spatial', x.shape, y.shape)
+        x_upsampled = F.interpolate(x, [t, h, w], mode='nearest')
+
+        return x_upsampled
+
+
+    def forward(self, input):
+        fast, Tc = self.FastPath(input)
+        # print('alpha',self.alpha)
+
+        
+        if self.slow_full_span:
+            slow_input = torch.index_select(
+                input,
+                2,
+                torch.linspace(
+                    0,
+                    input.shape[2] - 1,
+                    input.shape[2] // self.alpha,
+                ).long().cuda(),
+            )
+        else:
+            slow_input = input[:, :, ::self.alpha, :, :]
+        slow = self.SlowPath(slow_input, Tc)
+
+        
+        fast[0] = self.pool2(fast[0])
+        fast[1] = self.pool2(fast[1])
+        fast[2] = self.pool2(fast[2])
+        
+        outFeat = []
+        for sitem,fitem in zip(slow,fast):
+            outFeat.append(torch.cat((sitem,fitem),1))
+            # print(outFeat[-1].shape)
+        return outFeat
+
+    def SlowPath(self, input, Tc):
+        # print('slowinpdi',input.shape)
+        x = self.slow_conv1(input)
+        x = self.slow_bn1(x)
+        x = self.slow_relu(x)
+        x = self.slow_maxpool(x)
+        # print('x',x.shape)
+        x = torch.cat([x, Tc[0]], dim=1)
+        x = self.slow_res1(x)
+        x = torch.cat([x, Tc[1]], dim=1)
+        c3 = self.slow_res2(x)
+        x = torch.cat([c3, Tc[2]], dim=1)
+        c4 = self.slow_res3(x)
+        x = torch.cat([c4, Tc[3]], dim=1)
+        c5 = self.slow_res4(x)
+
+        return [c3,c4,c5]
+
+    def FastPath(self, input):
+        x = self.fast_conv1(input)
+        x = self.fast_bn1(x)
+        x = self.fast_relu(x)
+        x = self.fast_maxpool(x)
+        Tc1 = self.Tconv1(x)
+        x = self.fast_res1(x)
+        Tc2 = self.Tconv2(x)
+        c3 = self.fast_res2(x)
+        Tc3 = self.Tconv3(c3)
+        c4 = self.fast_res3(c3)
+        Tc4 = self.Tconv4(c4)
+        c5 = self.fast_res4(c4)
+        return [c3,c4,c5], [Tc1, Tc2, Tc3, Tc4]
+
+    def _make_layer_fast(self, block, planes, blocks, stride=1, head_conv=1, dilation=1):
+        downsample = None
+        if stride != 1 or self.fast_inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv3d(
+                    self.fast_inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=(1, stride, stride),
+                    bias=False
+                )
+            )
+
+        layers = []
+        layers.append(block(self.fast_inplanes, planes, stride, downsample, dilation=dilation, head_conv=head_conv))
+        self.fast_inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.fast_inplanes, planes, dilation=dilation, head_conv=head_conv))
+
+        return nn.Sequential(*layers)
+
+    def _make_layer_slow(self, block, planes, blocks, stride=1, head_conv=1, dilation=1):
+        downsample = None
+        fused_inplanes = self.slow_inplanes + int(self.slow_inplanes * self.beta) * 2
+        if stride != 1 or fused_inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv3d(
+                    fused_inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=(1, stride, stride),
+                    bias=False
+                )
+            )
+
+        layers = []
+        layers.append(block(fused_inplanes, planes, stride, downsample, dilation=dilation, head_conv=head_conv))
+        self.slow_inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.slow_inplanes, planes, dilation=dilation, head_conv=head_conv))
+
+        return nn.Sequential(*layers)
+
+    
+def slowfast50(**kwargs):
+    """Constructs a SlowFast-50 model.
+    """
+    model = SlowFast(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+
+
+def slowfast101(**kwargs):
+    """Constructs a SlowFast-101 model.
+    """
+    model = SlowFast(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+
+
+def slowfast152(**kwargs):
+    """Constructs a SlowFast-152 model.
+    """
+    model = SlowFast(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model
+
+
+def slowfast200(**kwargs):
+    """Constructs a SlowFast-200 model.
+    """
+    model = SlowFast(Bottleneck, [3, 24, 36, 3], **kwargs)
+    return model
diff --git a/models/heads/__init__.py b/models/heads/__init__.py
new file mode 100755
index 0000000..1009d70
--- /dev/null
+++ b/models/heads/__init__.py
@@ -0,0 +1,17 @@
+import torch.nn as nn
+
+from .linear import *
+from .acar import *
+
+
+def model_entry(config):
+    return globals()[config['type']](**config['kwargs'])
+
+
+class AVA_head(nn.Module):
+    def __init__(self, config):
+        super(AVA_head, self).__init__()
+        self.module = model_entry(config)
+
+    def forward(self, data):
+        return self.module(data)
diff --git a/models/heads/acar.py b/models/heads/acar.py
new file mode 100755
index 0000000..d35a423
--- /dev/null
+++ b/models/heads/acar.py
@@ -0,0 +1,151 @@
+import math
+
+import torch
+import torch.nn as nn
+import torchvision
+
+__all__ = ['acar']
+
+
+class HR2O_NL(nn.Module):
+    def __init__(self, hidden_dim=512, kernel_size=3, mlp_1x1=False):
+        super(HR2O_NL, self).__init__()
+
+        self.hidden_dim = hidden_dim
+
+        padding = kernel_size // 2
+        self.conv_q = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False)
+        self.conv_k = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False)
+        self.conv_v = nn.Conv2d(hidden_dim, hidden_dim, kernel_size, padding=padding, bias=False)
+
+        self.conv = nn.Conv2d(
+            hidden_dim, hidden_dim,
+            1 if mlp_1x1 else kernel_size,
+            padding=0 if mlp_1x1 else padding,
+            bias=False
+        )
+        self.norm = nn.GroupNorm(1, hidden_dim, affine=True)
+        self.dp = nn.Dropout(0.2)
+
+    def forward(self, x):
+        query = self.conv_q(x).unsqueeze(1)
+        key = self.conv_k(x).unsqueeze(0)
+        att = (query * key).sum(2) / (self.hidden_dim ** 0.5)
+        att = nn.Softmax(dim=1)(att)
+        value = self.conv_v(x)
+        virt_feats = (att.unsqueeze(2) * value).sum(1)
+
+        virt_feats = self.norm(virt_feats)
+        virt_feats = nn.functional.relu(virt_feats)
+        virt_feats = self.conv(virt_feats)
+        virt_feats = self.dp(virt_feats)
+
+        x = x + virt_feats
+        return x
+
+
+class ACARHead(nn.Module):
+    def __init__(self, width, roi_spatial=7, num_classes=60, dropout=0., bias=False,
+                 reduce_dim=1024, hidden_dim=512, downsample='max2x2', depth=2,
+                 kernel_size=3, mlp_1x1=False):
+        super(ACARHead, self).__init__()
+
+        self.roi_spatial = roi_spatial
+        self.roi_maxpool = nn.MaxPool2d(roi_spatial)
+
+        # actor-context feature encoder
+        self.conv_reduce = nn.Conv2d(width, reduce_dim, 1, bias=False)
+
+        self.conv1 = nn.Conv2d(reduce_dim * 2, hidden_dim, 1, bias=False)
+        self.conv2 = nn.Conv2d(hidden_dim, hidden_dim, 3, bias=False)
+
+        # down-sampling before HR2O
+        assert downsample in ['none', 'max2x2']
+        if downsample == 'none':
+            self.downsample = nn.Identity()
+        elif downsample == 'max2x2':
+            self.downsample = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # high-order relation reasoning operator (HR2O_NL)
+        layers = []
+        for _ in range(depth):
+            layers.append(HR2O_NL(hidden_dim, kernel_size, mlp_1x1))
+        self.hr2o = nn.Sequential(*layers)
+
+        # classification
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Linear(reduce_dim, hidden_dim, bias=False)
+        self.fc2 = nn.Linear(hidden_dim * 2, num_classes, bias=bias)
+
+        if dropout > 0:
+            self.dp = nn.Dropout(dropout)
+        else:
+            self.dp = None
+
+    # data: features, rois, num_rois, roi_ids, sizes_before_padding
+    # returns: outputs
+    def forward(self, data):
+        if not isinstance(data['features'], list):
+            feats = [data['features']]
+        else:
+            feats = data['features']
+
+        # temporal average pooling
+        h, w = feats[0].shape[3:]
+        # requires all features have the same spatial dimensions
+        feats = [nn.AdaptiveAvgPool3d((1, h, w))(f).view(-1, f.shape[1], h, w) for f in feats]
+        feats = torch.cat(feats, dim=1)
+
+        feats = self.conv_reduce(feats)
+
+        rois = data['rois']
+        rois[:, 1] = rois[:, 1] * w
+        rois[:, 2] = rois[:, 2] * h
+        rois[:, 3] = rois[:, 3] * w
+        rois[:, 4] = rois[:, 4] * h
+        rois = rois.detach()
+        roi_feats = torchvision.ops.roi_align(feats, rois, (self.roi_spatial, self.roi_spatial))
+        roi_feats = self.roi_maxpool(roi_feats).view(data['num_rois'], -1)
+
+        roi_ids = data['roi_ids']
+        sizes_before_padding = data['sizes_before_padding']
+        high_order_feats = []
+        for idx in range(feats.shape[0]):  # iterate over mini-batch
+            n_rois = roi_ids[idx+1] - roi_ids[idx]
+            if n_rois == 0:
+                continue
+
+            eff_h, eff_w = math.ceil(h * sizes_before_padding[idx][1]), math.ceil(w * sizes_before_padding[idx][0])
+            bg_feats = feats[idx][:, :eff_h, :eff_w]
+            bg_feats = bg_feats.unsqueeze(0).repeat((n_rois, 1, 1, 1))
+            actor_feats = roi_feats[roi_ids[idx]:roi_ids[idx+1]]
+            tiled_actor_feats = actor_feats.unsqueeze(2).unsqueeze(2).expand_as(bg_feats)
+            interact_feats = torch.cat([bg_feats, tiled_actor_feats], dim=1)
+
+            interact_feats = self.conv1(interact_feats)
+            interact_feats = nn.functional.relu(interact_feats)
+            interact_feats = self.conv2(interact_feats)
+            interact_feats = nn.functional.relu(interact_feats)
+
+            interact_feats = self.downsample(interact_feats)
+
+            interact_feats = self.hr2o(interact_feats)
+            interact_feats = self.gap(interact_feats)
+            high_order_feats.append(interact_feats)
+
+        high_order_feats = torch.cat(high_order_feats, dim=0).view(data['num_rois'], -1)
+
+        outputs = self.fc1(roi_feats)
+        outputs = nn.functional.relu(outputs)
+        outputs = torch.cat([outputs, high_order_feats], dim=1)
+
+        if self.dp is not None:
+            outputs = self.dp(outputs)
+        outputs = self.fc2(outputs)
+
+        return {'outputs': outputs}
+
+
+def acar(**kwargs):
+    model = ACARHead(**kwargs)
+    return model
diff --git a/models/heads/linear.py b/models/heads/linear.py
new file mode 100755
index 0000000..846cf8c
--- /dev/null
+++ b/models/heads/linear.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+import torchvision
+
+__all__ = ['linear']
+
+
+class LinearHead(nn.Module):
+    def __init__(self, width, roi_spatial=7, num_classes=60, dropout=0., bias=False):
+        super(LinearHead, self).__init__()
+        
+        self.roi_spatial = roi_spatial
+        self.roi_maxpool = nn.MaxPool2d(roi_spatial)
+
+        self.fc = nn.Linear(width, num_classes, bias=bias)
+
+        if dropout > 0:
+            self.dp = nn.Dropout(dropout)
+        else:
+            self.dp = None
+
+    # data: features, rois
+    # returns: outputs
+    def forward(self, data):
+        if not isinstance(data['features'], list):
+            features = [data['features']]
+        else:
+            features = data['features']
+
+        roi_features = []
+        for f in features:
+            sp = f.shape
+            h, w = sp[3:]
+            feats = nn.AdaptiveAvgPool3d((1, h, w))(f).view(-1, sp[1], h, w)
+
+            rois = data['rois'].clone()
+            rois[:, 1] = rois[:, 1] * w
+            rois[:, 2] = rois[:, 2] * h
+            rois[:, 3] = rois[:, 3] * w
+            rois[:, 4] = rois[:, 4] * h
+            rois = rois.detach()
+            roi_feats = torchvision.ops.roi_align(feats, rois, (self.roi_spatial, self.roi_spatial))
+            roi_feats = self.roi_maxpool(roi_feats).view(-1, sp[1])
+
+            roi_features.append(roi_feats)
+
+        roi_features = torch.cat(roi_features, dim=1)
+        if self.dp is not None:
+            roi_features = self.dp(roi_features)
+        outputs = self.fc(roi_features)
+
+        return {'outputs': outputs}
+
+
+def linear(**kwargs):
+    model = LinearHead(**kwargs)
+    return model
\ No newline at end of file
diff --git a/models/necks/__init__.py b/models/necks/__init__.py
new file mode 100755
index 0000000..b1ddc43
--- /dev/null
+++ b/models/necks/__init__.py
@@ -0,0 +1,16 @@
+import torch.nn as nn
+
+from .basic import *
+
+
+def model_entry(config):
+    return globals()[config['type']](**config['kwargs'])
+
+
+class AVA_neck(nn.Module):
+    def __init__(self, config):
+        super(AVA_neck, self).__init__()
+        self.module = model_entry(config)
+
+    def forward(self, data):
+        return self.module(data)
diff --git a/models/necks/basic.py b/models/necks/basic.py
new file mode 100755
index 0000000..49a67c2
--- /dev/null
+++ b/models/necks/basic.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+
+from .utils import bbox_jitter, get_bbox_after_aug
+
+__all__ = ['basic']
+
+
+class BasicNeck(nn.Module):
+    def __init__(self, aug_threshold=0., bbox_jitter=None, num_classes=60, multi_class=True):
+        super(BasicNeck, self).__init__()
+        
+        # threshold on preserved ratio of bboxes after cropping augmentation
+        self.aug_threshold = aug_threshold
+        # config for bbox jittering
+        self.bbox_jitter = bbox_jitter
+
+        self.num_classes = num_classes
+        self.multi_class = multi_class
+
+    # data: aug_info, labels, filenames, mid_times
+    # returns: num_rois, rois, roi_ids, targets, sizes_before_padding, filenames, mid_times, bboxes, bbox_ids
+    def forward(self, data):
+        rois, roi_ids, targets, sizes_before_padding, filenames, mid_times = [], [0], [], [], [], []
+        bboxes, bbox_ids = [], []  # used for multi-crop fusion
+
+        cur_bbox_id = -1  # record current bbox no.
+        for idx in range(len(data['aug_info'])):
+            aug_info = data['aug_info'][idx]
+            pad_ratio = aug_info['pad_ratio']
+            sizes_before_padding.append([1. / pad_ratio[0], 1. / pad_ratio[1]])
+            
+            for label in data['labels'][idx]:
+                cur_bbox_id += 1
+                if self.training and self.bbox_jitter is not None:
+                    bbox_list = bbox_jitter(label['bounding_box'],
+                                            self.bbox_jitter.get('num', 1),
+                                            self.bbox_jitter.scale)
+                else:
+                    # no bbox jittering during evaluation
+                    bbox_list = [label['bounding_box']]
+                
+                for b in bbox_list:
+                    bbox = get_bbox_after_aug(aug_info, b, self.aug_threshold)
+                    if bbox is None:
+                        continue
+                    rois.append([idx] + bbox)
+                    
+                    filenames.append(data['filenames'][idx])
+                    mid_times.append(data['mid_times'][idx])
+                    bboxes.append(label['bounding_box'])
+                    bbox_ids.append(cur_bbox_id)
+
+                    if self.multi_class:
+                        ret = torch.zeros(self.num_classes)
+                        ret.put_(torch.LongTensor(label['label']), 
+                                torch.ones(len(label['label'])))
+                    else:
+                        ret = torch.LongTensor(label['label'])
+                    targets.append(ret)
+                
+            roi_ids.append(len(rois))
+
+        num_rois = len(rois)
+        if num_rois == 0:
+            return {'num_rois': 0, 'rois': None, 'roi_ids': roi_ids, 'targets': None, 
+                    'sizes_before_padding': sizes_before_padding,
+                    'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes, 'bbox_ids': bbox_ids}
+        
+        rois = torch.FloatTensor(rois).cuda()
+        targets = torch.stack(targets, dim=0).cuda()
+        return {'num_rois': num_rois, 'rois': rois, 'roi_ids': roi_ids, 'targets': targets, 
+                'sizes_before_padding': sizes_before_padding,
+                'filenames': filenames, 'mid_times': mid_times, 'bboxes': bboxes, 'bbox_ids': bbox_ids}
+
+    
+def basic(**kwargs):
+    model = BasicNeck(**kwargs)
+    return model
diff --git a/models/necks/utils.py b/models/necks/utils.py
new file mode 100755
index 0000000..87ea964
--- /dev/null
+++ b/models/necks/utils.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def bbox_jitter(bbox, num, delta):
+    w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    
+    if num == 1:
+        jitter = np.random.uniform(-delta, delta, 4)
+        bboxes = [[max(bbox[0] + jitter[0] * w, 0.), min(bbox[1] + jitter[1] * h, 1.),
+                   max(bbox[2] + jitter[2] * w, 0.), min(bbox[3] + jitter[3] * h, 1.)]]
+                   
+        return bboxes
+    
+    bboxes = [bbox]
+    jitter = np.random.uniform(-delta, delta, [num - 1, 4])
+    for i in range(num - 1):
+        bboxes.append([max(bbox[0] + jitter[i][0] * w, 0.), min(bbox[1] + jitter[i][1] * h, 1.),
+                       max(bbox[2] + jitter[i][2] * w, 0.), min(bbox[3] + jitter[i][3] * h, 1.)])
+    return bboxes
+
+
+def get_bbox_after_aug(aug_info, bbox, aug_threshold=0.3):
+    if aug_info is None:
+        return bbox
+    
+    cbox = aug_info['crop_box']
+    w = cbox[2] - cbox[0]
+    h = cbox[3] - cbox[1]
+    
+    l = max(min(bbox[0], cbox[2]), cbox[0])
+    r = max(min(bbox[2], cbox[2]), cbox[0])
+    t = max(min(bbox[1], cbox[3]), cbox[1])
+    b = max(min(bbox[3], cbox[3]), cbox[1])
+    
+    if (b-t) * (r-l) <= (bbox[3]-bbox[1]) * (bbox[2]-bbox[0]) * aug_threshold:
+        return None
+    ret = [(l-cbox[0]) / w, (t-cbox[1]) / h, (r-cbox[0]) / w, (b-cbox[1]) / h]
+    
+    if aug_info['flip']:
+        ret = [1. - ret[2], ret[1], 1. - ret[0], ret[3]]
+
+    pad_ratio = aug_info['pad_ratio']
+    ret = [ret[0] / pad_ratio[0], ret[1] / pad_ratio[1], ret[2] / pad_ratio[0], ret[3] / pad_ratio[1]]
+    
+    return ret
diff --git a/models/resnetFPN.py b/models/resnetFPN.py
index 6b294ed..079f424 100644
--- a/models/resnetFPN.py
+++ b/models/resnetFPN.py
@@ -8,6 +8,12 @@
 
 import modules.utils as lutils
 
+import yaml
+from easydict import EasyDict
+from .backbones import AVA_backbone
+
+
+
 logger = lutils.get_logger(__name__)
 
 ### Download weights from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
@@ -24,8 +30,21 @@ def conv1x1(in_channel, out_channel):
 class ResNetFPN(nn.Module):
 
     def __init__(self, block, args):
+        
         self.inplanes = 64
         super(ResNetFPN, self).__init__()
+    
+    
+        if args.MODEL_TYPE.startswith('SlowFast'):
+            with open('configs/ROAD.yml') as f:
+                config = yaml.load(f, Loader=yaml.FullLoader)
+            opt = EasyDict(config)
+            print(opt)
+            self.inplanes = 64
+            super(ResNetFPN, self).__init__()
+            self.backbone = AVA_backbone(opt)
+    
+
         self.MODEL_TYPE = args.MODEL_TYPE
         num_blocks = args.model_perms
         non_local_inds = args.non_local_inds
@@ -60,19 +79,34 @@ def __init__(self, block, args):
         #self.avgpool = nn.AvgPool2d(7, stride=1)
         #self.fc = nn.Linear(512 * block.expansion, num_classes)
 
-        self.conv6 = conv3x3(512 * block.expansion, 256, stride=2, padding=1)  # P6
-        self.conv7 = conv3x3(256, 256, stride=2, padding=1)  # P7
+        if self.MODEL_TYPE == 'SlowFast':
+            self.conv6 = conv3x3(2304, 256, stride=2, padding=1)  # P6
+            self.conv7 = conv3x3(256, 256, stride=2, padding=1)  # P7
 
-        self.ego_lateral = conv3x3(512 * block.expansion,  256, stride=2, padding=0)
-        self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
+            self.ego_lateral = conv3x3(512 * block.expansion,  256, stride=2, padding=0)
+            self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
 
-        self.lateral_layer1 = conv1x1(512 * block.expansion, 256)
-        self.lateral_layer2 = conv1x1(256 * block.expansion, 256)
-        self.lateral_layer3 = conv1x1(128 * block.expansion, 256)
-        
-        self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1)  # P4
-        self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1)  # P4
-        self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1)  # P3
+            self.lateral_layer1 = conv1x1(2304, 256)
+            self.lateral_layer2 = conv1x1(1152, 256)
+            self.lateral_layer3 = conv1x1(576, 256)
+            
+            self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1)  # P4
+            self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1)  # P4
+            self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1)  # P3
+        else:
+            self.conv6 = conv3x3(512 * block.expansion, 256, stride=2, padding=1)  # P6
+            self.conv7 = conv3x3(256, 256, stride=2, padding=1)  # P7
+
+            self.ego_lateral = conv3x3(512 * block.expansion,  256, stride=2, padding=0)
+            self.avg_pool = nn.AdaptiveAvgPool3d((None, 1, 1))
+
+            self.lateral_layer1 = conv1x1(512 * block.expansion, 256)
+            self.lateral_layer2 = conv1x1(256 * block.expansion, 256)
+            self.lateral_layer3 = conv1x1(128 * block.expansion, 256)
+            
+            self.corr_layer1 = conv3x3(256, 256, stride=1, padding=1)  # P4
+            self.corr_layer2 = conv3x3(256, 256, stride=1, padding=1)  # P4
+            self.corr_layer3 = conv3x3(256, 256, stride=1, padding=1)  # P3
 
 
         for m in self.modules():
@@ -139,49 +173,59 @@ def _make_layer(self, block, planes, num_blocks, stride=1, temp_kernals=[], nl_i
         return nn.Sequential(*layers)
 
     def forward(self, x):
-        # pdb.set_trace()
-        # print('input shape', x.shape)
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.pool1(x)
-        # print('p1 ', x.shape)
-        x = self.layer1(x)
-        if self.pool2 is not None:
-            x = self.pool2(x)
-        # print('p2 shape ', x.shape)
-        c3 = self.layer2(x)
-        c4 = self.layer3(c3)
-        c5 = self.layer4(c4)
-
-        # ego_feat = self.ego_lateral(c5)
-        # print(sources[-1].shape)
-        
-
-        p5 = self.lateral_layer1(c5)
-        p5_upsampled = self._upsample(p5, c4)
-        p5 = self.corr_layer1(p5)
-
-        p4 = self.lateral_layer2(c4)
-        p4 = p5_upsampled + p4
-        p4_upsampled = self._upsample(p4, c3)
-        p4 = self.corr_layer2(p4)
-
-        p3 = self.lateral_layer3(c3)
-        p3 = p4_upsampled + p3
-        p3 = self.corr_layer3(p3)
-
-        p6 = self.conv6(c5)
-        p7 = self.conv7(F.relu(p6))
 
-        features = [p3, p4, p5, p6, p7]
-        
-        ego_feat = self.avg_pool(p7)
-        if self.pool2 is not None:
-            for i in range(len(features)):
-                features[i] = self._upsample_time(features[i])
-            ego_feat = self._upsample_time(ego_feat)
-        
+        if self.MODEL_TYPE.startswith('SlowFast'):
+            ff = self.backbone(x)
+            c3 = ff[0]
+            c4 = ff[1]
+            c5 = ff[2]
+            p5 = self.lateral_layer1(c5)
+            p5_upsampled = self._upsample(p5, c4)
+            p5 = self.corr_layer1(p5)
+            p4 = self.lateral_layer2(c4)
+            p4 = p5_upsampled + p4
+            p4_upsampled = self._upsample(p4, c3)
+            p4 = self.corr_layer2(p4)
+            p3 = self.lateral_layer3(c3)
+            p3 = p4_upsampled + p3
+            p3 = self.corr_layer3(p3)
+            p6 = self.conv6(c5)
+            p7 = self.conv7(F.relu(p6))
+            features = [p3, p4, p5, p6, p7]
+            ego_feat = self.avg_pool(p7)
+            if self.pool2 is not None:
+                for i in range(len(features)):
+                    features[i] = self._upsample_time(features[i])
+                ego_feat = self._upsample_time(ego_feat)
+        else:
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+            x = self.pool1(x)
+            x = self.layer1(x)
+            if self.pool2 is not None:
+                x = self.pool2(x)
+            c3 = self.layer2(x)
+            c4 = self.layer3(c3)
+            c5 = self.layer4(c4)
+            p5 = self.lateral_layer1(c5)
+            p5_upsampled = self._upsample(p5, c4)
+            p5 = self.corr_layer1(p5)
+            p4 = self.lateral_layer2(c4)
+            p4 = p5_upsampled + p4
+            p4_upsampled = self._upsample(p4, c3)
+            p4 = self.corr_layer2(p4)
+            p3 = self.lateral_layer3(c3)
+            p3 = p4_upsampled + p3
+            p3 = self.corr_layer3(p3)
+            p6 = self.conv6(c5)
+            p7 = self.conv7(F.relu(p6))
+            features = [p3, p4, p5, p6, p7]
+            ego_feat = self.avg_pool(p7)
+            if self.pool2 is not None:
+                for i in range(len(features)):
+                    features[i] = self._upsample_time(features[i])
+                ego_feat = self._upsample_time(ego_feat)
         return features, ego_feat
 
 
@@ -290,5 +334,7 @@ def resnetfpn(args):
         return ResNetFPN(BottleneckRCLSTM, args)
     elif model_type.startswith('RCGRU'):
         return ResNetFPN(BottleneckRCGRU, args)
+    elif model_type.startswith('SlowFast'):
+        return ResNetFPN(BottleneckI3D, args)
     else:
         raise RuntimeError('Define the model type correctly:: ' + model_type)
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100755
index 0000000..4085bef
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,208 @@
+import os
+import csv
+import logging
+import math
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import Sampler
+
+_LOGGER = None
+
+
+def get_rank():
+    return dist.get_rank()
+
+
+def get_world_size():
+    return dist.get_world_size()
+
+
+def mkdir(path):
+    os.makedirs(path, exist_ok=True)
+
+
+def random_seed(seed_value):
+    np.random.seed(seed_value)
+    torch.manual_seed(seed_value)
+    random.seed(seed_value)
+    os.environ['PYTHONHASHSEED'] = str(seed_value)
+    torch.cuda.manual_seed(seed_value)
+    torch.cuda.manual_seed_all(seed_value)
+
+
+def parameters_string(module):
+    lines = [
+        "",
+        "List of model parameters:",
+        "=" * 105,
+    ]
+
+    row_format = "{name:<60} {shape:>27} ={total_size:>15,d}"
+    params = list(module.named_parameters())
+    for name, param in params:
+        lines.append(row_format.format(
+            name=name,
+            shape=" * ".join(str(p) for p in param.size()),
+            total_size=param.numel()
+        ))
+    lines.append("=" * 105)
+    lines.append(row_format.format(
+        name="all parameters",
+        shape="sum of above",
+        total_size=sum(int(param.numel()) for name, param in params)
+    ))
+    lines.append("")
+    return "\n".join(lines)
+
+
+def create_logger(log_file, level=logging.INFO):
+    global _LOGGER
+    if _LOGGER is not None:
+        return _LOGGER
+    l = logging.getLogger('global')
+    formatter = logging.Formatter('[%(asctime)s][%(filename)15s][line:%(lineno)4d][%(levelname)8s] %(message)s')
+    fh = logging.FileHandler(log_file)
+    fh.setFormatter(formatter)
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    l.setLevel(level)
+    l.addHandler(fh)
+    l.addHandler(sh)
+    l.propagate = False
+    _LOGGER = l
+    return l
+
+
+def get_logger():
+    return _LOGGER
+
+
+class Logger(object):
+
+    def __init__(self, path, header):
+        self.log_file = open(path, 'w')
+        self.logger = csv.writer(self.log_file, delimiter='\t')
+
+        self.logger.writerow(header)
+        self.header = header
+
+    def __del(self):
+        self.log_file.close()
+
+    def log(self, values):
+        write_values = []
+        for col in self.header:
+            assert col in values
+            write_values.append(values[col])
+
+        self.logger.writerow(write_values)
+        self.log_file.flush()
+
+
+class AverageMeter(object):
+    def __init__(self, length=0):
+        self.length = length
+        self.reset()
+
+    def reset(self):
+        if self.length > 0:
+            self.history, self.history_num = [], []
+        else:
+            self.count = 0
+            self.sum = 0.0
+        self.val = 0.0
+        self.avg = 0.0
+
+    def update(self, val, num=1):
+        assert num > 0
+        if self.length > 0:
+            self.history.append(val * num)
+            self.history_num.append(num)
+            if len(self.history) > self.length:
+                del self.history[0]
+                del self.history_num[0]
+
+            self.val = val
+            self.avg = np.sum(self.history) / np.sum(self.history_num)
+        else:
+            self.val = val
+            self.sum += val * num
+            self.count += num
+            self.avg = self.sum / self.count
+
+
+class DistributedSampler(Sampler):
+    def __init__(self, dataset, world_size=None, rank=None, round_down=False):
+        if world_size is None:
+            world_size = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank = rank
+        self.round_down = round_down
+        self.epoch = 0
+
+        self.total_size = len(self.dataset)
+        if self.round_down:
+            self.num_samples = int(math.floor(len(self.dataset) / self.world_size))
+        else:
+            self.num_samples = int(math.ceil(len(self.dataset) / self.world_size))
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = list(torch.randperm(len(self.dataset), generator=g))
+
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        if self.round_down:
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+def load_pretrain(pretrain_opt, net):
+    checkpoint = torch.load(pretrain_opt.path, map_location=lambda storage, loc: storage.cuda())
+    if pretrain_opt.get('state_dict_key', None) is not None:
+        checkpoint = checkpoint[pretrain_opt.state_dict_key]
+
+    if pretrain_opt.get('delete_prefix', None):
+        keys = set(checkpoint.keys())
+        for k in keys:
+            if k.startswith(pretrain_opt.delete_prefix):
+                checkpoint.pop(k)
+    if pretrain_opt.get('replace_prefix', None) is not None:
+        keys = set(checkpoint.keys())
+        for k in keys:
+            if k.startswith(pretrain_opt.replace_prefix):
+                new_k = pretrain_opt.get('replace_to', '') + k[len(pretrain_opt.replace_prefix):]
+                checkpoint[new_k] = checkpoint.pop(k)
+    net.load_state_dict(checkpoint, strict=False)
+
+    # if get_rank() == 0:
+    #     ckpt_keys = set(checkpoint.keys())
+    #     own_keys = set(net.state_dict().keys())
+    #     missing_keys = own_keys - ckpt_keys
+    #     ignore_keys = ckpt_keys - own_keys
+    #     loaded_keys = own_keys - missing_keys
+
+    #     logger = get_logger()
+    #     for k in missing_keys:
+    #         logger.info('Caution: missing key {}'.format(k))
+    #     for k in ignore_keys:
+    #         logger.info('Caution: redundant key {}'.format(k))
+    #     logger.info('Loaded {} key(s) from pre-trained model at {}'.format(len(loaded_keys), pretrain_opt.path))