From 362ca0a199d9d511fbc8caea9dbf371589cedc9b Mon Sep 17 00:00:00 2001
From: Hugo Serrat <hu.serrat@gmail.com>
Date: Tue, 17 May 2016 13:28:18 +0200
Subject: [PATCH 01/14] Fast and Faster R-CNN

This commit is a port from the following [fork](https://github.com/rbgirshick/caffe-fast-rcnn/tree/0dcd397b29507b8314e252e850518c5695efbb83)

It adds :
 - smooth l1 loss layer
 - roi pooling layer
 - dropout scaling at test time (needed for MSRA-trained ZF network)

LICENSE :
Faster R-CNN

The MIT License (MIT)

Copyright (c) 2015 Microsoft Corporation

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
---
 include/caffe/layer.hpp                       |   2 +
 include/caffe/layers/dropout_layer.hpp        |   1 +
 include/caffe/layers/roi_pooling_layer.hpp    |  58 ++++++
 include/caffe/layers/smooth_l1_loss_layer.hpp |  65 +++++++
 python/caffe/__init__.py                      |   2 +-
 python/caffe/_caffe.cpp                       |   1 +
 src/caffe/layers/dropout_layer.cpp            |  27 ++-
 src/caffe/layers/dropout_layer.cu             |  35 +++-
 src/caffe/layers/roi_pooling_layer.cpp        | 136 +++++++++++++
 src/caffe/layers/roi_pooling_layer.cu         | 184 ++++++++++++++++++
 src/caffe/layers/smooth_l1_loss_layer.cpp     |  65 +++++++
 src/caffe/layers/smooth_l1_loss_layer.cu      | 116 +++++++++++
 src/caffe/proto/caffe.proto                   |  21 ++
 src/caffe/test/test_roi_pooling_layer.cpp     |  93 +++++++++
 src/caffe/test/test_smooth_l1_loss_layer.cpp  |  85 ++++++++
 15 files changed, 879 insertions(+), 12 deletions(-)
 create mode 100644 include/caffe/layers/roi_pooling_layer.hpp
 create mode 100644 include/caffe/layers/smooth_l1_loss_layer.hpp
 create mode 100644 src/caffe/layers/roi_pooling_layer.cpp
 create mode 100644 src/caffe/layers/roi_pooling_layer.cu
 create mode 100644 src/caffe/layers/smooth_l1_loss_layer.cpp
 create mode 100644 src/caffe/layers/smooth_l1_loss_layer.cu
 create mode 100644 src/caffe/test/test_roi_pooling_layer.cpp
 create mode 100644 src/caffe/test/test_smooth_l1_loss_layer.cpp
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 10f353f94f9..ddf882348f7 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -316,6 +316,8 @@ class Layer {
     param_propagate_down_[param_id] = value;
   }
 
+  inline Phase phase() { return phase_; }
+
 
  protected:
   /** The protobuf that stores the layer parameters */
diff --git a/include/caffe/layers/dropout_layer.hpp b/include/caffe/layers/dropout_layer.hpp
index e83143bc3cc..fc19a392101 100644
--- a/include/caffe/layers/dropout_layer.hpp
+++ b/include/caffe/layers/dropout_layer.hpp
@@ -73,6 +73,7 @@ class DropoutLayer : public NeuronLayer<Dtype> {
   /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
   Dtype scale_;
   unsigned int uint_thres_;
+  bool scale_train_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/roi_pooling_layer.hpp b/include/caffe/layers/roi_pooling_layer.hpp
new file mode 100644
index 00000000000..e9e39ad4f26
--- /dev/null
+++ b/include/caffe/layers/roi_pooling_layer.hpp
@@ -0,0 +1,58 @@
+#ifndef CAFFE_ROI_POOLING_LAYER_HPP_
+#define CAFFE_ROI_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief ROIPoolingLayer - Region of Interest Pooling Layer.
+ *
+ * Fast R-CNN
+ * Written by Ross Girshick
+ */
+
+template <typename Dtype>
+class ROIPoolingLayer : public Layer<Dtype> {
+ public:
+  explicit ROIPoolingLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "ROIPooling"; }
+
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int channels_;
+  int height_;
+  int width_;
+  int pooled_height_;
+  int pooled_width_;
+  Dtype spatial_scale_;
+  Blob<int> max_idx_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ROI_POOLING_LAYER_HPP_
diff --git a/include/caffe/layers/smooth_l1_loss_layer.hpp b/include/caffe/layers/smooth_l1_loss_layer.hpp
new file mode 100644
index 00000000000..44082558077
--- /dev/null
+++ b/include/caffe/layers/smooth_l1_loss_layer.hpp
@@ -0,0 +1,65 @@
+#ifndef CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
+#define CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief SmoothL1LossLayer
+ *
+ * Fast R-CNN
+ * Written by Ross Girshick
+ */
+template <typename Dtype>
+class SmoothL1LossLayer : public LossLayer<Dtype> {
+ public:
+  explicit SmoothL1LossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), diff_() {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "SmoothL1Loss"; }
+
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 4; }
+
+  /**
+   * Unlike most loss layers, in the SmoothL1LossLayer we can backpropagate
+   * to both inputs -- override to return true and always allow force_backward.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return true;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> diff_;
+  Blob<Dtype> errors_;
+  Blob<Dtype> ones_;
+  bool has_weights_;
+  Dtype sigma2_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index e2881b89c1b..35868a403a3 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
+from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index dfcfd0db0c4..9ca9378c197 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -284,6 +284,7 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_device", &Caffe::SetDevice);
+  bp::def("set_random_seed", &Caffe::set_random_seed);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
 
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 533ab26c04d..cedd4a14f4c 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -16,6 +16,7 @@ void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   DCHECK(threshold_ < 1.);
   scale_ = 1. / (1. - threshold_);
   uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+  scale_train_ = this->layer_param_.dropout_param().scale_train();
 }
 
 template <typename Dtype>
@@ -37,11 +38,20 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   if (this->phase_ == TRAIN) {
     // Create random numbers
     caffe_rng_bernoulli(count, 1. - threshold_, mask);
-    for (int i = 0; i < count; ++i) {
-      top_data[i] = bottom_data[i] * mask[i] * scale_;
+    if (scale_train_) {
+      for (int i = 0; i < count; ++i) {
+        top_data[i] = bottom_data[i] * mask[i] * scale_;
+      }
+    } else {
+      for (int i = 0; i < count; ++i) {
+        top_data[i] = bottom_data[i] * mask[i];
+      }
     }
   } else {
     caffe_copy(bottom[0]->count(), bottom_data, top_data);
+    if (!scale_train_) {
+      caffe_scal<Dtype>(count, 1. / scale_, top_data);
+    }
   }
 }
 
@@ -55,11 +65,20 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (this->phase_ == TRAIN) {
       const unsigned int* mask = rand_vec_.cpu_data();
       const int count = bottom[0]->count();
-      for (int i = 0; i < count; ++i) {
-        bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+      if (scale_train_) {
+        for (int i = 0; i < count; ++i) {
+          bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+        }
+      } else {
+        for (int i = 0; i < count; ++i) {
+          bottom_diff[i] = top_diff[i] * mask[i];
+        }
       }
     } else {
       caffe_copy(top[0]->count(), top_diff, bottom_diff);
+      if (!scale_train_) {
+        caffe_scal<Dtype>(top[0]->count(), 1. / scale_, bottom_diff);
+      }
     }
   }
 }
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index 186c10ca489..f9f93536c5b 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -25,12 +25,23 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
         static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
     caffe_gpu_rng_uniform(count, mask);
     // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, mask, uint_thres_, scale_, top_data);
+    if (scale_train_) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(
+          count, bottom_data, mask, uint_thres_, scale_, top_data);
+    } else {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(
+          count, bottom_data, mask, uint_thres_, 1.f, top_data);
+    }
     CUDA_POST_KERNEL_CHECK;
   } else {
     caffe_copy(count, bottom_data, top_data);
+    if (!scale_train_) {
+      caffe_gpu_scal<Dtype>(count, 1. / scale_, top_data);
+    }
   }
 }
 
@@ -54,13 +65,23 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const unsigned int* mask =
           static_cast<const unsigned int*>(rand_vec_.gpu_data());
       const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+      if (scale_train_) {
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
+          CAFFE_CUDA_NUM_THREADS>>>(
+            count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+      } else {
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
+          CAFFE_CUDA_NUM_THREADS>>>(
+           count, top_diff, mask, uint_thres_, 1.f, bottom_diff);
+      }
       CUDA_POST_KERNEL_CHECK;
     } else {
       caffe_copy(top[0]->count(), top_diff, bottom_diff);
+      if (!scale_train_) {
+        caffe_gpu_scal<Dtype>(top[0]->count(), 1. / scale_, bottom_diff);
+      }
     }
   }
 }
diff --git a/src/caffe/layers/roi_pooling_layer.cpp b/src/caffe/layers/roi_pooling_layer.cpp
new file mode 100644
index 00000000000..3447dbf65ae
--- /dev/null
+++ b/src/caffe/layers/roi_pooling_layer.cpp
@@ -0,0 +1,136 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/roi_pooling_layer.hpp"
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace caffe {
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
+  CHECK_GT(roi_pool_param.pooled_h(), 0)
+      << "pooled_h must be > 0";
+  CHECK_GT(roi_pool_param.pooled_w(), 0)
+      << "pooled_w must be > 0";
+  pooled_height_ = roi_pool_param.pooled_h();
+  pooled_width_ = roi_pool_param.pooled_w();
+  spatial_scale_ = roi_pool_param.spatial_scale();
+  LOG(INFO) << "Spatial scale: " << spatial_scale_;
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,
+      pooled_width_);
+  max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
+      pooled_width_);
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_rois = bottom[1]->cpu_data();
+  // Number of ROIs
+  int num_rois = bottom[1]->num();
+  int batch_size = bottom[0]->num();
+  int top_count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+  int* argmax_data = max_idx_.mutable_cpu_data();
+  caffe_set(top_count, -1, argmax_data);
+
+  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
+  for (int n = 0; n < num_rois; ++n) {
+    int roi_batch_ind = bottom_rois[0];
+    int roi_start_w = round(bottom_rois[1] * spatial_scale_);
+    int roi_start_h = round(bottom_rois[2] * spatial_scale_);
+    int roi_end_w = round(bottom_rois[3] * spatial_scale_);
+    int roi_end_h = round(bottom_rois[4] * spatial_scale_);
+    CHECK_GE(roi_batch_ind, 0);
+    CHECK_LT(roi_batch_ind, batch_size);
+
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    const Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                             / static_cast<Dtype>(pooled_height_);
+    const Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                             / static_cast<Dtype>(pooled_width_);
+
+    const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
+
+    for (int c = 0; c < channels_; ++c) {
+      for (int ph = 0; ph < pooled_height_; ++ph) {
+        for (int pw = 0; pw < pooled_width_; ++pw) {
+          // Compute pooling region for this output unit:
+          //  start (included) = floor(ph * roi_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+          int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
+                                              * bin_size_h));
+          int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
+                                              * bin_size_w));
+          int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
+                                           * bin_size_h));
+          int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
+                                           * bin_size_w));
+
+          hstart = min(max(hstart + roi_start_h, 0), height_);
+          hend = min(max(hend + roi_start_h, 0), height_);
+          wstart = min(max(wstart + roi_start_w, 0), width_);
+          wend = min(max(wend + roi_start_w, 0), width_);
+
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+          const int pool_index = ph * pooled_width_ + pw;
+          if (is_empty) {
+            top_data[pool_index] = 0;
+            argmax_data[pool_index] = -1;
+          }
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width_ + w;
+              if (batch_data[index] > top_data[pool_index]) {
+                top_data[pool_index] = batch_data[index];
+                argmax_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+      // Increment all data pointers by one channel
+      batch_data += bottom[0]->offset(0, 1);
+      top_data += top[0]->offset(0, 1);
+      argmax_data += max_idx_.offset(0, 1);
+    }
+    // Increment ROI data pointer
+    bottom_rois += bottom[1]->offset(1);
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  NOT_IMPLEMENTED;
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(ROIPoolingLayer);
+#endif
+
+INSTANTIATE_CLASS(ROIPoolingLayer);
+REGISTER_LAYER_CLASS(ROIPooling);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/roi_pooling_layer.cu b/src/caffe/layers/roi_pooling_layer.cu
new file mode 100644
index 00000000000..ea699a2d1d4
--- /dev/null
+++ b/src/caffe/layers/roi_pooling_layer.cu
@@ -0,0 +1,184 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/roi_pooling_layer.hpp"
+
+
+using std::max;
+using std::min;
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ROIPoolForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const Dtype* bottom_rois, Dtype* top_data, int* argmax_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    bottom_rois += n * 5;
+    int roi_batch_ind = bottom_rois[0];
+    int roi_start_w = round(bottom_rois[1] * spatial_scale);
+    int roi_start_h = round(bottom_rois[2] * spatial_scale);
+    int roi_end_w = round(bottom_rois[3] * spatial_scale);
+    int roi_end_h = round(bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                       / static_cast<Dtype>(pooled_height);
+    Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                       / static_cast<Dtype>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
+                                        * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
+                                        * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
+                                     * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
+                                     * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    Dtype maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (bottom_data[bottom_index] > maxval) {
+          maxval = bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    argmax_data[index] = maxidx;
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bottom_rois = bottom[1]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int* argmax_data = max_idx_.mutable_gpu_data();
+  int count = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ROIPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, spatial_scale_, channels_, height_, width_,
+      pooled_height_, pooled_width_, bottom_rois, top_data, argmax_data);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
+    const int* argmax_data, const int num_rois, const Dtype spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, Dtype* bottom_diff,
+    const Dtype* bottom_rois) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, h, w) coords in bottom data
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    Dtype gradient = 0;
+    // Accumulate gradient over all ROIs that pooled this element
+    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+      const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      // Skip if ROI's batch index doesn't match n
+      if (n != roi_batch_ind) {
+        continue;
+      }
+
+      int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+      int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+      int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+      int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+      // Skip if ROI doesn't include (h, w)
+      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
+                           h >= roi_start_h && h <= roi_end_h);
+      if (!in_roi) {
+        continue;
+      }
+
+      int offset = (roi_n * channels + c) * pooled_height * pooled_width;
+      const Dtype* offset_top_diff = top_diff + offset;
+      const int* offset_argmax_data = argmax_data + offset;
+
+      // Compute feasible set of pooled units that could have pooled
+      // this bottom unit
+
+      // Force malformed ROIs to be 1x1
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+
+      Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                         / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                         / static_cast<Dtype>(pooled_width);
+
+      int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
+      int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
+      int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
+      int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
+
+      phstart = min(max(phstart, 0), pooled_height);
+      phend = min(max(phend, 0), pooled_height);
+      pwstart = min(max(pwstart, 0), pooled_width);
+      pwend = min(max(pwend, 0), pooled_width);
+
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {
+            gradient += offset_top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* bottom_rois = bottom[1]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int count = bottom[0]->count();
+  caffe_gpu_set(count, Dtype(0.), bottom_diff);
+  const int* argmax_data = max_idx_.gpu_data();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
+      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(ROIPoolingLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
new file mode 100644
index 00000000000..6ddbc6edaf2
--- /dev/null
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -0,0 +1,65 @@
+#include <vector>
+
+#include "caffe/layers/smooth_l1_loss_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::LayerSetUp(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  SmoothL1LossParameter loss_param = this->layer_param_.smooth_l1_loss_param();
+  sigma2_ = loss_param.sigma() * loss_param.sigma();
+  has_weights_ = (bottom.size() >= 3);
+  if (has_weights_) {
+    CHECK_EQ(bottom.size(), 4) << "If weights are used, must specify both "
+      "inside and outside weights";
+  }
+}
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::Reshape(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
+  CHECK_EQ(bottom[0]->height(), bottom[1]->height());
+  CHECK_EQ(bottom[0]->width(), bottom[1]->width());
+  if (has_weights_) {
+    CHECK_EQ(bottom[0]->channels(), bottom[2]->channels());
+    CHECK_EQ(bottom[0]->height(), bottom[2]->height());
+    CHECK_EQ(bottom[0]->width(), bottom[2]->width());
+    CHECK_EQ(bottom[0]->channels(), bottom[3]->channels());
+    CHECK_EQ(bottom[0]->height(), bottom[3]->height());
+    CHECK_EQ(bottom[0]->width(), bottom[3]->width());
+  }
+  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  errors_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  // vector of ones used to sum
+  ones_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  for (int i = 0; i < bottom[0]->count(); ++i) {
+    ones_.mutable_cpu_data()[i] = Dtype(1);
+  }
+}
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  NOT_IMPLEMENTED;
+}
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  NOT_IMPLEMENTED;
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(SmoothL1LossLayer);
+#endif
+
+INSTANTIATE_CLASS(SmoothL1LossLayer);
+REGISTER_LAYER_CLASS(SmoothL1Loss);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/smooth_l1_loss_layer.cu b/src/caffe/layers/smooth_l1_loss_layer.cu
new file mode 100644
index 00000000000..9ddc9183539
--- /dev/null
+++ b/src/caffe/layers/smooth_l1_loss_layer.cu
@@ -0,0 +1,116 @@
+#include <vector>
+
+#include "caffe/layers/smooth_l1_loss_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void SmoothL1Forward(const int n, const Dtype* in, Dtype* out,
+    Dtype sigma2) {
+  // f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
+  //        |x| - 0.5 / sigma / sigma    otherwise
+  CUDA_KERNEL_LOOP(index, n) {
+    Dtype val = in[index];
+    Dtype abs_val = abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      out[index] = 0.5 * val * val * sigma2;
+    } else {
+      out[index] = abs_val - 0.5 / sigma2;
+    }
+  }
+}
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_gpu_sub(
+      count,
+      bottom[0]->gpu_data(),
+      bottom[1]->gpu_data(),
+      diff_.mutable_gpu_data());    // d := b0 - b1
+  if (has_weights_) {
+    // apply "inside" weights
+    caffe_gpu_mul(
+        count,
+        bottom[2]->gpu_data(),
+        diff_.gpu_data(),
+        diff_.mutable_gpu_data());  // d := w_in * (b0 - b1)
+  }
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SmoothL1Forward<Dtype><<<CAFFE_GET_BLOCKS(count),
+    CAFFE_CUDA_NUM_THREADS>>>(
+      count, diff_.gpu_data(), errors_.mutable_gpu_data(), sigma2_);
+  CUDA_POST_KERNEL_CHECK;
+
+  if (has_weights_) {
+    // apply "outside" weights
+    caffe_gpu_mul(
+        count,
+        bottom[3]->gpu_data(),
+        errors_.gpu_data(),
+        errors_.mutable_gpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+  }
+
+  Dtype loss;
+  caffe_gpu_dot(count, ones_.gpu_data(), errors_.gpu_data(), &loss);
+  top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
+}
+
+template <typename Dtype>
+__global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out,
+    Dtype sigma2) {
+  // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+  //       = sign(x)                   otherwise
+  CUDA_KERNEL_LOOP(index, n) {
+    Dtype val = in[index];
+    Dtype abs_val = abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      out[index] = sigma2 * val;
+    } else {
+      out[index] = (Dtype(0) < val) - (val < Dtype(0));
+    }
+  }
+}
+
+template <typename Dtype>
+void SmoothL1LossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  // after forwards, diff_ holds w_in * (b0 - b1)
+  int count = diff_.count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SmoothL1Backward<Dtype><<<CAFFE_GET_BLOCKS(count),
+    CAFFE_CUDA_NUM_THREADS >>>(
+      count, diff_.gpu_data(), diff_.mutable_gpu_data(), sigma2_);
+  CUDA_POST_KERNEL_CHECK;
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_gpu_axpby(
+          count,                           // count
+          alpha,                           // alpha
+          diff_.gpu_data(),                // x
+          Dtype(0),                        // beta
+          bottom[i]->mutable_gpu_diff());  // y
+      if (has_weights_) {
+        // Scale by "inside" weight
+        caffe_gpu_mul(
+            count,
+            bottom[2]->gpu_data(),
+            bottom[i]->gpu_diff(),
+            bottom[i]->mutable_gpu_diff());
+        // Scale by "outside" weight
+        caffe_gpu_mul(
+            count,
+            bottom[3]->gpu_data(),
+            bottom[i]->gpu_diff(),
+            bottom[i]->mutable_gpu_diff());
+      }
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(SmoothL1LossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 1556781cbc2..cf759a57efc 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -394,8 +394,10 @@ message LayerParameter {
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
+  optional ROIPoolingParameter roi_pooling_param = 146;
   optional ScaleParameter scale_param = 142;
   optional SigmoidParameter sigmoid_param = 124;
+  optional SmoothL1LossParameter smooth_l1_loss_param = 147;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;
@@ -660,6 +662,7 @@ message DataParameter {
 
 message DropoutParameter {
   optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+  optional bool scale_train = 2 [default = true];  // scale train or test phase
 }
 
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
@@ -1057,6 +1060,17 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
+// Message that stores parameters used by ROIPoolingLayer
+message ROIPoolingParameter {
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
+  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
+  // Multiplicative spatial scale factor to translate ROI coords from their
+  // input scale to the scale used when pooling
+  optional float spatial_scale = 3 [default = 1];
+}
+
 message ScaleParameter {
   // The first axis of bottom[0] (the first input Blob) along which to apply
   // bottom[1] (the second input Blob).  May be negative to index from the end
@@ -1114,6 +1128,13 @@ message SliceParameter {
   optional uint32 slice_dim = 1 [default = 1];
 }
 
+message SmoothL1LossParameter {
+  // SmoothL1Loss(x) =
+  //   0.5 * (sigma * x) ** 2    -- if x < 1.0 / sigma / sigma
+  //   |x| - 0.5 / sigma / sigma -- otherwise
+  optional float sigma = 1 [default = 1];
+}
+
 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
 message SoftmaxParameter {
   enum Engine {
diff --git a/src/caffe/test/test_roi_pooling_layer.cpp b/src/caffe/test/test_roi_pooling_layer.cpp
new file mode 100644
index 00000000000..70d4d939a0b
--- /dev/null
+++ b/src/caffe/test/test_roi_pooling_layer.cpp
@@ -0,0 +1,93 @@
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "boost/scoped_ptr.hpp"
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/roi_pooling_layer.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+using boost::scoped_ptr;
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+template <typename Dtype>
+class ROIPoolingLayerTest : public GPUDeviceTest<Dtype> {
+ protected:
+  ROIPoolingLayerTest()
+      : blob_bottom_data_(new Blob<Dtype>(4, 3, 12, 8)),
+        blob_bottom_rois_(new Blob<Dtype>(4, 5, 1, 1)),
+        blob_top_data_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_std(10);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_data_);
+
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+
+    int i = 0;
+    // caffe_rng_rand() % 4;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 1;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 6;  // y2 < 12
+    i = 1;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 2;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 6;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 11;  // y2 < 12
+    i = 2;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 5;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 10;  // y2 < 12
+    i = 3;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 3;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3;  // y2 < 12
+
+    blob_bottom_vec_.push_back(blob_bottom_rois_);
+    blob_top_vec_.push_back(blob_top_data_);
+  }
+  virtual ~ROIPoolingLayerTest() {
+    delete blob_bottom_data_;
+    delete blob_bottom_rois_;
+    delete blob_top_data_;
+  }
+  Blob<Dtype>* const blob_bottom_data_;
+  Blob<Dtype>* const blob_bottom_rois_;
+  Blob<Dtype>* const blob_top_data_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+// Forward and Backward CPU not implemented yet
+TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypes);
+
+TYPED_TEST(ROIPoolingLayerTest, TestGradient) {
+  LayerParameter layer_param;
+  ROIPoolingParameter* roi_pooling_param =
+      layer_param.mutable_roi_pooling_param();
+  roi_pooling_param->set_pooled_h(6);
+  roi_pooling_param->set_pooled_w(6);
+  ROIPoolingLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-4, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+#endif
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_smooth_l1_loss_layer.cpp b/src/caffe/test/test_smooth_l1_loss_layer.cpp
new file mode 100644
index 00000000000..7f60a8d3a35
--- /dev/null
+++ b/src/caffe/test/test_smooth_l1_loss_layer.cpp
@@ -0,0 +1,85 @@
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/smooth_l1_loss_layer.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+template <typename Dtype>
+class SmoothL1LossLayerTest : public GPUDeviceTest<Dtype> {
+ protected:
+  SmoothL1LossLayerTest()
+      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_bottom_label_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_bottom_inside_weights_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_bottom_outside_weights_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_top_loss_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter const_filler_param;
+    const_filler_param.set_value(-1.);
+    ConstantFiller<Dtype> const_filler(const_filler_param);
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+
+    filler.Fill(this->blob_bottom_data_);
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+
+    filler.Fill(this->blob_bottom_label_);
+    blob_bottom_vec_.push_back(blob_bottom_label_);
+
+    filler.Fill(this->blob_bottom_inside_weights_);
+    blob_bottom_vec_.push_back(blob_bottom_inside_weights_);
+
+    filler.Fill(this->blob_bottom_outside_weights_);
+    blob_bottom_vec_.push_back(blob_bottom_outside_weights_);
+
+    blob_top_vec_.push_back(blob_top_loss_);
+  }
+  virtual ~SmoothL1LossLayerTest() {
+    delete blob_bottom_data_;
+    delete blob_bottom_label_;
+    delete blob_bottom_inside_weights_;
+    delete blob_bottom_outside_weights_;
+    delete blob_top_loss_;
+  }
+
+  Blob<Dtype>* const blob_bottom_data_;
+  Blob<Dtype>* const blob_bottom_label_;
+  Blob<Dtype>* const blob_bottom_inside_weights_;
+  Blob<Dtype>* const blob_bottom_outside_weights_;
+  Blob<Dtype>* const blob_top_loss_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypes);
+
+TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
+  LayerParameter layer_param;
+  SmoothL1LossParameter* loss_param =
+      layer_param.mutable_smooth_l1_loss_param();
+  loss_param->set_sigma(2.4);
+
+  const TypeParam kLossWeight = 3.7;
+  layer_param.add_loss_weight(kLossWeight);
+  SmoothL1LossLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  GradientChecker<TypeParam> checker(1e-2, 1e-2, 1701);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
+}
+#endif
+
+}  // namespace caffe

From 3546e877ddf1cd6c0fc9cf63c4f43891bdf97263 Mon Sep 17 00:00:00 2001
From: Hugo Serrat <hu.serrat@gmail.com>
Date: Fri, 27 May 2016 10:04:24 +0200
Subject: [PATCH 02/14] Added Backward CPU support for ROI Pooling Layer Added
 tests for ROI Pooling Layer

Author: Ronghang Hu
---
 include/caffe/layers/roi_pooling_layer.hpp |  30 ++++-
 src/caffe/layers/roi_pooling_layer.cpp     |  34 +++++-
 src/caffe/test/test_roi_pooling_layer.cpp  | 122 ++++++++++++++++-----
 3 files changed, 156 insertions(+), 30 deletions(-)

diff --git a/include/caffe/layers/roi_pooling_layer.hpp b/include/caffe/layers/roi_pooling_layer.hpp
index e9e39ad4f26..a082721ce4c 100644
--- a/include/caffe/layers/roi_pooling_layer.hpp
+++ b/include/caffe/layers/roi_pooling_layer.hpp
@@ -10,8 +10,34 @@
 
 namespace caffe {
 
-/**
- * @brief ROIPoolingLayer - Region of Interest Pooling Layer.
+/** 
+ * @brief Perform max pooling on regions of interest specified by input, takes
+ *        as input N feature maps and a list of R regions of interest.
+ *
+ *   ROIPoolingLayer takes 2 inputs and produces 1 output. bottom[0] is
+ *   [N x C x H x W] feature maps on which pooling is performed. bottom[1] is
+ *   [R x 5] containing a list R ROI tuples with batch index and coordinates of
+ *   regions of interest. Each row in bottom[1] is a ROI tuple in format
+ *   [batch_index x1 y1 x2 y2], where batch_index corresponds to the index of
+ *   instance in the first input and x1 y1 x2 y2 are 0-indexed coordinates
+ *   of ROI rectangle (including its boundaries).
+ *
+ *   For each of the R ROIs, max-pooling is performed over pooled_h x pooled_w
+ *   output bins (specified in roi_pooling_param). The pooling bin sizes are
+ *   adaptively set such that they tile ROI rectangle in the indexed feature
+ *   map. The pooling region of vertical bin ph in [0, pooled_h) is computed as
+ *
+ *    start_ph (included) = y1 + floor(ph * (y2 - y1 + 1) / pooled_h)
+ *    end_ph (excluded)   = y1 + ceil((ph + 1) * (y2 - y1 + 1) / pooled_h)
+ *
+ *   and similar horizontal bins.
+ *
+ * @param param provides ROIPoolingParameter roi_pooling_param,
+ *        with ROIPoolingLayer options:
+ *  - pooled_h. The pooled output height.
+ *  - pooled_w. The pooled output width
+ *  - spatial_scale. Multiplicative spatial scale factor to translate ROI
+ *  coordinates from their input scale to the scale used when pooling.
  *
  * Fast R-CNN
  * Written by Ross Girshick
diff --git a/src/caffe/layers/roi_pooling_layer.cpp b/src/caffe/layers/roi_pooling_layer.cpp
index 3447dbf65ae..3394478223e 100644
--- a/src/caffe/layers/roi_pooling_layer.cpp
+++ b/src/caffe/layers/roi_pooling_layer.cpp
@@ -122,7 +122,39 @@ void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  NOT_IMPLEMENTED;
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to roi inputs.";
+  }
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* bottom_rois = bottom[1]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  caffe_set(bottom[0]->count(), Dtype(0.), bottom_diff);
+  const int* argmax_data = max_idx_.cpu_data();
+  const int num_rois = top[0]->num();
+
+  // Accumulate gradient over all ROIs
+  for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+    int roi_batch_ind = bottom_rois[roi_n * 5];
+    // Accumulate gradients over each bin in this ROI
+    for (int c = 0; c < channels_; ++c) {
+      for (int ph = 0; ph < pooled_height_; ++ph) {
+        for (int pw = 0; pw < pooled_width_; ++pw) {
+          int offset_top = ((roi_n * channels_ + c) * pooled_height_ + ph)
+              * pooled_width_ + pw;
+          int argmax_index = argmax_data[offset_top];
+          if (argmax_index >= 0) {
+            int offset_bottom = (roi_batch_ind * channels_ + c) * height_
+                * width_ + argmax_index;
+            bottom_diff[offset_bottom] += top_diff[offset_top];
+          }
+        }
+      }
+    }
+  }
 }
 
 
diff --git a/src/caffe/test/test_roi_pooling_layer.cpp b/src/caffe/test/test_roi_pooling_layer.cpp
index 70d4d939a0b..f678bd1d10a 100644
--- a/src/caffe/test/test_roi_pooling_layer.cpp
+++ b/src/caffe/test/test_roi_pooling_layer.cpp
@@ -1,3 +1,5 @@
+#include <algorithm>
+#include <cfloat>
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
@@ -17,77 +19,143 @@ using boost::scoped_ptr;
 
 namespace caffe {
 
-#ifndef CPU_ONLY
-template <typename Dtype>
-class ROIPoolingLayerTest : public GPUDeviceTest<Dtype> {
+template <typename TypeParam>
+class ROIPoolingLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
  protected:
   ROIPoolingLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(4, 3, 12, 8)),
+      : blob_bottom_data_(new Blob<Dtype>(2, 2, 6, 8)),
         blob_bottom_rois_(new Blob<Dtype>(4, 5, 1, 1)),
-        blob_top_data_(new Blob<Dtype>()) {
+        blob_top_data_(new Blob<Dtype>()),
+        blob_bottom_data_2_(new Blob<Dtype>(2, 3, 12, 20)),
+        blob_bottom_rois_2_(new Blob<Dtype>(1, 5, 1, 1)),
+        blob_top_data_2_(new Blob<Dtype>()) {
     // fill the values
     FillerParameter filler_param;
     filler_param.set_std(10);
     GaussianFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
-
+    // for (int i = 0; i < blob_bottom_data_->count(); ++i) {
+    //   blob_bottom_data_->mutable_cpu_data()[i] = i;
+    // }
     blob_bottom_vec_.push_back(blob_bottom_data_);
-
     int i = 0;
-    // caffe_rng_rand() % 4;
     blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
-    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 1;  // x1 < 8
-    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 12
-    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6;  // x2 < 8
-    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 6;  // y2 < 12
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 0;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 0;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 5;  // y2 < 6
     i = 1;
-    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 2;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
     blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 6;  // x1 < 8
-    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2;  // y1 < 6
     blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7;  // x2 < 8
-    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 11;  // y2 < 12
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 5;  // y2 < 6
     i = 2;
     blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
     blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
-    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 12
-    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 5;  // x2 < 8
-    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 10;  // y2 < 12
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 4;  // y2 < 6
     i = 3;
     blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
     blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
-    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3;  // y1 < 12
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3;  // y1 < 6
     blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 3;  // x2 < 8
-    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3;  // y2 < 12
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3;  // y2 < 6
 
     blob_bottom_vec_.push_back(blob_bottom_rois_);
     blob_top_vec_.push_back(blob_top_data_);
+
+    filler.Fill(this->blob_bottom_data_2_);
+    blob_bottom_vec_2_.push_back(blob_bottom_data_2_);
+
+    // Pool over the entire bottom of feature map 1
+    blob_bottom_rois_2_->mutable_cpu_data()[0] = 1;
+    blob_bottom_rois_2_->mutable_cpu_data()[1] = 0;
+    blob_bottom_rois_2_->mutable_cpu_data()[2] = 0;
+    blob_bottom_rois_2_->mutable_cpu_data()[3] = 19;
+    blob_bottom_rois_2_->mutable_cpu_data()[4] = 11;
+
+    blob_bottom_vec_2_.push_back(blob_bottom_rois_2_);
+    blob_top_vec_2_.push_back(blob_top_data_2_);
   }
   virtual ~ROIPoolingLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_rois_;
     delete blob_top_data_;
+    delete blob_bottom_data_2_;
+    delete blob_bottom_rois_2_;
+    delete blob_top_data_2_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_rois_;
   Blob<Dtype>* const blob_top_data_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype>* const blob_bottom_data_2_;
+  Blob<Dtype>* const blob_bottom_rois_2_;
+  Blob<Dtype>* const blob_top_data_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_2_;
+  vector<Blob<Dtype>*> blob_top_vec_2_;
 };
 
-// Forward and Backward CPU not implemented yet
-TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypes);
+TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypesAndDevices);
 
-TYPED_TEST(ROIPoolingLayerTest, TestGradient) {
+TYPED_TEST(ROIPoolingLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ROIPoolingParameter* roi_pooling_param =
       layer_param.mutable_roi_pooling_param();
+
+  // 12 x 20 pooling with bin_size_h == 1 && bin_size_w == 1
+  roi_pooling_param->set_pooled_h(12);
+  roi_pooling_param->set_pooled_w(20);
+  ROIPoolingLayer<Dtype> layer_2(layer_param);
+  layer_2.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  layer_2.Forward(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  for (int i = 0; i < this->blob_top_data_2_->count(); ++i) {
+    EXPECT_EQ(this->blob_top_data_2_->cpu_data()[i],
+        this->blob_bottom_data_2_->cpu_data()[i+3*12*20]);
+  }
+
+  // 6 x 10 pooling with bin_size_h == 2 && bin_size_w == 2
   roi_pooling_param->set_pooled_h(6);
-  roi_pooling_param->set_pooled_w(6);
-  ROIPoolingLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-4, 1e-2);
+  roi_pooling_param->set_pooled_w(10);
+  ROIPoolingLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  layer.Forward(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  int n = 1;
+  for (int c = 0; c < 3; ++c) {
+    for (int ph = 0; ph < 6; ++ph) {
+      for (int pw = 0; pw < 10; ++pw) {
+        Dtype maxval = -FLT_MAX;
+        for (int h = 2 * ph; h < 2 * (ph + 1); ++h) {
+          for (int w = 2 * pw; w < 2 * (pw + 1); ++w) {
+            maxval = std::max(maxval, this->blob_bottom_data_2_->cpu_data()[
+                ((n * 3 + c) * 12 + h) * 20 + w]);
+          }
+        }
+        EXPECT_EQ(this->blob_top_data_2_->cpu_data()[(c * 6 + ph) * 10 + pw],
+            maxval);
+      }
+    }
+  }
+}
+
+TYPED_TEST(ROIPoolingLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ROIPoolingParameter* roi_pooling_param =
+      layer_param.mutable_roi_pooling_param();
+  roi_pooling_param->set_pooled_h(3);
+  roi_pooling_param->set_pooled_w(4);
+  ROIPoolingLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-4, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
-#endif
 
 }  // namespace caffe

From 3c27a70ae7304feb48c60b268c70adf585879d50 Mon Sep 17 00:00:00 2001
From: Hugo Serrat <hu.serrat@gmail.com>
Date: Thu, 9 Jun 2016 15:20:54 +0200
Subject: [PATCH 03/14] Updated layer id in caffe.proto

---
 src/caffe/proto/caffe.proto | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index cf759a57efc..2b57b2da2d9 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+// LayerParameter next available layer-specific ID: 149 (last added: smooth_l1_loss_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -394,10 +394,10 @@ message LayerParameter {
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
-  optional ROIPoolingParameter roi_pooling_param = 146;
+  optional ROIPoolingParameter roi_pooling_param = 147;
   optional ScaleParameter scale_param = 142;
   optional SigmoidParameter sigmoid_param = 124;
-  optional SmoothL1LossParameter smooth_l1_loss_param = 147;
+  optional SmoothL1LossParameter smooth_l1_loss_param = 148;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;

From 2a34aefb26a288355f4232848cc34c8bf5522478 Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Thu, 10 Nov 2016 15:17:22 -0500
Subject: [PATCH 04/14] CPU support for smooth_l1_loss

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 77 ++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index 6ddbc6edaf2..72ed1220883 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -43,18 +43,91 @@ void SmoothL1LossLayer<Dtype>::Reshape(
   }
 }
 
+
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  NOT_IMPLEMENTED;
+    int count = bottom[0]->count();
+    caffe_sub(
+        count, 
+        bottom[0]->cpu_data(), 
+        bottom[1]->cpu_data(), 
+        diff_.mutable_cpu_data());    // d := b0 - b1
+    if (has_weights_) {
+        // apply "inside" weights
+        caffe_mul(
+            count, 
+            bottom[2]->cpu_data(), 
+            diff_.cpu_data(), 
+            diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
+    }
+    for(int index =0; index<count; index++) {
+        Dtype val = diff_.cpu_data()[index];
+        Dtype abs_val = abs(val);
+        if (abs_val < 1.0 / sigma2_) {
+           errors_.mutable_cpu_data()[index] = 0.5 * val * val * sigma2_;
+        } else {
+           errors_.mutable_cpu_data()[index] = abs_val - 0.5 / sigma2_;
+        }
+    }
+    if (has_weights_) {
+        // apply "outside" weights
+        caffe_mul(
+            count, 
+            bottom[3]->cpu_data(), 
+            errors_.cpu_data(), 
+            errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+    }
+
+    Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
+    top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
 }
 
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  NOT_IMPLEMENTED;
+    // after forwards, diff_ holds w_in * (b0 - b1)
+    int count = diff_.count();    
+    for(int index=0; index < count; index++) {
+        // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+        //       = sign(x)                   otherwise
+        Dtype val = diff_.cpu_data()[index];
+        Dtype abs_val = abs(val);
+        if (abs_val < 1.0 / sigma2_) {
+          diff_.mutable_cpu_data()[index] = sigma2_ * val;
+        } else {
+          diff_.mutable_cpu_data()[index] = (Dtype(0) < val) - (val < Dtype(0));
+        }
+    }
+    for (int i = 0; i < 2; ++i) {
+        if (propagate_down[i]) {
+            const Dtype sign = (i == 0) ? 1 : -1;
+            const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+            caffe_cpu_axpby(
+              count,                           // count
+              alpha,                           // alpha
+              diff_.cpu_data(),                // x
+              Dtype(0),                        // beta
+              bottom[i]->mutable_cpu_diff());  // y
+            if (has_weights_) {
+                // Scale by "inside" weight
+                caffe_mul(
+                    count,
+                    bottom[2]->cpu_data(),
+                    bottom[i]->cpu_diff(),
+                    bottom[i]->mutable_cpu_diff());
+                // Scale by "outside" weight
+                caffe_mul(
+                    count,
+                    bottom[3]->cpu_data(),
+                    bottom[i]->cpu_diff(),
+                    bottom[i]->mutable_cpu_diff());
+            }
+        }
+    }
 }
 
+
 #ifdef CPU_ONLY
 STUB_GPU(SmoothL1LossLayer);
 #endif

From 5da89ea55842bb73612d3a5335ba9224499ecc46 Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Thu, 10 Nov 2016 17:40:55 -0500
Subject: [PATCH 05/14] Update smooth_l1_loss_layer.cpp

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index 72ed1220883..8ca8ca1ec2c 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -61,7 +61,7 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
             diff_.cpu_data(), 
             diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
-    for(int index =0; index<count; index++) {
+    for (int index = 0; index < count; index++) {
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
         if (abs_val < 1.0 / sigma2_) {
@@ -76,7 +76,8 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
             count, 
             bottom[3]->cpu_data(), 
             errors_.cpu_data(), 
-            errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+            errors_.mutable_cpu_data()); 
+        // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
     Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
@@ -88,7 +89,7 @@ void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     // after forwards, diff_ holds w_in * (b0 - b1)
     int count = diff_.count();    
-    for(int index=0; index < count; index++) {
+    for (int index = 0; index < count; index++) {
         // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
         //       = sign(x)                   otherwise
         Dtype val = diff_.cpu_data()[index];

From bacd0a0635f7a4ab887b5d154ba551c3b0924902 Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 13:28:46 -0500
Subject: [PATCH 06/14] Update smooth_l1_loss_layer.cpp

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index 8ca8ca1ec2c..353aa8bc7a7 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -49,16 +49,16 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
     int count = bottom[0]->count();
     caffe_sub(
-        count, 
-        bottom[0]->cpu_data(), 
-        bottom[1]->cpu_data(), 
+        count,
+        bottom[0]->cpu_data(),
+        bottom[1]->cpu_data(),
         diff_.mutable_cpu_data());    // d := b0 - b1
     if (has_weights_) {
         // apply "inside" weights
         caffe_mul(
-            count, 
-            bottom[2]->cpu_data(), 
-            diff_.cpu_data(), 
+            count,
+            bottom[2]->cpu_data(),
+            diff_.cpu_data(),
             diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
     for (int index = 0; index < count; index++) {
@@ -73,10 +73,10 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     if (has_weights_) {
         // apply "outside" weights
         caffe_mul(
-            count, 
-            bottom[3]->cpu_data(), 
-            errors_.cpu_data(), 
-            errors_.mutable_cpu_data()); 
+            count,
+            bottom[3]->cpu_data(),
+            errors_.cpu_data(),
+            errors_.mutable_cpu_data());
         // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
@@ -88,7 +88,7 @@ template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     // after forwards, diff_ holds w_in * (b0 - b1)
-    int count = diff_.count();    
+    int count = diff_.count();
     for (int index = 0; index < count; index++) {
         // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
         //       = sign(x)                   otherwise

From a0c85c1600a811621363c556e0a1bb7db2b1a94c Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 14:38:49 -0500
Subject: [PATCH 07/14] Update test_smooth_l1_loss_layer.cpp

---
 src/caffe/test/test_smooth_l1_loss_layer.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/caffe/test/test_smooth_l1_loss_layer.cpp b/src/caffe/test/test_smooth_l1_loss_layer.cpp
index 7f60a8d3a35..3bac3ec760a 100644
--- a/src/caffe/test/test_smooth_l1_loss_layer.cpp
+++ b/src/caffe/test/test_smooth_l1_loss_layer.cpp
@@ -14,9 +14,8 @@
 
 namespace caffe {
 
-#ifndef CPU_ONLY
 template <typename Dtype>
-class SmoothL1LossLayerTest : public GPUDeviceTest<Dtype> {
+class SmoothL1LossLayerTest : public MultiDeviceTest<Dtype> {
  protected:
   SmoothL1LossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
@@ -80,6 +79,5 @@ TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 1);
 }
-#endif
 
 }  // namespace caffe

From a696d70166248cb2848ecc95e4ecd2daf427ce3f Mon Sep 17 00:00:00 2001
From: saiprabahakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 16:00:04 -0500
Subject: [PATCH 08/14] added unit test have to debug

---
 src/caffe/test/test_smooth_l1_loss_layer.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/caffe/test/test_smooth_l1_loss_layer.cpp b/src/caffe/test/test_smooth_l1_loss_layer.cpp
index 3bac3ec760a..2041a84b51e 100644
--- a/src/caffe/test/test_smooth_l1_loss_layer.cpp
+++ b/src/caffe/test/test_smooth_l1_loss_layer.cpp
@@ -14,8 +14,10 @@
 
 namespace caffe {
 
-template <typename Dtype>
-class SmoothL1LossLayerTest : public MultiDeviceTest<Dtype> {
+template <typename TypeParam>
+class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
  protected:
   SmoothL1LossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
@@ -61,19 +63,20 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<Dtype> {
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
-TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypes);
+TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   SmoothL1LossParameter* loss_param =
       layer_param.mutable_smooth_l1_loss_param();
   loss_param->set_sigma(2.4);
 
-  const TypeParam kLossWeight = 3.7;
+  const Dtype kLossWeight = 3.7;
   layer_param.add_loss_weight(kLossWeight);
-  SmoothL1LossLayer<TypeParam> layer(layer_param);
+  SmoothL1LossLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<TypeParam> checker(1e-2, 1e-2, 1701);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,

From 7ba0516f902f1daccec703bbc0c29cc51cb96ba3 Mon Sep 17 00:00:00 2001
From: saiprabahakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 16:54:44 -0500
Subject: [PATCH 09/14] updating

---
 src/caffe/layers/smooth_l1_loss_layer.cpp    | 69 ++++++++++----------
 src/caffe/test/test_smooth_l1_loss_layer.cpp |  8 ++-
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index 353aa8bc7a7..a5fc149c64f 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -43,65 +43,66 @@ void SmoothL1LossLayer<Dtype>::Reshape(
   }
 }
 
-
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
+void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) 
+{
     int count = bottom[0]->count();
-    caffe_sub(
-        count,
-        bottom[0]->cpu_data(),
-        bottom[1]->cpu_data(),
-        diff_.mutable_cpu_data());    // d := b0 - b1
-    if (has_weights_) {
+    caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());    // d := b0 - b1
+    if (has_weights_) 
+    {
         // apply "inside" weights
-        caffe_mul(
-            count,
-            bottom[2]->cpu_data(),
-            diff_.cpu_data(),
-            diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
+        caffe_mul(count, bottom[2]->cpu_data(), diff_.cpu_data(), diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
-    for (int index = 0; index < count; index++) {
+    for(int index =0; index<count; index++)
+    {
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
-        if (abs_val < 1.0 / sigma2_) {
+        if (abs_val < 1.0 / sigma2_) 
+        {
            errors_.mutable_cpu_data()[index] = 0.5 * val * val * sigma2_;
-        } else {
+        } 
+        else 
+        {
            errors_.mutable_cpu_data()[index] = abs_val - 0.5 / sigma2_;
         }
     }
-    if (has_weights_) {
+    if (has_weights_) 
+    {
         // apply "outside" weights
-        caffe_mul(
-            count,
-            bottom[3]->cpu_data(),
-            errors_.cpu_data(),
-            errors_.mutable_cpu_data());
-        // d := w_out * SmoothL1(w_in * (b0 - b1))
+        caffe_mul(count, bottom[3]->cpu_data(), errors_.cpu_data(), errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
     Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
     top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
 }
 
+
+
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) 
+{
     // after forwards, diff_ holds w_in * (b0 - b1)
     int count = diff_.count();
-    for (int index = 0; index < count; index++) {
-        // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+    
+    for(int index=0; index < count; index++)
+    {
+         // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
         //       = sign(x)                   otherwise
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
-        if (abs_val < 1.0 / sigma2_) {
+        if (abs_val < 1.0 / sigma2_) 
+        {
           diff_.mutable_cpu_data()[index] = sigma2_ * val;
-        } else {
+        } 
+        else 
+        {
           diff_.mutable_cpu_data()[index] = (Dtype(0) < val) - (val < Dtype(0));
         }
     }
-    for (int i = 0; i < 2; ++i) {
-        if (propagate_down[i]) {
+    for (int i = 0; i < 2; ++i) 
+    {
+        if (propagate_down[i]) 
+        {
             const Dtype sign = (i == 0) ? 1 : -1;
             const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
             caffe_cpu_axpby(
@@ -110,7 +111,8 @@ void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
               diff_.cpu_data(),                // x
               Dtype(0),                        // beta
               bottom[i]->mutable_cpu_diff());  // y
-            if (has_weights_) {
+            if (has_weights_) 
+            {
                 // Scale by "inside" weight
                 caffe_mul(
                     count,
@@ -128,7 +130,6 @@ void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     }
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(SmoothL1LossLayer);
 #endif
diff --git a/src/caffe/test/test_smooth_l1_loss_layer.cpp b/src/caffe/test/test_smooth_l1_loss_layer.cpp
index 2041a84b51e..fb11a0fe094 100644
--- a/src/caffe/test/test_smooth_l1_loss_layer.cpp
+++ b/src/caffe/test/test_smooth_l1_loss_layer.cpp
@@ -11,9 +11,10 @@
 #include "caffe/layers/smooth_l1_loss_layer.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
-
 namespace caffe {
 
+//typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;
+
 template <typename TypeParam>
 class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -34,13 +35,13 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
 
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
-
     filler.Fill(this->blob_bottom_label_);
     blob_bottom_vec_.push_back(blob_bottom_label_);
 
+    //const_filler.Fill(this->blob_bottom_inside_weights_);
     filler.Fill(this->blob_bottom_inside_weights_);
     blob_bottom_vec_.push_back(blob_bottom_inside_weights_);
-
+    //const_filler.Fill(this->blob_bottom_outside_weights_);
     filler.Fill(this->blob_bottom_outside_weights_);
     blob_bottom_vec_.push_back(blob_bottom_outside_weights_);
 
@@ -63,6 +64,7 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
+//TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesGPU);
 TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {

From a483a7badcb7c899d4a76e053fb18578fadfd3f2 Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 17:45:18 -0500
Subject: [PATCH 10/14] Update smooth_l1_loss_layer.cpp

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 48 +++++++++++------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index a5fc149c64f..a78e23a8f17 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -44,32 +44,35 @@ void SmoothL1LossLayer<Dtype>::Reshape(
 }
 
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) 
-{
+void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
     int count = bottom[0]->count();
-    caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());    // d := b0 - b1
-    if (has_weights_) 
-    {
+    caffe_sub(
+      count,
+      bottom[0]->cpu_data(),
+      bottom[1]->cpu_data(),
+      diff_.mutable_cpu_data());    // d := b0 - b1
+    if (has_weights_) {
         // apply "inside" weights
-        caffe_mul(count, bottom[2]->cpu_data(), diff_.cpu_data(), diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
+        caffe_mul(count, 
+          bottom[2]->cpu_data(), 
+          diff_.cpu_data(), 
+          diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
-    for(int index =0; index<count; index++)
-    {
+    for(int index =0; index<count; index++) {
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
-        if (abs_val < 1.0 / sigma2_) 
-        {
+        if (abs_val < 1.0 / sigma2_) {
            errors_.mutable_cpu_data()[index] = 0.5 * val * val * sigma2_;
-        } 
-        else 
-        {
+        } else {
            errors_.mutable_cpu_data()[index] = abs_val - 0.5 / sigma2_;
         }
     }
-    if (has_weights_) 
-    {
+    if (has_weights_) {
         // apply "outside" weights
-        caffe_mul(count, bottom[3]->cpu_data(), errors_.cpu_data(), errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+        caffe_mul(count, 
+          bottom[3]->cpu_data(),
+          errors_.cpu_data(),
+          errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
     Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
@@ -90,19 +93,14 @@ void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, con
         //       = sign(x)                   otherwise
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
-        if (abs_val < 1.0 / sigma2_) 
-        {
+        if (abs_val < 1.0 / sigma2_) {
           diff_.mutable_cpu_data()[index] = sigma2_ * val;
-        } 
-        else 
-        {
+        } else {
           diff_.mutable_cpu_data()[index] = (Dtype(0) < val) - (val < Dtype(0));
         }
     }
-    for (int i = 0; i < 2; ++i) 
-    {
-        if (propagate_down[i]) 
-        {
+    for (int i = 0; i < 2; ++i) {
+      if (propagate_down[i]) {
             const Dtype sign = (i == 0) ? 1 : -1;
             const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
             caffe_cpu_axpby(

From b624df21abc024629724068ef145110694775b9a Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 17:47:32 -0500
Subject: [PATCH 11/14] Update test_smooth_l1_loss_layer.cpp

---
 src/caffe/test/test_smooth_l1_loss_layer.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/caffe/test/test_smooth_l1_loss_layer.cpp b/src/caffe/test/test_smooth_l1_loss_layer.cpp
index fb11a0fe094..c08bd8c3e8b 100644
--- a/src/caffe/test/test_smooth_l1_loss_layer.cpp
+++ b/src/caffe/test/test_smooth_l1_loss_layer.cpp
@@ -13,8 +13,6 @@
 #include "caffe/test/test_gradient_check_util.hpp"
 namespace caffe {
 
-//typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;
-
 template <typename TypeParam>
 class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -38,10 +36,10 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
     filler.Fill(this->blob_bottom_label_);
     blob_bottom_vec_.push_back(blob_bottom_label_);
 
-    //const_filler.Fill(this->blob_bottom_inside_weights_);
+    // const_filler.Fill(this->blob_bottom_inside_weights_);
     filler.Fill(this->blob_bottom_inside_weights_);
     blob_bottom_vec_.push_back(blob_bottom_inside_weights_);
-    //const_filler.Fill(this->blob_bottom_outside_weights_);
+    // const_filler.Fill(this->blob_bottom_outside_weights_);
     filler.Fill(this->blob_bottom_outside_weights_);
     blob_bottom_vec_.push_back(blob_bottom_outside_weights_);
 
@@ -64,7 +62,6 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
-//TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesGPU);
 TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {

From 2b1352d21799ea3af600a11c8568a9fe7b71b1fb Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 18:17:50 -0500
Subject: [PATCH 12/14] Update smooth_l1_loss_layer.cpp

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 28 +++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index a78e23a8f17..5b038955edc 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -44,7 +44,8 @@ void SmoothL1LossLayer<Dtype>::Reshape(
 }
 
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                                           const vector<Blob<Dtype>*>& top) {
     int count = bottom[0]->count();
     caffe_sub(
       count,
@@ -53,12 +54,12 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, c
       diff_.mutable_cpu_data());    // d := b0 - b1
     if (has_weights_) {
         // apply "inside" weights
-        caffe_mul(count, 
-          bottom[2]->cpu_data(), 
-          diff_.cpu_data(), 
+        caffe_mul(count,
+          bottom[2]->cpu_data(),
+          diff_.cpu_data(),
           diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
-    for(int index =0; index<count; index++) {
+    for (int index = 0; index < count; index++) {
         Dtype val = diff_.cpu_data()[index];
         Dtype abs_val = abs(val);
         if (abs_val < 1.0 / sigma2_) {
@@ -69,10 +70,11 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, c
     }
     if (has_weights_) {
         // apply "outside" weights
-        caffe_mul(count, 
+        caffe_mul(count,
           bottom[3]->cpu_data(),
           errors_.cpu_data(),
-          errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+          errors_.mutable_cpu_data());  
+          // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
     Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
@@ -82,13 +84,12 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, c
 
 
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) 
-{
+void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+                                            const vector<bool>& propagate_down,
+                                            const vector<Blob<Dtype>*>& bottom) {
     // after forwards, diff_ holds w_in * (b0 - b1)
     int count = diff_.count();
-    
-    for(int index=0; index < count; index++)
-    {
+    for(int index=0; index < count; index++) {
          // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
         //       = sign(x)                   otherwise
         Dtype val = diff_.cpu_data()[index];
@@ -109,8 +110,7 @@ void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, con
               diff_.cpu_data(),                // x
               Dtype(0),                        // beta
               bottom[i]->mutable_cpu_diff());  // y
-            if (has_weights_) 
-            {
+            if (has_weights_) {
                 // Scale by "inside" weight
                 caffe_mul(
                     count,

From 1441ddc05a96ace7717e831f5b8e725fa18b0ff8 Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Sat, 12 Nov 2016 18:41:36 -0500
Subject: [PATCH 13/14] Update smooth_l1_loss_layer.cpp

---
 src/caffe/layers/smooth_l1_loss_layer.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/caffe/layers/smooth_l1_loss_layer.cpp b/src/caffe/layers/smooth_l1_loss_layer.cpp
index 5b038955edc..696bbee12cf 100644
--- a/src/caffe/layers/smooth_l1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_l1_loss_layer.cpp
@@ -73,7 +73,7 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
         caffe_mul(count,
           bottom[3]->cpu_data(),
           errors_.cpu_data(),
-          errors_.mutable_cpu_data());  
+          errors_.mutable_cpu_data());
           // d := w_out * SmoothL1(w_in * (b0 - b1))
     }
 
@@ -84,12 +84,13 @@ void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-                                            const vector<bool>& propagate_down,
-                                            const vector<Blob<Dtype>*>& bottom) {
+void SmoothL1LossLayer<Dtype>::Backward_cpu(
+  const vector<Blob<Dtype>*>& top,
+  const vector<bool>& propagate_down,
+  const vector<Blob<Dtype>*>& bottom) {
     // after forwards, diff_ holds w_in * (b0 - b1)
     int count = diff_.count();
-    for(int index=0; index < count; index++) {
+    for (int index = 0; index < count; index++) {
          // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
         //       = sign(x)                   otherwise
         Dtype val = diff_.cpu_data()[index];

From 31d8bcd781a6666e9d3f1cee6d3bfd9c9840fd6f Mon Sep 17 00:00:00 2001
From: Sai Prabhakar <aps.prabhakar@gmail.com>
Date: Wed, 30 Jan 2019 19:02:02 -0500
Subject: [PATCH 14/14] Update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 44b9e62c157..211cb5f66c3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,6 @@
+Added CPU support for *smooth_l1_loss_layer*
+
+=================
 # Caffe
 
 [![Build Status](https://travis-ci.org/BVLC/caffe.svg?branch=master)](https://travis-ci.org/BVLC/caffe)