diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 988cfdf..dad9ad4 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -45,7 +45,6 @@ class ConstantFiller : public Filler<Dtype> {
   }
 };
 
-#ifdef NO_CAFFE_MOBILE
 /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
 template <typename Dtype>
 class UniformFiller : public Filler<Dtype> {
@@ -124,7 +123,6 @@ class PositiveUnitballFiller : public Filler<Dtype> {
          << "Sparsity not supported by this Filler.";
   }
 };
-#endif
 
 /**
  * @brief Fills a Blob with values @f$ x \sim U(-a, +a) @f$ where @f$ a @f$ is
@@ -167,7 +165,6 @@ class XavierFiller : public Filler<Dtype> {
   }
 };
 
-#ifdef NO_CAFFE_MOBILE
 /**
  * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
  *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
@@ -263,7 +260,6 @@ class BilinearFiller : public Filler<Dtype> {
          << "Sparsity not supported by this Filler.";
   }
 };
-#endif
 
 /**
  * @brief Get a specific filler from the specification given in FillerParameter.
@@ -276,22 +272,18 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
   const std::string& type = param.type();
   if (type == "constant") {
     return new ConstantFiller<Dtype>(param);
-#ifdef NO_CAFFE_MOBILE
   } else if (type == "gaussian") {
     return new GaussianFiller<Dtype>(param);
   } else if (type == "positive_unitball") {
     return new PositiveUnitballFiller<Dtype>(param);
   } else if (type == "uniform") {
     return new UniformFiller<Dtype>(param);
-#endif
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
-#ifdef NO_CAFFE_MOBILE
   } else if (type == "msra") {
     return new MSRAFiller<Dtype>(param);
   } else if (type == "bilinear") {
     return new BilinearFiller<Dtype>(param);
-#endif
   } else {
     CHECK(false) << "Unknown filler name: " << param.type();
   }
diff --git a/include/caffe/fmath.hpp b/include/caffe/fmath.hpp
deleted file mode 100644
index 5f0274e..0000000
--- a/include/caffe/fmath.hpp
+++ /dev/null
@@ -1,837 +0,0 @@
-#pragma once
-/**
-	@brief fast math library for float
-	@author herumi
-	@url http://homepage1.nifty.com/herumi/
-	@note modified new BSD license
-	http://opensource.org/licenses/BSD-3-Clause
-
-	cl /Ox /Ob2 /arch:SSE2 /fp:fast bench.cpp -I../xbyak /EHsc /DNOMINMAX
-	g++ -O3 -fomit-frame-pointer -fno-operator-names -march=core2 -mssse3 -mfpmath=sse -ffast-math -fexcess-precision=fast
-*/
-/*
-	function prototype list
-
-	float fmath::exp(float);
-	double fmath::expd(double);
-	float fmath::log(float);
-
-	__m128 fmath::exp_ps(__m128);
-	__m256 fmath::exp_ps256(__m256);
-	__m128 fmath::log_ps(__m128);
-
-	double fmath::expd_v(double *, size_t n);
-
-	if FMATH_USE_XBYAK is defined then Xbyak version are used
-*/
-//#define FMATH_USE_XBYAK
-
-#include <math.h>
-#include <stddef.h>
-#include <assert.h>
-#include <limits>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> // for memcpy
-#if defined(_WIN32) && !defined(__GNUC__)
-	#include <intrin.h>
-	#ifndef MIE_ALIGN
-		#define MIE_ALIGN(x) __declspec(align(x))
-	#endif
-#else
-	#ifndef __GNUC_PREREQ
-	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
-	#endif
-	#if __GNUC_PREREQ(4, 4) || !defined(__GNUC__)
-		/* GCC >= 4.4 and non-GCC compilers */
-		#include <x86intrin.h>
-	#elif __GNUC_PREREQ(4, 1)
-		/* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */
-		#include <emmintrin.h>
-	#endif
-	#ifndef MIE_ALIGN
-		#define MIE_ALIGN(x) __attribute__((aligned(x)))
-	#endif
-#endif
-#ifndef MIE_PACK
-	#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
-#endif
-#ifdef FMATH_USE_XBYAK
-	#define XBYAK_NO_OP_NAMES
-	#include "xbyak/xbyak.h"
-	#include "xbyak/xbyak_util.h"
-#endif
-
-namespace fmath {
-
-namespace local {
-
-const size_t EXP_TABLE_SIZE = 10;
-const size_t EXPD_TABLE_SIZE = 11;
-const size_t LOG_TABLE_SIZE = 12;
-
-typedef unsigned long long uint64_t;
-
-union fi {
-	float f;
-	unsigned int i;
-};
-
-union di {
-	double d;
-	uint64_t i;
-};
-
-inline unsigned int mask(int x)
-{
-	return (1U << x) - 1;
-}
-
-inline uint64_t mask64(int x)
-{
-	return (1ULL << x) - 1;
-}
-
-template<class T>
-inline const T* cast_to(const void *p)
-{
-	return reinterpret_cast<const T*>(p);
-}
-
-template<class T, size_t N>
-size_t NumOfArray(const T (&)[N]) { return N; }
-
-/*
-	exp(88.722839f) = inf ; 0x42b17218
-	exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50
-	exp(-103.972081f) = 0 ; 0xc2cff1b5
-*/
-template<size_t N = EXP_TABLE_SIZE>
-struct ExpVar {
-	enum {
-		s = N,
-		n = 1 << s,
-		f88 = 0x42b00000 /* 88.0 */
-	};
-	float minX[8];
-	float maxX[8];
-	float a[8];
-	float b[8];
-	float f1[8];
-	unsigned int i127s[8];
-	unsigned int mask_s[8];
-	unsigned int i7fffffff[8];
-	unsigned int tbl[n];
-	ExpVar()
-	{
-		float log_2 = ::logf(2.0f);
-		for (int i = 0; i < 8; i++) {
-			maxX[i] = 88;
-			minX[i] = -88;
-			a[i] = n / log_2;
-			b[i] = log_2 / n;
-			f1[i] = 1.0f;
-			i127s[i] = 127 << s;
-			i7fffffff[i] = 0x7fffffff;
-			mask_s[i] = mask(s);
-		}
-
-		for (int i = 0; i < n; i++) {
-			float y = pow(2.0f, (float)i / n);
-			fi fi;
-			fi.f = y;
-			tbl[i] = fi.i & mask(23);
-		}
-	}
-};
-
-template<size_t sbit_ = EXPD_TABLE_SIZE>
-struct ExpdVar {
-	enum {
-		sbit = sbit_,
-		s = 1UL << sbit,
-		adj = (1UL << (sbit + 10)) - (1UL << sbit)
-	};
-	// A = 1, B = 1, C = 1/2, D = 1/6
-	double C1[2]; // A
-	double C2[2]; // D
-	double C3[2]; // C/D
-	uint64_t tbl[s];
-	double a;
-	double ra;
-	ExpdVar()
-		: a(s / ::log(2.0))
-		, ra(1 / a)
-	{
-		for (int i = 0; i < 2; i++) {
-#if 0
-			C1[i] = 1.0;
-			C2[i] = 0.16667794882310216;
-			C3[i] = 2.9997969303278795;
-#else
-			C1[i] = 1.0;
-			C2[i] = 0.16666666685227835064;
-			C3[i] = 3.0000000027955394;
-#endif
-		}
-		for (int i = 0; i < s; i++) {
-			di di;
-			di.d = ::pow(2.0, i * (1.0 / s));
-			tbl[i] = di.i & mask64(52);
-		}
-	}
-};
-
-template<size_t N = LOG_TABLE_SIZE>
-struct LogVar {
-	enum {
-		LEN = N - 1
-	};
-	unsigned int m1[4]; // 0
-	unsigned int m2[4]; // 16
-	unsigned int m3[4]; // 32
-	float m4[4];		// 48
-	unsigned int m5[4]; // 64
-	struct {
-		float app;
-		float rev;
-	} tbl[1 << LEN];
-	float c_log2;
-	LogVar()
-		: c_log2(::logf(2.0f) / (1 << 23))
-	{
-		const double e = 1 / double(1 << 24);
-		const double h = 1 / double(1 << LEN);
-		const size_t n = 1U << LEN;
-		for (size_t i = 0; i < n; i++) {
-			double x = 1 + double(i) / n;
-			double a = ::log(x);
-			tbl[i].app = (float)a;
-			if (i < n - 1) {
-				double b = ::log(x + h - e);
-				tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
-			} else {
-				tbl[i].rev = (float)(1 / (x * (1 << 23)));
-			}
-		}
-		for (int i = 0; i < 4; i++) {
-			m1[i] = mask(8) << 23;
-			m2[i] = mask(LEN) << (23 - LEN);
-			m3[i] = mask(23 - LEN);
-			m4[i] = c_log2;
-			m5[i] = 127U << 23;
-		}
-	}
-};
-
-#ifdef FMATH_USE_XBYAK
-struct ExpCode : public Xbyak::CodeGenerator {
-	float (*exp_)(float);
-	__m128 (*exp_ps_)(__m128);
-	template<size_t N>
-	ExpCode(const ExpVar<N> *self)
-	{
-		Xbyak::util::Cpu cpu;
-		try {
-			makeExp(self, cpu);
-			exp_ = (float(*)(float))getCode();
-			align(16);
-			exp_ps_ = (__m128(*)(__m128))getCurr();
-			makeExpPs(self, cpu);
-			return;
-		} catch (std::exception& e) {
-			fprintf(stderr, "ExpCode ERR:%s\n", e.what());
-		} catch (...) {
-			fprintf(stderr, "ExpCode ERR:unknown error\n");
-		}
-		::exit(1);
-	}
-	template<size_t N>
-	void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu& /*cpu*/)
-	{
-		typedef ExpVar<N> Self;
-		using namespace local;
-		using namespace Xbyak;
-
-		inLocalLabel();
-#ifdef XBYAK64
-		const Reg64& base = rcx;
-		const Reg64& a = rax;
-#else
-		const Reg32& base = ecx;
-		const Reg32& a = eax;
-#endif
-
-		mov(base, (size_t)self);
-
-#ifdef XBYAK32
-		movss(xm0, ptr [esp + 4]);
-#endif
-	L(".retry");
-		movaps(xm1, xm0);
-		movd(edx, xm0);
-		mulss(xm1, ptr [base + offsetof(Self, a)]); // t
-		and_(edx, 0x7fffffff);
-		cvtss2si(eax, xm1);
-		cmp(edx, ExpVar<N>::f88);
-		jg(".overflow");
-		lea(edx, ptr [eax + (127 << self->s)]);
-		cvtsi2ss(xm1, eax);
-		and_(eax, mask(self->s)); // v
-		mov(eax, ptr [base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v]
-		shr(edx, self->s);
-		mulss(xm1, ptr [base + offsetof(Self, b)]);
-		shl(edx, 23); // u
-		subss(xm0, xm1); // t
-		or_(eax, edx); // fi.f
-		addss(xm0, ptr [base + offsetof(Self, f1)]);
-		movd(xm1, eax);
-		mulss(xm0, xm1);
-#ifdef XBYAK32
-		movss(ptr[esp + 4], xm0);
-		fld(dword[esp + 4]);
-#endif
-		ret();
-	L(".overflow");
-		minss(xm0, ptr [base + offsetof(Self, maxX)]);
-		maxss(xm0, ptr [base + offsetof(Self, minX)]);
-		jmp(".retry");
-		outLocalLabel();
-	}
-	template<size_t N>
-	void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu)
-	{
-		typedef ExpVar<N> Self;
-		using namespace local;
-		using namespace Xbyak;
-
-		inLocalLabel();
-#ifdef XBYAK64
-		const Reg64& base = rcx;
-		const Reg64& a = rax;
-		const Reg64& d = rdx;
-#else
-		const Reg32& base = ecx;
-		const Reg32& a = eax;
-		const Reg32& d = edx;
-#endif
-
-/*
-	if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
-	minps, maxps are very slow then avoid them
-*/
-		const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
-#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
-		movaps(xm0, ptr [rcx]);
-#endif
-		mov(base, (size_t)self);
-	L(".retry");
-		movaps(xm5, xm0);
-		andps(xm5, ptr [base + offsetof(Self, i7fffffff)]);
-		movaps(xm3, ptr [base + offsetof(Self, a)]);
-		movaps(xm4, ptr [base + offsetof(Self, b)]);
-		pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]);
-		mulps(xm3, xm0);
-		movaps(xm1, ptr [base + offsetof(Self, i127s)]);
-		pmovmskb(eax, xm5);
-		movaps(xm5, ptr [base + offsetof(Self, mask_s)]);
-		cvtps2dq(xm2, xm3);
-		pand(xm5, xm2);
-		cvtdq2ps(xm3, xm2);
-		test(eax, eax);
-		jnz(".overflow");
-		paddd(xm1, xm2);
-		movd(eax, xm5);
-		mulps(xm4, xm3);
-		pextrw(edx, xm5, 2);
-		subps(xm0, xm4);
-		movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]);
-		addps(xm0, ptr [base + offsetof(Self, f1)]);
-		pextrw(eax, xm5, 4);
-		if (useSSE41) {
-			pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1);
-		} else {
-			movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
-			movlhps(xm4, xm3);
-		}
-		pextrw(edx, xm5, 6);
-		psrld(xm1, self->s);
-		pslld(xm1, 23);
-		if (useSSE41) {
-			pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2);
-			pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3);
-		} else {
-			movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]);
-			movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
-			movlhps(xm2, xm3);
-			shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
-		}
-		por(xm1, xm4);
-		mulps(xm0, xm1);
-		ret();
-	L(".overflow");
-		minps(xm0, ptr [base + offsetof(Self, maxX)]);
-		maxps(xm0, ptr [base + offsetof(Self, minX)]);
-		jmp(".retry");
-		outLocalLabel();
-	}
-};
-#endif
-
-/* to define static variables in fmath.hpp */
-template<size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE, size_t EXPD_N = EXPD_TABLE_SIZE>
-struct C {
-	static const ExpVar<EXP_N> expVar;
-	static const LogVar<LOG_N> logVar;
-	static const ExpdVar<EXPD_N> expdVar;
-#ifdef FMATH_USE_XBYAK
-	static const ExpCode& getInstance() {
-		static const ExpCode expCode(&expVar);
-		return expCode;
-	}
-#endif
-};
-
-template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
-MIE_ALIGN(32) const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;
-
-template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
-MIE_ALIGN(32) const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;
-
-template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
-MIE_ALIGN(32) const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;
-
-} // fmath::local
-
-#ifdef FMATH_USE_XBYAK
-inline float expC(float x)
-#else
-inline float exp(float x)
-#endif
-{
-	using namespace local;
-	const ExpVar<>& expVar = C<>::expVar;
-
-#if 1
-	__m128 x1 = _mm_set_ss(x);
-
-	int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
-	if (limit > ExpVar<>::f88) {
-		x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
-		x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
-	}
-
-	int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
-	unsigned int v = r & mask(expVar.s);
-	float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
-	int u = r >> expVar.s;
-	fi fi;
-	fi.i = ((u + 127) << 23) | expVar.tbl[v];
-	return (1 + t) * fi.f;
-#else
-	x = std::min(x, expVar.maxX[0]);
-	x = std::max(x, expVar.minX[0]);
-	float t = x * expVar.a[0];
-	const float magic = (1 << 23) + (1 << 22); // to round
-	t += magic;
-	fi fi;
-	fi.f = t;
-	t = x - (t - magic) * expVar.b[0];
-	int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;
-	unsigned int v = fi.i & mask(expVar.s);
-	fi.i = u | expVar.tbl[v];
-	return (1 + t) * fi.f;
-//	return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n);
-#endif
-}
-
-inline double expd(double x)
-{
-	if (x <= -708.39641853226408) return 0;
-	if (x >= 709.78271289338397) return std::numeric_limits<double>::infinity();
-	using namespace local;
-	const ExpdVar<>& c = C<>::expdVar;
-#if 1
-	const double _b = double(uint64_t(3) << 51);
-	__m128d b = _mm_load_sd(&_b);
-	__m128d xx = _mm_load_sd(&x);
-	__m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b);
-	uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d));
-	uint64_t iax = c.tbl[di & mask(c.sbit)];
-	__m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx);
-	uint64_t u = ((di + c.adj) >> c.sbit) << 52;
-	double t;
-	_mm_store_sd(&t, _t);
-	double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
-	double did;
-	u |= iax;
-	memcpy(&did, &u, sizeof(did));
-	return y * did;
-#else
-/*
-	remark : -ffast-math option of gcc may generate bad code for fmath::expd
-*/
-	const uint64_t b = 3ULL << 51;
-	di di;
-	di.d = x * c.a + b;
-	uint64_t iax = c.tbl[di.i & mask(c.sbit)];
-
-	double t = (di.d - b) * c.ra - x;
-	uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
-	double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
-
-	di.i = u | iax;
-	return y * di.d;
-#endif
-}
-
-// not fast
-#if 0
-inline __m128d exp_pd(__m128d x)
-{
-	using namespace local;
-	const ExpdVar<>& c = C<>::expdVar;
-	const double b = double(3ULL << 51);
-	const __m128d mC1 = *cast_to<__m128d>(c.C1);
-	const __m128d mC2 = *cast_to<__m128d>(c.C2);
-	const __m128d mC3 = *cast_to<__m128d>(c.C3);
-	const __m128d ma = _mm_set1_pd(c.a);
-	const __m128d mra = _mm_set1_pd(c.ra);
-	const __m128i madj = _mm_set1_epi32(c.adj);
-	MIE_ALIGN(16) const double expMax[2] = { 709.78271289338397, 709.78271289338397 };
-	MIE_ALIGN(16) const double expMin[2] = { -708.39641853226408, -708.39641853226408 };
-	x = _mm_min_pd(x, *(const __m128d*)expMax);
-	x = _mm_max_pd(x, *(const __m128d*)expMin);
-
-	__m128d d = _mm_mul_pd(x, ma);
-	d = _mm_add_pd(d, _mm_set1_pd(b));
-	int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
-	int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
-__m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
-	__m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
-	iax = _mm_unpacklo_epi64(iaxL, iax);
-
-	__m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
-	__m128i u = _mm_castpd_si128(d);
-	u = _mm_add_epi64(u, madj);
-	u = _mm_srli_epi64(u, c.sbit);
-	u = _mm_slli_epi64(u, 52);
-	u = _mm_or_si128(u, iax);
-	__m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
-	y = _mm_mul_pd(y, mC2);
-	y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
-	y = _mm_mul_pd(y, _mm_castsi128_pd(u));
-	return y;
-}
-#endif
-
-inline void expd_v(double *px, size_t n)
-{
-	using namespace local;
-	const ExpdVar<>& c = C<>::expdVar;
-	const double b = double(3ULL << 51);
-#ifdef __AVX2__
-	assert((n % 4) == 0);
-	const __m256d mC1 = _mm256_set1_pd(c.C1[0]);
-	const __m256d mC2 = _mm256_set1_pd(c.C2[0]);
-	const __m256d mC3 = _mm256_set1_pd(c.C3[0]);
-	const __m256d ma = _mm256_set1_pd(c.a);
-	const __m256d mra = _mm256_set1_pd(c.ra);
-	const __m256i madj = _mm256_set1_epi64x(c.adj);
-	const __m256i maskSbit = _mm256_set1_epi64x(mask(c.sbit));
-	const __m256d expMax = _mm256_set1_pd(709.78272569338397);
-	const __m256d expMin = _mm256_set1_pd(-708.39641853226408);
-	for (size_t i = 0; i < n; i += 4) {
-		__m256d x = _mm256_load_pd(px);
-		x = _mm256_min_pd(x, expMax);
-		x = _mm256_max_pd(x, expMin);
-
-		__m256d d = _mm256_mul_pd(x, ma);
-		d = _mm256_add_pd(d, _mm256_set1_pd(b));
-		__m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit);
-		__m256i iax = _mm256_i64gather_epi64((const long long*)c.tbl, adr, 8);
-		__m256d t = _mm256_sub_pd(_mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x);
-		__m256i u = _mm256_castpd_si256(d);
-		u = _mm256_add_epi64(u, madj);
-		u = _mm256_srli_epi64(u, c.sbit);
-		u = _mm256_slli_epi64(u, 52);
-		u = _mm256_or_si256(u, iax);
-		__m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t));
-		y = _mm256_mul_pd(y, mC2);
-		y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1);
-		_mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u)));
-		px += 4;
-	}
-#else
-	assert((n % 2) == 0);
-	const __m128d mC1 = _mm_set1_pd(c.C1[0]);
-	const __m128d mC2 = _mm_set1_pd(c.C2[0]);
-	const __m128d mC3 = _mm_set1_pd(c.C3[0]);
-	const __m128d ma = _mm_set1_pd(c.a);
-	const __m128d mra = _mm_set1_pd(c.ra);
-#if defined(__x86_64__) || defined(_WIN64)
-	const __m128i madj = _mm_set1_epi64x(c.adj);
-#else
-	const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj);
-#endif
-	const __m128d expMax = _mm_set1_pd(709.78272569338397);
-	const __m128d expMin = _mm_set1_pd(-708.39641853226408);
-	for (size_t i = 0; i < n; i += 2) {
-		__m128d x = _mm_load_pd(px);
-		x = _mm_min_pd(x, expMax);
-		x = _mm_max_pd(x, expMin);
-
-		__m128d d = _mm_mul_pd(x, ma);
-		d = _mm_add_pd(d, _mm_set1_pd(b));
-		int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
-		int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
-
-		__m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
-		__m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
-		iax = _mm_unpacklo_epi64(iaxL, iax);
-
-		__m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
-		__m128i u = _mm_castpd_si128(d);
-		u = _mm_add_epi64(u, madj);
-		u = _mm_srli_epi64(u, c.sbit);
-		u = _mm_slli_epi64(u, 52);
-		u = _mm_or_si128(u, iax);
-		__m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
-		y = _mm_mul_pd(y, mC2);
-		y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
-		_mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
-		px += 2;
-	}
-#endif
-}
-
-#ifdef FMATH_USE_XBYAK
-inline __m128 exp_psC(__m128 x)
-#else
-inline __m128 exp_ps(__m128 x)
-#endif
-{
-	using namespace local;
-	const ExpVar<>& expVar = C<>::expVar;
-
-	__m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
-	int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
-	if (over) {
-		x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
-		x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
-	}
-
-	__m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
-	__m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
-	t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));
-
-	__m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
-	__m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
-	u4 = _mm_srli_epi32(u4, expVar.s);
-	u4 = _mm_slli_epi32(u4, 23);
-
-#ifdef __AVX2__ // fast?
-	__m128i ti = _mm_i32gather_epi32((const int*)expVar.tbl, v4, 4);
-	__m128 t0 = _mm_castsi128_ps(ti);
-#else
-	unsigned int v0, v1, v2, v3;
-	v0 = _mm_cvtsi128_si32(v4);
-	v1 = _mm_extract_epi16(v4, 2);
-	v2 = _mm_extract_epi16(v4, 4);
-	v3 = _mm_extract_epi16(v4, 6);
-#if 1
-	__m128 t0, t1, t2, t3;
-
-	t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
-	t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
-	t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
-	t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));
-
-	t1 = _mm_movelh_ps(t1, t3);
-	t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
-	t0 = _mm_movelh_ps(t0, t2);
-	t0 = _mm_or_ps(t0, t1);
-#else
-	__m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0]));
-	ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
-	ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
-	ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
-	__m128 t0 = _mm_castsi128_ps(ti);
-#endif
-#endif
-	t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
-
-	t = _mm_mul_ps(t, t0);
-
-	return t;
-}
-#ifdef __AVX2__
-inline __m256 exp_ps256(__m256 x)
-{
-	using namespace local;
-	const ExpVar<>& expVar = C<>::expVar;
-
-	__m256i limit = _mm256_castps_si256(_mm256_and_ps(x, *(const __m256*)expVar.i7fffffff));
-	int over = _mm256_movemask_epi8(_mm256_cmpgt_epi32(limit, *(const __m256i*)expVar.maxX));
-	if (over) {
-		x = _mm256_min_ps(x, _mm256_load_ps(expVar.maxX));
-		x = _mm256_max_ps(x, _mm256_load_ps(expVar.minX));
-	}
-	__m256i r = _mm256_cvtps_epi32(_mm256_mul_ps(x, *(const __m256*)expVar.a));
-	__m256 t = _mm256_sub_ps(x, _mm256_mul_ps(_mm256_cvtepi32_ps(r), *(const __m256*)expVar.b));
-	t = _mm256_add_ps(t, *(const __m256*)expVar.f1);
-	__m256i v8 = _mm256_and_si256(r, *(const __m256i*)expVar.mask_s);
-	__m256i u8 = _mm256_add_epi32(r, *(const __m256i*)expVar.i127s);
-	u8 = _mm256_srli_epi32(u8, expVar.s);
-	u8 = _mm256_slli_epi32(u8, 23);
-#if 1
-	__m256i ti = _mm256_i32gather_epi32((const int*)expVar.tbl, v8, 4);
-#else
-	unsigned int v0, v1, v2, v3, v4, v5, v6, v7;
-	v0 = _mm256_extract_epi16(v8, 0);
-	v1 = _mm256_extract_epi16(v8, 2);
-	v2 = _mm256_extract_epi16(v8, 4);
-	v3 = _mm256_extract_epi16(v8, 6);
-	v4 = _mm256_extract_epi16(v8, 8);
-	v5 = _mm256_extract_epi16(v8, 10);
-	v6 = _mm256_extract_epi16(v8, 12);
-	v7 = _mm256_extract_epi16(v8, 14);
-	__m256i ti = _mm256_setzero_si256();
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v0], 0);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v1], 1);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v2], 2);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v3], 3);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v4], 4);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v5], 5);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v6], 6);
-	ti = _mm256_insert_epi32(ti, expVar.tbl[v7], 7);
-#endif
-	__m256 t0 = _mm256_castsi256_ps(ti);
-	t0 = _mm256_or_ps(t0, _mm256_castsi256_ps(u8));
-	t = _mm256_mul_ps(t, t0);
-	return t;
-}
-#endif
-
-inline float log(float x)
-{
-	using namespace local;
-	const LogVar<>& logVar = C<>::logVar;
-	const size_t logLen = logVar.LEN;
-
-	fi fi;
-	fi.f = x;
-	int a = fi.i & (mask(8) << 23);
-	unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));
-	unsigned int b2 = fi.i & mask(23 - logLen);
-	int idx = b1 >> (23 - logLen);
-	float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app + float(b2) * logVar.tbl[idx].rev;
-	return f;
-}
-
-inline __m128 log_ps(__m128 x)
-{
-	using namespace local;
-	const LogVar<>& logVar = C<>::logVar;
-
-	__m128i xi = _mm_castps_si128(x);
-	__m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN));
-	__m128 a  = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5)));
-	__m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));
-
-	a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2
-
-	unsigned int i0 = _mm_cvtsi128_si32(idx);
-
-#if 1
-	unsigned int i1 = _mm_extract_epi16(idx, 2);
-	unsigned int i2 = _mm_extract_epi16(idx, 4);
-	unsigned int i3 = _mm_extract_epi16(idx, 6);
-#else
-	idx = _mm_srli_si128(idx, 4);
-	unsigned int i1 = _mm_cvtsi128_si32(idx);
-
-	idx = _mm_srli_si128(idx, 4);
-	unsigned int i2 = _mm_cvtsi128_si32(idx);
-
-	idx = _mm_srli_si128(idx, 4);
-	unsigned int i3 = _mm_cvtsi128_si32(idx);
-#endif
-
-	__m128 app, rev;
-	__m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
-	__m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
-	__m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
-	L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
-	H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
-	rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
-	app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));
-	rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));
-
-	a = _mm_add_ps(a, app);
-	rev = _mm_mul_ps(b2, rev);
-	return _mm_add_ps(a, rev);
-}
-
-#ifndef __CYGWIN__
-// cygwin defines log2() in global namespace!
-// log2(x) = log(x) / log(2)
-inline float log2(float x) { return fmath::log(x) * 1.442695f; }
-#endif
-
-/*
-	for given y > 0
-	get f_y(x) := pow(x, y) for x >= 0
-*/
-class PowGenerator {
-	enum {
-		N = 11
-	};
-	float tbl0_[256];
-	struct {
-		float app;
-		float rev;
-	} tbl1_[1 << N];
-public:
-	PowGenerator(float y)
-	{
-		for (int i = 0; i < 256; i++) {
-			tbl0_[i] = ::powf(2, (i - 127) * y);
-		}
-		const double e = 1 / double(1 << 24);
-		const double h = 1 / double(1 << N);
-		const size_t n = 1U << N;
-		for (size_t i = 0; i < n; i++) {
-			double x = 1 + double(i) / n;
-			double a = ::pow(x, (double)y);
-			tbl1_[i].app = (float)a;
-			double b = ::pow(x + h - e, (double)y);
-			tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));
-		}
-	}
-	float get(float x) const
-	{
-		using namespace local;
-		fi fi;
-		fi.f = x;
-		int a = (fi.i >> 23) & mask(8);
-		unsigned int b = fi.i & mask(23);
-		unsigned int b1 = b & (mask(N) << (23 - N));
-		unsigned int b2 = b & mask(23 - N);
-		float f;
-		int idx = b1 >> (23 - N);
-		f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);
-		return f;
-	}
-};
-
-// for Xbyak version
-#ifdef FMATH_USE_XBYAK
-float (*const exp)(float) = local::C<>::getInstance().exp_;
-__m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_;
-#endif
-
-// exp2(x) = pow(2, x)
-inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }
-
-} // fmath
diff --git a/patch/include/caffe/filler.hpp.patch b/patch/include/caffe/filler.hpp.patch
deleted file mode 100644
index f1f3c4a..0000000
--- a/patch/include/caffe/filler.hpp.patch
+++ /dev/null
@@ -1,57 +0,0 @@
---- caffe/include/caffe/filler.hpp	2017-01-27 09:51:55.350123200 +0800
-+++ ../include/caffe/filler.hpp	2017-02-04 22:35:34.594153197 +0800
-@@ -45,6 +45,7 @@
-   }
- };
- 
-+#ifdef NO_CAFFE_MOBILE
- /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
- template <typename Dtype>
- class UniformFiller : public Filler<Dtype> {
-@@ -123,6 +124,7 @@
-          << "Sparsity not supported by this Filler.";
-   }
- };
-+#endif
- 
- /**
-  * @brief Fills a Blob with values @f$ x \sim U(-a, +a) @f$ where @f$ a @f$ is
-@@ -165,6 +167,7 @@
-   }
- };
- 
-+#ifdef NO_CAFFE_MOBILE
- /**
-  * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
-  *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
-@@ -260,6 +263,7 @@
-          << "Sparsity not supported by this Filler.";
-   }
- };
-+#endif
- 
- /**
-  * @brief Get a specific filler from the specification given in FillerParameter.
-@@ -272,18 +276,22 @@
-   const std::string& type = param.type();
-   if (type == "constant") {
-     return new ConstantFiller<Dtype>(param);
-+#ifdef NO_CAFFE_MOBILE
-   } else if (type == "gaussian") {
-     return new GaussianFiller<Dtype>(param);
-   } else if (type == "positive_unitball") {
-     return new PositiveUnitballFiller<Dtype>(param);
-   } else if (type == "uniform") {
-     return new UniformFiller<Dtype>(param);
-+#endif
-   } else if (type == "xavier") {
-     return new XavierFiller<Dtype>(param);
-+#ifdef NO_CAFFE_MOBILE
-   } else if (type == "msra") {
-     return new MSRAFiller<Dtype>(param);
-   } else if (type == "bilinear") {
-     return new BilinearFiller<Dtype>(param);
-+#endif
-   } else {
-     CHECK(false) << "Unknown filler name: " << param.type();
-   }
diff --git a/patch/src/caffe/common.cpp.patch b/patch/src/caffe/common.cpp.patch
index 7065a3d..416ec44 100644
--- a/patch/src/caffe/common.cpp.patch
+++ b/patch/src/caffe/common.cpp.patch
@@ -1,14 +1,16 @@
 --- caffe/src/caffe/common.cpp	2017-01-27 09:51:56.283123700 +0800
-+++ ../src/caffe/common.cpp	2017-02-04 22:35:34.654099433 +0800
-@@ -1,5 +1,6 @@
++++ ../src/caffe/common.cpp	2017-02-04 23:04:01.367500325 +0800
+@@ -1,5 +1,9 @@
 +#ifdef USE_BOOST
  #include <boost/thread.hpp>
--#include <glog/logging.h>
++#endif
++#ifdef USE_GLOG
+ #include <glog/logging.h>
 +#endif
  #include <cmath>
  #include <cstdio>
  #include <ctime>
-@@ -7,16 +8,36 @@
+@@ -7,16 +11,36 @@
  #include "caffe/common.hpp"
  #include "caffe/util/rng.hpp"
  
@@ -45,7 +47,7 @@
  }
  
  // random seeding
-@@ -41,12 +62,16 @@
+@@ -41,12 +65,16 @@
  
  
  void GlobalInit(int* pargc, char*** pargv) {
diff --git a/patch/src/caffe/layers/dropout_layer.cpp.patch b/patch/src/caffe/layers/dropout_layer.cpp.patch
deleted file mode 100644
index 623ae72..0000000
--- a/patch/src/caffe/layers/dropout_layer.cpp.patch
+++ /dev/null
@@ -1,16 +0,0 @@
---- caffe/src/caffe/layers/dropout_layer.cpp	2017-01-27 09:51:56.561122800 +0800
-+++ ../src/caffe/layers/dropout_layer.cpp	2017-02-04 22:35:34.654099433 +0800
-@@ -35,11 +35,13 @@
-   unsigned int* mask = rand_vec_.mutable_cpu_data();
-   const int count = bottom[0]->count();
-   if (this->phase_ == TRAIN) {
-+#ifdef NO_CAFFE_MOBILE
-     // Create random numbers
-     caffe_rng_bernoulli(count, 1. - threshold_, mask);
-     for (int i = 0; i < count; ++i) {
-       top_data[i] = bottom_data[i] * mask[i] * scale_;
-     }
-+#endif
-   } else {
-     caffe_copy(bottom[0]->count(), bottom_data, top_data);
-   }
diff --git a/patch/src/caffe/net.cpp.patch b/patch/src/caffe/net.cpp.patch
index c677d81..f9f70b0 100644
--- a/patch/src/caffe/net.cpp.patch
+++ b/patch/src/caffe/net.cpp.patch
@@ -1,5 +1,5 @@
 --- caffe/src/caffe/net.cpp	2017-01-27 09:51:56.935642700 +0800
-+++ ../src/caffe/net.cpp	2017-02-04 22:35:34.654099433 +0800
++++ ../src/caffe/net.cpp	2017-02-04 23:04:40.954733321 +0800
 @@ -5,19 +5,27 @@
  #include <utility>
  #include <vector>
@@ -390,33 +390,3 @@
  
  template <typename Dtype>
  void Net<Dtype>::ClearParamDiffs() {
-@@ -981,4 +1052,29 @@
- 
- INSTANTIATE_CLASS(Net);
- 
-+#if 0
-+// FIXME need?
-+/* force register */
-+#define FORCE_REG(type) \
-+	extern LayerRegistry<float> g_creator_f_##type; \
-+	extern LayerRegistry<double> g_creator_d_##type; \
-+	LayerRegistry<float> *__g_creator_f_##type = &g_creator_f_##type; \
-+	LayerRegistry<double> *__g_creator_d_##type = &g_creator_d_##type
-+
-+FORCE_REG(TanH);
-+FORCE_REG(Pooling);
-+FORCE_REG(ReLU);
-+FORCE_REG(Sigmoid);
-+FORCE_REG(Softmax);
-+FORCE_REG(Convolution);
-+
-+FORCE_REG(Concat);
-+FORCE_REG(BNLL);
-+FORCE_REG(Flatten);
-+FORCE_REG(InnerProduct);
-+FORCE_REG(LRN);
-+FORCE_REG(MemoryData);
-+
-+FORCE_REG(Split);
-+#endif
- }  // namespace caffe
diff --git a/patch/src/caffe/util/benchmark.cpp.patch b/patch/src/caffe/util/benchmark.cpp.patch
index 8169296..fbdc08c 100644
--- a/patch/src/caffe/util/benchmark.cpp.patch
+++ b/patch/src/caffe/util/benchmark.cpp.patch
@@ -1,24 +1,13 @@
 --- caffe/src/caffe/util/benchmark.cpp	2017-01-27 09:51:57.449642800 +0800
-+++ ../src/caffe/util/benchmark.cpp	2017-02-04 22:35:34.654099433 +0800
-@@ -1,10 +1,17 @@
++++ ../src/caffe/util/benchmark.cpp	2017-02-04 23:00:28.602281548 +0800
+@@ -1,4 +1,6 @@
 +#ifdef USE_BOOST
  #include <boost/date_time/posix_time/posix_time.hpp>
 +#endif
  
  #include "caffe/common.hpp"
  #include "caffe/util/benchmark.hpp"
- 
- namespace caffe {
- 
-+static float time_diff_ms(struct timeval start, struct timeval stop) {
-+    return (stop.tv_sec - start.tv_sec)*1000
-+        + (float)(stop.tv_usec - start.tv_usec)/1000;
-+}
-+
- Timer::Timer()
-     : initted_(false),
-       running_(false),
-@@ -32,7 +39,11 @@
+@@ -32,7 +34,11 @@
        NO_GPU;
  #endif
      } else {
@@ -30,7 +19,7 @@
      }
      running_ = true;
      has_run_at_least_once_ = true;
-@@ -48,7 +59,11 @@
+@@ -48,7 +54,11 @@
        NO_GPU;
  #endif
      } else {
@@ -42,7 +31,7 @@
      }
      running_ = false;
    }
-@@ -74,7 +89,12 @@
+@@ -74,7 +84,12 @@
        NO_GPU;
  #endif
    } else {
@@ -55,7 +44,7 @@
    }
    return elapsed_microseconds_;
  }
-@@ -96,7 +116,12 @@
+@@ -96,7 +111,12 @@
        NO_GPU;
  #endif
    } else {
@@ -68,7 +57,7 @@
    }
    return elapsed_milliseconds_;
  }
-@@ -127,7 +152,11 @@
+@@ -127,7 +147,11 @@
  
  void CPUTimer::Start() {
    if (!running()) {
@@ -80,7 +69,7 @@
      this->running_ = true;
      this->has_run_at_least_once_ = true;
    }
-@@ -135,7 +164,11 @@
+@@ -135,7 +159,11 @@
  
  void CPUTimer::Stop() {
    if (running()) {
@@ -92,7 +81,7 @@
      this->running_ = false;
    }
  }
-@@ -148,8 +181,13 @@
+@@ -148,8 +176,13 @@
    if (running()) {
      Stop();
    }
@@ -106,7 +95,7 @@
    return this->elapsed_milliseconds_;
  }
  
-@@ -161,8 +199,13 @@
+@@ -161,8 +194,13 @@
    if (running()) {
      Stop();
    }
diff --git a/patch/src/caffe/util/io.cpp.patch b/patch/src/caffe/util/io.cpp.patch
index bca071f..2da7aea 100644
--- a/patch/src/caffe/util/io.cpp.patch
+++ b/patch/src/caffe/util/io.cpp.patch
@@ -1,5 +1,5 @@
 --- caffe/src/caffe/util/io.cpp	2017-01-27 09:51:57.498643500 +0800
-+++ ../src/caffe/util/io.cpp	2017-02-04 22:35:34.654099433 +0800
++++ ../src/caffe/util/io.cpp	2017-02-04 23:01:28.591786478 +0800
 @@ -18,6 +18,9 @@
  #include "caffe/common.hpp"
  #include "caffe/proto/caffe.pb.h"
@@ -10,12 +10,3 @@
  
  const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 byte.
  
-@@ -42,7 +45,7 @@
- }
- 
- void WriteProtoToTextFile(const Message& proto, const char* filename) {
--  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-+  int fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-   FileOutputStream* output = new FileOutputStream(fd);
-   CHECK(google::protobuf::TextFormat::Print(proto, output));
-   delete output;
diff --git a/patch/src/caffe/util/math_functions.cpp.patch b/patch/src/caffe/util/math_functions.cpp.patch
index 1d35c80..8b6ba4e 100644
--- a/patch/src/caffe/util/math_functions.cpp.patch
+++ b/patch/src/caffe/util/math_functions.cpp.patch
@@ -1,5 +1,5 @@
 --- caffe/src/caffe/util/math_functions.cpp	2017-01-27 09:51:57.502642700 +0800
-+++ ../src/caffe/util/math_functions.cpp	2017-02-04 22:35:34.654099433 +0800
++++ ../src/caffe/util/math_functions.cpp	2017-02-04 23:03:03.164175136 +0800
 @@ -1,5 +1,7 @@
 +#ifdef USE_BOOST
  #include <boost/math/special_functions/next.hpp>
@@ -8,29 +8,21 @@
  
  #include <limits>
  
-@@ -42,7 +44,7 @@
- void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-     const int N, const double alpha, const double* A, const double* x,
-     const double beta, double* y) {
--  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
-+ cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
- }
- 
- template <>
-@@ -232,8 +234,12 @@
+@@ -232,8 +234,13 @@
  
  template <typename Dtype>
  Dtype caffe_nextafter(const Dtype b) {
 +#ifdef USE_BOOST
    return boost::math::nextafter<Dtype>(
        b, std::numeric_limits<Dtype>::max());
-+#endif
++#else
 +  return std::nextafter(
 +      b, std::numeric_limits<Dtype>::max());
++#endif
  }
  
  template
-@@ -247,12 +253,19 @@
+@@ -247,12 +254,19 @@
    CHECK_GE(n, 0);
    CHECK(r);
    CHECK_LE(a, b);
@@ -50,7 +42,7 @@
  }
  
  template
-@@ -269,12 +282,19 @@
+@@ -269,12 +283,19 @@
    CHECK_GE(n, 0);
    CHECK(r);
    CHECK_GT(sigma, 0);
@@ -70,7 +62,7 @@
  }
  
  template
-@@ -291,12 +311,19 @@
+@@ -291,12 +312,19 @@
    CHECK(r);
    CHECK_GE(p, 0);
    CHECK_LE(p, 1);
@@ -90,7 +82,7 @@
  }
  
  template
-@@ -311,12 +338,19 @@
+@@ -311,12 +339,19 @@
    CHECK(r);
    CHECK_GE(p, 0);
    CHECK_LE(p, 1);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index b1fcb33..4856192 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,6 +1,9 @@
 #ifdef USE_BOOST
 #include <boost/thread.hpp>
 #endif
+#ifdef USE_GLOG
+#include <glog/logging.h>
+#endif
 #include <cmath>
 #include <cstdio>
 #include <ctime>
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index d323dbe..533ab26 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -35,13 +35,11 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   unsigned int* mask = rand_vec_.mutable_cpu_data();
   const int count = bottom[0]->count();
   if (this->phase_ == TRAIN) {
-#ifdef NO_CAFFE_MOBILE
     // Create random numbers
     caffe_rng_bernoulli(count, 1. - threshold_, mask);
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }
-#endif
   } else {
     caffe_copy(bottom[0]->count(), bottom_data, top_data);
   }
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 4a9c960..e904ac1 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -1052,29 +1052,4 @@ const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
 
 INSTANTIATE_CLASS(Net);
 
-#if 0
-// FIXME need?
-/* force register */
-#define FORCE_REG(type) \
-	extern LayerRegistry<float> g_creator_f_##type; \
-	extern LayerRegistry<double> g_creator_d_##type; \
-	LayerRegistry<float> *__g_creator_f_##type = &g_creator_f_##type; \
-	LayerRegistry<double> *__g_creator_d_##type = &g_creator_d_##type
-
-FORCE_REG(TanH);
-FORCE_REG(Pooling);
-FORCE_REG(ReLU);
-FORCE_REG(Sigmoid);
-FORCE_REG(Softmax);
-FORCE_REG(Convolution);
-
-FORCE_REG(Concat);
-FORCE_REG(BNLL);
-FORCE_REG(Flatten);
-FORCE_REG(InnerProduct);
-FORCE_REG(LRN);
-FORCE_REG(MemoryData);
-
-FORCE_REG(Split);
-#endif
 }  // namespace caffe
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 9de960c..8f46c88 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -7,11 +7,6 @@
 
 namespace caffe {
 
-static float time_diff_ms(struct timeval start, struct timeval stop) {
-    return (stop.tv_sec - start.tv_sec)*1000
-        + (float)(stop.tv_usec - start.tv_usec)/1000;
-}
-
 Timer::Timer()
     : initted_(false),
       running_(false),
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 17a74af..e65fd4c 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -45,7 +45,7 @@ bool ReadProtoFromTextFile(const char* filename, Message* proto) {
 }
 
 void WriteProtoToTextFile(const Message& proto, const char* filename) {
-  int fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   FileOutputStream* output = new FileOutputStream(fd);
   CHECK(google::protobuf::TextFormat::Print(proto, output));
   delete output;
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 4a6af28..77a0cd9 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -44,7 +44,7 @@ template <>
 void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const double alpha, const double* A, const double* x,
     const double beta, double* y) {
- cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
 template <>
@@ -237,9 +237,10 @@ Dtype caffe_nextafter(const Dtype b) {
 #ifdef USE_BOOST
   return boost::math::nextafter<Dtype>(
       b, std::numeric_limits<Dtype>::max());
-#endif
+#else
   return std::nextafter(
       b, std::numeric_limits<Dtype>::max());
+#endif
 }
 
 template